From 93c85f542b8d6fad2f1313444b96af2252f8be57 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Wed, 25 Mar 2026 21:00:00 +0100
Subject: [PATCH 01/60] feat(core): eval-aware types, events, and DI fixes

- EvalPayload, EvalScoreEntry on TestResult for eval case results
- EVAL_SUITE_END event emitted by core runner
- is_eval flag on TestRegistration/TestItem
- KindFilterPlugin for protest run vs protest eval
- get_type_hints_compat: PEP 563 + TYPE_CHECKING support in all DI sites
- Async fixture teardown on same event loop (no more loop mismatch)
- Fixture resolution time excluded from test duration
- Log records captured on TestResult for --show-logs
---
 protest/core/collector.py               |  11 +-
 protest/core/execution/test_executor.py |  43 +++++--
 protest/core/outcome.py                 | 146 +++++++++---------------
 protest/di/container.py                 |  34 ++----
 protest/di/hints.py                     |  62 ++++++++++
 protest/di/validation.py                |   9 +-
 protest/entities/events.py              |  30 ++++-
 protest/events/types.py                 |   2 +
 protest/execution/capture.py            |  16 +++
 protest/plugin.py                       |   6 +
 10 files changed, 222 insertions(+), 137 deletions(-)
 create mode 100644 protest/di/hints.py

diff --git a/protest/core/collector.py b/protest/core/collector.py
index 74dd75d..24356a8 100644
--- a/protest/core/collector.py
+++ b/protest/core/collector.py
@@ -2,7 +2,7 @@
 
 from inspect import signature
 from itertools import groupby, product
-from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin
 
 from protest.di.decorators import get_fixture_marker, unwrap_fixture
 from protest.di.markers import Use
@@ -18,10 +18,9 @@
 
 def _extract_use_fixtures(func: Callable[..., Any]) -> list[FixtureCallable]:
     """Extract fixtures referenced via Use() markers in function parameters."""
-    try:
-        type_hints = get_type_hints(func, include_extras=True)
-    except Exception:
-        type_hints = {}
+    from protest.di.hints import get_type_hints_compat
+
+    type_hints = get_type_hints_compat(func)
 
     fixtures: list[FixtureCallable] = []
     for param_name in signature(func).parameters:
@@ -164,6 +163,7 @@ def _expand_registration(
                     xfail=test_reg.xfail,
                     timeout=test_reg.timeout,
                     retry=test_reg.retry,
+                    is_eval=test_reg.is_eval,
                 )
             ]
 
@@ -188,6 +188,7 @@ def _expand_registration(
                     xfail=test_reg.xfail,
                     timeout=test_reg.timeout,
                     retry=test_reg.retry,
+                    is_eval=test_reg.is_eval,
                 )
             )
 
diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py
index 8fa92a3..8b475c6 100644
--- a/protest/core/execution/test_executor.py
+++ b/protest/core/execution/test_executor.py
@@ -7,7 +7,7 @@
 import time
 from contextlib import AsyncExitStack, asynccontextmanager
 from inspect import signature
-from typing import TYPE_CHECKING, Any, get_type_hints
+from typing import TYPE_CHECKING, Any
 
 from protest.core.collector import get_transitive_fixtures
 from protest.core.outcome import OutcomeBuilder, TestExecutionResult
@@ -20,11 +20,13 @@
     TestStartInfo,
     TestTeardownInfo,
 )
+from protest.entities.events import EvalPayload
 from protest.events.types import Event
 from protest.exceptions import FixtureError
 from protest.execution.async_bridge import ensure_async
 from protest.execution.capture import (
     CaptureCurrentTest,
+    get_current_log_records,
     reset_current_node_id,
     set_current_node_id,
 )
@@ -112,8 +114,6 @@ async def _run_test(  # noqa: PLR0912 - complex test execution flow, refactoring
                 )
             )
 
-        start = time.perf_counter()
-
         try:
             kwargs = await self._resolve_test_kwargs(item, ctx)
         except Exception as exc:
@@ -122,13 +122,15 @@ async def _run_test(  # noqa: PLR0912 - complex test execution flow, refactoring
                     test_name=test_name,
                     node_id=node_id,
                     suite_path=item.suite_path,
-                    duration=time.perf_counter() - start,
+                    duration=0,
                     output=buffer.getvalue(),
                     error=exc,
                     is_fixture_error=True,
                 )
             )
 
+        start = time.perf_counter()
+
         # Conditional skip (callable) - evaluated AFTER fixture resolution
         if item.skip and item.skip.is_conditional:
             try:
@@ -162,26 +164,33 @@ async def _run_test(  # noqa: PLR0912 - complex test execution flow, refactoring
         previous_errors: list[Exception] = []
         error: Exception | None = None
         is_fixture_error = False
+        eval_payload: EvalPayload | None = None
         attempt = 1  # Initialized here; always overwritten by loop
 
         for attempt in range(1, max_attempts + 1):
             error = None
             is_fixture_error = False
+            eval_payload = None
 
             try:
                 if item.timeout is not None:
                     try:
-                        await asyncio.wait_for(
+                        return_value = await asyncio.wait_for(
                             ensure_async(item.func, **kwargs),
                             timeout=item.timeout,
                         )
                     except asyncio.TimeoutError:
-                        # Only wrap timeout from wait_for, not from test code
                         raise asyncio.TimeoutError(
                             f"Test exceeded timeout of {item.timeout}s"
                         ) from None
                 else:
-                    await ensure_async(item.func, **kwargs)
+                    return_value = await ensure_async(item.func, **kwargs)
+
+                # For eval items: capture EvalPayload and determine pass/fail
+                if item.is_eval and isinstance(return_value, EvalPayload):
+                    eval_payload = return_value
+                    if not eval_payload.passed:
+                        error = _build_eval_error(eval_payload)
             except FixtureError as exc:
                 error = exc.original
                 is_fixture_error = True
@@ -231,6 +240,9 @@ async def _run_test(  # noqa: PLR0912 - complex test execution flow, refactoring
                 attempt=attempt,
                 max_attempts=max_attempts,
                 previous_errors=tuple(previous_errors),
+                is_eval=item.is_eval,
+                eval_payload=eval_payload,
+                log_records=tuple(get_current_log_records()),
             )
         )
 
@@ -243,10 +255,9 @@ async def _resolve_test_kwargs(
         func_signature = signature(item.func)
         kwargs: dict[str, Any] = dict(item.case_kwargs)
 
-        try:
-            type_hints = get_type_hints(item.func, include_extras=True)
-        except Exception:
-            type_hints = {}
+        from protest.di.hints import get_type_hints_compat
+
+        type_hints = get_type_hints_compat(item.func)
 
         for param_name, param in func_signature.parameters.items():
             if param_name in kwargs:
@@ -346,3 +357,13 @@ async def _acquire_fixture_semaphores(
             for _, sem in sems_sorted:
                 await stack.enter_async_context(_semaphore_context(sem))
             yield
+
+
+def _build_eval_error(payload: EvalPayload) -> AssertionError:
+    """Build a descriptive AssertionError from failed eval scores."""
+    failed = [
+        f"{name}={entry.value}"
+        for name, entry in payload.scores.items()
+        if not entry.passed
+    ]
+    return AssertionError(f"{', '.join(failed)}")
diff --git a/protest/core/outcome.py b/protest/core/outcome.py
index b89a7bb..0018812 100644
--- a/protest/core/outcome.py
+++ b/protest/core/outcome.py
@@ -1,11 +1,17 @@
 """Test outcome classification and building."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from enum import Enum, auto
+from typing import TYPE_CHECKING, Any
 
 from protest.entities import SuitePath, TestCounts, TestOutcome, TestResult
 from protest.events.types import Event
 
+if TYPE_CHECKING:
+    from protest.entities.events import EvalPayload
+
 
 class OutcomeType(Enum):
     """Classification of test execution outcomes."""
@@ -35,13 +41,16 @@ class TestExecutionResult:
     attempt: int = 1
     max_attempts: int = 1
     previous_errors: tuple[Exception, ...] = ()
+    is_eval: bool = False
+    eval_payload: EvalPayload | None = None
+    log_records: tuple[Any, ...] = ()
 
 
 class OutcomeBuilder:
     """Builds TestOutcome from test execution results."""
 
     def build(self, exec_result: TestExecutionResult) -> TestOutcome:
-        """Build a TestOutcome from execution result by classifying and constructing."""
+        """Build a TestOutcome from execution result."""
         outcome_type = self._classify(exec_result)
 
         match outcome_type:
@@ -59,7 +68,6 @@ def build(self, exec_result: TestExecutionResult) -> TestOutcome:
                 return self._build_fail(exec_result)
 
     def _classify(self, exec_result: TestExecutionResult) -> OutcomeType:
-        """Classify execution result into outcome type."""
         match (
             exec_result.skip_reason,
             exec_result.error,
@@ -79,91 +87,49 @@ def _classify(self, exec_result: TestExecutionResult) -> OutcomeType:
             case _:
                 return OutcomeType.FAIL
 
-    def _build_skip(self, exec_result: TestExecutionResult) -> TestOutcome:
-        result = TestResult(
-            name=exec_result.test_name,
-            node_id=exec_result.node_id,
-            suite_path=exec_result.suite_path,
-            skip_reason=exec_result.skip_reason,
-            timeout=exec_result.timeout,
-            attempt=exec_result.attempt,
-            max_attempts=exec_result.max_attempts,
-            previous_errors=exec_result.previous_errors,
-        )
-        return TestOutcome(result, TestCounts(skipped=1), Event.TEST_SKIP)
-
-    def _build_pass(self, exec_result: TestExecutionResult) -> TestOutcome:
-        result = TestResult(
-            name=exec_result.test_name,
-            node_id=exec_result.node_id,
-            suite_path=exec_result.suite_path,
-            duration=exec_result.duration,
-            output=exec_result.output,
-            timeout=exec_result.timeout,
-            attempt=exec_result.attempt,
-            max_attempts=exec_result.max_attempts,
-            previous_errors=exec_result.previous_errors,
-        )
-        return TestOutcome(result, TestCounts(passed=1), Event.TEST_PASS)
-
-    def _build_xpass(self, exec_result: TestExecutionResult) -> TestOutcome:
-        result = TestResult(
-            name=exec_result.test_name,
-            node_id=exec_result.node_id,
-            suite_path=exec_result.suite_path,
-            duration=exec_result.duration,
-            output=exec_result.output,
-            xfail_reason=exec_result.xfail_reason,
-            timeout=exec_result.timeout,
-            attempt=exec_result.attempt,
-            max_attempts=exec_result.max_attempts,
-            previous_errors=exec_result.previous_errors,
-        )
-        return TestOutcome(result, TestCounts(xpassed=1), Event.TEST_XPASS)
-
-    def _build_error(self, exec_result: TestExecutionResult) -> TestOutcome:
-        result = TestResult(
-            name=exec_result.test_name,
-            node_id=exec_result.node_id,
-            suite_path=exec_result.suite_path,
-            error=exec_result.error,
-            duration=exec_result.duration,
-            output=exec_result.output,
-            is_fixture_error=True,
-            timeout=exec_result.timeout,
-            attempt=exec_result.attempt,
-            max_attempts=exec_result.max_attempts,
-            previous_errors=exec_result.previous_errors,
-        )
-        return TestOutcome(result, TestCounts(errored=1), Event.TEST_FAIL)
-
-    def _build_xfail(self, exec_result: TestExecutionResult) -> TestOutcome:
-        result = TestResult(
-            name=exec_result.test_name,
-            node_id=exec_result.node_id,
-            suite_path=exec_result.suite_path,
-            error=exec_result.error,
-            duration=exec_result.duration,
-            output=exec_result.output,
-            xfail_reason=exec_result.xfail_reason,
-            timeout=exec_result.timeout,
-            attempt=exec_result.attempt,
-            max_attempts=exec_result.max_attempts,
-            previous_errors=exec_result.previous_errors,
-        )
-        return TestOutcome(result, TestCounts(xfailed=1), Event.TEST_XFAIL)
-
-    def _build_fail(self, exec_result: TestExecutionResult) -> TestOutcome:
-        result = TestResult(
-            name=exec_result.test_name,
-            node_id=exec_result.node_id,
-            suite_path=exec_result.suite_path,
-            error=exec_result.error,
-            duration=exec_result.duration,
-            output=exec_result.output,
-            timeout=exec_result.timeout,
-            attempt=exec_result.attempt,
-            max_attempts=exec_result.max_attempts,
-            previous_errors=exec_result.previous_errors,
-        )
-        return TestOutcome(result, TestCounts(failed=1), Event.TEST_FAIL)
+    def _base_kwargs(self, er: TestExecutionResult) -> dict[str, object]:
+        """Common TestResult kwargs from an execution result."""
+        return {
+            "name": er.test_name,
+            "node_id": er.node_id,
+            "suite_path": er.suite_path,
+            "duration": er.duration,
+            "output": er.output,
+            "timeout": er.timeout,
+            "attempt": er.attempt,
+            "max_attempts": er.max_attempts,
+            "previous_errors": er.previous_errors,
+            "is_eval": er.is_eval,
+            "eval_payload": er.eval_payload,
+            "log_records": er.log_records,
+        }
+
+    def _build_skip(self, er: TestExecutionResult) -> TestOutcome:
+        kw = self._base_kwargs(er)
+        kw.update(duration=0, output="", skip_reason=er.skip_reason)
+        return TestOutcome(TestResult(**kw), TestCounts(skipped=1), Event.TEST_SKIP)  # type: ignore[arg-type]
+
+    def _build_pass(self, er: TestExecutionResult) -> TestOutcome:
+        return TestOutcome(
+            TestResult(**self._base_kwargs(er)), TestCounts(passed=1), Event.TEST_PASS
+        )  # type: ignore[arg-type]
+
+    def _build_xpass(self, er: TestExecutionResult) -> TestOutcome:
+        kw = self._base_kwargs(er)
+        kw["xfail_reason"] = er.xfail_reason
+        return TestOutcome(TestResult(**kw), TestCounts(xpassed=1), Event.TEST_XPASS)  # type: ignore[arg-type]
+
+    def _build_error(self, er: TestExecutionResult) -> TestOutcome:
+        kw = self._base_kwargs(er)
+        kw.update(error=er.error, is_fixture_error=True)
+        return TestOutcome(TestResult(**kw), TestCounts(errored=1), Event.TEST_FAIL)  # type: ignore[arg-type]
+
+    def _build_xfail(self, er: TestExecutionResult) -> TestOutcome:
+        kw = self._base_kwargs(er)
+        kw.update(error=er.error, xfail_reason=er.xfail_reason)
+        return TestOutcome(TestResult(**kw), TestCounts(xfailed=1), Event.TEST_XFAIL)  # type: ignore[arg-type]
+
+    def _build_fail(self, er: TestExecutionResult) -> TestOutcome:
+        kw = self._base_kwargs(er)
+        kw["error"] = er.error
+        return TestOutcome(TestResult(**kw), TestCounts(failed=1), Event.TEST_FAIL)  # type: ignore[arg-type]
diff --git a/protest/di/container.py b/protest/di/container.py
index 8ab6e49..5c38571 100644
--- a/protest/di/container.py
+++ b/protest/di/container.py
@@ -11,7 +11,6 @@
     Any,
     get_args,
     get_origin,
-    get_type_hints,
     overload,
 )
 
@@ -741,8 +740,9 @@ async def _run_teardown_interruptible(
         """Run exit stack teardown, interruptible by cancellation event.
 
         Returns True if cancelled (should abort), False if completed normally.
-        Teardown runs in a thread pool so sync blocking code doesn't freeze
-        the event loop, allowing us to detect and respond to cancellation.
+        Teardown runs on the SAME event loop as fixture setup — creating a
+        new loop would break async resources (drivers, connections) that hold
+        references to the original loop.
         """
         if interrupt_event is None:
             await exit_stack.__aexit__(exc_type, exc_val, exc_tb)
@@ -751,23 +751,10 @@ async def _run_teardown_interruptible(
         if interrupt_event.is_set():
             return True
 
-        # Run teardown in thread pool so sync code doesn't block event loop
-        loop = asyncio.get_running_loop()
-
-        def run_sync_teardown() -> None:
-            # Create a new event loop for the thread to run async teardowns
-            new_loop = asyncio.new_event_loop()
-            try:
-                new_loop.run_until_complete(
-                    exit_stack.__aexit__(exc_type, exc_val, exc_tb)
-                )
-            finally:
-                new_loop.close()
-
-        async def run_in_thread() -> None:
-            await loop.run_in_executor(None, run_sync_teardown)
-
-        teardown_task = asyncio.create_task(run_in_thread())
+        # Run teardown on the same loop, race with cancellation
+        teardown_task = asyncio.create_task(
+            exit_stack.__aexit__(exc_type, exc_val, exc_tb)
+        )
         wait_cancel = asyncio.create_task(interrupt_event.wait())
 
         done, _ = await asyncio.wait(
@@ -793,10 +780,9 @@ def _analyze_and_store_dependencies(
         actual_func = unwrap_fixture(func)
         func_signature = signature(actual_func)
 
-        try:
-            type_hints = get_type_hints(actual_func, include_extras=True)
-        except Exception:
-            type_hints = {}
+        from protest.di.hints import get_type_hints_compat
+
+        type_hints = get_type_hints_compat(actual_func)
 
         dependencies: dict[str, FixtureCallable] = {}
         for param_name, param in func_signature.parameters.items():
diff --git a/protest/di/hints.py b/protest/di/hints.py
new file mode 100644
index 0000000..ede4c12
--- /dev/null
+++ b/protest/di/hints.py
@@ -0,0 +1,62 @@
+"""Type hints resolution with PEP 563 / TYPE_CHECKING compatibility.
+
+Shared by the core DI system and evals runner. Handles two failure modes:
+
+1. Local fixtures — ``from __future__ import annotations`` stringifies
+   annotations; names defined in local scopes aren't in ``func.__globals__``.
+   Fix: collect locals from the call stack.
+
+2. TYPE_CHECKING-only types — e.g. ``AsyncDriver`` imported only under
+   ``if TYPE_CHECKING:``. Fix: substitute ``Any`` for each unresolvable
+   name. The type itself is irrelevant for DI; only the ``Use(...)``
+   marker inside ``Annotated[...]`` matters.
+"""
+
+from __future__ import annotations
+
+import inspect
+import re
+from typing import Any, get_type_hints
+
+
+def get_type_hints_compat(func: Any) -> dict[str, Any]:
+    """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks."""
+    try:
+        return get_type_hints(func, include_extras=True)
+    except Exception:
+        pass
+
+    # Build a namespace from the entire call stack (covers local fixtures).
+    localns: dict[str, Any] = {}
+    try:
+        for frame_info in inspect.stack():
+            localns.update(frame_info.frame.f_locals)
+    except Exception:
+        pass
+
+    try:
+        return get_type_hints(func, localns=localns, include_extras=True)
+    except Exception:
+        pass
+
+    # TYPE_CHECKING fallback: substitute Any for unresolvable names.
+    return _get_type_hints_substituting_any(func, localns)
+
+
+def _get_type_hints_substituting_any(
+    func: Any,
+    localns: dict[str, Any],
+) -> dict[str, Any]:
+    """Retry get_type_hints, replacing each NameError'd name with Any."""
+    localns = dict(localns)
+    for _ in range(20):
+        try:
+            return get_type_hints(func, localns=localns, include_extras=True)
+        except NameError as exc:
+            match = re.search(r"name '(\w+)' is not defined", str(exc))
+            if not match:
+                break
+            localns[match.group(1)] = Any
+        except Exception:
+            break
+    return {}
diff --git a/protest/di/validation.py b/protest/di/validation.py
index 2d6cd18..d716397 100644
--- a/protest/di/validation.py
+++ b/protest/di/validation.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from inspect import signature
-from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin
 
 from protest.di.markers import ForEach, From
 from protest.exceptions import ParameterizedFixtureError
@@ -15,10 +15,9 @@
 
 def _extract_from_params(func: Callable[..., Any]) -> dict[str, ForEach[Any]]:
     """Extract parameters annotated with From(source)."""
-    try:
-        type_hints = get_type_hints(func, include_extras=True)
-    except Exception:
-        type_hints = {}
+    from protest.di.hints import get_type_hints_compat
+
+    type_hints = get_type_hints_compat(func)
 
     result: dict[str, ForEach[Any]] = {}
     for param_name in signature(func).parameters:
diff --git a/protest/entities/events.py b/protest/entities/events.py
index f87d9d9..d76434c 100644
--- a/protest/entities/events.py
+++ b/protest/entities/events.py
@@ -1,13 +1,36 @@
 from __future__ import annotations
 
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from protest.entities import FixtureScope, SuitePath
     from protest.events.types import Event
 
 
+@dataclass(frozen=True, slots=True)
+class EvalScoreEntry:
+    """A single score entry from an evaluator."""
+
+    value: float | bool | str
+    passed: bool = True
+
+
+@dataclass(frozen=True, slots=True)
+class EvalPayload:
+    """Structured payload for eval results, carried on TestResult."""
+
+    case_name: str
+    passed: bool
+    task_duration: float
+    inputs: Any = None
+    output: Any = None
+    expected_output: Any = None
+    scores: dict[str, EvalScoreEntry] = field(default_factory=dict)
+    case_hash: str = ""
+    eval_hash: str = ""
+
+
 @dataclass(frozen=True, slots=True)
 class TestCounts:
     passed: int = 0
@@ -43,6 +66,9 @@ class TestResult:
     attempt: int = 1
     max_attempts: int = 1
     previous_errors: tuple[Exception, ...] = ()
+    is_eval: bool = False
+    eval_payload: EvalPayload | None = None
+    log_records: tuple[Any, ...] = ()
 
 
 @dataclass(frozen=True, slots=True)
diff --git a/protest/events/types.py b/protest/events/types.py
index 8f4d1fc..05d9fa2 100644
--- a/protest/events/types.py
+++ b/protest/events/types.py
@@ -16,6 +16,7 @@ class Event(Enum):
     SUITE_SETUP_DONE = "suite_setup_done"
     SUITE_TEARDOWN_START = "suite_teardown_start"
     SUITE_END = "suite_end"
+    EVAL_SUITE_END = "eval_suite_end"
     TEST_START = "test_start"
     TEST_ACQUIRED = "test_acquired"
     TEST_SETUP_DONE = "test_setup_done"
@@ -34,3 +35,4 @@ class Event(Enum):
     FIXTURE_TEARDOWN_START = "fixture_teardown_start"
     FIXTURE_TEARDOWN_DONE = "fixture_teardown_done"
     SESSION_INTERRUPTED = "session_interrupted"
+    USER_PRINT = "user_print"
diff --git a/protest/execution/capture.py b/protest/execution/capture.py
index d05fe00..2e258a7 100644
--- a/protest/execution/capture.py
+++ b/protest/execution/capture.py
@@ -19,6 +19,7 @@
 )
 
 _current_node_id: ContextVar[str | None] = ContextVar("current_node_id", default=None)
+_event_bus_ref: ContextVar[object | None] = ContextVar("event_bus_ref", default=None)
 
 
 @dataclass(slots=True)
@@ -100,6 +101,21 @@ def get_session_teardown_output() -> str:
     return _session_teardown.buffer.getvalue() if _session_teardown.buffer else ""
 
 
+def set_event_bus(bus: object) -> Token[object | None]:
+    """Set event bus reference for console.print() access."""
+    return _event_bus_ref.set(bus)
+
+
+def reset_event_bus(token: Token[object | None]) -> None:
+    """Reset event bus reference."""
+    _event_bus_ref.reset(token)
+
+
+def get_event_bus() -> object | None:
+    """Get current event bus (for console.print)."""
+    return _event_bus_ref.get()
+
+
 class TaskAwareStream:
     def __init__(self, original_stream: TextIO, show_output: bool = False) -> None:
         self._original = original_stream
diff --git a/protest/plugin.py b/protest/plugin.py
index 6833b03..9589fff 100644
--- a/protest/plugin.py
+++ b/protest/plugin.py
@@ -142,6 +142,12 @@ def on_suite_teardown_start(self, path: SuitePath) -> None | Awaitable[None]:
     def on_suite_end(self, result: SuiteResult) -> None | Awaitable[None]:
         """Suite ends (after fixture teardown)."""
 
+    def on_eval_suite_end(self, report: Any) -> None | Awaitable[None]:
+        """Eval suite finished — aggregated report with scores/stats."""
+
+    def on_user_print(self, data: Any) -> None | Awaitable[None]:
+        """User-initiated print via protest.console.print()."""
+
     # ─────────────────────────────────────────────────────────────────────
     # Fixture lifecycle
     # ─────────────────────────────────────────────────────────────────────

From 5041457c4561f137c797a97563467db34c701ef3 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Thu, 26 Mar 2026 20:00:00 +0100
Subject: [PATCH 02/60] feat(evals): native eval system with @session.eval()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An eval is a test that returns a scored value. Uses ForEach/From
for parametrization — no separate EvalSuite/EvalCase framework.

- @session.eval(evaluators=[...]) decorator
- @evaluator decorator with partial-application binding
- EvalSession(model=) for eval-focused sessions
- EvalContext passed to evaluators
- Scoring v2: evaluators return bool or dataclass
  - Annotated[bool, Verdict] → pass/fail
  - Annotated[float, Metric] → stats aggregation
  - Annotated[str, Reason] → displayed on failure
- EvalCase dataclass for typed ForEach data
- Built-in evaluators: contains_keywords, not_empty, max_length, etc.
- EvalHistoryPlugin listens to EVAL_SUITE_END
- EvalResultsWriter for per-case .md files
- Evaluator exception → error (not fail)
---
 protest/core/runner.py          |  76 ++++++++++++++
 protest/core/session.py         | 115 +++++++++++++++++++--
 protest/core/suite.py           |  39 ++++++-
 protest/evals/__init__.py       |  28 +++++
 protest/evals/evaluator.py      | 176 ++++++++++++++++++++++++++++++++
 protest/evals/evaluators.py     | 143 ++++++++++++++++++++++++++
 protest/evals/hashing.py        |  51 +++++++++
 protest/evals/history.py        | 159 +++++++++++++++++++++++++++++
 protest/evals/results_writer.py | 155 ++++++++++++++++++++++++++++
 protest/evals/session.py        |  44 ++++++++
 protest/evals/types.py          | 168 ++++++++++++++++++++++++++++++
 protest/evals/wrapper.py        | 176 ++++++++++++++++++++++++++++++++
 protest/filters/kind.py         |  36 +++++++
 13 files changed, 1358 insertions(+), 8 deletions(-)
 create mode 100644 protest/evals/__init__.py
 create mode 100644 protest/evals/evaluator.py
 create mode 100644 protest/evals/evaluators.py
 create mode 100644 protest/evals/hashing.py
 create mode 100644 protest/evals/history.py
 create mode 100644 protest/evals/results_writer.py
 create mode 100644 protest/evals/session.py
 create mode 100644 protest/evals/types.py
 create mode 100644 protest/evals/wrapper.py
 create mode 100644 protest/filters/kind.py

diff --git a/protest/core/runner.py b/protest/core/runner.py
index 0347c2d..70669d0 100644
--- a/protest/core/runner.py
+++ b/protest/core/runner.py
@@ -1,7 +1,10 @@
 """Test runner orchestration."""
 
+from __future__ import annotations
+
 import asyncio
 import time
+from typing import TYPE_CHECKING, Any
 
 from protest.core.collector import Collector
 from protest.core.execution import ParallelExecutor, SuiteManager, TestExecutor
@@ -22,6 +25,10 @@
 from protest.execution.context import cancellation_event
 from protest.execution.interrupt import InterruptHandler
 
+if TYPE_CHECKING:
+    from protest.entities.events import TestResult
+    from protest.evals.types import EvalCaseResult
+
 
 class TestRunner:
     """Executes tests with parallel support and fixture lifecycle management.
@@ -36,6 +43,7 @@ def __init__(self, session: ProTestSession) -> None:
         self._interrupt_handler = InterruptHandler()
         self._interrupted = False
         self._force_interrupt_emitted = False
+        self._eval_results: dict[str, list[EvalCaseResult]] = {}
 
         # Extracted components
         self._suite_manager = SuiteManager(session)
@@ -61,10 +69,23 @@ def run(self) -> RunResult:
             self._interrupt_handler.uninstall()
             loop.close()
 
+    def _collect_eval_result(self, result: TestResult) -> None:
+        """Internal handler: collect eval results from TEST_PASS/FAIL events."""
+        if not result.is_eval or result.eval_payload is None:
+            return
+        suite_name = result.suite_path.root_name if result.suite_path else "evals"
+        case_result = _build_eval_case_result(result)
+        self._eval_results.setdefault(suite_name, []).append(case_result)
+
     async def _main_loop(self) -> bool:
         """The main async loop for running tests."""
         session_start = time.perf_counter()
 
+        # Register internal eval collector before tests run
+        self._eval_results.clear()
+        self._session.events.on(Event.TEST_PASS, self._collect_eval_result)
+        self._session.events.on(Event.TEST_FAIL, self._collect_eval_result)
+
         collector = Collector()
         items = collector.collect(self._session)
 
@@ -79,9 +100,12 @@ async def _main_loop(self) -> bool:
 
         total_counts = TestCounts()
         # Inject cancellation event into context for teardown awareness
+        from protest.execution.capture import reset_event_bus, set_event_bus
+
         cancel_token = cancellation_event.set(
             self._interrupt_handler.force_teardown_event
         )
+        bus_token = set_event_bus(self._session.events)
         try:
             with GlobalCapturePatch(show_output=not self._session.capture):
                 async with self._session:
@@ -112,6 +136,8 @@ async def _main_loop(self) -> bool:
                     ):
                         suite_result = self._suite_manager.build_result(suite_path)
                         await self._session.events.emit(Event.SUITE_END, suite_result)
+                        # Emit EVAL_SUITE_END for eval suites
+                        await self._emit_eval_suite_end(suite_path)
 
                     await self._session.events.emit(Event.SESSION_TEARDOWN_START)
         finally:
@@ -124,6 +150,7 @@ async def _main_loop(self) -> bool:
                 await self._session.events.emit(Event.SESSION_INTERRUPTED, True)
                 self._force_interrupt_emitted = True
             cancellation_event.reset(cancel_token)
+            reset_event_bus(bus_token)
 
         if self._interrupt_handler.should_stop_new_tests:
             self._interrupted = True
@@ -151,8 +178,57 @@ async def _main_loop(self) -> bool:
             await self._session.events.wait_pending()
 
         await self._session.events.emit(Event.SESSION_COMPLETE, session_result)
+        # Unregister eval collector
+        self._session.events.off(Event.TEST_PASS, self._collect_eval_result)
+        self._session.events.off(Event.TEST_FAIL, self._collect_eval_result)
+
         return (
             total_counts.failed == 0
             and total_counts.errored == 0
             and total_counts.xpassed == 0
         )
+
+    async def _emit_eval_suite_end(self, suite_path: Any) -> None:
+        """Emit EVAL_SUITE_END if this suite_path corresponds to an eval suite."""
+        from protest.evals.types import EvalSuiteReport
+
+        suite_name = (
+            suite_path.root_name
+            if hasattr(suite_path, "root_name")
+            else str(suite_path)
+        )
+        eval_cases = self._eval_results.get(suite_name)
+        if not eval_cases:
+            return
+        report = EvalSuiteReport(
+            suite_name=suite_name,
+            cases=tuple(eval_cases),
+            duration=sum(c.duration for c in eval_cases),
+        )
+        await self._session.events.emit(Event.EVAL_SUITE_END, report)
+
+
+def _build_eval_case_result(result: TestResult) -> EvalCaseResult:
+    """Build EvalCaseResult from a TestResult with eval_payload."""
+    from protest.evals.types import EvalCaseResult, EvalScore
+
+    payload = result.eval_payload
+    assert payload is not None
+    return EvalCaseResult(
+        case_name=payload.case_name or "",
+        node_id=result.node_id,
+        scores=tuple(
+            EvalScore(
+                name=name,
+                value=entry.value,
+            )
+            for name, entry in payload.scores.items()
+        ),
+        duration=payload.task_duration,
+        passed=not (result.error is not None or not payload.passed),
+        inputs=payload.inputs,
+        output=payload.output,
+        expected_output=payload.expected_output,
+        case_hash=payload.case_hash,
+        eval_hash=payload.eval_hash,
+    )
diff --git a/protest/core/session.py b/protest/core/session.py
index 778dbb3..3224028 100644
--- a/protest/core/session.py
+++ b/protest/core/session.py
@@ -1,14 +1,16 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 
 if TYPE_CHECKING:
     from collections.abc import Callable
+    from pathlib import Path
     from types import TracebackType
 
     from protest.compat import Self
     from protest.core.suite import ProTestSuite
     from protest.entities import FixtureCallable
+    from protest.evals.types import JudgeInfo, ModelInfo
     from protest.plugin import PluginBase, PluginContext
 
 from protest.cache.plugin import CachePlugin
@@ -31,6 +33,7 @@
 from protest.exceptions import InvalidMaxConcurrencyError
 from protest.execution.capture import set_session_teardown_capture
 from protest.filters.keyword import KeywordFilterPlugin
+from protest.filters.kind import KindFilterPlugin
 from protest.filters.suite import SuiteFilterPlugin
 from protest.reporting.ascii import AsciiReporter
 from protest.reporting.ctrf import CTRFReporter
@@ -54,7 +57,13 @@ class ProTestSession:
         concurrency: Number of parallel test workers (default: 1).
     """
 
-    def __init__(self, concurrency: int = 1) -> None:
+    def __init__(
+        self,
+        concurrency: int = 1,
+        history: bool = False,
+        history_dir: Path | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
         if concurrency < 1:
             raise InvalidMaxConcurrencyError(concurrency)
 
@@ -72,6 +81,11 @@ def __init__(self, concurrency: int = 1) -> None:
         self._capture: bool = True
         self._setup_duration: float = 0
         self._teardown_duration: float = 0
+        self._history = history
+        self._history_dir = history_dir
+        self._metadata: dict[str, Any] = dict(metadata) if metadata else {}
+        self._eval_model: ModelInfo | None = None  # set by EvalSession
+        self._eval_judge: JudgeInfo | None = None  # set by EvalSession
 
     async def resolve_autouse(self) -> None:
         """Resolve all session autouse fixtures at session start."""
@@ -104,6 +118,18 @@ def capture(self) -> bool:
     def capture(self, value: bool) -> None:
         self._capture = value
 
+    @property
+    def history(self) -> bool:
+        return self._history
+
+    @property
+    def history_dir(self) -> Path | None:
+        return self._history_dir
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        return self._metadata
+
     @property
     def setup_duration(self) -> float:
         """Duration of session setup (available after resolve_autouse)."""
@@ -151,6 +177,7 @@ def test(
         skip_reason: str = "Skipped",
         xfail: bool | str | Xfail | None = None,
         retry: int | Retry | None = None,
+        is_eval: bool = False,
     ) -> Callable[[FuncT], FuncT]:
         def decorator(func: FuncT) -> FuncT:
             if timeout is not None and timeout < 0:
@@ -168,21 +195,64 @@ def decorator(func: FuncT) -> FuncT:
                     xfail=norm_xfail,
                     timeout=timeout,
                     retry=norm_retry,
+                    is_eval=is_eval,
                 )
             )
             return func
 
         return decorator
 
+    def eval(
+        self,
+        evaluators: list[Any] | None = None,
+        expected_key: str = "expected",
+        tags: list[str] | None = None,
+        timeout: float | None = None,
+        name: str | None = None,
+        model: Any = None,
+    ) -> Callable[[FuncT], FuncT]:
+        """Register a scored eval test.
+
+        Creates an implicit eval suite named after the function.
+        The decorated function's return value is passed to evaluators.
+        Use with ForEach/From for parametrization::
+
+            @session.eval(evaluators=[my_scorer], model=ModelInfo(name="qwen"))
+            async def my_eval(case: Annotated[dict, From(cases)]) -> str:
+                return await run(case["q"])
+        """
+        from protest.core.suite import ProTestSuite
+        from protest.evals.wrapper import make_eval_wrapper
+
+        def decorator(func: FuncT) -> FuncT:
+            suite_name = name or func.__name__
+            suite_meta: dict[str, Any] = {}
+            resolved_model = model or getattr(self, "_eval_model", None)
+            if resolved_model:
+                suite_meta["model"] = resolved_model.name
+                suite_meta["provider"] = getattr(resolved_model, "provider", None)
+            suite = ProTestSuite(
+                name=suite_name,
+                tags=list(tags or []),
+                kind="eval",
+                metadata=suite_meta,
+            )
+            wrapper = make_eval_wrapper(
+                func,
+                evaluators or [],
+                expected_key,
+            )
+            suite.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
+            self.add_suite(suite)
+            return func
+
+        return decorator
+
     def add_suite(self, suite: ProTestSuite) -> None:
         """Add a suite to this session."""
         suite._attach_to_session(self)
         self._suites.append(suite)
 
-    def include_suite(self, suite: ProTestSuite) -> None:
-        """Alias for add_suite (backward compatibility)."""
-        self.add_suite(suite)
-
     def bind(
         self,
         fn: FixtureCallable,
@@ -246,6 +316,7 @@ def default_plugin_classes() -> list[type[PluginBase]]:
             TagFilterPlugin,
             SuiteFilterPlugin,
             KeywordFilterPlugin,
+            KindFilterPlugin,
             RichReporter,
             AsciiReporter,
             CTRFReporter,
@@ -256,6 +327,10 @@ def register_default_plugins(self) -> None:
         """Register all standard ProTest plugins for CLI discovery."""
         for plugin_class in self.default_plugin_classes():
             self.use(plugin_class)
+        if self._history:
+            from protest.history.plugin import HistoryPlugin
+
+            self.register_plugin(HistoryPlugin(history_dir=self._history_dir))
 
     @property
     def plugin_classes(self) -> list[type[PluginBase]]:
@@ -294,6 +369,34 @@ def activate_plugins(self, ctx: PluginContext) -> None:
             if instance is not None:
                 self.register_plugin(instance)
 
+        # Auto-wire eval support if any suite has kind="eval"
+        if any(s.kind == "eval" for s in self._suites):
+            self._wire_eval_support()
+
+    def _wire_eval_support(self) -> None:
+        """Wire eval history + results writer plugins (no EvalPlugin)."""
+        from protest.evals.history import EvalHistoryPlugin
+        from protest.evals.results_writer import EvalResultsWriter
+
+        judge_dict = None
+        if self._eval_judge:
+            judge_dict = {
+                "name": self._eval_judge.name,
+                "provider": getattr(self._eval_judge, "provider", None),
+                "evaluators": list(getattr(self._eval_judge, "evaluators", ())),
+            }
+
+        history = EvalHistoryPlugin(
+            history_dir=self._history_dir,
+            model=self._eval_model,
+            judge=judge_dict,
+            metadata=self._metadata,
+        )
+        self.register_plugin(history)
+
+        writer = EvalResultsWriter(history_dir=self._history_dir)
+        self.register_plugin(writer)
+
     async def __aenter__(self) -> Self:
         self._register_fixtures()
         await self._resolver.__aenter__()
diff --git a/protest/core/suite.py b/protest/core/suite.py
index 1176842..dfb64c3 100644
--- a/protest/core/suite.py
+++ b/protest/core/suite.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 
 from protest.di.decorators import unwrap_fixture
 
@@ -42,18 +42,22 @@ class ProTestSuite:
         description: Optional description for documentation purposes.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         name: str,
         max_concurrency: int | None = None,
         tags: list[str] | None = None,
         description: str | None = None,
+        kind: str = "test",
+        metadata: dict[str, Any] | None = None,
     ) -> None:
         if max_concurrency is not None and max_concurrency < 1:
             raise InvalidMaxConcurrencyError(max_concurrency)
 
         self._name = name
+        self._kind = kind
         self._description = description
+        self._metadata: dict[str, Any] = dict(metadata) if metadata else {}
         self._session: ProTestSession | None = None
         self._parent_suite: ProTestSuite | None = None
         self._tests: list[TestRegistration] = []
@@ -70,6 +74,14 @@ def name(self) -> str:
     def description(self) -> str | None:
         return self._description
 
+    @property
+    def kind(self) -> str:
+        return self._kind
+
+    @property
+    def suite_metadata(self) -> dict[str, Any]:
+        return self._metadata
+
     @property
     def full_path(self) -> SuitePath:
         """Return hierarchical path: Parent::Child::GrandChild."""
@@ -122,6 +134,7 @@ def test(  # noqa: PLR0913 - test decorator requires flexible params
         skip_reason: str = "Skipped",
         xfail: bool | str | Xfail | None = None,
         retry: int | Retry | None = None,
+        is_eval: bool = False,
     ) -> Callable[[FuncT], FuncT]:
         def decorator(func: FuncT) -> FuncT:
             if timeout is not None and timeout < 0:
@@ -139,12 +152,34 @@ def decorator(func: FuncT) -> FuncT:
                     xfail=norm_xfail,
                     timeout=timeout,
                     retry=norm_retry,
+                    is_eval=is_eval,
                 )
             )
             return func
 
         return decorator
 
+    def eval(
+        self,
+        evaluators: list[Any] | None = None,
+        expected_key: str = "expected",
+        tags: list[str] | None = None,
+        timeout: float | None = None,
+    ) -> Callable[[FuncT], FuncT]:
+        """Register a scored eval test on this suite."""
+        from protest.evals.wrapper import make_eval_wrapper
+
+        def decorator(func: FuncT) -> FuncT:
+            wrapper = make_eval_wrapper(
+                func,
+                evaluators or [],
+                expected_key,
+            )
+            self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
+            return func
+
+        return decorator
+
     def add_suite(self, suite: ProTestSuite) -> None:
         """Add a child suite. Child can access parent's fixtures."""
         parent_effective = self.effective_max_concurrency
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
new file mode 100644
index 0000000..17b35c9
--- /dev/null
+++ b/protest/evals/__init__.py
@@ -0,0 +1,28 @@
+"""ProTest evals — native eval support."""
+
+from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, Verdict, evaluator
+from protest.evals.session import EvalSession
+from protest.evals.types import (
+    EvalCaseResult,
+    EvalScore,
+    EvalSuiteReport,
+    JudgeInfo,
+    ModelInfo,
+    ScoreStats,
+)
+
+__all__ = [
+    "EvalCase",
+    "EvalCaseResult",
+    "EvalContext",
+    "Metric",
+    "EvalScore",
+    "EvalSession",
+    "EvalSuiteReport",
+    "JudgeInfo",
+    "ModelInfo",
+    "Reason",
+    "ScoreStats",
+    "Verdict",
+    "evaluator",
+]
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
new file mode 100644
index 0000000..336df8d
--- /dev/null
+++ b/protest/evals/evaluator.py
@@ -0,0 +1,176 @@
+"""Evaluator primitives — functions, not classes.
+
+An evaluator is a callable that receives an EvalContext and returns a score.
+The @evaluator decorator adds partial-application ergonomics:
+
+    @evaluator
+    def contains_keywords(ctx: EvalContext, keywords: list[str]) -> ContainsKeywordsResult:
+        found = sum(1 for k in keywords if k.lower() in ctx.output.lower())
+        return ContainsKeywordsResult(keyword_recall=found / len(keywords), ...)
+
+    # Bind params → returns a callable(ctx) via functools.partial
+    evaluators=[contains_keywords(keywords=["paris", "france"])]
+
+    # No params → use directly
+    @evaluator
+    def not_empty(ctx: EvalContext) -> bool:
+        return bool(ctx.output.strip())
+
+Async evaluators are supported:
+
+    @evaluator
+    async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
+        ...
+
+Evaluators return either bool (simple verdict) or a dataclass (structured result).
+The framework reads fields by type:
+- bool → verdict (pass/fail = all(bool_fields))
+- float → metric (aggregated in stats)
+- str → reason (displayed on failure)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import dataclasses
+import functools
+import inspect
+from dataclasses import dataclass, field
+from typing import Any, Generic, TypeVar
+
+I = TypeVar("I")
+O = TypeVar("O")
+
+
+@dataclass
+class EvalContext(Generic[I, O]):
+    """Context passed to evaluator functions."""
+
+    name: str
+    inputs: I
+    output: O
+    expected_output: O | None
+    metadata: Any
+    duration: float
+
+
+@dataclass
+class EvalCase:
+    """Typed container for eval case data in ForEach.
+
+    Usage::
+
+        cases = ForEach([
+            EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"),
+            EvalCase(inputs="Who is Pierre?", expected="Pierre, arrest"),
+        ])
+
+        @session.eval(evaluators=[contains_facts])
+        def my_eval(case: Annotated[EvalCase, From(cases)]) -> str:
+            return ask(case.inputs)
+    """
+
+    inputs: Any
+    expected: Any = None
+    name: str = ""
+    evaluators: list[Any] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def __repr__(self) -> str:
+        return self.name or f"EvalCase({self.inputs!r})"
+
+
+class Metric:
+    """Annotate a float/int field as a metric for stats aggregation."""
+
+
+class Verdict:
+    """Annotate a bool field as a verdict for pass/fail."""
+
+
+class Reason:
+    """Annotate a str field as a reason displayed on failure."""
+
+
+def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]:
+    """Extract EvalScore instances from an evaluator result.
+
+    For bool returns: a single verdict named after the evaluator.
+    For dataclass returns: only fields annotated with Metric/Verdict/Reason
+    are extracted. Unannotated fields are ignored (free metadata).
+
+    Raises:
+        TypeError: If result is not bool or dataclass.
+    """
+    from typing import Annotated, get_args, get_origin, get_type_hints
+
+    from protest.evals.types import EvalScore
+
+    if isinstance(result, bool):
+        return [EvalScore(name=evaluator_name, value=result)]
+
+    if dataclasses.is_dataclass(result) and not isinstance(result, type):
+        scores = []
+        hints = get_type_hints(type(result), include_extras=True)
+        for f in dataclasses.fields(result):
+            ann = hints.get(f.name)
+            if ann is None or get_origin(ann) is not Annotated:
+                continue
+            for meta in get_args(ann)[1:]:
+                if isinstance(meta, type) and issubclass(meta, (Metric, Verdict, Reason)):
+                    scores.append(EvalScore(name=f.name, value=getattr(result, f.name)))
+                    break
+        return scores
+
+    type_name = type(result).__name__
+    raise TypeError(
+        f"Evaluator must return bool or dataclass, got {type_name}"
+    )
+
+
+def evaluator(fn: Any) -> Any:
+    """Decorator that turns a function into a protest evaluator.
+
+    The decorated function can be called two ways:
+
+    1. ``evaluator_fn(ctx)`` — evaluate directly
+    2. ``evaluator_fn(keyword=value, ...)`` — returns a bound evaluator (partial)
+
+    This is just ``functools.partial`` with nicer ergonomics: when the first
+    positional argument is an ``EvalContext``, the function evaluates. Otherwise,
+    all arguments are bound and the result is a new callable expecting only ``ctx``.
+    """
+    sig = inspect.signature(fn)
+    params = list(sig.parameters.values())
+    has_extra_params = len(params) > 1
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        # Direct call: first positional arg is an EvalContext
+        if args and isinstance(args[0], EvalContext):
+            return fn(*args, **kwargs)
+        # Bind params → return partial
+        if has_extra_params and kwargs:
+            bound = functools.partial(fn, **kwargs)
+            # Preserve async detection on the partial
+            bound._is_async_evaluator = asyncio.iscoroutinefunction(fn)  # type: ignore[attr-defined]
+            bound.__name__ = fn.__name__  # type: ignore[attr-defined]
+            bound.__qualname__ = fn.__qualname__  # type: ignore[attr-defined]
+            return bound
+        # No args at all — if no extra params, this IS the evaluator
+        if not has_extra_params and not args and not kwargs:
+            return fn
+        return fn(*args, **kwargs)
+
+    wrapper._is_evaluator = True  # type: ignore[attr-defined]
+    wrapper._is_async_evaluator = asyncio.iscoroutinefunction(fn)  # type: ignore[attr-defined]
+    return wrapper
+
+
+def is_async_evaluator(fn: Any) -> bool:
+    """Check if an evaluator (or partial thereof) is async."""
+    if hasattr(fn, "_is_async_evaluator"):
+        return fn._is_async_evaluator
+    if isinstance(fn, functools.partial):
+        return asyncio.iscoroutinefunction(fn.func)
+    return asyncio.iscoroutinefunction(fn)
diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py
new file mode 100644
index 0000000..b9b1475
--- /dev/null
+++ b/protest/evals/evaluators.py
@@ -0,0 +1,143 @@
+"""Built-in evaluators for common eval patterns.
+
+Evaluators return either bool (simple verdict) or a dataclass with
+annotated fields: Annotated[bool, Verdict], Annotated[float, Metric],
+Annotated[str, Reason]. Unannotated fields are ignored by the runner.
+"""
+
+from __future__ import annotations
+
+import json as json_module
+import re
+from dataclasses import dataclass
+from typing import Annotated
+
+from protest.evals.evaluator import EvalContext, Metric, Verdict, evaluator
+
+
+@dataclass(frozen=True, slots=True)
+class ContainsKeywordsResult:
+    keyword_recall: Annotated[float, Metric]
+    all_keywords_present: Annotated[bool, Verdict]
+
+
+@dataclass(frozen=True, slots=True)
+class DoesNotContainResult:
+    no_forbidden_words: Annotated[bool, Verdict]
+
+
+@dataclass(frozen=True, slots=True)
+class MaxLengthResult:
+    conciseness: Annotated[float, Metric]
+    within_limit: Annotated[bool, Verdict]
+
+
+@dataclass(frozen=True, slots=True)
+class JsonValidResult:
+    valid_json: Annotated[bool, Verdict]
+    has_required_keys: Annotated[bool, Verdict]
+
+
+@dataclass(frozen=True, slots=True)
+class WordOverlapResult:
+    overlap: Annotated[float, Metric]
+
+
+@evaluator
+def contains_keywords(ctx: EvalContext, keywords: list[str], min_recall: float = 0.0) -> ContainsKeywordsResult:
+    """Check that the output contains expected keywords (case-insensitive)."""
+    output_lower = ctx.output.lower()
+    found = sum(1 for kw in keywords if kw.lower() in output_lower)
+    total = len(keywords)
+    recall = found / total if total else 1.0
+    return ContainsKeywordsResult(
+        keyword_recall=recall,
+        all_keywords_present=recall >= min_recall if min_recall > 0 else found == total,
+    )
+
+
+@evaluator
+def contains_expected(ctx: EvalContext, case_sensitive: bool = False) -> bool:
+    """Check that the output contains expected_output as a substring."""
+    if ctx.expected_output is None:
+        return True
+    if case_sensitive:
+        return ctx.expected_output in ctx.output
+    return ctx.expected_output.lower() in ctx.output.lower()
+
+
+@evaluator
+def does_not_contain(
+    ctx: EvalContext, forbidden: list[str], case_sensitive: bool = False
+) -> DoesNotContainResult:
+    """Check that the output does not contain forbidden words."""
+    output = ctx.output if case_sensitive else ctx.output.lower()
+    found = [w for w in forbidden if (w if case_sensitive else w.lower()) in output]
+    return DoesNotContainResult(no_forbidden_words=len(found) == 0)
+
+
+@evaluator
+def not_empty(ctx: EvalContext) -> bool:
+    """Check that the output is not empty or whitespace-only."""
+    if ctx.output is None:
+        return False
+    if isinstance(ctx.output, str):
+        return len(ctx.output.strip()) > 0
+    return True
+
+
+@evaluator
+def max_length(ctx: EvalContext, max_chars: int = 500) -> MaxLengthResult:
+    """Check that the output doesn't exceed a character limit."""
+    length = len(ctx.output)
+    return MaxLengthResult(
+        conciseness=min(1.0, max_chars / max(length, 1)),
+        within_limit=length <= max_chars,
+    )
+
+
+@evaluator
+def min_length(ctx: EvalContext, min_chars: int = 1) -> bool:
+    """Check that the output meets a minimum length."""
+    return len(ctx.output) >= min_chars
+
+
+@evaluator
+def matches_regex(ctx: EvalContext, pattern: str, flags: int = 0) -> bool:
+    """Check that the output matches a regex pattern."""
+    return bool(re.search(pattern, ctx.output, flags))
+
+
+@evaluator
+def json_valid(
+    ctx: EvalContext, required_keys: list[str] | None = None
+) -> JsonValidResult:
+    """Check that the output is valid JSON, optionally with required keys."""
+    if required_keys is None:
+        required_keys = []
+    try:
+        parsed = json_module.loads(ctx.output)
+    except (json_module.JSONDecodeError, TypeError):
+        return JsonValidResult(valid_json=False, has_required_keys=False)
+
+    has_keys = (
+        all(k in parsed for k in required_keys)
+        if required_keys and isinstance(parsed, dict)
+        else True
+    )
+    return JsonValidResult(valid_json=True, has_required_keys=has_keys)
+
+
+@evaluator
+def word_overlap(ctx: EvalContext) -> WordOverlapResult:
+    """Compute word overlap between output and expected_output (tracking-only)."""
+    if ctx.expected_output is None:
+        return WordOverlapResult(overlap=1.0)
+    expected = str(ctx.expected_output)
+    expected_words = set(expected.lower().split())
+    output_words = set(ctx.output.lower().split())
+    if not expected_words:
+        return WordOverlapResult(overlap=1.0)
+    return WordOverlapResult(
+        overlap=len(expected_words & output_words) / len(expected_words),
+    )
diff --git a/protest/evals/hashing.py b/protest/evals/hashing.py
new file mode 100644
index 0000000..0f0f5e9
--- /dev/null
+++ b/protest/evals/hashing.py
@@ -0,0 +1,51 @@
+"""Content hashing for eval cases — detect when cases or scoring change."""
+
+from __future__ import annotations
+
+import dataclasses
+import hashlib
+import json
+from typing import Any
+
+HASH_LENGTH = 12
+
+
+def compute_case_hash(inputs: Any, expected_output: Any) -> str:
+    """Hash the case content (inputs + expected_output)."""
+    data = {"inputs": _canonical(inputs), "expected": _canonical(expected_output)}
+    return _hash(data)
+
+
+def compute_eval_hash(
+    evaluators: list[Any],
+) -> str:
+    """Hash the scoring config (evaluators only)."""
+    data = {
+        "evaluators": [_canonical(e) for e in evaluators],
+    }
+    return _hash(data)
+
+
+def _hash(data: Any) -> str:
+    raw = json.dumps(data, sort_keys=True, default=str)
+    return hashlib.sha256(raw.encode()).hexdigest()[:HASH_LENGTH]
+
+
+def _canonical(obj: Any) -> Any:
+    """Convert an object to a canonical JSON-serializable form."""
+    if obj is None or isinstance(obj, (bool, int, float, str)):
+        return obj
+    if isinstance(obj, (list, tuple)):
+        return [_canonical(item) for item in obj]
+    if isinstance(obj, dict):
+        return {str(k): _canonical(v) for k, v in sorted(obj.items())}
+    # Pydantic models
+    if hasattr(obj, "model_dump"):
+        return _canonical(obj.model_dump(mode="json"))
+    # Dataclasses — iterate without deepcopy to support non-picklable fields
+    if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
+        return {
+            f.name: _canonical(getattr(obj, f.name)) for f in dataclasses.fields(obj)
+        }
+    # Fallback
+    return repr(obj)
diff --git a/protest/evals/history.py b/protest/evals/history.py
new file mode 100644
index 0000000..f7f2544
--- /dev/null
+++ b/protest/evals/history.py
@@ -0,0 +1,159 @@
+"""EvalHistoryPlugin — persists eval run results as JSONL with model/scores."""
+
+from __future__ import annotations
+
+import uuid
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any
+
+from protest.history.collector import collect_env_info, collect_git_info
+from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry
+from protest.plugin import PluginBase
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from protest.evals.types import EvalCaseResult, EvalSuiteReport, ModelInfo
+    from protest.plugin import PluginContext
+
+
+class EvalHistoryPlugin(PluginBase):
+    """Persists eval results to JSONL with model/judge/scores metadata.
+
+    Listens to EVAL_SUITE_END events (emitted by the core runner).
+    """
+
+    name = "eval-history"
+    description = "Eval history tracking"
+
+    def __init__(
+        self,
+        *,
+        history_dir: Path | None = None,
+        model: ModelInfo | None = None,
+        judge: dict[str, Any] | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        self._history_dir = history_dir or DEFAULT_HISTORY_DIR
+        self._history_file = self._history_dir / HISTORY_FILE
+        self._model = model
+        self._judge = judge
+        self._metadata = dict(metadata) if metadata else {}
+        self._reports: dict[str, EvalSuiteReport] = {}
+
+    _suite_metadata: dict[str, dict[str, Any]]
+
+    @classmethod
+    def activate(cls, ctx: PluginContext) -> EvalHistoryPlugin | None:
+        return None  # Wired explicitly by session
+
+    def setup(self, session: Any) -> None:
+        """Collect per-suite metadata from session."""
+        self._suite_metadata = {}
+        for suite in getattr(session, "suites", []):
+            if getattr(suite, "kind", "test") == "eval":
+                self._suite_metadata[suite.name] = getattr(suite, "suite_metadata", {})
+
+    def on_eval_suite_end(self, report: EvalSuiteReport) -> None:
+        """Collect suite reports as they arrive."""
+        self._reports[report.suite_name] = report
+
+    def on_session_end(self, _result: Any) -> None:
+        """Write all collected reports to history."""
+        if not self._reports:
+            return
+        entry = _build_entry(
+            self._reports,
+            self._model,
+            self._judge,
+            self._metadata,
+            self._suite_metadata,
+        )
+        append_entry(self._history_file, entry)
+
+    def load_entries(self, n: int | None = None) -> list[dict[str, Any]]:
+        """Load entries from history file."""
+        from protest.history.storage import load_history
+
+        return load_history(history_dir=self._history_dir, n=n, evals_only=True)
+
+
+def _build_entry(
+    reports: dict[str, EvalSuiteReport],
+    model: ModelInfo | None,
+    judge: dict[str, Any] | None,
+    metadata: dict[str, Any] | None = None,
+    all_suite_metadata: dict[str, dict[str, Any]] | None = None,
+) -> dict[str, Any]:
+    """Build a complete history entry covering all suites in the session."""
+    suites_data: dict[str, Any] = {}
+    all_score_stats: list[Any] = []
+
+    for suite_name, report in reports.items():
+        sm = (all_suite_metadata or {}).get(suite_name, {})
+        suite_model = sm.get("model") or (model.name if model else None)
+        suite_provider = sm.get("provider") or (model.provider if model else None)
+        suites_data[suite_name] = {
+            "kind": "eval",
+            "model": suite_model,
+            "provider": suite_provider,
+            "total_cases": report.total_count,
+            "passed": report.passed_count,
+            "failed": report.failed_count,
+            "pass_rate": round(report.pass_rate, 4),
+            "duration": round(report.duration, 2),
+            "cases": {c.case_name: _serialize_case(c) for c in report.cases},
+        }
+        all_score_stats.extend(report.all_score_stats())
+
+    scores_summary = {
+        s.name: {
+            "mean": round(s.mean, 4),
+            "median": round(s.median, 4),
+            "p5": round(s.p5, 4),
+            "p95": round(s.p95, 4),
+            "min": round(s.min, 4),
+            "max": round(s.max, 4),
+            "count": s.count,
+        }
+        for s in all_score_stats
+    }
+
+    return {
+        "run_id": str(uuid.uuid4()),
+        "timestamp": datetime.now(tz=timezone.utc).isoformat(),
+        "git": collect_git_info(),
+        "environment": collect_env_info(),
+        "metadata": dict(metadata) if metadata else {},
+        "evals": {
+            "model": model.name if model else None,
+            "provider": model.provider if model else None,
+            "judge": judge,
+            "scores_summary": scores_summary,
+        },
+        "suites": suites_data,
+    }
+
+
+def _serialize_case(case: EvalCaseResult) -> dict[str, Any]:
+    entry: dict[str, Any] = {
+        "passed": case.passed,
+        "duration": round(case.duration, 3),
+        "scores": {s.name: s.value for s in case.scores if s.is_metric},
+        "case_hash": case.case_hash,
+        "eval_hash": case.eval_hash,
+    }
+    labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)}
+    if labels:
+        entry["labels"] = labels
+    assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)}
+    if assertions:
+        entry["assertions"] = assertions
+    return entry
+
+
+def load_previous_run(history_dir: Any = None) -> dict[str, Any] | None:
+    """Load the most recent eval run from history."""
+    from protest.history.storage import load_previous_run as _load
+
+    return _load(history_dir=history_dir, evals_only=True)
diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
new file mode 100644
index 0000000..0054e25
--- /dev/null
+++ b/protest/evals/results_writer.py
@@ -0,0 +1,155 @@
+"""EvalResultsWriter — writes per-case eval results as markdown files.
+
+Listens to TEST_PASS/FAIL events, filters for eval cases, and writes
+a markdown file for each case to .protest/results/<suite>_<timestamp>/.
+"""
+
+from __future__ import annotations
+
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from protest.plugin import PluginBase
+
+if TYPE_CHECKING:
+    from protest.entities.events import TestResult
+    from protest.evals.types import EvalCaseResult, EvalScore
+    from protest.plugin import PluginContext
+
+DEFAULT_RESULTS_DIR = Path(".protest") / "results"
+
+
+class EvalResultsWriter(PluginBase):
+    """Writes per-case eval result files as markdown."""
+
+    name = "eval-results-writer"
+    description = "Write eval case result files"
+
+    def __init__(self, history_dir: Path | None = None) -> None:
+        self._results_base = (
+            (history_dir / "results") if history_dir else DEFAULT_RESULTS_DIR
+        )
+        self._run_dirs: dict[str, Path] = {}
+
+    @classmethod
+    def activate(cls, ctx: PluginContext) -> EvalResultsWriter | None:
+        return None  # Wired explicitly by session
+
+    def on_test_pass(self, result: TestResult) -> None:
+        self._maybe_write(result, passed=True)
+
+    def on_test_fail(self, result: TestResult) -> None:
+        self._maybe_write(result, passed=False)
+
+    def _maybe_write(self, result: TestResult, *, passed: bool) -> None:
+        if not result.is_eval or result.eval_payload is None:
+            return
+        suite_name = result.suite_path.root_name if result.suite_path else "evals"
+        case_result = _build_case_result(result, passed)
+        self._write_case_file(case_result, suite_name)
+
+    def _write_case_file(self, case_result: EvalCaseResult, suite_name: str) -> None:
+        if suite_name not in self._run_dirs:
+            self._run_dirs[suite_name] = _make_run_dir(suite_name, self._results_base)
+        _write_case_file(case_result, self._run_dirs[suite_name])
+
+    def on_eval_suite_end(self, report: Any) -> None:
+        """Print results dir path for the suite."""
+        from protest.evals.types import EvalSuiteReport
+
+        if not isinstance(report, EvalSuiteReport):
+            return
+        run_dir = self._run_dirs.get(report.suite_name)
+        if run_dir:
+            print(f"  Results: {run_dir}")
+
+
+def _build_case_result(result: TestResult, passed: bool) -> EvalCaseResult:
+    """Build EvalCaseResult from a TestResult with eval_payload."""
+    from protest.evals.types import EvalCaseResult, EvalScore
+
+    payload = result.eval_payload
+    assert payload is not None
+    return EvalCaseResult(
+        case_name=payload.case_name or "",
+        node_id=result.node_id,
+        scores=tuple(
+            EvalScore(
+                name=name,
+                value=entry.value,
+            )
+            for name, entry in payload.scores.items()
+        ),
+        duration=payload.task_duration,
+        passed=passed,
+        inputs=payload.inputs,
+        output=payload.output,
+        expected_output=payload.expected_output,
+        case_hash=payload.case_hash,
+        eval_hash=payload.eval_hash,
+    )
+
+
+# ---------------------------------------------------------------------------
+# File writing helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_run_dir(suite_name: str, base_dir: Path | None = None) -> Path:
+    """Create and return the timestamped directory for this run."""
+    base = base_dir or DEFAULT_RESULTS_DIR
+    ts = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+    safe_suite = re.sub(r"[^\w\-]", "_", suite_name)
+    run_dir = base / f"{safe_suite}_{ts}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    return run_dir
+
+
+def _write_case_file(case: EvalCaseResult, run_dir: Path) -> None:
+    """Write a markdown file for a single eval case."""
+    safe_name = re.sub(r"[^\w\-]", "_", case.case_name)
+    path = run_dir / f"{safe_name}.md"
+    path.write_text(_render_case(case), encoding="utf-8")
+
+
+def _render_case(case: EvalCaseResult) -> str:
+    status = "PASS ✓" if case.passed else "FAIL ✗"
+    duration = (
+        f"{case.duration * 1000:.0f}ms"
+        if case.duration < 1
+        else f"{case.duration:.2f}s"
+    )
+    lines: list[str] = [
+        f"# {case.case_name} — {status} ({duration})",
+        "",
+    ]
+
+    lines += ["## Input", "", _format_value(case.inputs), ""]
+    lines += ["## Output", "", _format_value(case.output), ""]
+    lines += ["## Expected", "", _format_value(case.expected_output), ""]
+
+    if case.scores:
+        lines += ["## Scores", ""]
+        for score in case.scores:
+            lines.append(_format_score(score))
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def _format_score(score: EvalScore) -> str:
+    if score.is_metric:
+        icon = "·"
+    else:
+        icon = "✓" if score.passed else "✗"
+    return f"- **{score.name}**: {score.value} {icon}"
+
+
+def _format_value(value: Any) -> str:
+    if value is None:
+        return "_none_"
+    if isinstance(value, str):
+        return value if value.strip() else "_empty string_"
+    return f"```\n{value!r}\n```"
diff --git a/protest/evals/session.py b/protest/evals/session.py
new file mode 100644
index 0000000..82bea35
--- /dev/null
+++ b/protest/evals/session.py
@@ -0,0 +1,44 @@
+"""EvalSession — session dédiée aux evals."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from protest.core.session import ProTestSession
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from protest.evals.types import JudgeInfo, ModelInfo
+
+
+class EvalSession(ProTestSession):
+    """Session dédiée aux evals.
+
+    Usage::
+
+        session = EvalSession(model=ModelInfo(name="qwen-2.5"))
+
+        @session.eval(evaluators=[contains_facts])
+        async def chatbot(case: Annotated[dict, From(cases)]) -> str:
+            return await ask(case["q"])
+    """
+
+    def __init__(
+        self,
+        *,
+        model: ModelInfo | None = None,
+        judge: JudgeInfo | None = None,
+        concurrency: int = 1,
+        history: bool = True,
+        history_dir: Path | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__(
+            concurrency=concurrency,
+            history=history,
+            history_dir=history_dir,
+            metadata=metadata,
+        )
+        self._eval_model = model
+        self._eval_judge = judge
diff --git a/protest/evals/types.py b/protest/evals/types.py
new file mode 100644
index 0000000..24082f1
--- /dev/null
+++ b/protest/evals/types.py
@@ -0,0 +1,168 @@
+"""Types for eval results, scores, and run context."""
+
+from __future__ import annotations
+
+import statistics
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass(frozen=True, slots=True)
+class ModelInfo:
+    """Metadata about the model being evaluated."""
+
+    name: str
+    provider: str | None = None
+    temperature: float | None = None
+    extra: dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_agent(cls, agent: Any) -> ModelInfo:
+        """Extract model info from a pydantic-ai Agent (duck-typed)."""
+        model = getattr(agent, "model", None)
+        if model is None:
+            msg = "Agent has no model configured"
+            raise ValueError(msg)
+        if isinstance(model, str):
+            return cls(name=model)
+        model_name = getattr(model, "model_name", None)
+        if callable(model_name):
+            return cls(name=str(model_name()))
+        return cls(name=str(getattr(model, "name", None) or model))
+
+
+@dataclass(frozen=True, slots=True)
+class JudgeInfo:
+    """Metadata about the LLM judge used for evaluation."""
+
+    name: str
+    provider: str | None = None
+    evaluators: tuple[str, ...] = ()
+    extra: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True, slots=True)
+class EvalScore:
+    """A single named value from an evaluator result.
+
+    Values are categorized by type:
+    - bool → verdict (pass/fail)
+    - float → metric (aggregated in stats)
+    - str → reason (displayed on failure)
+    """
+
+    name: str
+    value: float | bool | str
+
+    @property
+    def is_verdict(self) -> bool:
+        return isinstance(self.value, bool)
+
+    @property
+    def is_metric(self) -> bool:
+        return isinstance(self.value, (int, float)) and not isinstance(self.value, bool)
+
+    @property
+    def is_reason(self) -> bool:
+        return isinstance(self.value, str)
+
+    @property
+    def passed(self) -> bool:
+        if isinstance(self.value, bool):
+            return self.value
+        return True  # metrics and reasons always "pass"
+
+
+@dataclass(frozen=True, slots=True)
+class EvalCaseResult:
+    """Complete result of evaluating a single case."""
+
+    case_name: str
+    node_id: str
+    scores: tuple[EvalScore, ...]
+    duration: float
+    passed: bool
+    inputs: Any = None
+    output: Any = None
+    expected_output: Any = None
+    case_hash: str = ""
+    eval_hash: str = ""
+
+    @property
+    def numeric_scores(self) -> dict[str, float]:
+        return {s.name: float(s.value) for s in self.scores if s.is_metric}
+
+    @property
+    def failed_scores(self) -> tuple[EvalScore, ...]:
+        return tuple(s for s in self.scores if not s.passed)
+
+
+@dataclass(frozen=True, slots=True)
+class ScoreStats:
+    """Aggregated statistics for a named score across cases."""
+
+    name: str
+    mean: float
+    median: float
+    p5: float
+    p95: float
+    min: float
+    max: float
+    count: int
+
+    @classmethod
+    def from_values(cls, name: str, values: list[float]) -> ScoreStats:
+        if not values:
+            return cls(name=name, mean=0, median=0, p5=0, p95=0, min=0, max=0, count=0)
+        sv = sorted(values)
+        n = len(sv)
+        return cls(
+            name=name,
+            mean=statistics.mean(sv),
+            median=statistics.median(sv),
+            p5=sv[max(0, int(n * 0.05))],
+            p95=sv[min(n - 1, int(n * 0.95))],
+            min=sv[0],
+            max=sv[-1],
+            count=n,
+        )
+
+
+@dataclass(frozen=True, slots=True)
+class EvalSuiteReport:
+    """Aggregated report for a suite of eval cases."""
+
+    suite_name: str
+    cases: tuple[EvalCaseResult, ...]
+    duration: float
+
+    @property
+    def passed_count(self) -> int:
+        return sum(1 for c in self.cases if c.passed)
+
+    @property
+    def failed_count(self) -> int:
+        return sum(1 for c in self.cases if not c.passed)
+
+    @property
+    def total_count(self) -> int:
+        return len(self.cases)
+
+    @property
+    def pass_rate(self) -> float:
+        return self.passed_count / self.total_count if self.cases else 0.0
+
+    def score_names(self) -> set[str]:
+        return {s.name for c in self.cases for s in c.scores if s.is_metric}
+
+    def score_stats(self, name: str) -> ScoreStats:
+        values = [
+            float(s.value)
+            for c in self.cases
+            for s in c.scores
+            if s.name == name and s.is_metric
+        ]
+        return ScoreStats.from_values(name, values)
+
+    def all_score_stats(self) -> list[ScoreStats]:
+        return [self.score_stats(n) for n in sorted(self.score_names())]
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
new file mode 100644
index 0000000..c9087b6
--- /dev/null
+++ b/protest/evals/wrapper.py
@@ -0,0 +1,176 @@
+"""Eval wrapper — turns a function into a scored eval test.
+
+The wrapper intercepts the return value, runs evaluators, and returns
+an EvalPayload. The rest of the pipeline (executor, outcome builder,
+reporters) handles it like any eval test.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import functools
+import time
+from typing import Any
+
+from protest.entities.events import EvalPayload, EvalScoreEntry
+from protest.evals.evaluator import EvalContext, extract_scores_from_result
+from protest.evals.types import EvalScore
+
+
+def make_eval_wrapper(
+    func: Any,
+    evaluators: list[Any],
+    expected_key: str,
+) -> Any:
+    """Wrap a function to run evaluators on its return value."""
+
+    @functools.wraps(func)
+    async def eval_wrapper(**kwargs: Any) -> EvalPayload:
+        expected = _extract_expected(kwargs, expected_key)
+        case_name = _extract_case_name(kwargs, func.__name__)
+        inputs = _extract_inputs(kwargs)
+        metadata = _extract_metadata(kwargs)
+
+        start = time.perf_counter()
+        if asyncio.iscoroutinefunction(func):
+            output = await func(**kwargs)
+        else:
+            output = func(**kwargs)
+        task_duration = time.perf_counter() - start
+
+        all_evaluators = list(evaluators)
+        per_case = _extract_per_case_evaluators(kwargs)
+        all_evaluators.extend(per_case)
+
+        scores = await run_evaluators(
+            all_evaluators,
+            case_name,
+            inputs,
+            output,
+            expected,
+            metadata,
+            task_duration,
+        )
+
+        from protest.evals.hashing import compute_case_hash, compute_eval_hash
+
+        return EvalPayload(
+            case_name=case_name,
+            passed=all(s.passed for s in scores),
+            task_duration=task_duration,
+            inputs=inputs,
+            output=output,
+            expected_output=expected,
+            scores={
+                s.name: EvalScoreEntry(
+                    value=s.value,
+                    passed=s.passed,
+                )
+                for s in scores
+            },
+            case_hash=compute_case_hash(inputs, expected),
+            eval_hash=compute_eval_hash(all_evaluators),
+        )
+
+    return eval_wrapper
+
+
+# ---------------------------------------------------------------------------
+# Extract helpers — pull data from case_kwargs (dict or dataclass)
+# ---------------------------------------------------------------------------
+
+
+def _get(obj: Any, key: str, default: Any = None) -> Any:
+    """Get a value from a dict or dataclass by key/attr name."""
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+
+
+def _is_case_data(v: Any) -> bool:
+    """Check if a value looks like case data (dict or has 'expected'/'q'/'inputs')."""
+    if isinstance(v, dict):
+        return True
+    return hasattr(v, "expected") or hasattr(v, "q") or hasattr(v, "inputs")
+
+
+def _extract_expected(kwargs: dict[str, Any], key: str) -> Any:
+    for v in kwargs.values():
+        if _is_case_data(v):
+            val = _get(v, key)
+            if val is not None:
+                return val
+    return None
+
+
+def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str:
+    for v in kwargs.values():
+        if _is_case_data(v):
+            name = _get(v, "name")
+            if name:
+                return name
+    return fallback
+
+
+def _extract_inputs(kwargs: dict[str, Any]) -> Any:
+    for v in kwargs.values():
+        if _is_case_data(v):
+            return _get(v, "inputs") or _get(v, "q") or _get(v, "input")
+    return None
+
+
+def _extract_metadata(kwargs: dict[str, Any]) -> Any:
+    for v in kwargs.values():
+        if _is_case_data(v):
+            val = _get(v, "metadata")
+            if val is not None:
+                return val
+    return None
+
+
+def _extract_per_case_evaluators(kwargs: dict[str, Any]) -> list[Any]:
+    for v in kwargs.values():
+        if _is_case_data(v):
+            evs = _get(v, "evaluators")
+            if evs:
+                return list(evs)
+    return []
+
+
+# ---------------------------------------------------------------------------
+# Evaluator execution
+# ---------------------------------------------------------------------------
+
+
+async def run_evaluators(
+    evaluators: list[Any],
+    case_name: str,
+    inputs: Any,
+    output: Any,
+    expected_output: Any,
+    metadata: Any,
+    duration: float,
+) -> list[EvalScore]:
+    """Run evaluators and convert results to EvalScores."""
+    ctx = EvalContext(
+        name=case_name,
+        inputs=inputs,
+        output=output,
+        expected_output=expected_output,
+        metadata=metadata,
+        duration=duration,
+    )
+
+    scores: list[EvalScore] = []
+    for ev in evaluators:
+        evaluator_name = getattr(ev, "__name__", type(ev).__name__)
+        try:
+            raw = ev(ctx)
+            result = await raw if asyncio.iscoroutine(raw) else raw
+            scores.extend(extract_scores_from_result(result, evaluator_name))
+        except Exception as exc:
+            from protest.exceptions import FixtureError
+
+            raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
+
+    return scores
diff --git a/protest/filters/kind.py b/protest/filters/kind.py
new file mode 100644
index 0000000..859e7dd
--- /dev/null
+++ b/protest/filters/kind.py
@@ -0,0 +1,36 @@
+"""KindFilterPlugin — filters tests by suite kind (test/eval)."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from protest.plugin import PluginBase
+
+if TYPE_CHECKING:
+    from protest.entities import TestItem
+    from protest.plugin import PluginContext
+
+
+class KindFilterPlugin(PluginBase):
+    """Filters collected tests by suite kind ('test' or 'eval')."""
+
+    name = "kind-filter"
+    description = "Filter by suite kind"
+
+    def __init__(self, kind: str) -> None:
+        self._kind = kind
+
+    @classmethod
+    def activate(cls, ctx: PluginContext) -> KindFilterPlugin | None:
+        kind = ctx.get("kind_filter")
+        if kind:
+            return cls(kind=kind)
+        return None
+
+    def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]:
+        return [item for item in items if self._matches(item)]
+
+    def _matches(self, item: TestItem) -> bool:
+        if item.suite is None:
+            return self._kind == "test"
+        return item.suite.kind == self._kind

From 4310e577e1234a57589a94a00609d12264e8fab8 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 27 Mar 2026 19:00:00 +0100
Subject: [PATCH 03/60] feat(reporters): Rich eval table, multi-model history,
 console.print

- on_eval_suite_end: Rich table for scores, plain text for ASCII
- Scores inline in -v, --show-output for inputs/output/expected
- --show-logs flag for captured log records
- Fixture setup time always displayed
- protest history --runs: per-suite breakdown with model
- protest.console.print(): progress output bypassing capture
- Lifecycle messages bypass capture (no re-display on fail)
- Output truncated at 20 lines with pointer to full output
- Case id in lifecycle messages (chatbot[lookup] not chatbot)
---
 protest/__init__.py                |   1 +
 protest/cli/history.py             | 522 +++++++++++++++++++++++++++++
 protest/cli/main.py                |  60 ++--
 protest/console.py                 |  70 ++++
 protest/reporting/ascii.py         |  48 ++-
 protest/reporting/rich_reporter.py | 202 ++++++++++-
 6 files changed, 859 insertions(+), 44 deletions(-)
 create mode 100644 protest/cli/history.py
 create mode 100644 protest/console.py

diff --git a/protest/__init__.py b/protest/__init__.py
index 6590311..9317c9f 100644
--- a/protest/__init__.py
+++ b/protest/__init__.py
@@ -1,3 +1,4 @@
+from protest import console
 from protest.api import collect_tests, list_tags, run_session
 from protest.assertions import ExceptionInfo, RaisesContext, raises, warns
 from protest.core.session import ProTestSession
diff --git a/protest/cli/history.py b/protest/cli/history.py
new file mode 100644
index 0000000..f9eb7ac
--- /dev/null
+++ b/protest/cli/history.py
@@ -0,0 +1,522 @@
+"""CLI command: protest history — browse run history."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from typing import Any
+
+
+def handle_history_command(argv: list[str]) -> None:
+    """Entry point for `protest history`."""
+    parser = argparse.ArgumentParser(
+        prog="protest history", description="Browse run history"
+    )
+    parser.add_argument(
+        "--tail", "-n", type=int, default=10, help="Number of entries (default: 10)"
+    )
+    parser.add_argument("--model", type=str, default=None, help="Filter by model name")
+    parser.add_argument("--suite", type=str, default=None, help="Filter by suite name")
+    parser.add_argument("--runs", action="store_true", help="Show run-by-run list")
+    parser.add_argument(
+        "--show",
+        nargs="?",
+        const=0,
+        type=int,
+        default=None,
+        metavar="N",
+        help="Detailed panel for Nth most recent run (0=latest)",
+    )
+    parser.add_argument(
+        "--compare", action="store_true", help="Compare 2 most recent runs"
+    )
+    parser.add_argument("--evals", action="store_true", help="Eval runs only")
+    parser.add_argument("--tests", action="store_true", help="Test runs only")
+    parser.add_argument(
+        "--clean-dirty",
+        action="store_true",
+        help="Remove runs with uncommitted changes on current commit.",
+    )
+    parser.add_argument(
+        "--path", type=str, default=None, help="History directory (default: .protest/)"
+    )
+
+    args = parser.parse_args(argv)
+    from pathlib import Path
+
+    from protest.history.storage import clean_dirty, load_history
+
+    history_dir = Path(args.path) if args.path else None
+
+    if args.clean_dirty:
+        removed = clean_dirty(history_dir=history_dir)
+        print(
+            f"Removed {removed} dirty entries."
+            if removed
+            else "No dirty entries to clean."
+        )
+        sys.exit(0)
+
+    entries = load_history(
+        history_dir=history_dir,
+        model=args.model,
+        suite=args.suite,
+        evals_only=args.evals,
+        tests_only=args.tests,
+    )
+    if not entries:
+        print("No history found.")
+        sys.exit(0)
+
+    out = _get_output()
+    if args.compare:
+        if len(entries) < 2:
+            print("Need at least 2 runs to compare.")
+            sys.exit(1)
+        out.compare(entries[-1], entries[-2])
+    elif args.show is not None:
+        idx = args.show
+        if idx >= len(entries):
+            print(f"Only {len(entries)} entries available.")
+            sys.exit(1)
+        out.detail(entries[-(idx + 1)])
+    elif args.runs:
+        out.runs(entries[-args.tail :])
+    else:
+        out.stats(entries)
+
+
+# ---------------------------------------------------------------------------
+# Output abstraction — Rich if available, plain text fallback
+# ---------------------------------------------------------------------------
+
+
+class _Output:
+    """Base output — plain text."""
+
+    def stats(self, entries: list[dict[str, Any]]) -> None:
+        suites = _aggregate_suites(entries)
+        if not suites:
+            print("No suite data found.")
+            return
+        print(f"\n  {'Suite':<22} {'Kind':<6} {'Runs':>4}  {'Pass rate':<16} {'Flaky'}")
+        for name in sorted(suites):
+            s = suites[name]
+            rate_str = _format_rate(s["pass_rates"])
+            flaky_n = len(s["flaky"])
+            print(
+                f"  {name:<22} {s['kind']:<6} {s['n_runs']:>4}  {rate_str:<16} {flaky_n or ''}"
+            )
+        print()
+
+    def runs(self, entries: list[dict[str, Any]]) -> None:
+        for i, e in enumerate(entries):
+            p, t, r = _entry_stats(e)
+            git = (e.get("git") or {}).get("commit_short", "?")
+            ts = e.get("timestamp", "?")[:16]
+            print(f"\n  #{len(entries) - i:<3} {ts}  {p}/{t} ({r * 100:.0f}%)  {git}")
+            for sn, sd in e.get("suites", {}).items():
+                if not isinstance(sd, dict):
+                    continue
+                sp = sd.get("passed", 0)
+                st = sd.get("total_cases", 0)
+                sr = sp / st * 100 if st else 0
+                model = sd.get("model") or "-"
+                print(f"       {sn:<20} {sp}/{st} ({sr:.0f}%)  {model}")
+        print()
+
+    def detail(self, entry: dict[str, Any]) -> None:
+        kind = "EVAL" if entry.get("evals") else "TEST"
+        git = entry.get("git") or {}
+        ts = entry.get("timestamp", "?")[:19]
+        print(
+            f"\n  {kind} run  {ts}  {git.get('commit_short', '?')} @ {git.get('branch', '?')}"
+        )
+        for sn, sd in entry.get("suites", {}).items():
+            if not isinstance(sd, dict):
+                continue
+            suite_model = sd.get("model")
+            model_str = f"  [{suite_model}]" if suite_model else ""
+            print(
+                f"\n  Suite: {sn}  {sd.get('passed', 0)}/{sd.get('total_cases', 0)}{model_str}"
+            )
+            for cn, cd in sd.get("cases", {}).items():
+                if not isinstance(cd, dict):
+                    continue
+                m = "+" if cd.get("passed") else "-"
+                print(f"    {m} {cn}  ({_fmt_dur(cd.get('duration', 0))})")
+        print()
+
+    def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None:
+        cm = _get_display_model(current)
+        pm = _get_display_model(previous)
+        _, _, cr = _entry_stats(current)
+        _, _, pr = _entry_stats(previous)
+        if cm == pm:
+            print(f"\n  Model: {cm}")
+        else:
+            print(f"\n  Model: {pm} → {cm}")
+        print(f"  Pass rate: {pr * 100:.0f}% → {cr * 100:.0f}%")
+        changes = _classify_changes(_all_cases(current), _all_cases(previous))
+        _print_changes(changes)
+
+
+class _RichOutput(_Output):
+    """Rich output with colors, tables, panels."""
+
+    def __init__(self) -> None:
+        from rich.console import Console
+
+        self.console = Console(highlight=False)
+
+    def stats(self, entries: list[dict[str, Any]]) -> None:
+        from rich.table import Table
+
+        suites = _aggregate_suites(entries)
+        if not suites:
+            self.console.print("No suite data found.")
+            return
+        table = Table(show_header=True, header_style="bold", box=None, pad_edge=False)
+        table.add_column("Suite", min_width=12, no_wrap=True)
+        table.add_column("Kind", width=5)
+        table.add_column("Runs", justify="right", width=4)
+        table.add_column("Pass rate", min_width=14, no_wrap=True)
+        table.add_column("Scores", no_wrap=True)
+        table.add_column("Flaky", width=5)
+
+        for name in sorted(suites):
+            s = suites[name]
+            kind = s["kind"]
+            kind_color = "cyan" if kind == "eval" else "blue"
+            rate_str = _rich_rate(s["pass_rates"])
+            score_arrows = _rich_score_arrows(s.get("score_values", {}))
+            flaky_n = len(s["flaky"])
+            flaky_str = f"[yellow]{flaky_n}[/]" if flaky_n else ""
+            table.add_row(
+                name,
+                f"[{kind_color}]{kind}[/]",
+                str(s["n_runs"]),
+                rate_str,
+                score_arrows,
+                flaky_str,
+            )
+
+        self.console.print()
+        self.console.print(table)
+        self.console.print()
+
+    def runs(self, entries: list[dict[str, Any]]) -> None:
+        self.console.print()
+        for i, e in enumerate(entries):
+            p, t, r = _entry_stats(e)
+            git = (e.get("git") or {}).get("commit_short", "?")
+            ts = e.get("timestamp", "?")[:16]
+            rate_color = "green" if r >= 0.8 else "yellow" if r >= 0.5 else "red"
+            self.console.print(
+                f"  [dim]#{len(entries) - i:<3}[/] {ts}  "
+                f"[{rate_color}]{p}/{t} ({r * 100:.0f}%)[/]  [dim]{git}[/]"
+            )
+            for sn, sd in e.get("suites", {}).items():
+                if not isinstance(sd, dict):
+                    continue
+                sp = sd.get("passed", 0)
+                st = sd.get("total_cases", 0)
+                sr = sp / st * 100 if st else 0
+                sc = "green" if sr >= 80 else "yellow" if sr >= 50 else "red"
+                model = sd.get("model") or "-"
+                self.console.print(
+                    f"       {sn:<20} [{sc}]{sp}/{st} ({sr:.0f}%)[/]  [cyan]{model}[/]"
+                )
+        self.console.print()
+
+    def detail(self, entry: dict[str, Any]) -> None:
+        from rich.panel import Panel
+        from rich.text import Text
+
+        kind = "EVAL" if entry.get("evals") else "TEST"
+        git = entry.get("git") or {}
+        ts = entry.get("timestamp", "?")[:19]
+        evals_info = entry.get("evals") or {}
+
+        lines = Text()
+        lines.append(f"{kind} run", style="bold")
+        lines.append(f"  {ts}  ", style="dim")
+        lines.append(
+            f"{git.get('commit_short', '?')} @ {git.get('branch', '?')}\n", style="dim"
+        )
+
+        # Scores summary
+        for sn, stats in evals_info.get("scores_summary", {}).items():
+            mean = stats.get("mean", 0)
+            color = "green" if mean >= 0.8 else "yellow" if mean >= 0.5 else "red"
+            lines.append(f"  {sn}: ", style="dim")
+            lines.append(f"mean={mean:.2f}", style=color)
+            lines.append(
+                f"  p50={stats.get('median', 0):.2f}  p95={stats.get('p95', 0):.2f}\n",
+                style="dim",
+            )
+
+        for sn, sd in entry.get("suites", {}).items():
+            if not isinstance(sd, dict):
+                continue
+            p, t = sd.get("passed", 0), sd.get("total_cases", 0)
+            lines.append("\nSuite: ", style="bold")
+            lines.append(sn)
+            pc = "green" if p == t else "yellow" if p >= t * 0.5 else "red"
+            lines.append(f"  {p}/{t}", style=pc)
+            suite_model = sd.get("model")
+            if suite_model:
+                lines.append(f"  [{suite_model}]", style="cyan")
+            lines.append(f"  {_fmt_dur(sd.get('duration', 0))}\n", style="dim")
+            for cn, cd in sd.get("cases", {}).items():
+                if not isinstance(cd, dict):
+                    continue
+                if cd.get("passed"):
+                    lines.append("  + ", style="green")
+                else:
+                    lines.append("  - ", style="red")
+                lines.append(cn)
+                lines.append(f"  ({_fmt_dur(cd.get('duration', 0))})\n", style="dim")
+
+        self.console.print()
+        self.console.print(
+            Panel(lines, title="[bold]Run Detail[/]", border_style="cyan")
+        )
+
+    def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None:
+        from rich.panel import Panel
+        from rich.text import Text
+
+        cm = _get_display_model(current)
+        pm = _get_display_model(previous)
+        _, _, cr = _entry_stats(current)
+        _, _, pr = _entry_stats(previous)
+        delta = cr - pr
+
+        lines = Text()
+        if cm == pm:
+            lines.append(f"Model: {cm}\n", style="cyan")
+        else:
+            lines.append(f"Model: {pm} → {cm}\n", style="cyan")
+
+        lines.append("Pass rate: ")
+        lines.append(f"{pr * 100:.0f}%", style="dim")
+        lines.append(" → ")
+        rc = "green" if delta > 0 else "red" if delta < 0 else ""
+        lines.append(f"{cr * 100:.0f}%", style=rc)
+        if abs(delta) >= 0.001:
+            lines.append(f" ({delta * 100:+.0f}%)", style=rc)
+        lines.append("\n\n")
+
+        changes = _classify_changes(_all_cases(current), _all_cases(previous))
+        labels = [
+            ("fixed", "Fixed", "green", "+"),
+            ("regressed", "Regressions", "red", "-"),
+            ("modified", "Modified", "yellow", "⟳"),
+            ("new", "New", "cyan", "*"),
+        ]
+        has_any = False
+        for key, label, color, marker in labels:
+            items = changes[key]
+            if items:
+                has_any = True
+                lines.append(f"{label} ({len(items)}):\n", style=color)
+                for n in items:
+                    lines.append(f"  {marker} {n}\n")
+                lines.append("\n")
+        if not has_any:
+            lines.append("No changes.\n", style="dim")
+
+        self.console.print()
+        self.console.print(
+            Panel(lines, title="[bold]Run Comparison[/]", border_style="cyan")
+        )
+
+
+def _get_output() -> _Output:
+    try:
+        return _RichOutput()
+    except ImportError:
+        return _Output()
+
+
+# ---------------------------------------------------------------------------
+# Rich helpers
+# ---------------------------------------------------------------------------
+
+
+def _rich_rate(rates: list[float]) -> str:
+    if len(rates) >= 2:
+        first, last = rates[0], rates[-1]
+        delta = last - first
+        if delta > 0.01:
+            return f"[dim]{first * 100:.0f}%[/] [green]↗ {last * 100:.0f}%[/]"
+        if delta < -0.01:
+            return f"[dim]{first * 100:.0f}%[/] [red]↘ {last * 100:.0f}%[/]"
+        return f"{last * 100:.0f}%"
+    if rates:
+        return f"{rates[0] * 100:.0f}%"
+    return "-"
+
+
+def _rich_score_arrows(score_values: dict[str, list[float]]) -> str:
+    """Score trend arrows: ↗↘→ per score."""
+    parts: list[str] = []
+    for _name, values in sorted(score_values.items()):
+        if len(values) >= 2:
+            d = values[-1] - values[0]
+            if d > 0.01:
+                parts.append("[green]↗[/]")
+            elif d < -0.01:
+                parts.append("[red]↘[/]")
+            else:
+                parts.append("[dim]→[/]")
+    return "".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Data helpers
+# ---------------------------------------------------------------------------
+
+
+def _format_rate(rates: list[float]) -> str:
+    if len(rates) >= 2:
+        first, last = rates[0], rates[-1]
+        delta = last - first
+        arrow = "↗" if delta > 0.01 else "↘" if delta < -0.01 else "→"
+        return f"{first * 100:.0f}% {arrow} {last * 100:.0f}%"
+    if rates:
+        return f"{rates[0] * 100:.0f}%"
+    return "-"
+
+
+def _aggregate_suites(entries: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
+    suites: dict[str, dict[str, Any]] = {}
+    for entry in entries:
+        for name, data in entry.get("suites", {}).items():
+            if not isinstance(data, dict):
+                continue
+            if name not in suites:
+                suites[name] = {
+                    "kind": data.get("kind", "test"),
+                    "n_runs": 0,
+                    "pass_rates": [],
+                    "flaky": {},
+                    "cases_seen": {},
+                    "score_values": {},
+                }
+            s = suites[name]
+            s["n_runs"] += 1
+            total = data.get("total_cases", 0)
+            passed = data.get("passed", 0)
+            if total:
+                s["pass_rates"].append(passed / total)
+            _track_cases(s, data.get("cases", {}))
+
+    for s in suites.values():
+        s["flaky"] = {
+            cn: cs["fails"]
+            for cn, cs in s["cases_seen"].items()
+            if 0 < cs["fails"] < cs["runs"]
+        }
+    return suites
+
+
+def _track_cases(suite: dict[str, Any], cases: dict[str, Any]) -> None:
+    """Track per-case pass/fail and scores for a suite."""
+    for cn, cd in cases.items():
+        if not isinstance(cd, dict):
+            continue
+        if cn not in suite["cases_seen"]:
+            suite["cases_seen"][cn] = {"runs": 0, "fails": 0}
+        suite["cases_seen"][cn]["runs"] += 1
+        if not cd.get("passed", True):
+            suite["cases_seen"][cn]["fails"] += 1
+        for sn, sv in cd.get("scores", {}).items():
+            if isinstance(sv, (int, float)):
+                if sn not in suite["score_values"]:
+                    suite["score_values"][sn] = []
+                suite["score_values"][sn].append(float(sv))
+
+
+def _get_display_model(entry: dict[str, Any]) -> str:
+    """Get display model: per-suite models if they differ, global otherwise."""
+    suite_models = {
+        sd.get("model")
+        for sd in entry.get("suites", {}).values()
+        if isinstance(sd, dict) and sd.get("model")
+    }
+    if len(suite_models) > 1:
+        return ", ".join(sorted(suite_models))
+    if suite_models:
+        return next(iter(suite_models))
+    return (entry.get("evals") or {}).get("model") or "-"
+
+
+def _entry_stats(entry: dict[str, Any]) -> tuple[int, int, float]:
+    total = passed = 0
+    for data in entry.get("suites", {}).values():
+        if isinstance(data, dict):
+            total += data.get("total_cases", 0)
+            passed += data.get("passed", 0)
+    return passed, total, passed / total if total else 0
+
+
+def _all_cases(entry: dict[str, Any]) -> dict[str, Any]:
+    cases: dict[str, Any] = {}
+    for data in entry.get("suites", {}).values():
+        if isinstance(data, dict):
+            cases.update(data.get("cases", {}))
+    return cases
+
+
+def _classify_changes(
+    curr_cases: dict[str, Any],
+    prev_cases: dict[str, Any],
+) -> dict[str, list[str]]:
+    result: dict[str, list[str]] = {
+        "fixed": [],
+        "regressed": [],
+        "modified": [],
+        "new": [],
+    }
+    for name, curr in curr_cases.items():
+        prev = prev_cases.get(name)
+        if prev is None:
+            result["new"].append(name)
+        elif curr.get("case_hash") and curr["case_hash"] != prev.get("case_hash"):
+            result["modified"].append(f"{name} (case modified)")
+        elif curr.get("eval_hash") and curr["eval_hash"] != prev.get("eval_hash"):
+            result["modified"].append(f"{name} (scoring modified)")
+        elif curr.get("passed") and not prev.get("passed"):
+            result["fixed"].append(name)
+        elif not curr.get("passed") and prev.get("passed"):
+            result["regressed"].append(name)
+    return result
+
+
+def _print_changes(changes: dict[str, list[str]]) -> None:
+    labels = {
+        "fixed": ("Fixed", "+"),
+        "regressed": ("Regressions", "-"),
+        "modified": ("Modified", "⟳"),
+        "new": ("New", "*"),
+    }
+    has_any = False
+    for key, (label, marker) in labels.items():
+        if changes[key]:
+            has_any = True
+            print(f"\n  {label} ({len(changes[key])}):")
+            for n in changes[key]:
+                print(f"    {marker} {n}")
+    if not has_any:
+        print("  No changes.")
+    print()
+
+
+def _fmt_dur(seconds: float) -> str:
+    if seconds < 1:
+        return f"{seconds * 1000:.0f}ms"
+    if seconds < 60:
+        return f"{seconds:.1f}s"
+    return f"{int(seconds // 60)}m{seconds % 60:.0f}s"
diff --git a/protest/cli/main.py b/protest/cli/main.py
index a913e7f..0ee6f2a 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -2,7 +2,7 @@
 
 import argparse
 import sys
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from protest.core.session import ProTestSession
@@ -103,19 +103,21 @@ def main() -> None:
         _print_help()
         return
 
-    if command == "tags":
-        _handle_tags_command()
+    commands: dict[str, Any] = {
+        "tags": _handle_tags_command,
+        "run": lambda: _handle_run_command(kind_filter="test"),
+        "eval": lambda: _handle_run_command(kind_filter="eval"),
+        "history": _handle_history_command,
+        "live": _handle_live_command,
+    }
+
+    handler = commands.get(command)
+    if handler:
+        handler()
         return
 
-    if command == "run":
-        _handle_run_command()
-        return
-
-    if command == "live":
-        _handle_live_command()
-        return
-
-    print(f"Error: Unknown command '{command}'. Use 'run', 'tags', or 'live'.")
+    valid = ", ".join(f"'{c}'" for c in commands)
+    print(f"Error: Unknown command '{command}'. Use {valid}.")
     sys.exit(1)
 
 
@@ -143,9 +145,11 @@ def _print_help() -> None:
     """Print main help."""
     print("ProTest - Async-first Python test framework\n")
     print("Commands:")
-    print("  run    Run tests")
-    print("  live   Start live reporter server")
-    print("  tags   Tag inspection commands")
+    print("  run      Run tests")
+    print("  eval     Run evaluations")
+    print("  history  Browse run history")
+    print("  live     Start live reporter server")
+    print("  tags     Tag inspection commands")
     print(HELP_EPILOG)
 
 
@@ -228,8 +232,15 @@ def _create_run_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def _handle_run_command() -> None:
-    """Handle 'protest run' subcommand with two-phase parsing."""
+def _handle_history_command() -> None:
+    """Handle 'protest history' subcommand."""
+    from protest.cli.history import handle_history_command
+
+    handle_history_command(sys.argv[2:])
+
+
+def _handle_run_command(kind_filter: str | None = None) -> None:
+    """Handle 'protest run' / 'protest eval' with two-phase parsing."""
     from protest.loader import LoadError, load_session, parse_target
 
     argv = sys.argv[2:]
@@ -275,13 +286,14 @@ def _handle_run_command() -> None:
     from protest.reporting.verbosity import Verbosity
 
     effective_verbosity = Verbosity.QUIET if args.quiet else args.verbosity
-    ctx = PluginContext(
-        args={
-            **vars(args),
-            "target_suite": suite_filter,
-            "verbosity": effective_verbosity,
-        }
-    )
+    ctx_args: dict[str, Any] = {
+        **vars(args),
+        "target_suite": suite_filter,
+        "verbosity": effective_verbosity,
+    }
+    if kind_filter:
+        ctx_args["kind_filter"] = kind_filter
+    ctx = PluginContext(args=ctx_args)
 
     # Phase 6: Run tests (api.run_session handles plugin activation)
     run_tests(session, ctx, collect_only=args.collect_only)
diff --git a/protest/console.py b/protest/console.py
new file mode 100644
index 0000000..9270c16
--- /dev/null
+++ b/protest/console.py
@@ -0,0 +1,70 @@
+"""protest.console — progress output that bypasses test capture.
+
+Usage::
+
+    from protest import console
+
+    @fixture()
+    async def pipeline():
+        for i, scene in enumerate(scenes):
+            console.print(f"[bold]pipeline:[/] importing {scene.name} ({i+1}/{len(scenes)})")
+            await import_scene(scene)
+
+    # Raw mode — no markup processing
+    console.print("debug: raw bytes here", raw=True)
+
+Messages go through the event bus → reporters display them inline.
+If no event bus is available (outside a protest session), falls back to stderr.
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+
+
+def print(msg: str, *, raw: bool = False) -> None:
+    """Print a message that bypasses test capture.
+
+    Goes through the event bus so reporters display it at the right place.
+    Supports Rich markup (stripped for ASCII reporter).
+
+    Args:
+        msg: The message to print. Supports Rich markup unless raw=True.
+        raw: If True, no markup processing — message passed as-is.
+    """
+    from protest.execution.capture import get_event_bus
+
+    bus = get_event_bus()
+    if bus is None:
+        _fallback_print(msg, raw)
+        return
+
+    from protest.events.types import Event
+
+    # Call handlers directly (sync, bypasses async emit).
+    # This ensures messages appear immediately, not after the test.
+    for handler_entry in bus._handlers.get(Event.USER_PRINT, []):
+        try:
+            handler_entry.func((msg, raw))
+        except Exception:
+            pass
+
+
+def _fallback_print(msg: str, raw: bool) -> None:
+    """Fallback when no event bus — write to real stderr (bypassing capture)."""
+    text = msg if raw else strip_markup(msg)
+    # sys.stderr may be wrapped by TaskAwareStream — get the original
+    stream = getattr(sys.stderr, "_original", sys.stderr)
+    stream.write(text + "\n")
+    stream.flush()
+
+
+def strip_markup(msg: str) -> str:
+    """Strip Rich markup tags from a string.
+
+    Handles escaped brackets (``\\[text]`` → ``[text]``).
+    """
+    msg = msg.replace("\\[", "\x00")
+    msg = re.sub(r"\[/?[^\]]*\]", "", msg)
+    return msg.replace("\x00", "[")
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index 9ff7211..a52c509 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -1,5 +1,6 @@
 import traceback
 from pathlib import Path
+from typing import Any
 
 from typing_extensions import Self
 
@@ -123,7 +124,7 @@ def on_fixture_setup_start(self, info: FixtureInfo) -> None:
             print(f"    -> fixture '{info.name}' setup... ({info.scope.value})")
 
     def on_fixture_setup_done(self, info: FixtureInfo) -> None:
-        if self._verbosity >= Verbosity.FIXTURES:
+        if self._verbosity >= Verbosity.NORMAL:
             print(
                 f"    -> fixture '{info.name}' ready ({_format_duration(info.duration)})"
             )
@@ -140,11 +141,19 @@ def on_fixture_teardown_done(self, info: FixtureInfo) -> None:
 
     def on_test_setup_done(self, info: TestStartInfo) -> None:
         if self._verbosity >= Verbosity.FIXTURES:
-            print(f"      > {info.name} setup done")
+            self._print_bypass(f"      > {info.name} setup done")
 
     def on_test_teardown_start(self, info: TestTeardownInfo) -> None:
         if self._verbosity >= Verbosity.FIXTURES:
-            print(f"      < {info.name} teardown...")
+            self._print_bypass(f"      < {info.name} teardown...")
+
+    @staticmethod
+    def _print_bypass(msg: str) -> None:
+        import sys
+
+        stream = getattr(sys.stdout, "_original", sys.stdout)
+        stream.write(msg + "\n")
+        stream.flush()
 
     def on_test_retry(self, info: TestRetryInfo) -> None:
         delay_msg = f", retrying in {info.delay}s" if info.delay > 0 else ""
@@ -250,6 +259,39 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
             for line in result.output.rstrip().splitlines():
                 print(f"  {line}")
 
+    def on_user_print(self, data: Any) -> None:
+        import sys
+
+        from protest.console import strip_markup
+
+        msg, raw = data
+        text = msg if raw else strip_markup(msg)
+        stream = getattr(sys.stdout, "_original", sys.stdout)
+        stream.write(f"       | {text}\n")
+        stream.flush()
+
+    def on_eval_suite_end(self, report: Any) -> None:
+        from protest.evals.types import EvalSuiteReport
+
+        if not isinstance(report, EvalSuiteReport):
+            return
+        stats = report.all_score_stats()
+        print()
+        print(f"  Eval: {report.suite_name} ({report.total_count} cases)")
+        if stats:
+            max_name = max(len(s.name) for s in stats)
+            print("  " + "─" * 60)
+            for s in stats:
+                print(
+                    f"    {s.name:<{max_name}}  "
+                    f"mean={s.mean:.2f}  p50={s.median:.2f}  "
+                    f"p5={s.p5:.2f}  p95={s.p95:.2f}"
+                )
+            print("  " + "─" * 60)
+        rate_pct = report.pass_rate * 100
+        print(f"  Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)")
+        print()
+
     def on_session_complete(self, result: SessionResult) -> None:
         if self._failed_results or self._error_results:
             self._print_failure_summary()
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 2931e6b..8f263d9 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -1,6 +1,7 @@
 import traceback
 from argparse import ArgumentParser
 from pathlib import Path
+from typing import Any
 
 from rich.console import Console  # type: ignore[import-not-found]
 from typing_extensions import Self
@@ -24,12 +25,17 @@
 from protest.reporting.verbosity import Verbosity
 
 
+def _short_label(name: str, node_id: str) -> str:
+    """name + [case_id] from node_id."""
+    if "[" in node_id:
+        suffix = node_id[node_id.index("[") :]
+        return f"{name}{suffix}"
+    return name
+
+
 def _format_test_name(result: TestResult) -> str:
-    if "[" in result.node_id:
-        suffix = result.node_id[result.node_id.index("[") :]
-        escaped_suffix = suffix.replace("[", "\\[")
-        return f"{result.name}{escaped_suffix}"
-    return result.name
+    label = _short_label(result.name, result.node_id)
+    return label.replace("[", "\\[")
 
 
 MIN_DURATION_THRESHOLD = 0.001
@@ -43,15 +49,38 @@ def _format_duration(seconds: float) -> str:
     return f"{seconds:.2f}s"
 
 
+def _format_eval_scores_inline(result: TestResult) -> str:
+    """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0')."""
+    if not result.eval_payload:
+        return ""
+    parts = []
+    for name, entry in result.eval_payload.scores.items():
+        val = entry.value
+        if isinstance(val, bool):
+            parts.append(f"{name}={'✓' if val else '✗'}")
+        elif isinstance(val, float):
+            parts.append(f"{name}={val:.2f}")
+        else:
+            parts.append(f"{name}={val}")
+    return f" [dim]{' '.join(parts)}[/]" if parts else ""
+
+
 class RichReporter(PluginBase):
     """Rich console reporter with colors."""
 
     name = "rich-reporter"
     description = "Rich console reporter with colors"
 
-    def __init__(self, verbosity: int = 0) -> None:
+    def __init__(
+        self,
+        verbosity: int = 0,
+        show_logs: str | None = None,
+        show_output: bool = False,
+    ) -> None:
         self.console = Console(highlight=False)
         self._verbosity = verbosity
+        self._show_logs = show_logs
+        self._show_output = show_output
         self._failed_results: list[TestResult] = []
         self._error_results: list[TestResult] = []
 
@@ -71,16 +100,80 @@ def add_cli_options(cls, parser: ArgumentParser) -> None:
             action="store_true",
             help="Disable colors (plain ASCII output)",
         )
+        group.add_argument(
+            "--show-logs",
+            dest="show_logs",
+            nargs="?",
+            const="INFO",
+            default=None,
+            metavar="LEVEL",
+            help="Show captured log records (default: INFO+)",
+        )
+        group.add_argument(
+            "--show-output",
+            dest="show_output",
+            action="store_true",
+            help="Show eval inputs/output/expected per case",
+        )
 
     @classmethod
     def activate(cls, ctx: PluginContext) -> Self | None:
         if ctx.get("no_color", False):
             return None
-        return cls(verbosity=ctx.get("verbosity", 0))
+        return cls(
+            verbosity=ctx.get("verbosity", 0),
+            show_logs=ctx.get("show_logs"),
+            show_output=ctx.get("show_output", False),
+        )
 
     def _print(self, message: str) -> None:
         self.console.print(message)
 
+    def _print_eval_detail(self, result: TestResult) -> None:
+        """Print eval inputs/output/expected for -vv verbosity."""
+        p = result.eval_payload
+        if not p:
+            return
+        if p.inputs is not None:
+            inp = str(p.inputs)[:200]
+            self._print(f"[dim]       │ inputs: {inp}[/]")
+        if p.output is not None:
+            out = str(p.output)[:200]
+            self._print(f"[dim]       │ output: {out}[/]")
+        if p.expected_output is not None:
+            exp = str(p.expected_output)[:200]
+            self._print(f"[dim]       │ expected: {exp}[/]")
+
+    def _maybe_show_logs(self, result: TestResult) -> None:
+        """Show captured log records if --show-logs is active."""
+        if not self._show_logs or not result.log_records:
+            return
+        import logging
+
+        min_level = getattr(logging, self._show_logs.upper(), logging.INFO)
+        for record in result.log_records:
+            if record.levelno >= min_level:
+                level = record.levelname
+                color = (
+                    "red"
+                    if record.levelno >= logging.ERROR
+                    else "yellow"
+                    if record.levelno >= logging.WARNING
+                    else "dim"
+                )
+                self._print(
+                    f"[{color}]       LOG [{level}] {record.name}: {record.getMessage()}[/]"
+                )
+
+    def _print_bypass(self, message: str) -> None:
+        """Print bypassing capture (for lifecycle messages emitted during tests)."""
+        import sys
+
+        from rich.console import Console
+
+        stream = getattr(sys.stdout, "_original", sys.stdout)
+        Console(file=stream, highlight=False).print(message)
+
     def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]:
         return items
 
@@ -128,7 +221,7 @@ def on_fixture_setup_start(self, info: FixtureInfo) -> None:
             self._print(f"[dim]    ↳ fixture '{info.name}' setup... {scope_str}[/]")
 
     def on_fixture_setup_done(self, info: FixtureInfo) -> None:
-        if self._verbosity >= Verbosity.FIXTURES:
+        if self._verbosity >= Verbosity.NORMAL:
             self._print(
                 f"[dim]    ↳ fixture '{info.name}' ready ({_format_duration(info.duration)})[/]"
             )
@@ -145,11 +238,13 @@ def on_fixture_teardown_done(self, info: FixtureInfo) -> None:
 
     def on_test_setup_done(self, info: TestStartInfo) -> None:
         if self._verbosity >= Verbosity.FIXTURES:
-            self._print(f"[dim]       → {info.name} setup done[/]")
+            label = _short_label(info.name, info.node_id).replace("[", "\\[")
+            self._print_bypass(f"[dim]       → {label} setup done[/]")
 
     def on_test_teardown_start(self, info: TestTeardownInfo) -> None:
         if self._verbosity >= Verbosity.FIXTURES:
-            self._print(f"[dim]       ← {info.name} teardown...[/]")
+            label = _short_label(info.name, info.node_id).replace("[", "\\[")
+            self._print_bypass(f"[dim]       ← {label} teardown...[/]")
 
     def on_test_retry(self, info: TestRetryInfo) -> None:
         delay_msg = f", retrying in {info.delay}s" if info.delay > 0 else ""
@@ -169,7 +264,13 @@ def on_test_pass(self, result: TestResult) -> None:
                 retry_suffix = (
                     f" [dim]\\[attempt {result.attempt}/{result.max_attempts}][/]"
                 )
-            self._print(f"   [green]✓[/]   {name} [dim]({duration})[/]{retry_suffix}")
+            scores_str = _format_eval_scores_inline(result) if result.is_eval else ""
+            self._print(
+                f"   [green]✓[/]   {name} [dim]({duration})[/]{scores_str}{retry_suffix}"
+            )
+            if self._show_output and result.is_eval:
+                self._print_eval_detail(result)
+            self._maybe_show_logs(result)
 
     def on_test_fail(self, result: TestResult) -> None:
         name = _format_test_name(result)
@@ -197,8 +298,17 @@ def on_test_fail(self, result: TestResult) -> None:
             self._print(f"   [red]✗[/]   {name}: {result.error}{retry_suffix}")
 
         if result.output:
-            for line in result.output.rstrip().splitlines():
+            lines = result.output.rstrip().splitlines()
+            max_lines = 20
+            for line in lines[:max_lines]:
                 self._print(f"[dim]       │ {line}[/]")
+            if len(lines) > max_lines:
+                self._print(
+                    f"[dim]       │ ... ({len(lines) - max_lines} more lines in .protest/last_run_stdout)[/]"
+                )
+        if result.is_eval:
+            self._print_eval_detail(result)  # always show on fail
+        self._maybe_show_logs(result)
 
     def on_test_skip(self, result: TestResult) -> None:
         self._skipped += 1
@@ -249,14 +359,16 @@ def _format_traceback(self, error: Exception) -> str:
         return "".join(lines)
 
     def _print_failure_summary(self) -> None:
-        if self._failed_results:
+        non_eval_failures = [r for r in self._failed_results if not r.is_eval]
+        if non_eval_failures:
             self._print("\n[bold red]═══ FAILURES ═══[/]")
-            for result in self._failed_results:
+            for result in non_eval_failures:
                 self._print_failure_detail(result, is_error=False)
 
-        if self._error_results:
+        non_eval_errors = [r for r in self._error_results if not r.is_eval]
+        if non_eval_errors:
             self._print("\n[bold yellow]═══ ERRORS ═══[/]")
-            for result in self._error_results:
+            for result in non_eval_errors:
                 self._print_failure_detail(result, is_error=True)
 
     def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
@@ -281,8 +393,64 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
                 escaped_line = line.replace("[", "\\[")
                 self._print(f"[dim]{escaped_line}[/]")
 
+    def on_user_print(self, data: Any) -> None:
+        import sys
+
+        from rich.console import Console
+
+        msg, raw = data
+        # Write to the real stdout, bypassing capture
+        stream = getattr(sys.stdout, "_original", sys.stdout)
+        c = Console(file=stream, highlight=False)
+        if raw:
+            c.print(msg, markup=False)
+        else:
+            c.print(f"[dim]       │[/] {msg}")
+
+    def on_eval_suite_end(self, report: Any) -> None:
+        from rich.table import Table
+
+        from protest.evals.types import EvalSuiteReport
+
+        if not isinstance(report, EvalSuiteReport):
+            return
+        stats = report.all_score_stats()
+        self._print("")
+        if stats:
+            table = Table(
+                title=f"Eval: {report.suite_name} ({report.total_count} cases)",
+                show_header=True,
+                header_style="bold cyan",
+                padding=(0, 1),
+            )
+            table.add_column("Score", style="cyan", no_wrap=True)
+            table.add_column("mean", justify="right")
+            table.add_column("p50", justify="right")
+            table.add_column("p5", justify="right", style="dim")
+            table.add_column("p95", justify="right", style="dim")
+            for s in stats:
+                table.add_row(
+                    s.name,
+                    f"{s.mean:.2f}",
+                    f"{s.median:.2f}",
+                    f"{s.p5:.2f}",
+                    f"{s.p95:.2f}",
+                )
+            self.console.print(table)
+        else:
+            self._print(
+                f"  [cyan]Eval: {report.suite_name} ({report.total_count} cases)[/]"
+            )
+        rate_pct = report.pass_rate * 100
+        color = "green" if rate_pct >= 100 else "yellow" if rate_pct >= 50 else "red"
+        self._print(
+            f"  [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]"
+        )
+
     def on_session_complete(self, result: SessionResult) -> None:
-        if self._failed_results or self._error_results:
+        has_non_eval_failures = any(not r.is_eval for r in self._failed_results)
+        has_non_eval_errors = any(not r.is_eval for r in self._error_results)
+        if has_non_eval_failures or has_non_eval_errors:
             self._print_failure_summary()
 
         total = (

From 82f736b304fa3a48bf21b051017d022827dc569f Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sun, 29 Mar 2026 20:00:00 +0200
Subject: [PATCH 04/60] feat: tests, examples, and documentation

- 1063 tests (56 eval-specific)
- Yorkshire chatbot example with @session.eval + ForEach
- History module: JSONL storage, git info, env info
- docs/evals.md: full guide (scoring, evaluators, CLI, history)
- docs/core-concepts/console.md: console.print guide
---
 docs/core-concepts/console.md          |   49 ++
 docs/evals.md                          |  356 ++++++++
 examples/yorkshire/app/chatbot.py      |   93 +++
 examples/yorkshire/evals/__init__.py   |    0
 examples/yorkshire/evals/dataset.py    |  122 +++
 examples/yorkshire/evals/evaluators.py |    5 +
 examples/yorkshire/evals/session.py    |   29 +
 examples/yorkshire/session.py          |   52 ++
 mkdocs.yml                             |    2 +
 protest/history/__init__.py            |   17 +
 protest/history/collector.py           |   81 ++
 protest/history/plugin.py              |   98 +++
 protest/history/storage.py             |  135 +++
 tests/core/test_collector.py           |    6 +-
 tests/core/test_parametrize.py         |    2 +-
 tests/core/test_skip.py                |    2 +-
 tests/core/test_skipif.py              |    2 +-
 tests/core/test_xfail.py               |    2 +-
 tests/evals/test_e2e.py                | 1064 ++++++++++++++++++++++++
 tests/evals/test_hashing.py            |   72 ++
 20 files changed, 2182 insertions(+), 7 deletions(-)
 create mode 100644 docs/core-concepts/console.md
 create mode 100644 docs/evals.md
 create mode 100644 examples/yorkshire/app/chatbot.py
 create mode 100644 examples/yorkshire/evals/__init__.py
 create mode 100644 examples/yorkshire/evals/dataset.py
 create mode 100644 examples/yorkshire/evals/evaluators.py
 create mode 100644 examples/yorkshire/evals/session.py
 create mode 100644 examples/yorkshire/session.py
 create mode 100644 protest/history/__init__.py
 create mode 100644 protest/history/collector.py
 create mode 100644 protest/history/plugin.py
 create mode 100644 protest/history/storage.py
 create mode 100644 tests/evals/test_e2e.py
 create mode 100644 tests/evals/test_hashing.py

diff --git a/docs/core-concepts/console.md b/docs/core-concepts/console.md
new file mode 100644
index 0000000..b172246
--- /dev/null
+++ b/docs/core-concepts/console.md
@@ -0,0 +1,49 @@
+# Console Output
+
+Print progress and debug messages that bypass test capture.
+
+## The Problem
+
+`print()` inside tests and fixtures is captured by ProTest. During long-running fixtures (pipeline imports, graph seeding), there's no visible feedback.
+
+## `console.print`
+
+```python
+from protest import console
+
+@fixture()
+async def pipeline():
+    for i, scene in enumerate(scenes):
+        console.print(f"[cyan]pipeline:[/] importing {scene.name} ({i+1}/{len(scenes)})")
+        await import_scene(scene)
+    return driver
+```
+
+Messages appear inline in the reporter output, between test results.
+
+## Rich Markup
+
+`console.print` supports Rich markup. The Rich reporter renders colors; the ASCII reporter strips tags.
+
+```python
+console.print(f"[bold green]done[/] in {duration:.1f}s")
+console.print(f"[yellow]warning:[/] slow query ({elapsed:.2f}s)")
+```
+
+## Raw Mode
+
+Skip markup processing with `raw=True`:
+
+```python
+console.print("debug: raw bytes here", raw=True)
+```
+
+The message is passed as-is to both reporters.
+
+## How It Works
+
+`console.print` sends a `USER_PRINT` event through the event bus. The reporter receives it and writes to the real stdout (bypassing test capture). This means:
+
+- Messages appear immediately, not buffered until test end
+- Works with `-n 4` (concurrent tests) — the event bus serializes per plugin
+- No interference with test capture or `result.output`
diff --git a/docs/evals.md b/docs/evals.md
new file mode 100644
index 0000000..11895aa
--- /dev/null
+++ b/docs/evals.md
@@ -0,0 +1,356 @@
+# Evals
+
+Evaluate LLM outputs with scored metrics, thresholds, and historical tracking.
+
+## What is an Eval?
+
+A test produces **pass/fail**. An eval produces **scores** — numeric values (0.0–1.0) that measure output quality. Scores are aggregated across cases, tracked over time, and compared between runs.
+
+ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, tags. An eval is a test that returns a value, scored by evaluators.
+
+## Quick Start
+
+```python
+# evals/session.py
+from typing import Annotated
+
+from protest import ForEach, From
+from protest.evals import EvalCase, EvalSession, evaluator
+from protest.evals.evaluators import contains_keywords
+
+cases = ForEach([
+    EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"),
+    EvalCase(inputs="What is 2+2?", expected="4", name="math"),
+])
+
+session = EvalSession()
+
+@session.eval(evaluators=[contains_keywords(keywords=["Marie"])])
+async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
+    return await my_agent(case.inputs)
+```
+
+```bash
+protest eval evals.session:session
+```
+
+## How It Works
+
+`@session.eval()` wraps a function to run evaluators on its return value:
+
+1. Your function receives case data via `ForEach`/`From` (same as parameterized tests)
+2. It returns the output (string, object, anything)
+3. ProTest passes the output to evaluators → scores
+4. Scores determine pass/fail via thresholds
+5. Aggregated stats appear in the terminal
+
+The rest of the pipeline — fixtures, DI, parallelism, reporters — works identically to tests.
+
+## EvalSession
+
+`EvalSession` is a session configured for evals. History is enabled by default.
+
+```python
+from protest.evals import EvalSession, ModelInfo
+
+session = EvalSession(
+    model=ModelInfo(name="gpt-4o-mini"),    # tracked in history
+    concurrency=4,                          # parallel eval cases
+    metadata={"version": "1.0"},            # stored in history
+)
+```
+
+## EvalCase
+
+Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts.
+
+```python
+from protest.evals import EvalCase
+
+cases = ForEach([
+    EvalCase(inputs="What is 2+2?", expected="4", name="math"),
+    EvalCase(inputs="Who is Napoleon?", expected="emperor, France", name="history"),
+])
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `inputs` | `Any` | Input to your task function |
+| `expected` | `Any` | Expected output (passed to evaluators as `ctx.expected_output`) |
+| `name` | `str` | Case identifier (used in test IDs and history) |
+| `evaluators` | `list` | Per-case evaluators (added to suite-level ones) |
+| `metadata` | `dict` | Arbitrary metadata |
+
+## Evaluators
+
+An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict.
+
+### Return Types
+
+Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). The framework reads fields by type:
+
+| Field Type | Role |
+|------------|------|
+| `bool` | Verdict — pass/fail (`all(bool_fields)`) |
+| `float` | Metric — aggregated in stats (mean/p50/p95) |
+| `str` | Reason — displayed on failure, stored in history |
+
+Returning `float`, `dict`, or any other type raises `TypeError`.
+
+### Simple Evaluator
+
+```python
+@evaluator
+def not_empty(ctx: EvalContext) -> bool:
+    return bool(ctx.output.strip())
+```
+
+### Structured Evaluator
+
+```python
+from dataclasses import dataclass
+
+@dataclass
+class KeywordScores:
+    keyword_recall: float      # metric → stats
+    all_present: bool          # verdict → pass/fail
+    detail: str = ""           # reason → shown on failure
+
+@evaluator
+def keyword_check(ctx: EvalContext, keywords: list[str], min_recall: float = 0.5) -> KeywordScores:
+    found = [k for k in keywords if k.lower() in ctx.output.lower()]
+    recall = len(found) / len(keywords)
+    return KeywordScores(
+        keyword_recall=recall,
+        all_present=recall >= min_recall,
+        detail=f"found {len(found)}/{len(keywords)}",
+    )
+```
+
+The threshold (`min_recall`) is a parameter of the evaluator, not a framework concept. The evaluator decides the verdict.
+
+### Async (LLM Judge)
+
+```python
+@dataclass
+class JudgeResult:
+    accuracy: float
+    accurate_enough: bool
+    reason: str = ""
+
+@evaluator
+async def llm_judge(ctx: EvalContext, rubric: str = "", min_score: float = 0.7) -> JudgeResult:
+    result = await judge_agent.run(f"Evaluate: {ctx.output}\nCriteria: {rubric}")
+    score = parse_score(result)
+    return JudgeResult(accuracy=score, accurate_enough=score >= min_score, reason=result.explanation)
+```
+
+### Per-Case Thresholds
+
+Different thresholds per case = different evaluator bindings:
+
+```python
+EvalCase(inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min_recall=0.9)]),
+EvalCase(inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]),
+```
+
+### Using Evaluators
+
+```python
+# No params → use directly
+evaluators=[not_empty]
+
+# With params → call to bind
+evaluators=[keyword_check(keywords=["python", "async"], min_recall=0.75)]
+
+# Per-case evaluators (added to suite-level)
+EvalCase(inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")])
+```
+
+### EvalContext
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | `str` | Case name |
+| `inputs` | `I` | Case inputs |
+| `output` | `O` | Task return value |
+| `expected_output` | `O \| None` | From `EvalCase.expected` |
+| `metadata` | `Any` | From `EvalCase.metadata` |
+| `duration` | `float` | Task execution time (seconds) |
+
+### Built-in Evaluators
+
+| Evaluator | Params | Returns |
+|-----------|--------|---------|
+| `contains_keywords` | `keywords, min_recall=0.0` | `keyword_recall: float`, `all_keywords_present: bool` |
+| `contains_expected` | `case_sensitive=False` | `bool` |
+| `does_not_contain` | `forbidden` | `no_forbidden_words: bool` |
+| `not_empty` | — | `bool` |
+| `max_length` | `max_chars=500` | `conciseness: float`, `within_limit: bool` |
+| `min_length` | `min_chars=1` | `bool` |
+| `matches_regex` | `pattern` | `bool` |
+| `json_valid` | `required_keys=[]` | `valid_json: bool`, `has_required_keys: bool` |
+| `word_overlap` | — | `overlap: float` (tracking-only) |
+
+## Fixtures
+
+Evals use the same fixture system as tests. Expensive setup (database, pipeline, graph) runs once and is shared across all cases.
+
+```python
+@fixture()
+async def pipeline():
+    driver = await build_pipeline()  # 3 minutes, once
+    yield driver
+    await driver.close()
+
+session.bind(pipeline)
+
+@session.eval(evaluators=[my_scorer])
+async def pipeline_eval(
+    case: Annotated[EvalCase, From(cases)],
+    driver: Annotated[AsyncDriver, Use(pipeline)],
+) -> QueryResult:
+    return await query(driver, case.inputs)
+```
+
+## ModelInfo
+
+`ModelInfo` is a **label for history tracking** — it does not configure or route to any model. It records which model produced the results so you can compare runs.
+
+```python
+session = EvalSession(model=ModelInfo(name="qwen-2.5"))
+```
+
+## Evaluator Errors
+
+If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output. Scores from other evaluators that ran before the error are lost.
+
+> **Tip:** For non-deterministic evaluators (LLM judges), catch exceptions in the evaluator and return a score indicating failure rather than letting them propagate.
+
+## Multi-Model Sessions
+
+Track which model produced each eval suite's results:
+
+```python
+pipeline_model = ModelInfo(name="qwen-2.5")
+chat_model = ModelInfo(name="mistral-7b")
+
+session = EvalSession(model=pipeline_model)
+
+@session.eval(evaluators=[...], name="pipeline", model=pipeline_model)
+async def pipeline_eval(case, driver) -> str: ...
+
+@session.eval(evaluators=[...], name="chatbot", model=chat_model)
+async def chatbot_eval(case, deps) -> str: ...
+```
+
+`protest history --runs` shows the model per suite:
+
+```
+#1   2026-03-28T09:14  57/81 (70%)  cb6f7bc
+     pipeline             29/39 (74%)  qwen-2.5
+     chatbot              10/21 (48%)  mistral-7b
+```
+
+## CLI
+
+```bash
+# Run evals
+protest eval evals.session:session
+
+# Parallelism
+protest eval evals.session:session -n 4
+
+# Filter by tag
+protest eval evals.session:session --tag chatbot
+
+# Filter by name
+protest eval evals.session:session -k "lookup"
+
+# Re-run failures only
+protest eval evals.session:session --last-failed
+
+# Verbosity: scores inline
+protest eval evals.session:session -v
+
+# Show eval inputs/output/expected on passing cases
+protest eval evals.session:session --show-output
+
+# Show captured log records
+protest eval evals.session:session --show-logs
+protest eval evals.session:session --show-logs=DEBUG
+```
+
+Flags are independent and combinable: `-v --show-output --show-logs`.
+
+> **Note:** Failed eval cases always show inputs/output/expected — no flag needed.
+
+## Output
+
+### Default
+
+```
+   ✓   chatbot[lookup] (3.39s) facts_score=1.00 facts_ok=✓
+   ✗   chatbot[causal]: facts_ok=False, LLMJudge=False
+
+         Eval: chatbot (26 cases)
+┏━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓
+┃ Score       ┃ mean ┃  p50 ┃   p5 ┃  p95 ┃
+┡━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩
+│ facts_score │ 0.37 │ 0.00 │ 0.00 │ 1.00 │
+└─────────────┴──────┴──────┴──────┴──────┘
+  Passed: 14/26 (53.8%)
+  Results: .protest/results/chatbot_20260329_091422
+```
+
+### Per-Case Results
+
+Each eval case writes a markdown file to `.protest/results/<suite>_<timestamp>/`:
+
+```
+.protest/results/chatbot_20260329_091422/
+├── lookup.md
+├── causal.md
+└── negative.md
+```
+
+## History
+
+Eval results are persisted as JSONL in `.protest/history.jsonl`. Track trends across runs.
+
+```bash
+# Run list with per-suite breakdown
+protest history --evals --runs
+
+# Detailed view of latest run
+protest history --evals --show
+
+# Compare last two runs (fixed/regressed/new)
+protest history --evals --compare
+```
+
+### Integrity Hashes
+
+Each case in history carries two hashes:
+
+- **`case_hash`** — hash of inputs + expected output. Changes when the test data changes.
+- **`eval_hash`** — hash of evaluators + thresholds. Changes when the scoring criteria change.
+
+`protest history --compare` uses these hashes to detect modified cases vs regressions. If a case's `eval_hash` changed between runs, it's reported as "scoring modified" rather than a real regression.
+
+## Progress Output
+
+For long-running fixtures, use `console.print` to show progress without polluting test capture:
+
+```python
+from protest import console
+
+@fixture()
+async def pipeline():
+    for i, scene in enumerate(scenes):
+        console.print(f"[cyan]pipeline:[/] importing {scene.name} ({i+1}/{len(scenes)})")
+        await import_scene(scene)
+    return driver
+```
+
+Messages appear inline in the reporter output. Rich markup is supported (stripped for ASCII).
diff --git a/examples/yorkshire/app/chatbot.py b/examples/yorkshire/app/chatbot.py
new file mode 100644
index 0000000..dedc1e4
--- /dev/null
+++ b/examples/yorkshire/app/chatbot.py
@@ -0,0 +1,93 @@
+"""Yorkshire Terrier Expert Chatbot — fake LLM for eval demos.
+
+Simulates a RAG chatbot with realistic imperfections:
+- Sometimes misses keywords (simulates retrieval failures)
+- Occasionally adds irrelevant info (simulates hallucination)
+- Response quality varies (simulates LLM non-determinism)
+"""
+
+from __future__ import annotations
+
+import random
+
+# Knowledge base — what a real RAG system would retrieve
+YORKSHIRE_FACTS = {
+    "size": "Yorkshire Terriers typically weigh between 2-3 kg. They come in teacup, mini, and standard sizes.",
+    "grooming": "Yorkies with long coats need daily brushing. Seniors over 6 years need extra grooming care. Regular baths every 2-3 weeks.",
+    "temperament": "Yorkies are bold, confident, and affectionate. Despite their small size, they are courageous and sometimes stubborn.",
+    "health": "Common health issues include dental problems, patellar luxation, and tracheal collapse. Regular vet checkups recommended.",
+    "training": "Yorkies are intelligent but can be stubborn. Positive reinforcement works best. Start training early for best results.",
+    "diet": "Small breed formula recommended. Feed 2-3 small meals per day. Avoid chocolate, grapes, and onions.",
+    "exercise": "30 minutes of daily exercise is sufficient. Short walks and indoor play. Avoid extreme temperatures.",
+    "jobs": "Historically bred as ratters. Modern Yorkies excel as therapy dogs, influencers, and loyal companions.",
+    "puppies": "Yorkshire puppies need extra care until 12 months. Socialization is critical in the first 6 months.",
+    "seniors": "Senior Yorkies (8+ years) may slow down. Adjust exercise and diet. More frequent vet visits recommended.",
+}
+
+
+def yorkshire_chatbot(question: str) -> str:
+    """Fake chatbot that answers questions about Yorkshire Terriers.
+
+    Simulates a RAG pipeline: keyword matching → fact retrieval → response generation.
+    No LLM calls — pure string matching for deterministic eval testing.
+    """
+    question_lower = question.lower()
+
+    # Find relevant facts by keyword matching
+    relevant_facts: list[str] = []
+    for topic, fact in YORKSHIRE_FACTS.items():
+        if topic in question_lower or any(
+            word in question_lower for word in topic.split()
+        ):
+            relevant_facts.append(fact)
+
+    # Check for specific question patterns
+    if "weight" in question_lower or "how heavy" in question_lower:
+        relevant_facts.append(YORKSHIRE_FACTS["size"])
+    if "brush" in question_lower or "coat" in question_lower:
+        relevant_facts.append(YORKSHIRE_FACTS["grooming"])
+    if "eat" in question_lower or "food" in question_lower or "feed" in question_lower:
+        relevant_facts.append(YORKSHIRE_FACTS["diet"])
+    if "walk" in question_lower or "active" in question_lower:
+        relevant_facts.append(YORKSHIRE_FACTS["exercise"])
+    if "old" in question_lower or "aging" in question_lower:
+        relevant_facts.append(YORKSHIRE_FACTS["seniors"])
+    if (
+        "puppy" in question_lower
+        or "baby" in question_lower
+        or "young" in question_lower
+    ):
+        relevant_facts.append(YORKSHIRE_FACTS["puppies"])
+
+    # Deduplicate while preserving order
+    seen: set[str] = set()
+    unique_facts = []
+    for fact in relevant_facts:
+        if fact not in seen:
+            seen.add(fact)
+            unique_facts.append(fact)
+
+    if not unique_facts:
+        return "I'm not sure about that. I specialize in Yorkshire Terrier care and health."
+
+    response = " ".join(unique_facts)
+
+    # Simulate LLM imperfections
+    # ~20% chance: drop a sentence (simulates retrieval miss)
+    if random.random() < 0.2 and ". " in response:  # noqa: S311, PLR2004
+        sentences = response.split(". ")
+        drop_idx = random.randint(0, len(sentences) - 1)  # noqa: S311
+        sentences.pop(drop_idx)
+        response = ". ".join(sentences)
+
+    # ~10% chance: add irrelevant filler (simulates rambling)
+    if random.random() < 0.1:  # noqa: S311, PLR2004
+        response += " By the way, Yorkshire Terriers were originally bred in Yorkshire, England during the 19th century."
+
+    # ~5% chance: return a vague non-answer (simulates confusion)
+    if random.random() < 0.05:  # noqa: S311, PLR2004
+        response = (
+            "That's a great question about Yorkies! There are many factors to consider."
+        )
+
+    return response
diff --git a/examples/yorkshire/evals/__init__.py b/examples/yorkshire/evals/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py
new file mode 100644
index 0000000..7153ab6
--- /dev/null
+++ b/examples/yorkshire/evals/dataset.py
@@ -0,0 +1,122 @@
+"""Dataset for the Yorkshire chatbot evals."""
+
+from __future__ import annotations
+
+from protest import ForEach
+from protest.evals.evaluators import (
+    contains_keywords,
+    does_not_contain,
+    max_length,
+    not_empty,
+)
+
+yorkshire_cases = ForEach(
+    [
+        # --- Factual recall ---
+        {
+            "name": "weight_question",
+            "inputs": "How much does a Yorkshire Terrier weigh?",
+            "expected": "2-3 kg",
+            "metadata": {"tags": ["factual", "size"]},
+            "evaluators": [
+                contains_keywords(keywords=["2-3 kg", "teacup", "mini", "standard"])
+            ],
+        },
+        {
+            "name": "grooming_basics",
+            "inputs": "How often should I brush my Yorkie?",
+            "expected": "daily brushing for long coats",
+            "metadata": {"tags": ["factual", "grooming"]},
+            "evaluators": [contains_keywords(keywords=["daily", "brushing", "long"])],
+        },
+        {
+            "name": "diet_advice",
+            "inputs": "What should I feed my Yorkshire Terrier?",
+            "expected": "small breed formula, 2-3 meals",
+            "metadata": {"tags": ["factual", "diet"]},
+            "evaluators": [
+                contains_keywords(keywords=["small breed", "meals", "avoid"])
+            ],
+        },
+        {
+            "name": "exercise_needs",
+            "inputs": "How much exercise does a Yorkie need?",
+            "expected": "30 minutes daily",
+            "metadata": {"tags": ["factual", "exercise"]},
+            "evaluators": [contains_keywords(keywords=["30 minutes", "walk"])],
+        },
+        # --- Temperament ---
+        {
+            "name": "personality",
+            "inputs": "What is the temperament of a Yorkshire Terrier?",
+            "expected": "bold, confident, affectionate",
+            "metadata": {"tags": ["factual", "temperament"]},
+            "evaluators": [
+                contains_keywords(keywords=["bold", "confident", "affectionate"])
+            ],
+        },
+        # --- Age-specific ---
+        {
+            "name": "puppy_care",
+            "inputs": "How do I care for a Yorkshire puppy?",
+            "expected": "extra care, socialization",
+            "metadata": {"tags": ["factual", "puppies"]},
+            "evaluators": [contains_keywords(keywords=["12 months", "socialization"])],
+        },
+        {
+            "name": "senior_care",
+            "inputs": "My Yorkie is getting old, what should I change?",
+            "expected": "adjust exercise, more vet visits",
+            "metadata": {"tags": ["factual", "seniors"]},
+            "evaluators": [contains_keywords(keywords=["senior", "exercise", "vet"])],
+        },
+        # --- Hallucination checks ---
+        {
+            "name": "no_cat_advice",
+            "inputs": "Tell me about Yorkshire Terrier health",
+            "expected": "dental problems, patellar luxation",
+            "metadata": {"tags": ["safety"]},
+            "evaluators": [
+                does_not_contain(forbidden=["cat", "feline", "persian"]),
+                contains_keywords(keywords=["dental", "health"]),
+            ],
+        },
+        {
+            "name": "no_made_up_breeds",
+            "inputs": "What jobs can a Yorkie do?",
+            "expected": "therapy dogs, companions",
+            "metadata": {"tags": ["safety"]},
+            "evaluators": [
+                does_not_contain(forbidden=["labrador", "golden retriever", "poodle"]),
+                contains_keywords(keywords=["therapy", "companion"]),
+            ],
+        },
+        # --- Edge cases ---
+        {
+            "name": "unknown_topic",
+            "inputs": "What is the GDP of France?",
+            "expected": "I'm not sure",
+            "metadata": {"tags": ["edge_case"]},
+            "evaluators": [contains_keywords(keywords=["not sure", "specialize"])],
+        },
+        {
+            "name": "empty_question",
+            "inputs": "",
+            "expected": "I'm not sure",
+            "metadata": {"tags": ["edge_case"]},
+            "evaluators": [contains_keywords(keywords=["not sure"])],
+        },
+        # --- Known weak spot (chatbot doesn't know about training treats) ---
+        {
+            "name": "training_treats",
+            "inputs": "What treats are best for training a Yorkie?",
+            "expected": "small soft treats, positive reinforcement",
+            "metadata": {"tags": ["factual", "training"]},
+            "evaluators": [
+                contains_keywords(keywords=["treats", "small", "soft", "reward"])
+            ],
+        },
+    ]
+)
+
+suite_evaluators = [not_empty, max_length(max_chars=500)]
diff --git a/examples/yorkshire/evals/evaluators.py b/examples/yorkshire/evals/evaluators.py
new file mode 100644
index 0000000..b07153d
--- /dev/null
+++ b/examples/yorkshire/evals/evaluators.py
@@ -0,0 +1,5 @@
+"""Yorkshire-specific evaluators.
+
+Generic evaluators come from protest.evals.evaluators.
+Only project-specific ones live here.
+"""
diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py
new file mode 100644
index 0000000..7779f66
--- /dev/null
+++ b/examples/yorkshire/evals/session.py
@@ -0,0 +1,29 @@
+"""Yorkshire Chatbot Evals — evaluate the fake Yorkshire expert chatbot.
+
+Run with:
+    protest eval examples.yorkshire.evals.session:session
+    protest eval examples.yorkshire.evals.session:session -n 4
+    protest eval examples.yorkshire.evals.session:session --tag safety
+    protest eval examples.yorkshire.evals.session:session --last-failed
+    protest history --evals --show
+"""
+
+from typing import Annotated
+
+from examples.yorkshire.app.chatbot import yorkshire_chatbot
+from examples.yorkshire.evals.dataset import (
+    suite_evaluators,
+    yorkshire_cases,
+)
+from protest import From
+from protest.evals import EvalSession, ModelInfo
+
+session = EvalSession(
+    model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"),
+    metadata={"version": "1.0", "type": "keyword-matching"},
+)
+
+
+@session.eval(evaluators=suite_evaluators)
+def yorkshire_eval(case: Annotated[dict, From(yorkshire_cases)]) -> str:
+    return yorkshire_chatbot(case["inputs"])
diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py
new file mode 100644
index 0000000..7b8c3c3
--- /dev/null
+++ b/examples/yorkshire/session.py
@@ -0,0 +1,52 @@
+"""Yorkshire Terrier Unified Session — tests + evals in one session.
+
+Run all (tests + evals):
+    protest run examples.yorkshire.session:session
+
+Run only tests:
+    protest run examples.yorkshire.session:session
+    (protest run filters to kind=test by default)
+
+Run only evals:
+    protest eval examples.yorkshire.session:session
+"""
+
+from examples.yorkshire.app.chatbot import yorkshire_chatbot
+from examples.yorkshire.evals.dataset import dataset
+from examples.yorkshire.tests.fixtures import (
+    configure_kennel_logging,
+    kennel,
+    yorkshire,
+)
+from examples.yorkshire.tests.plugins import BarkPlugin
+from examples.yorkshire.tests.suites.adults import adults_suite
+from examples.yorkshire.tests.suites.custom_factory import custom_factory_suite
+from examples.yorkshire.tests.suites.legacy.suite import legacy_suite
+from examples.yorkshire.tests.suites.puppies.suite import puppies_suite
+from examples.yorkshire.tests.suites.rate_limited import rate_limited_suite
+from examples.yorkshire.tests.suites.seniors.suite import seniors_suite
+from examples.yorkshire.tests.suites.showcase.suite import showcase_suite
+from protest import ProTestSession
+from protest.evals import ModelInfo
+
+session = ProTestSession(concurrency=4, history=True)
+session.use(BarkPlugin)
+session.bind(configure_kennel_logging, autouse=True)
+session.bind(kennel)
+session.bind(yorkshire)
+
+# Tests
+session.add_suite(puppies_suite)
+session.add_suite(adults_suite)
+session.add_suite(seniors_suite)
+session.add_suite(legacy_suite)
+session.add_suite(showcase_suite)
+session.add_suite(rate_limited_suite)
+session.add_suite(custom_factory_suite)
+
+# Evals
+session.configure_evals(model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"))
+session.register_dataset(
+    dataset,
+    task=yorkshire_chatbot,
+)
diff --git a/mkdocs.yml b/mkdocs.yml
index 93864db..a643afe 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -65,6 +65,8 @@ nav:
       - Tags: core-concepts/tags.md
       - Dependency Injection: core-concepts/dependency-injection.md
       - Reporters: core-concepts/reporters.md
+      - Console Output: core-concepts/console.md
+  - Evals: evals.md
   - Guides:
       - Best Practices: best-practices.md
       - Project Organization: guides/project-organization.md
diff --git a/protest/history/__init__.py b/protest/history/__init__.py
new file mode 100644
index 0000000..5183cf7
--- /dev/null
+++ b/protest/history/__init__.py
@@ -0,0 +1,17 @@
+"""History module — run tracking for tests and evals."""
+
+from protest.history.storage import (
+    HISTORY_FILE,
+    append_entry,
+    clean_dirty,
+    load_history,
+    load_previous_run,
+)
+
+__all__ = [
+    "HISTORY_FILE",
+    "append_entry",
+    "clean_dirty",
+    "load_history",
+    "load_previous_run",
+]
diff --git a/protest/history/collector.py b/protest/history/collector.py
new file mode 100644
index 0000000..e81eefd
--- /dev/null
+++ b/protest/history/collector.py
@@ -0,0 +1,81 @@
+"""Metadata collection: git info, environment, CI detection."""
+
+from __future__ import annotations
+
+import os
+import platform
+import subprocess
+import sys
+from typing import Any
+
+
+def collect_git_info() -> dict[str, Any] | None:
+    """Collect git context. Returns None if not in a git repo."""
+    try:
+        commit = _git("rev-parse", "HEAD")
+        return {
+            "commit": commit,
+            "commit_short": commit[:7] if commit else None,
+            "branch": _git("rev-parse", "--abbrev-ref", "HEAD"),
+            "dirty": bool(_git("status", "--porcelain")),
+            "author": _git("log", "-1", "--format=%an"),
+            "commit_message": _git("log", "-1", "--format=%s"),
+        }
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return None
+
+
+def collect_env_info() -> dict[str, Any]:
+    """Collect environment metadata."""
+    ci_provider = detect_ci_provider()
+    return {
+        "python_version": platform.python_version(),
+        "protest_version": _get_pkg_version("protest"),
+        "pydantic_evals_version": _get_pkg_version("pydantic-evals"),
+        "hostname": platform.node(),
+        "os": sys.platform,
+        "ci": ci_provider is not None,
+        "ci_provider": ci_provider,
+    }
+
+
+_CI_PROVIDERS: dict[str, str] = {
+    "GITHUB_ACTIONS": "github-actions",
+    "GITLAB_CI": "gitlab-ci",
+    "CIRCLECI": "circleci",
+    "BUILDKITE": "buildkite",
+    "TRAVIS": "travis-ci",
+}
+
+
+def detect_ci_provider() -> str | None:
+    """Detect CI provider from standard environment variables."""
+    env = os.environ
+    for var, name in _CI_PROVIDERS.items():
+        if env.get(var) == "true":
+            return name
+    if env.get("JENKINS_URL"):
+        return "jenkins"
+    if env.get("CI") == "true":
+        return "unknown"
+    return None
+
+
+def _git(*args: str) -> str:
+    result = subprocess.run(
+        ["git", *args],  # noqa: S607
+        capture_output=True,
+        text=True,
+        timeout=5,
+        check=True,
+    )
+    return result.stdout.strip()
+
+
+def _get_pkg_version(name: str) -> str | None:
+    try:
+        from importlib.metadata import version
+
+        return version(name)
+    except Exception:
+        return None
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
new file mode 100644
index 0000000..4fe80f6
--- /dev/null
+++ b/protest/history/plugin.py
@@ -0,0 +1,98 @@
+"""HistoryPlugin — persists test run results as JSONL."""
+
+from __future__ import annotations
+
+import uuid
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any
+
+from protest.history.collector import collect_env_info, collect_git_info
+from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry
+from protest.plugin import PluginBase
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from protest.entities.events import SessionResult, TestResult
+    from protest.plugin import PluginContext
+
+
+class HistoryPlugin(PluginBase):
+    """Persists test results to JSONL for run-over-run tracking."""
+
+    name = "history"
+    description = "Test history tracking"
+
+    def __init__(self, history_dir: Path | None = None) -> None:
+        self._history_dir = history_dir or DEFAULT_HISTORY_DIR
+        self._history_file = self._history_dir / HISTORY_FILE
+        self._suites: dict[str, dict[str, dict[str, Any]]] = {}
+        self._suite_kinds: dict[str, str] = {}
+        self._default_suite_name: str = "tests"
+        self._history_enabled: bool = False
+        self._metadata: dict[str, Any] = {}
+
+    @classmethod
+    def activate(cls, ctx: PluginContext) -> HistoryPlugin | None:
+        return None  # Wired explicitly by session
+
+    def setup(self, session: Any) -> None:
+        self._history_enabled = getattr(session, "history", False)
+        self._metadata = dict(getattr(session, "metadata", None) or {})
+        for suite in getattr(session, "suites", []):
+            self._suite_kinds[suite.name] = getattr(suite, "kind", "test")
+            if not self._default_suite_name or self._default_suite_name == "tests":
+                self._default_suite_name = suite.name
+
+    def on_test_pass(self, result: TestResult) -> None:
+        if result.is_eval:
+            return
+        self._record(result, passed=True)
+
+    def on_test_fail(self, result: TestResult) -> None:
+        if result.is_eval:
+            return
+        self._record(result, passed=False)
+
+    def on_session_end(self, _result: SessionResult) -> None:
+        if not self._history_enabled or not self._suites:
+            return
+
+        suites_data: dict[str, Any] = {}
+        for suite_name, cases in self._suites.items():
+            total = len(cases)
+            passed = sum(1 for c in cases.values() if c["passed"])
+            suites_data[suite_name] = {
+                "kind": self._suite_kinds.get(suite_name, "test"),
+                "total_cases": total,
+                "passed": passed,
+                "failed": total - passed,
+                "pass_rate": round(passed / total, 4) if total else 0,
+                "duration": round(sum(c["duration"] for c in cases.values()), 2),
+                "cases": cases,
+            }
+
+        entry: dict[str, Any] = {
+            "run_id": str(uuid.uuid4()),
+            "timestamp": datetime.now(tz=timezone.utc).isoformat(),
+            "git": collect_git_info(),
+            "environment": collect_env_info(),
+            "metadata": self._metadata,
+            "evals": None,
+            "suites": suites_data,
+        }
+        append_entry(self._history_file, entry)
+
+    def _record(self, result: TestResult, *, passed: bool) -> None:
+        suite_name = self._get_suite_name(result)
+        if suite_name not in self._suites:
+            self._suites[suite_name] = {}
+        self._suites[suite_name][result.name] = {
+            "passed": passed,
+            "duration": round(result.duration, 3),
+        }
+
+    def _get_suite_name(self, result: TestResult) -> str:
+        if result.suite_path:
+            return result.suite_path.root_name
+        return self._default_suite_name
diff --git a/protest/history/storage.py b/protest/history/storage.py
new file mode 100644
index 0000000..78d35b9
--- /dev/null
+++ b/protest/history/storage.py
@@ -0,0 +1,135 @@
+"""JSONL history storage: load, append, filter, clean."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+from pathlib import Path
+from typing import Any
+
+DEFAULT_HISTORY_DIR = Path(".protest")
+HISTORY_FILE = "history.jsonl"
+
+
+def load_history(
+    history_dir: Path | None = None,
+    n: int | None = None,
+    model: str | None = None,
+    suite: str | None = None,
+    evals_only: bool = False,
+    tests_only: bool = False,
+) -> list[dict[str, Any]]:
+    """Load history entries with optional filtering."""
+    path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE
+    if not path.exists():
+        return []
+
+    entries: list[dict[str, Any]] = []
+    for line in path.read_text().strip().splitlines():
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if evals_only and not _has_suite_kind(entry, "eval"):
+            continue
+        if tests_only and not _has_suite_kind(entry, "test"):
+            continue
+        if model and (entry.get("evals") or {}).get("model") != model:
+            continue
+        if suite and suite not in entry.get("suites", {}):
+            continue
+        entries.append(entry)
+
+    entries.sort(key=lambda e: e.get("timestamp", ""))
+    if n is not None:
+        entries = entries[-n:]
+    return entries
+
+
+def _has_suite_kind(entry: dict[str, Any], kind: str) -> bool:
+    """Check if entry has at least one suite with the given kind."""
+    suites = entry.get("suites", {})
+    for suite_data in suites.values():
+        if isinstance(suite_data, dict) and suite_data.get("kind") == kind:
+            return True
+    # Legacy fallback: entries without kind field
+    if not any(isinstance(s, dict) and "kind" in s for s in suites.values()):
+        if kind == "eval":
+            return entry.get("evals") is not None
+        if kind == "test":
+            return entry.get("evals") is None
+    return False
+
+
+def append_entry(path: Path, entry: dict[str, Any]) -> None:
+    """Append a single JSON entry to a JSONL file.
+
+    Note: no file locking — concurrent writes from separate processes
+    could corrupt the file. In practice, protest runs are single-process
+    (async workers share the same process). If concurrent CI jobs write
+    to the same history file, consider using separate history_dir per job.
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "a") as f:
+        f.write(json.dumps(entry, default=str) + "\n")
+
+
+def load_previous_run(
+    history_dir: Path | None = None,
+    evals_only: bool = False,
+) -> dict[str, Any] | None:
+    """Load the most recent history entry."""
+    path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE
+    if not path.exists():
+        return None
+    lines = path.read_text().strip().splitlines()
+    for line in reversed(lines):
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if evals_only and entry.get("evals") is None:
+            continue
+        return entry
+    return None
+
+
+def clean_dirty(history_dir: Path | None = None) -> int:
+    """Remove entries where git.dirty=True AND git.commit matches current HEAD.
+
+    Returns the number of entries removed.
+    """
+    path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE
+    if not path.exists():
+        return 0
+
+    try:
+        current_commit = subprocess.run(
+            ["git", "rev-parse", "HEAD"],  # noqa: S607
+            capture_output=True,
+            text=True,
+            timeout=5,
+            check=True,
+        ).stdout.strip()
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return 0
+
+    lines = path.read_text().strip().splitlines()
+    kept: list[str] = []
+    removed = 0
+
+    for line in lines:
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            kept.append(line)
+            continue
+        git = entry.get("git") or {}
+        if git.get("dirty") and git.get("commit") == current_commit:
+            removed += 1
+        else:
+            kept.append(line)
+
+    if removed:
+        path.write_text("\n".join(kept) + "\n" if kept else "")
+    return removed
diff --git a/tests/core/test_collector.py b/tests/core/test_collector.py
index 6b02ad7..9ba8719 100644
--- a/tests/core/test_collector.py
+++ b/tests/core/test_collector.py
@@ -88,7 +88,7 @@ def test_collect_suite_tests(self) -> None:
         """Collects tests from suites."""
         session = ProTestSession()
         suite = ProTestSuite("my_suite")
-        session.include_suite(suite)
+        session.add_suite(suite)
 
         @suite.test()
         def suite_test() -> None:
@@ -107,7 +107,7 @@ def test_collect_mixed_tests(self) -> None:
         """Collects both standalone and suite tests."""
         session = ProTestSession()
         suite = ProTestSuite("my_suite")
-        session.include_suite(suite)
+        session.add_suite(suite)
 
         @session.test()
         def standalone_test() -> None:
@@ -129,7 +129,7 @@ def test_collect_generates_correct_node_ids(self) -> None:
         """Collected items have correct node_ids."""
         session = ProTestSession()
         suite = ProTestSuite("MySuite")
-        session.include_suite(suite)
+        session.add_suite(suite)
 
         @session.test()
         def standalone() -> None:
diff --git a/tests/core/test_parametrize.py b/tests/core/test_parametrize.py
index ec567db..df8a9ac 100644
--- a/tests/core/test_parametrize.py
+++ b/tests/core/test_parametrize.py
@@ -190,7 +190,7 @@ def test_triple(
     def test_structured_data_for_reporters(self) -> None:
         session = ProTestSession()
         suite = ProTestSuite("API")
-        session.include_suite(suite)
+        session.add_suite(suite)
 
         users = ForEach(["alice"], ids=lambda u: u)
 
diff --git a/tests/core/test_skip.py b/tests/core/test_skip.py
index 437e47d..71cddb1 100644
--- a/tests/core/test_skip.py
+++ b/tests/core/test_skip.py
@@ -54,7 +54,7 @@ def test_normal() -> None:
     def test_suite_skip_decorator(self) -> None:
         session = ProTestSession()
         suite = ProTestSuite("test")
-        session.include_suite(suite)
+        session.add_suite(suite)
 
         @suite.test(skip="Suite test skipped")
         def test_skipped() -> None:
diff --git a/tests/core/test_skipif.py b/tests/core/test_skipif.py
index 65fe632..4e24388 100644
--- a/tests/core/test_skipif.py
+++ b/tests/core/test_skipif.py
@@ -74,7 +74,7 @@ def test_skipped() -> None:
     def test_suite_skip_with_callable(self) -> None:
         session = ProTestSession()
         suite = ProTestSuite("test")
-        session.include_suite(suite)
+        session.add_suite(suite)
 
         @suite.test(skip=lambda: True, skip_reason="Suite conditional skip")
         def test_skipped() -> None:
diff --git a/tests/core/test_xfail.py b/tests/core/test_xfail.py
index 8451e23..4cf1d0a 100644
--- a/tests/core/test_xfail.py
+++ b/tests/core/test_xfail.py
@@ -57,7 +57,7 @@ def test_normal() -> None:
     def test_suite_xfail_decorator(self) -> None:
         session = ProTestSession()
         suite = ProTestSuite("test")
-        session.include_suite(suite)
+        session.add_suite(suite)
 
         @suite.test(xfail="Suite test xfailed")
         def test_xfailed() -> None:
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
new file mode 100644
index 0000000..5fbb4e8
--- /dev/null
+++ b/tests/evals/test_e2e.py
@@ -0,0 +1,1064 @@
+"""End-to-end tests for ProTest evals integration.
+
+These tests define the PUBLIC API contract. They test what the user sees:
+- Session setup (EvalSession, @session.eval with ForEach/From)
+- CLI behavior (protest run vs protest eval)
+- Output format (scores table, trends, failure messages)
+- History (JSONL format, stats, significance, clean-dirty)
+- Built-in evaluators
+
+Implementation can change freely as long as these tests pass.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Annotated, Any
+
+from protest import ForEach, From, ProTestSession
+from protest.core.runner import TestRunner
+from protest.evals import EvalContext, EvalSession, Metric, ModelInfo, Verdict, evaluator
+from protest.evals.evaluators import (
+    contains_expected,
+    contains_keywords,
+    does_not_contain,
+    json_valid,
+    matches_regex,
+    max_length,
+    min_length,
+    not_empty,
+    word_overlap,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures: deterministic evaluators + task
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True, slots=True)
+class FakeAccuracyResult:
+    """Structured result for fake accuracy evaluator."""
+
+    accuracy: Annotated[float, Metric]
+    matches_expected: Annotated[bool, Verdict]
+
+
+@evaluator
+def fake_accuracy(ctx: EvalContext) -> FakeAccuracyResult:
+    if ctx.expected_output and ctx.expected_output.lower() in ctx.output.lower():
+        return FakeAccuracyResult(accuracy=1.0, matches_expected=True)
+    return FakeAccuracyResult(accuracy=0.0, matches_expected=False)
+
+
+@evaluator
+async def async_fake_accuracy(ctx: EvalContext) -> FakeAccuracyResult:
+    """Async evaluator — simulates LLMJudge which calls an async LLM API."""
+    # Simulate async I/O (e.g. LLM call) without actually blocking
+    if ctx.expected_output and ctx.expected_output.lower() in ctx.output.lower():
+        return FakeAccuracyResult(accuracy=1.0, matches_expected=True)
+    return FakeAccuracyResult(accuracy=0.0, matches_expected=False)
+
+
+def echo_task(text: str) -> str:
+    return f"Echo: {text}"
+
+
+async def async_echo_task(text: str) -> str:
+    return f"Async: {text}"
+
+
+basic_cases = ForEach(
+    [
+        {"inputs": "hello world", "expected": "hello", "name": "case_pass"},
+        {"inputs": "xyz", "expected": "notfound", "name": "case_fail"},
+    ],
+    ids=lambda c: c["name"],
+)
+
+
+# ---------------------------------------------------------------------------
+# Session setup
+# ---------------------------------------------------------------------------
+
+
+class TestEvalSession:
+    """EvalSession setup: constructor with model=, @session.eval."""
+
+    def test_add_eval_creates_eval_kind(self) -> None:
+        session = EvalSession()
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        # The session should have a suite with kind=eval
+        assert len(session._suites) > 0
+        assert any(s.kind == "eval" for s in session._suites)
+
+    def test_model_set_via_constructor(self) -> None:
+        session = EvalSession(model=ModelInfo(name="test-model"))
+        assert session._eval_model is not None
+        assert session._eval_model.name == "test-model"
+
+    def test_metadata_on_constructor(self) -> None:
+        session = EvalSession(metadata={"env": "test"})
+        assert session.metadata["env"] == "test"
+
+    def test_eval_with_bool_verdict(self) -> None:
+        """Evaluator with bool field: case_fail has matches_expected=False -> fail."""
+        session = EvalSession()
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        result = runner.run()
+        # case_pass returns matches_expected=True -> pass
+        # case_fail returns matches_expected=False -> fail
+        assert result.success is False
+
+    def test_async_task_works(self) -> None:
+        session = EvalSession()
+
+        @session.eval(evaluators=[fake_accuracy])
+        async def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return await async_echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+    def test_async_evaluator_does_not_crash(self) -> None:
+        """Regression: async evaluator called via evaluate_sync raised 'event loop already running'."""
+        single_case = ForEach(
+            [
+                {"inputs": "hello world", "expected": "hello", "name": "c1"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession()
+
+        @session.eval(evaluators=[async_fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        result = runner.run()
+        assert result.success is True
+
+
+# ---------------------------------------------------------------------------
+# Kind filtering (protest run vs protest eval)
+# ---------------------------------------------------------------------------
+
+
+class TestKindFiltering:
+    """Suites have kind, filtering works."""
+
+    def test_test_suite_has_kind_test(self) -> None:
+        from protest.core.suite import ProTestSuite
+
+        suite = ProTestSuite("my_tests")
+        assert suite.kind == "test"
+
+    def test_eval_suite_has_kind_eval(self) -> None:
+        session = EvalSession()
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        assert any(s.kind == "eval" for s in session._suites)
+
+    def test_kind_filter_keeps_only_matching(self) -> None:
+        from protest.core.suite import ProTestSuite
+        from protest.filters.kind import KindFilterPlugin
+
+        test_suite = ProTestSuite("tests")
+        eval_suite = ProTestSuite("evals", kind="eval")
+
+        session = ProTestSession()
+
+        @test_suite.test()
+        def test_one() -> None:
+            pass
+
+        @eval_suite.test(is_eval=True)
+        def eval_one() -> None:
+            pass
+
+        session.add_suite(test_suite)
+        session.add_suite(eval_suite)
+
+        from protest.core.collector import Collector
+
+        items = Collector().collect(session)
+        assert len(items) == 2
+
+        # Filter to eval only
+        plugin = KindFilterPlugin(kind="eval")
+        filtered = plugin.on_collection_finish(items)
+        assert len(filtered) == 1
+        assert filtered[0].suite.kind == "eval"
+
+    def test_unified_session_runs_tests_only(self) -> None:
+        """protest run behavior: only kind=test suites."""
+        from protest.core.suite import ProTestSuite
+
+        session = ProTestSession()
+
+        test_suite = ProTestSuite("unit")
+        results: list[str] = []
+
+        @test_suite.test()
+        def test_a() -> None:
+            results.append("test")
+
+        session.add_suite(test_suite)
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        from protest.api import run_session
+        from protest.plugin import PluginContext
+
+        ctx = PluginContext(args={"kind_filter": "test"})
+        run_session(session, ctx=ctx)
+
+        assert "test" in results
+
+    def test_unified_session_runs_evals_only(self) -> None:
+        """protest eval behavior: only kind=eval suites."""
+        from protest.core.suite import ProTestSuite
+
+        session = ProTestSession()
+
+        test_suite = ProTestSuite("unit")
+        test_ran = []
+
+        @test_suite.test()
+        def test_a() -> None:
+            test_ran.append(True)
+
+        session.add_suite(test_suite)
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        from protest.api import run_session
+        from protest.plugin import PluginContext
+
+        ctx = PluginContext(args={"kind_filter": "eval"})
+        run_session(session, ctx=ctx)
+
+        assert len(test_ran) == 0  # test suite was filtered out
+
+
+# ---------------------------------------------------------------------------
+# Output format
+# ---------------------------------------------------------------------------
+
+
+class TestEvalOutput:
+    """What the user sees in the terminal.
+
+    These tests verify output by reading the EvalPlugin report directly,
+    since ProTest captures stdout during test runs.
+    """
+
+    def test_report_contains_score_stats(self) -> None:
+        from protest.evals.types import EvalSuiteReport
+        from protest.plugin import PluginBase
+
+        reports: list[EvalSuiteReport] = []
+
+        class ReportCapture(PluginBase):
+            name = "report-capture"
+            description = "Captures eval reports"
+
+            def on_eval_suite_end(self, report: Any) -> None:
+                reports.append(report)
+
+        session = EvalSession()
+        session.register_plugin(ReportCapture())
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert len(reports) == 1
+        stats = reports[0].all_score_stats()
+        assert len(stats) > 0
+        assert any(s.name == "accuracy" for s in stats)
+
+    def test_report_has_pass_count(self) -> None:
+        from protest.evals.types import EvalSuiteReport
+        from protest.plugin import PluginBase
+
+        reports: list[EvalSuiteReport] = []
+
+        class ReportCapture(PluginBase):
+            name = "report-capture"
+            description = "Captures eval reports"
+
+            def on_eval_suite_end(self, report: Any) -> None:
+                reports.append(report)
+
+        session = EvalSession()
+        session.register_plugin(ReportCapture())
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert len(reports) == 1
+        assert reports[0].total_count == 2
+
+    def test_failed_eval_has_error_with_score_details(self) -> None:
+        """When an eval case fails, the error message includes score details."""
+        from protest.plugin import PluginBase
+
+        errors: list[Any] = []
+
+        class ErrorCollector(PluginBase):
+            name = "error-collector"
+
+            def on_test_fail(self, result: Any) -> None:
+                if result.error:
+                    errors.append(str(result.error))
+
+        session = EvalSession()
+        session.register_plugin(ErrorCollector())
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        from protest.api import run_session
+
+        run_session(session)
+
+        # case_fail has matches_expected=False
+        assert any("matches_expected=" in e for e in errors)
+
+
+# ---------------------------------------------------------------------------
+# EvalPayload flow
+# ---------------------------------------------------------------------------
+
+
+class TestEvalPayloadFlow:
+    """EvalPayload flows through the framework correctly."""
+
+    def test_test_result_has_eval_payload(self) -> None:
+        from protest.plugin import PluginBase
+
+        collected: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_pass(self, result: Any) -> None:
+                collected.append(result)
+
+            def on_test_fail(self, result: Any) -> None:
+                collected.append(result)
+
+        session = EvalSession()
+        session.register_plugin(Collector())
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert len(collected) == 2
+        for result in collected:
+            assert result.is_eval is True
+            assert result.eval_payload is not None
+            assert result.eval_payload.case_name in ("case_pass", "case_fail")
+            assert "accuracy" in result.eval_payload.scores
+            assert "matches_expected" in result.eval_payload.scores
+
+    def test_lifecycle_events_have_case_id_in_node_id(self) -> None:
+        """setup_done/teardown_start events carry node_id with [case_id]."""
+        from protest.plugin import PluginBase
+
+        setup_ids: list[str] = []
+        teardown_ids: list[str] = []
+
+        class LifecycleCollector(PluginBase):
+            name = "lifecycle-collector"
+
+            def on_test_setup_done(self, info: Any) -> None:
+                setup_ids.append(info.node_id)
+
+            def on_test_teardown_start(self, info: Any) -> None:
+                teardown_ids.append(info.node_id)
+
+        session = EvalSession()
+        session.register_plugin(LifecycleCollector())
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert len(setup_ids) == 2
+        for node_id in setup_ids:
+            assert "[" in node_id, f"node_id missing case id: {node_id}"
+        for node_id in teardown_ids:
+            assert "[" in node_id, f"node_id missing case id: {node_id}"
+
+    def test_evaluator_exception_is_error_not_fail(self) -> None:
+        """An evaluator that raises is treated as error (infra), not test fail."""
+        from protest.plugin import PluginBase
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_fail(self, result: Any) -> None:
+                results.append(result)
+
+        @evaluator
+        def crashing_evaluator(ctx: EvalContext) -> bool:
+            raise RuntimeError("LLM judge timeout")
+
+        single_case = ForEach(
+            [
+                {"inputs": "hello", "expected": "hello", "name": "c1"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession()
+        session.register_plugin(Collector())
+
+        @session.eval(evaluators=[crashing_evaluator])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert len(results) == 1
+        assert results[0].is_fixture_error is True
+        assert "LLM judge timeout" in str(results[0].error)
+
+    def test_non_eval_test_has_no_payload(self) -> None:
+        from protest.plugin import PluginBase
+
+        collected: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_pass(self, result: Any) -> None:
+                collected.append(result)
+
+        session = ProTestSession()
+        session.register_plugin(Collector())
+
+        @session.test()
+        def regular_test() -> None:
+            assert True
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert len(collected) == 1
+        assert collected[0].is_eval is False
+        assert collected[0].eval_payload is None
+
+
+# ---------------------------------------------------------------------------
+# History
+# ---------------------------------------------------------------------------
+
+
+class TestHistory:
+    """JSONL history format and querying."""
+
+    def _run_eval(self, tmp_path: Path) -> None:
+        from protest.api import run_session
+
+        session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path)
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        run_session(session)
+
+    def test_history_file_created(self, tmp_path: Path) -> None:
+        self._run_eval(tmp_path)
+        assert (tmp_path / "history.jsonl").exists()
+
+    def test_history_entry_format(self, tmp_path: Path) -> None:
+        self._run_eval(tmp_path)
+        lines = (tmp_path / "history.jsonl").read_text().strip().splitlines()
+        entry = json.loads(lines[0])
+
+        # Required top-level keys
+        assert "run_id" in entry
+        assert "timestamp" in entry
+        assert "git" in entry
+        assert "environment" in entry
+        assert "metadata" in entry
+        assert "evals" in entry
+        assert "suites" in entry
+
+        # Evals block
+        assert entry["evals"] is not None
+        assert entry["evals"]["model"] == "test-model"
+
+        # Suites with kind
+        suites = entry["suites"]
+        assert len(suites) == 1
+        suite_name = next(iter(suites))
+        suite = suites[suite_name]
+        assert suite["kind"] == "eval"
+        assert "total_cases" in suite
+        assert "passed" in suite
+        assert "cases" in suite
+
+    def test_history_test_run_has_null_evals(self, tmp_path: Path) -> None:
+        from protest.api import run_session
+
+        session = ProTestSession(history=True, history_dir=tmp_path)
+
+        @session.test()
+        def test_simple() -> None:
+            pass
+
+        run_session(session)
+
+        lines = (tmp_path / "history.jsonl").read_text().strip().splitlines()
+        entry = json.loads(lines[0])
+        assert entry["evals"] is None
+
+    def test_history_multiple_runs_append(self, tmp_path: Path) -> None:
+        self._run_eval(tmp_path)
+        self._run_eval(tmp_path)
+        lines = (tmp_path / "history.jsonl").read_text().strip().splitlines()
+        assert len(lines) == 2
+
+    def test_history_metadata_included(self, tmp_path: Path) -> None:
+        from protest.api import run_session
+
+        session = EvalSession(
+            history_dir=tmp_path,
+            metadata={"env": "test", "version": "1.0"},
+        )
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        run_session(session)
+
+        lines = (tmp_path / "history.jsonl").read_text().strip().splitlines()
+        entry = json.loads(lines[0])
+        assert entry["metadata"]["env"] == "test"
+
+
+# ---------------------------------------------------------------------------
+# History: clean-dirty
+# ---------------------------------------------------------------------------
+
+
+class TestCleanDirty:
+    """protest history --clean-dirty behavior."""
+
+    def test_clean_dirty_removes_current_head_only(self, tmp_path: Path) -> None:
+        # Entry with current HEAD + dirty
+        import subprocess
+
+        from protest.history.storage import append_entry, clean_dirty
+
+        try:
+            current_commit = subprocess.run(
+                ["git", "rev-parse", "HEAD"],
+                capture_output=True,
+                text=True,
+                timeout=5,
+                check=True,
+            ).stdout.strip()
+        except (FileNotFoundError, subprocess.CalledProcessError):
+            return  # skip if not in a git repo
+
+        path = tmp_path / "history.jsonl"
+
+        # Dirty entry on current HEAD -> should be removed
+        append_entry(
+            path, {"git": {"commit": current_commit, "dirty": True}, "suites": {}}
+        )
+        # Dirty entry on old commit -> should be preserved
+        append_entry(path, {"git": {"commit": "old123", "dirty": True}, "suites": {}})
+        # Clean entry on current HEAD -> should be preserved
+        append_entry(
+            path, {"git": {"commit": current_commit, "dirty": False}, "suites": {}}
+        )
+
+        removed = clean_dirty(history_dir=tmp_path)
+        assert removed == 1
+
+        lines = path.read_text().strip().splitlines()
+        assert len(lines) == 2
+
+
+# ---------------------------------------------------------------------------
+# Case hashing
+# ---------------------------------------------------------------------------
+
+
+class TestCaseHashing:
+    """Content hashing for eval integrity."""
+
+    def test_case_hash_stored_in_history(self, tmp_path: Path) -> None:
+        """History entries include case_hash and eval_hash per case."""
+        from protest.api import run_session
+
+        session = EvalSession(history_dir=tmp_path)
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        run_session(session)
+
+        lines = (tmp_path / "history.jsonl").read_text().strip().splitlines()
+        entry = json.loads(lines[0])
+        suites = entry["suites"]
+        suite = next(iter(suites.values()))
+        case = next(iter(suite["cases"].values()))
+        assert "case_hash" in case
+        assert "eval_hash" in case
+        assert len(case["case_hash"]) > 0
+        assert len(case["eval_hash"]) > 0
+
+    def test_case_hash_changes_on_input_change(self) -> None:
+        """Different inputs -> different case_hash."""
+        from protest.evals.hashing import compute_case_hash
+
+        h1 = compute_case_hash("hello world", "expected")
+        h2 = compute_case_hash("hello world modified", "expected")
+        assert h1 != h2
+
+    def test_case_hash_stable_for_same_input(self) -> None:
+        """Same inputs -> same case_hash (deterministic)."""
+        from protest.evals.hashing import compute_case_hash
+
+        h1 = compute_case_hash("hello world", "expected")
+        h2 = compute_case_hash("hello world", "expected")
+        assert h1 == h2
+
+    def test_eval_hash_changes_on_evaluator_change(self) -> None:
+        """Different evaluators -> different eval_hash."""
+        from protest.evals.hashing import compute_eval_hash
+
+        e1 = contains_keywords(keywords=["hello"])
+        e2 = contains_keywords(keywords=["hello", "world"])
+        h1 = compute_eval_hash([e1])
+        h2 = compute_eval_hash([e2])
+        assert h1 != h2
+
+
+# ---------------------------------------------------------------------------
+# Built-in evaluators
+# ---------------------------------------------------------------------------
+
+
+class TestBuiltinEvaluators:
+    """All built-in evaluators work correctly through protest-native API."""
+
+    def _make_ctx(self, output: str, expected: str | None = None) -> EvalContext:
+        """Minimal EvalContext for evaluator testing."""
+        return EvalContext(
+            name="test",
+            inputs="",
+            output=output,
+            expected_output=expected,
+            metadata=None,
+            duration=0.0,
+        )
+
+    def test_contains_keywords(self) -> None:
+        e = contains_keywords(keywords=["hello", "world"])
+        result = e(self._make_ctx("Hello World"))
+        assert result.keyword_recall == 1.0
+        assert result.all_keywords_present is True
+
+    def test_contains_expected(self) -> None:
+        e = contains_expected
+        assert e(self._make_ctx("Hello World", "world")) is True
+        assert e(self._make_ctx("Hello", "world")) is False
+
+    def test_does_not_contain(self) -> None:
+        e = does_not_contain(forbidden=["cat", "dog"])
+        assert e(self._make_ctx("Yorkshire")).no_forbidden_words is True
+        assert e(self._make_ctx("I like cats")).no_forbidden_words is False
+
+    def test_not_empty(self) -> None:
+        assert not_empty(self._make_ctx("hello")) is True
+        assert not_empty(self._make_ctx("")) is False
+        assert not_empty(self._make_ctx("   ")) is False
+
+    def test_max_length(self) -> None:
+        e = max_length(max_chars=5)
+        result = e(self._make_ctx("hi"))
+        assert result.within_limit is True
+        result = e(self._make_ctx("this is too long"))
+        assert result.within_limit is False
+
+    def test_min_length(self) -> None:
+        assert min_length(min_chars=3)(self._make_ctx("hello")) is True
+        assert min_length(min_chars=10)(self._make_ctx("hi")) is False
+
+    def test_matches_regex(self) -> None:
+        e = matches_regex(pattern=r"\d{3}-\d{4}")
+        assert e(self._make_ctx("Call 555-1234")) is True
+        assert e(self._make_ctx("no numbers")) is False
+
+    def test_json_valid(self) -> None:
+        e = json_valid(required_keys=["name"])
+        result = e(self._make_ctx('{"name": "Rex"}'))
+        assert result.valid_json is True
+        assert result.has_required_keys is True
+        result = e(self._make_ctx("not json"))
+        assert result.valid_json is False
+
+    def test_word_overlap(self) -> None:
+        e = word_overlap
+        assert e(self._make_ctx("hello world", "hello world")).overlap == 1.0
+        assert e(self._make_ctx("hello there", "hello world")).overlap == 0.5
+        assert e(self._make_ctx("foo", "hello world")).overlap == 0.0
+
+
+# ---------------------------------------------------------------------------
+# Scoring v2: bool verdict, tracking-only metrics
+# ---------------------------------------------------------------------------
+
+
+class TestScoringV2:
+    """Scoring v2: evaluators return bool or dataclass."""
+
+    def test_bool_evaluator_pass(self) -> None:
+        """Evaluator returning True -> case passes."""
+        from protest.plugin import PluginBase
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_pass(self, result: Any) -> None:
+                results.append(result)
+
+            def on_test_fail(self, result: Any) -> None:
+                results.append(result)
+
+        single_case = ForEach(
+            [
+                {"inputs": "hello world", "expected": "hello", "name": "c1"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession()
+        session.register_plugin(Collector())
+
+        @session.eval(evaluators=[not_empty])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        result = runner.run()
+
+        assert result.success is True
+        assert len(results) == 1
+        assert results[0].eval_payload.scores["not_empty"].value is True
+
+    def test_dataclass_without_bool_is_tracking_only(self) -> None:
+        """Dataclass with only float fields -> tracking-only, always passes."""
+        from protest.plugin import PluginBase
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_pass(self, result: Any) -> None:
+                results.append(result)
+
+            def on_test_fail(self, result: Any) -> None:
+                results.append(result)
+
+        single_case = ForEach(
+            [
+                {"inputs": "foo", "expected": "bar baz", "name": "c1"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession()
+        session.register_plugin(Collector())
+
+        @session.eval(evaluators=[word_overlap])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        result = runner.run()
+
+        # word_overlap returns only float -> tracking-only, always passes
+        assert result.success is True
+
+    def test_float_return_raises_type_error(self) -> None:
+        """Evaluator returning naked float -> TypeError (caught as fixture error)."""
+        from protest.plugin import PluginBase
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_fail(self, result: Any) -> None:
+                results.append(result)
+
+        @evaluator
+        def bad_evaluator(ctx: EvalContext) -> float:
+            return 0.5
+
+        single_case = ForEach(
+            [{"inputs": "hello", "expected": "hello", "name": "c1"}],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession()
+        session.register_plugin(Collector())
+
+        @session.eval(evaluators=[bad_evaluator])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert len(results) == 1
+        assert results[0].is_fixture_error is True
+
+
+# ---------------------------------------------------------------------------
+# Results files per run
+# ---------------------------------------------------------------------------
+
+
+class TestResultsFiles:
+    """Per-case markdown files written to .protest/results/<suite>_<ts>/."""
+
+    def _run_eval(self, tmp_path: Path) -> Path:
+        from protest.evals.results_writer import EvalResultsWriter
+
+        results_dir = tmp_path / "results"
+        session = EvalSession()
+        writer = EvalResultsWriter(history_dir=tmp_path)
+        session.register_plugin(writer)
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+        return results_dir
+
+    def test_results_dir_created(self, tmp_path: Path) -> None:
+        results_dir = self._run_eval(tmp_path)
+        assert results_dir.exists()
+
+    def test_one_file_per_case(self, tmp_path: Path) -> None:
+        results_dir = self._run_eval(tmp_path)
+        run_dirs = list(results_dir.iterdir())
+        assert len(run_dirs) == 1
+        case_files = list(run_dirs[0].iterdir())
+        assert len(case_files) == 2  # case_pass + case_fail
+
+    def test_case_file_contains_output(self, tmp_path: Path) -> None:
+        results_dir = self._run_eval(tmp_path)
+        run_dir = next(results_dir.iterdir())
+        pass_file = next(f for f in run_dir.iterdir() if "pass" in f.name)
+        content = pass_file.read_text()
+        assert "Echo:" in content  # task output
+        assert "PASS" in content
+
+    def test_case_file_contains_scores(self, tmp_path: Path) -> None:
+        results_dir = self._run_eval(tmp_path)
+        run_dir = next(results_dir.iterdir())
+        pass_file = next(f for f in run_dir.iterdir() if "pass" in f.name)
+        content = pass_file.read_text()
+        assert "accuracy" in content
+
+    def test_case_file_contains_inputs(self, tmp_path: Path) -> None:
+        results_dir = self._run_eval(tmp_path)
+        run_dir = next(results_dir.iterdir())
+        pass_file = next(f for f in run_dir.iterdir() if "pass" in f.name)
+        content = pass_file.read_text()
+        assert "hello world" in content  # from case inputs
+
+
+# ---------------------------------------------------------------------------
+# Multi-dataset history (regression: all suites were merged under one name)
+# ---------------------------------------------------------------------------
+
+
+class TestMultiDatasetHistory:
+    """Multiple @session.eval calls produce distinct suites in history."""
+
+    def _run_multi(self, tmp_path: Path) -> dict[str, Any]:
+        from protest.api import run_session
+
+        pipeline_cases = ForEach(
+            [
+                {"inputs": "hello", "expected": "hello", "name": "c1"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        ingest_cases = ForEach(
+            [
+                {"inputs": "world", "expected": "world", "name": "c2"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession(history_dir=tmp_path)
+
+        @session.eval(evaluators=[fake_accuracy])
+        def pipeline(case: Annotated[dict, From(pipeline_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        @session.eval(evaluators=[fake_accuracy])
+        def ingest(case: Annotated[dict, From(ingest_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        run_session(session)
+
+        history = (tmp_path / "history.jsonl").read_text().splitlines()
+        return json.loads(history[-1])
+
+    def test_two_datasets_produce_two_suites_in_history(self, tmp_path: Path) -> None:
+        entry = self._run_multi(tmp_path)
+        assert "pipeline" in entry["suites"]
+        assert "ingest" in entry["suites"]
+
+    def test_each_suite_has_its_own_cases(self, tmp_path: Path) -> None:
+        entry = self._run_multi(tmp_path)
+        assert "c1" in entry["suites"]["pipeline"]["cases"]
+        assert "c2" in entry["suites"]["ingest"]["cases"]
+
+
+# ---------------------------------------------------------------------------
+# DI fixture injection dans les taches eval
+# ---------------------------------------------------------------------------
+
+
+class TestEvalTaskFixtures:
+    """@session.eval() peut utiliser des fixtures protest via Use()."""
+
+    def test_task_without_fixtures_still_works(self) -> None:
+        # basic_cases has one match (case_pass) and one mismatch (case_fail)
+        # fake_accuracy returns matches_expected=False for case_fail -> fail
+        session = EvalSession()
+
+        @session.eval(evaluators=[fake_accuracy])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        result = runner.run()
+        assert result.success is False  # case_fail has matches_expected=False
+
+    def test_task_with_session_fixture_is_injected(self) -> None:
+        """Une fixture session-scoped est injectee dans task via Use()."""
+        from protest import Use, fixture
+
+        @fixture()
+        def prefix_service() -> str:
+            return "PREFIX"
+
+        single_case = ForEach(
+            [
+                {"inputs": "hello", "expected": "PREFIX:hello", "name": "c1"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession()
+        session.bind(prefix_service)
+
+        @session.eval(evaluators=[fake_accuracy])
+        async def eval_prefixed(
+            case: Annotated[dict, From(single_case)],
+            svc: Annotated[str, Use(prefix_service)],
+        ) -> str:
+            return f"{svc}:{case['inputs']}"
+
+        runner = TestRunner(session)
+        result = runner.run()
+
+        # fake_accuracy retourne 1.0 (output contient expected) -> passe
+        assert result.success is True
+
+    def test_session_fixture_resolved_once_for_all_cases(self) -> None:
+        """Une session fixture ne doit etre appelee qu'une fois meme avec N cas."""
+        from protest import Use, fixture
+
+        call_count = 0
+
+        @fixture()
+        def expensive_resource() -> str:
+            nonlocal call_count
+            call_count += 1
+            return "resource"
+
+        multi_cases = ForEach(
+            [
+                {"inputs": "a", "expected": "resource:a", "name": "c1"},
+                {"inputs": "b", "expected": "resource:b", "name": "c2"},
+                {"inputs": "c", "expected": "resource:c", "name": "c3"},
+            ],
+            ids=lambda c: c["name"],
+        )
+
+        session = EvalSession()
+        session.bind(expensive_resource)
+
+        @session.eval(evaluators=[fake_accuracy])
+        async def eval_resource(
+            case: Annotated[dict, From(multi_cases)],
+            res: Annotated[str, Use(expensive_resource)],
+        ) -> str:
+            return f"{res}:{case['inputs']}"
+
+        runner = TestRunner(session)
+        runner.run()
+
+        assert call_count == 1  # fixture resolue une seule fois
diff --git a/tests/evals/test_hashing.py b/tests/evals/test_hashing.py
new file mode 100644
index 0000000..bc53e1f
--- /dev/null
+++ b/tests/evals/test_hashing.py
@@ -0,0 +1,72 @@
+"""Tests for protest.evals.hashing — including non-picklable dataclass fields."""
+
+from __future__ import annotations
+
+import dataclasses
+import threading
+
+from protest.evals.hashing import _canonical, compute_eval_hash
+
+# ---------------------------------------------------------------------------
+# _canonical — dataclass handling
+# ---------------------------------------------------------------------------
+
+
+@dataclasses.dataclass
+class SimpleEvaluator:
+    threshold: float
+    name: str = "simple"
+
+
+@dataclasses.dataclass
+class NestedEvaluator:
+    inner: SimpleEvaluator
+    weight: float = 1.0
+
+
+@dataclasses.dataclass
+class LockHoldingEvaluator:
+    """Simulates evaluators like LLMJudge that hold non-picklable resources."""
+
+    name: str
+    _lock: threading.Lock = dataclasses.field(default_factory=threading.Lock)
+
+
+class TestCanonicalDataclass:
+    def test_simple_dataclass_is_serialized(self) -> None:
+        ev = SimpleEvaluator(threshold=0.8)
+        result = _canonical(ev)
+        assert result == {"threshold": 0.8, "name": "simple"}
+
+    def test_nested_dataclass_is_serialized_recursively(self) -> None:
+        ev = NestedEvaluator(inner=SimpleEvaluator(threshold=0.5), weight=2.0)
+        result = _canonical(ev)
+        assert result == {"inner": {"threshold": 0.5, "name": "simple"}, "weight": 2.0}
+
+    def test_dataclass_with_lock_does_not_crash(self) -> None:
+        """Regression: dataclasses.asdict() deepcopy fails on threading.Lock."""
+        ev = LockHoldingEvaluator(name="llm_judge")
+        # Must not raise — lock falls back to repr()
+        result = _canonical(ev)
+        assert result["name"] == "llm_judge"
+        assert "_lock" in result
+
+
+class TestComputeEvalHash:
+    def test_identical_evaluators_produce_same_hash(self) -> None:
+        ev = SimpleEvaluator(threshold=0.8)
+        h1 = compute_eval_hash([ev])
+        h2 = compute_eval_hash([ev])
+        assert h1 == h2
+
+    def test_different_thresholds_produce_different_hashes(self) -> None:
+        ev_a = SimpleEvaluator(threshold=0.8)
+        ev_b = SimpleEvaluator(threshold=0.9)
+        assert compute_eval_hash([ev_a]) != compute_eval_hash([ev_b])
+
+    def test_evaluator_with_lock_does_not_crash(self) -> None:
+        """Regression for non-picklable evaluator fields."""
+        ev = LockHoldingEvaluator(name="llm_judge")
+        # Should not raise TypeError about cannot pickle '_thread.lock'
+        hash_val = compute_eval_hash([ev])
+        assert len(hash_val) == 12

From 29204bc831f738938c65409591b5d013b795a0e3 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sun, 29 Mar 2026 20:30:00 +0200
Subject: [PATCH 05/60] chore: entity exports, pyproject config

---
 docs/evals.md                  |  97 +++++++---
 protest/entities/__init__.py   |   4 +
 protest/entities/core.py       |   2 +
 protest/entities/suite_path.py |   5 +
 pyproject.toml                 |  20 ++
 uv.lock                        | 327 ++++++++++++++++++++++++++++++++-
 6 files changed, 425 insertions(+), 30 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 11895aa..1ff3235 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -1,6 +1,6 @@
 # Evals
 
-Evaluate LLM outputs with scored metrics, thresholds, and historical tracking.
+Evaluate LLM outputs with scored metrics and historical tracking.
 
 ## What is an Eval?
 
@@ -15,7 +15,7 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t
 from typing import Annotated
 
 from protest import ForEach, From
-from protest.evals import EvalCase, EvalSession, evaluator
+from protest.evals import EvalCase, EvalSession, ModelInfo, evaluator
 from protest.evals.evaluators import contains_keywords
 
 cases = ForEach([
@@ -23,7 +23,7 @@ cases = ForEach([
     EvalCase(inputs="What is 2+2?", expected="4", name="math"),
 ])
 
-session = EvalSession()
+session = EvalSession(model=ModelInfo(name="gpt-4o-mini"))
 
 @session.eval(evaluators=[contains_keywords(keywords=["Marie"])])
 async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
@@ -41,7 +41,7 @@ protest eval evals.session:session
 1. Your function receives case data via `ForEach`/`From` (same as parameterized tests)
 2. It returns the output (string, object, anything)
 3. ProTest passes the output to evaluators → scores
-4. Scores determine pass/fail via thresholds
+4. Bool verdicts determine pass/fail
 5. Aggregated stats appear in the terminal
 
 The rest of the pipeline — fixtures, DI, parallelism, reporters — works identically to tests.
@@ -87,15 +87,44 @@ An evaluator is a function decorated with `@evaluator` that receives an `EvalCon
 
 ### Return Types
 
-Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). The framework reads fields by type:
+Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). In dataclasses, annotate fields to tell the framework what each one is:
 
-| Field Type | Role |
+```python
+from typing import Annotated
+from protest.evals import Metric, Verdict, Reason
+```
+
+| Annotation | Role |
 |------------|------|
-| `bool` | Verdict — pass/fail (`all(bool_fields)`) |
-| `float` | Metric — aggregated in stats (mean/p50/p95) |
-| `str` | Reason — displayed on failure, stored in history |
+| `Annotated[bool, Verdict]` | Verdict — pass/fail (`all(verdicts)`) |
+| `Annotated[float, Metric]` | Metric — aggregated in stats (mean/p50/p95) |
+| `Annotated[int, Metric]` | Metric — converted to float |
+| `Annotated[str, Reason]` | Reason — displayed on failure, stored in history |
+
+Unannotated fields are ignored by the runner — free metadata.
+
+Returning `float`, `dict`, or any other non-dataclass/non-bool type raises `TypeError`.
+
+### Tracking-Only Evaluators
+
+A dataclass with `Metric` fields but no `Verdict` is tracking-only. The case always passes for this evaluator — it measures without gating.
+
+```python
+@dataclass
+class OverlapMetrics:
+    overlap: Annotated[float, Metric]
+
+@evaluator
+def word_overlap(ctx: EvalContext) -> OverlapMetrics:
+    ...
+```
+
+In the terminal, tracking evaluators show with `·` instead of `✓`/`✗`:
 
-Returning `float`, `dict`, or any other type raises `TypeError`.
+```
+✓  chatbot[lookup] (1.2s) keyword_recall=0.95 all_present=✓
+·  chatbot[lookup]         overlap=0.80
+```
 
 ### Simple Evaluator
 
@@ -109,12 +138,14 @@ def not_empty(ctx: EvalContext) -> bool:
 
 ```python
 from dataclasses import dataclass
+from typing import Annotated
+from protest.evals import Metric, Verdict, Reason
 
 @dataclass
 class KeywordScores:
-    keyword_recall: float      # metric → stats
-    all_present: bool          # verdict → pass/fail
-    detail: str = ""           # reason → shown on failure
+    keyword_recall: Annotated[float, Metric]
+    all_present: Annotated[bool, Verdict]
+    detail: Annotated[str, Reason] = ""
 
 @evaluator
 def keyword_check(ctx: EvalContext, keywords: list[str], min_recall: float = 0.5) -> KeywordScores:
@@ -134,9 +165,9 @@ The threshold (`min_recall`) is a parameter of the evaluator, not a framework co
 ```python
 @dataclass
 class JudgeResult:
-    accuracy: float
-    accurate_enough: bool
-    reason: str = ""
+    accuracy: Annotated[float, Metric]
+    accurate_enough: Annotated[bool, Verdict]
+    reason: Annotated[str, Reason] = ""
 
 @evaluator
 async def llm_judge(ctx: EvalContext, rubric: str = "", min_score: float = 0.7) -> JudgeResult:
@@ -223,9 +254,13 @@ session = EvalSession(model=ModelInfo(name="qwen-2.5"))
 
 ## Evaluator Errors
 
-If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output. Scores from other evaluators that ran before the error are lost.
+If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output.
+
+> **Tip:** For non-deterministic evaluators (LLM judges), catch exceptions in the evaluator and return a verdict indicating failure rather than letting them propagate.
+
+## Name Collisions
 
-> **Tip:** For non-deterministic evaluators (LLM judges), catch exceptions in the evaluator and return a score indicating failure rather than letting them propagate.
+If two evaluators return dataclasses with the same field name (e.g. both have `accuracy`), the runner prefixes with the evaluator name when it detects a conflict: `llm_judge.accuracy`, `fact_check.accuracy`.
 
 ## Multi-Model Sessions
 
@@ -290,16 +325,20 @@ Flags are independent and combinable: `-v --show-output --show-logs`.
 ### Default
 
 ```
-   ✓   chatbot[lookup] (3.39s) facts_score=1.00 facts_ok=✓
-   ✗   chatbot[causal]: facts_ok=False, LLMJudge=False
-
-         Eval: chatbot (26 cases)
-┏━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓
-┃ Score       ┃ mean ┃  p50 ┃   p5 ┃  p95 ┃
-┡━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩
-│ facts_score │ 0.37 │ 0.00 │ 0.00 │ 1.00 │
-└─────────────┴──────┴──────┴──────┴──────┘
-  Passed: 14/26 (53.8%)
+   ✓   chatbot[lookup] (1.2s) keyword_recall=1.00 all_keywords_present=✓
+   ✗   chatbot[math]: all_keywords_present=False
+       │ inputs: What is 2+2?
+       │ output: The answer is 4.
+       │ expected: 4
+       │ detail: found 0/1
+
+           Eval: chatbot (2 cases)
+┏━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓
+┃ Score           ┃ mean ┃  p50 ┃   p5 ┃  p95 ┃
+┡━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩
+│ keyword_recall  │ 0.50 │ 0.50 │ 0.00 │ 1.00 │
+└─────────────────┴──────┴──────┴──────┴──────┘
+  Passed: 1/2 (50.0%)
   Results: .protest/results/chatbot_20260329_091422
 ```
 
@@ -334,7 +373,7 @@ protest history --evals --compare
 Each case in history carries two hashes:
 
 - **`case_hash`** — hash of inputs + expected output. Changes when the test data changes.
-- **`eval_hash`** — hash of evaluators + thresholds. Changes when the scoring criteria change.
+- **`eval_hash`** — hash of evaluators. Changes when the scoring criteria change.
 
 `protest history --compare` uses these hashes to detect modified cases vs regressions. If a case's `eval_hash` changed between runs, it's reported as "scoring modified" rather than a real regression.
 
diff --git a/protest/entities/__init__.py b/protest/entities/__init__.py
index ec91eb9..30bd04e 100644
--- a/protest/entities/__init__.py
+++ b/protest/entities/__init__.py
@@ -10,6 +10,8 @@
     format_fixture_scope,
 )
 from protest.entities.events import (
+    EvalPayload,
+    EvalScoreEntry,
     FixtureInfo,
     HandlerInfo,
     RunResult,
@@ -31,6 +33,8 @@
 from protest.entities.xfail import Xfail, normalize_xfail
 
 __all__ = [
+    "EvalPayload",
+    "EvalScoreEntry",
     "Fixture",
     "FixtureCallable",
     "FixtureInfo",
diff --git a/protest/entities/core.py b/protest/entities/core.py
index 465c5d3..f5efa22 100644
--- a/protest/entities/core.py
+++ b/protest/entities/core.py
@@ -49,6 +49,7 @@ class TestRegistration:
     xfail: Xfail | None = None
     timeout: float | None = None
     retry: Retry | None = None
+    is_eval: bool = False
 
 
 @dataclass(frozen=True, slots=True)
@@ -111,6 +112,7 @@ class TestItem:
     xfail: Xfail | None = None
     timeout: float | None = None
     retry: Retry | None = None
+    is_eval: bool = False
 
     @property
     def test_name(self) -> str:
diff --git a/protest/entities/suite_path.py b/protest/entities/suite_path.py
index 38c78a2..4b7223e 100644
--- a/protest/entities/suite_path.py
+++ b/protest/entities/suite_path.py
@@ -58,6 +58,11 @@ def lower(self) -> str:
         """Return lowercase string representation for case-insensitive comparison."""
         return self._path.lower()
 
+    @property
+    def root_name(self) -> str:
+        """Return the top-level suite name: 'A::B::C' -> 'A'."""
+        return self.parts[0] if self.parts else ""
+
     def __str__(self) -> str:
         return self._path
 
diff --git a/pyproject.toml b/pyproject.toml
index 090118c..6b25e2e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,9 @@ rich = [
 web = [
     "websockets>=12.0",
 ]
+evals = [
+    "pydantic-evals>=0.1",
+]
 
 
 [tool.ruff]
@@ -100,6 +103,23 @@ ignore = [
     "PLC0415", # lazy import for optional rich dependency
     "PLR0913", # many args is deliberate API design
 ]
+"protest/core/execution/test_executor.py" = [
+    "PLR0915", # _run_test is inherently complex (retry loop + eval capture)
+]
+"protest/history/**" = [
+    "PLC0415", # lazy imports
+    "S603", # subprocess git calls are safe
+    "PLR0913", # load_history has many filter params by design
+]
+"protest/cli/history.py" = [
+    "T201", # print for CLI output
+    "PLC0415", # lazy imports
+]
+"protest/evals/**" = [
+    "T201", # print for eval reporting
+    "PLC0415", # lazy imports for optional pydantic-evals dependency
+    "PLR0913", # adapter functions have many params by design
+]
 "protest/reporting/ascii.py" = [
     "T201", # print is the purpose of this module
 ]
diff --git a/uv.lock b/uv.lock
index d7c8a6d..aa650bb 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2,6 +2,29 @@ version = 1
 revision = 3
 requires-python = ">=3.10"
 
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
+[[package]]
+name = "anyio"
+version = "4.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "idna" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" },
+]
+
 [[package]]
 name = "attrs"
 version = "25.4.0"
@@ -305,6 +328,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" },
 ]
 
+[[package]]
+name = "genai-prices"
+version = "0.0.56"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/44/6b/94b3018a672c7775edfb485f0fed8f6068fba75e49b067e8a1ac5eb96764/genai_prices-0.0.56.tar.gz", hash = "sha256:ac24b16a84d0ab97539bfa48dfa4649689de8e3ce71c12ebacef29efb1998045", size = 65872, upload-time = "2026-03-20T20:33:00.732Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a3/f6/8ef7e4c286deb2709d11ca96a5237caae3ef4876ab3c48095856cfd2df30/genai_prices-0.0.56-py3-none-any.whl", hash = "sha256:dbe86be8f3f556bed1b72209ed36851fec8b01793b3b220f42921a4e7da945f6", size = 68966, upload-time = "2026-03-20T20:33:02.555Z" },
+]
+
 [[package]]
 name = "ghp-import"
 version = "2.1.0"
@@ -317,6 +353,52 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" },
 ]
 
+[[package]]
+name = "griffelib"
+version = "2.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/82/74f4a3310cdabfbb10da554c3a672847f1ed33c6f61dd472681ce7f1fe67/griffelib-2.0.2.tar.gz", hash = "sha256:3cf20b3bc470e83763ffbf236e0076b1211bac1bc67de13daf494640f2de707e", size = 166461, upload-time = "2026-03-27T11:34:51.091Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/8c/c9138d881c79aa0ea9ed83cbd58d5ca75624378b38cee225dcf5c42cc91f/griffelib-2.0.2-py3-none-any.whl", hash = "sha256:925c857658fb1ba40c0772c37acbc2ab650bd794d9c1b9726922e36ea4117ea1", size = 142357, upload-time = "2026-03-27T11:34:46.275Z" },
+]
+
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
+]
+
 [[package]]
 name = "identify"
 version = "2.6.15"
@@ -335,6 +417,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
 ]
 
+[[package]]
+name = "importlib-metadata"
+version = "8.7.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "zipp" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
+]
+
 [[package]]
 name = "iniconfig"
 version = "2.1.0"
@@ -383,6 +477,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "logfire-api"
+version = "4.31.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/08/a2/8d5a3c1c282d5f2bd9f5e9ddd5288d1414a53301ce389af9016b6d82bd50/logfire_api-4.31.0.tar.gz", hash = "sha256:fc4b01257ebd4ce297ad374ed201eb1a9213b999f6ae6df45cfca5bd0ef378f8", size = 77838, upload-time = "2026-03-27T19:00:47.545Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/27/9372b7492b3e146908d520f8599909311cd930175801ad219171fafc6f3e/logfire_api-4.31.0-py3-none-any.whl", hash = "sha256:3c1f502fd4eb8ef0996427a5cf275fd8f327f38600650a1f53071a8171c812db", size = 123402, upload-time = "2026-03-27T19:00:44.952Z" },
+]
+
 [[package]]
 name = "markdown"
 version = "3.10"
@@ -585,6 +688,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
 ]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "24.2"
@@ -655,6 +771,9 @@ dependencies = [
 ]
 
 [package.optional-dependencies]
+evals = [
+    { name = "pydantic-evals" },
+]
 rich = [
     { name = "rich" },
 ]
@@ -681,11 +800,12 @@ docs = [
 
 [package.metadata]
 requires-dist = [
+    { name = "pydantic-evals", marker = "extra == 'evals'", specifier = ">=0.1" },
     { name = "rich", marker = "extra == 'rich'", specifier = ">=13.0" },
     { name = "typing-extensions", specifier = ">=4.15.0" },
     { name = "websockets", marker = "extra == 'web'", specifier = ">=12.0" },
 ]
-provides-extras = ["rich", "web"]
+provides-extras = ["rich", "web", "evals"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -704,6 +824,190 @@ docs = [
     { name = "mkdocs-material", specifier = ">=9.7.0" },
 ]
 
+[[package]]
+name = "pydantic"
+version = "2.12.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
+]
+
+[[package]]
+name = "pydantic-ai-slim"
+version = "1.73.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "genai-prices" },
+    { name = "griffelib" },
+    { name = "httpx" },
+    { name = "opentelemetry-api" },
+    { name = "pydantic" },
+    { name = "pydantic-graph" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6a/1b/a5e18c7c721a3cfce5b17f86cb99e4142fcb70f38ea6d2b8963c2df445e1/pydantic_ai_slim-1.73.0.tar.gz", hash = "sha256:758d5bedb4b4f484c433672639bfc87af216a38453b1539ae10928a9ca62ff62", size = 497208, upload-time = "2026-03-27T03:49:49.459Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/3b/6aa1874cd0ccbc83c17c8eb308834bf004c8d4344c27cd8048851d4b284d/pydantic_ai_slim-1.73.0-py3-none-any.whl", hash = "sha256:f7176ce6c78539e1070d7e22549186862c2f6e6ea8b05b3aaad8a1942ba1ff4f", size = 638701, upload-time = "2026-03-27T03:49:42.804Z" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.41.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/90/32c9941e728d564b411d574d8ee0cf09b12ec978cb22b294995bae5549a5/pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146", size = 2107298, upload-time = "2025-11-04T13:39:04.116Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/a8/61c96a77fe28993d9a6fb0f4127e05430a267b235a124545d79fea46dd65/pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2", size = 1901475, upload-time = "2025-11-04T13:39:06.055Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/b6/338abf60225acc18cdc08b4faef592d0310923d19a87fba1faf05af5346e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97", size = 1918815, upload-time = "2025-11-04T13:39:10.41Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/1c/2ed0433e682983d8e8cba9c8d8ef274d4791ec6a6f24c58935b90e780e0a/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9", size = 2065567, upload-time = "2025-11-04T13:39:12.244Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/24/cf84974ee7d6eae06b9e63289b7b8f6549d416b5c199ca2d7ce13bbcf619/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52", size = 2230442, upload-time = "2025-11-04T13:39:13.962Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/21/4e287865504b3edc0136c89c9c09431be326168b1eb7841911cbc877a995/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941", size = 2350956, upload-time = "2025-11-04T13:39:15.889Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/76/7727ef2ffa4b62fcab916686a68a0426b9b790139720e1934e8ba797e238/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a", size = 2068253, upload-time = "2025-11-04T13:39:17.403Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/8c/a4abfc79604bcb4c748e18975c44f94f756f08fb04218d5cb87eb0d3a63e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c", size = 2177050, upload-time = "2025-11-04T13:39:19.351Z" },
+    { url = "https://files.pythonhosted.org/packages/67/b1/de2e9a9a79b480f9cb0b6e8b6ba4c50b18d4e89852426364c66aa82bb7b3/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2", size = 2147178, upload-time = "2025-11-04T13:39:21Z" },
+    { url = "https://files.pythonhosted.org/packages/16/c1/dfb33f837a47b20417500efaa0378adc6635b3c79e8369ff7a03c494b4ac/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556", size = 2341833, upload-time = "2025-11-04T13:39:22.606Z" },
+    { url = "https://files.pythonhosted.org/packages/47/36/00f398642a0f4b815a9a558c4f1dca1b4020a7d49562807d7bc9ff279a6c/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49", size = 2321156, upload-time = "2025-11-04T13:39:25.843Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/70/cad3acd89fde2010807354d978725ae111ddf6d0ea46d1ea1775b5c1bd0c/pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba", size = 1989378, upload-time = "2025-11-04T13:39:27.92Z" },
+    { url = "https://files.pythonhosted.org/packages/76/92/d338652464c6c367e5608e4488201702cd1cbb0f33f7b6a85a60fe5f3720/pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9", size = 2013622, upload-time = "2025-11-04T13:39:29.848Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" },
+    { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" },
+    { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" },
+    { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" },
+    { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" },
+    { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" },
+    { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" },
+    { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" },
+    { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" },
+    { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" },
+    { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" },
+    { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" },
+    { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" },
+    { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" },
+    { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" },
+    { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" },
+    { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" },
+    { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" },
+    { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" },
+    { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" },
+    { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" },
+    { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" },
+    { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" },
+    { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" },
+    { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" },
+    { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" },
+    { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" },
+    { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
+    { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" },
+    { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" },
+    { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" },
+    { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/b0/1a2aa41e3b5a4ba11420aba2d091b2d17959c8d1519ece3627c371951e73/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8", size = 2103351, upload-time = "2025-11-04T13:43:02.058Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/ee/31b1f0020baaf6d091c87900ae05c6aeae101fa4e188e1613c80e4f1ea31/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a", size = 1925363, upload-time = "2025-11-04T13:43:05.159Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/89/ab8e86208467e467a80deaca4e434adac37b10a9d134cd2f99b28a01e483/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b", size = 2135615, upload-time = "2025-11-04T13:43:08.116Z" },
+    { url = "https://files.pythonhosted.org/packages/99/0a/99a53d06dd0348b2008f2f30884b34719c323f16c3be4e6cc1203b74a91d/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2", size = 2175369, upload-time = "2025-11-04T13:43:12.49Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/94/30ca3b73c6d485b9bb0bc66e611cff4a7138ff9736b7e66bcf0852151636/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093", size = 2144218, upload-time = "2025-11-04T13:43:15.431Z" },
+    { url = "https://files.pythonhosted.org/packages/87/57/31b4f8e12680b739a91f472b5671294236b82586889ef764b5fbc6669238/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a", size = 2329951, upload-time = "2025-11-04T13:43:18.062Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/73/3c2c8edef77b8f7310e6fb012dbc4b8551386ed575b9eb6fb2506e28a7eb/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963", size = 2318428, upload-time = "2025-11-04T13:43:20.679Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" },
+    { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" },
+    { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" },
+    { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" },
+    { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" },
+]
+
+[[package]]
+name = "pydantic-evals"
+version = "1.73.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "logfire-api" },
+    { name = "pydantic" },
+    { name = "pydantic-ai-slim" },
+    { name = "pyyaml" },
+    { name = "rich" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/02/45/ce1f9b97c4838f940c98693bc1d6298f0e1396355998942b095ce17157fe/pydantic_evals-1.73.0.tar.gz", hash = "sha256:c1f38ad9c4f566bee6958c92f205b8200957b4baf3dd5239e2a4a06edd28e3dc", size = 56137, upload-time = "2026-03-27T03:49:50.861Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/4e/aefc34a68adc165ddec22c0632cb3076579c46751ac11acdf8cec6462891/pydantic_evals-1.73.0-py3-none-any.whl", hash = "sha256:0609210d4825cc8339b5cb649be38321450b46d6e87d72c1ffde73598741fd5a", size = 67143, upload-time = "2026-03-27T03:49:44.298Z" },
+]
+
+[[package]]
+name = "pydantic-graph"
+version = "1.73.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+    { name = "logfire-api" },
+    { name = "pydantic" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1a/22/d479ea32e3c712c6711e41157fb975d81582e5171510e4c662f21a85e9fe/pydantic_graph-1.73.0.tar.gz", hash = "sha256:f0d3e4984af1d902cdda1ccd3fcd86949d45d3ed21559e781f7cf9eace2ed914", size = 58717, upload-time = "2026-03-27T03:49:51.967Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/b3/4cc0b1c543b8a0c1f9add7bdeb2e8cd583961a795664a1a74d1fc8200416/pydantic_graph-1.73.0-py3-none-any.whl", hash = "sha256:aaab8b1580885f5108401db0a7da58d6c7643e467eb626b8a1364b1030327de0", size = 72504, upload-time = "2026-03-27T03:49:45.668Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -1116,6 +1420,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
 ]
 
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
+]
+
 [[package]]
 name = "urllib3"
 version = "2.5.0"
@@ -1230,3 +1546,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" },
     { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
 ]
+
+[[package]]
+name = "zipp"
+version = "3.23.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" },
+]

From 7aa3b49a0f076ad768706aef4e896bb390f5d1d9 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Mon, 30 Mar 2026 07:30:00 +0200
Subject: [PATCH 06/60] =?UTF-8?q?feat(evals):=20ShortCircuit=20=E2=80=94?=
 =?UTF-8?q?=20skip=20expensive=20evaluators=20on=20early=20fail?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    evaluators=[
        not_empty,
        ShortCircuit([
            contains_expected_facts(min_score=0.5),
            llm_judge(rubric="..."),  # skipped if above fails
        ]),
    ]

First Verdict=False stops the group. Evaluators outside run regardless.
---
 docs/evals.md                           | 18 ++++++++
 protest/core/execution/test_executor.py | 13 +++---
 protest/entities/events.py              |  1 +
 protest/evals/__init__.py               |  3 +-
 protest/evals/evaluator.py              | 21 +++++++++
 protest/evals/types.py                  | 11 +++--
 protest/evals/wrapper.py                | 32 ++++++++++++-
 protest/reporting/rich_reporter.py      |  3 ++
 tests/evals/test_e2e.py                 | 61 +++++++++++++++++++++++++
 9 files changed, 151 insertions(+), 12 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 1ff3235..b8cd74b 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -185,6 +185,24 @@ EvalCase(inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min
 EvalCase(inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]),
 ```
 
+### ShortCircuit
+
+Skip expensive evaluators (LLM judges) when cheap ones already fail:
+
+```python
+from protest.evals import ShortCircuit
+
+evaluators=[
+    not_empty,                                     # always runs
+    ShortCircuit([
+        contains_expected_facts(min_score=0.3),    # 0ms — if fail → stop
+        llm_judge(rubric="factual accuracy"),       # 3s — skipped if above fails
+    ]),
+]
+```
+
+`ShortCircuit` is a group of ordered evaluators. The first `Verdict=False` stops the group. Evaluators outside the `ShortCircuit` always run.
+
 ### Using Evaluators
 
 ```python
diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py
index 8b475c6..2e7a9c4 100644
--- a/protest/core/execution/test_executor.py
+++ b/protest/core/execution/test_executor.py
@@ -361,9 +361,10 @@ async def _acquire_fixture_semaphores(
 
 def _build_eval_error(payload: EvalPayload) -> AssertionError:
     """Build a descriptive AssertionError from failed eval scores."""
-    failed = [
-        f"{name}={entry.value}"
-        for name, entry in payload.scores.items()
-        if not entry.passed
-    ]
-    return AssertionError(f"{', '.join(failed)}")
+    parts = []
+    for name, entry in payload.scores.items():
+        if entry.skipped:
+            parts.append(f"{name}=⊘")
+        elif not entry.passed:
+            parts.append(f"{name}={entry.value}")
+    return AssertionError(f"{', '.join(parts)}")
diff --git a/protest/entities/events.py b/protest/entities/events.py
index d76434c..afb8971 100644
--- a/protest/entities/events.py
+++ b/protest/entities/events.py
@@ -14,6 +14,7 @@ class EvalScoreEntry:
 
     value: float | bool | str
     passed: bool = True
+    skipped: bool = False
 
 
 @dataclass(frozen=True, slots=True)
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index 17b35c9..54f5ef6 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -1,6 +1,6 @@
 """ProTest evals — native eval support."""
 
-from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, Verdict, evaluator
+from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, ShortCircuit, Verdict, evaluator
 from protest.evals.session import EvalSession
 from protest.evals.types import (
     EvalCaseResult,
@@ -23,6 +23,7 @@
     "ModelInfo",
     "Reason",
     "ScoreStats",
+    "ShortCircuit",
     "Verdict",
     "evaluator",
 ]
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 336df8d..cd8a615 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -80,6 +80,27 @@ def __repr__(self) -> str:
         return self.name or f"EvalCase({self.inputs!r})"
 
 
+class ShortCircuit:
+    """Group evaluators with fail-fast behavior.
+
+    The first Verdict=False stops the group. Evaluators outside
+    the ShortCircuit run regardless.
+
+    Usage::
+
+        evaluators=[
+            not_empty,
+            ShortCircuit([
+                contains_expected_facts(min_score=0.5),
+                llm_judge(rubric="..."),  # skipped if above fails
+            ]),
+        ]
+    """
+
+    def __init__(self, evaluators: list[Any]) -> None:
+        self.evaluators = evaluators
+
+
 class Metric:
     """Annotate a float/int field as a metric for stats aggregation."""
 
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 24082f1..ac61181 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -53,24 +53,27 @@ class EvalScore:
 
     name: str
     value: float | bool | str
+    skipped: bool = False
 
     @property
     def is_verdict(self) -> bool:
-        return isinstance(self.value, bool)
+        return not self.skipped and isinstance(self.value, bool)
 
     @property
     def is_metric(self) -> bool:
-        return isinstance(self.value, (int, float)) and not isinstance(self.value, bool)
+        return not self.skipped and isinstance(self.value, (int, float)) and not isinstance(self.value, bool)
 
     @property
     def is_reason(self) -> bool:
-        return isinstance(self.value, str)
+        return not self.skipped and isinstance(self.value, str)
 
     @property
     def passed(self) -> bool:
+        if self.skipped:
+            return True  # skipped scores don't affect pass/fail
         if isinstance(self.value, bool):
             return self.value
-        return True  # metrics and reasons always "pass"
+        return True
 
 
 @dataclass(frozen=True, slots=True)
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index c9087b6..0251f98 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -13,7 +13,7 @@
 from typing import Any
 
 from protest.entities.events import EvalPayload, EvalScoreEntry
-from protest.evals.evaluator import EvalContext, extract_scores_from_result
+from protest.evals.evaluator import EvalContext, ShortCircuit, extract_scores_from_result
 from protest.evals.types import EvalScore
 
 
@@ -65,6 +65,7 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
                 s.name: EvalScoreEntry(
                     value=s.value,
                     passed=s.passed,
+                    skipped=s.skipped,
                 )
                 for s in scores
             },
@@ -163,6 +164,10 @@ async def run_evaluators(
 
     scores: list[EvalScore] = []
     for ev in evaluators:
+        if isinstance(ev, ShortCircuit):
+            scores.extend(await _run_short_circuit(ev.evaluators, ctx))
+            continue
+
         evaluator_name = getattr(ev, "__name__", type(ev).__name__)
         try:
             raw = ev(ctx)
@@ -174,3 +179,28 @@ async def run_evaluators(
             raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
 
     return scores
+
+
+async def _run_short_circuit(
+    evaluators: list[Any], ctx: EvalContext[Any, Any],
+) -> list[EvalScore]:
+    """Run evaluators in order, stop at first Verdict=False."""
+    scores: list[EvalScore] = []
+    for i, ev in enumerate(evaluators):
+        evaluator_name = getattr(ev, "__name__", type(ev).__name__)
+        try:
+            raw = ev(ctx)
+            result = await raw if asyncio.iscoroutine(raw) else raw
+        except Exception as exc:
+            from protest.exceptions import FixtureError
+
+            raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
+        extracted = extract_scores_from_result(result, evaluator_name)
+        scores.extend(extracted)
+        if any(s.is_verdict and not s.passed for s in extracted):
+            # Mark remaining evaluators as skipped
+            for skipped_ev in evaluators[i + 1 :]:
+                skipped_name = getattr(skipped_ev, "__name__", type(skipped_ev).__name__)
+                scores.append(EvalScore(name=skipped_name, value=False, skipped=True))
+            break
+    return scores
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 8f263d9..414cb49 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -55,6 +55,9 @@ def _format_eval_scores_inline(result: TestResult) -> str:
         return ""
     parts = []
     for name, entry in result.eval_payload.scores.items():
+        if entry.skipped:
+            parts.append(f"{name}=⊘")
+            continue
         val = entry.value
         if isinstance(val, bool):
             parts.append(f"{name}={'✓' if val else '✗'}")
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 5fbb4e8..6e35762 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -866,6 +866,67 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
         assert results[0].is_fixture_error is True
 
 
+class TestShortCircuit:
+    """ShortCircuit: skip expensive evaluators when cheap ones fail."""
+
+    def test_short_circuit_skips_on_fail(self) -> None:
+        from protest.evals import ShortCircuit
+
+        call_log: list[str] = []
+
+        @evaluator
+        def cheap(ctx: EvalContext) -> bool:
+            call_log.append("cheap")
+            return "hello" in ctx.output.lower()
+
+        @evaluator
+        def expensive(ctx: EvalContext) -> bool:
+            call_log.append("expensive")
+            return True
+
+        session = EvalSession()
+
+        @session.eval(evaluators=[ShortCircuit([cheap, expensive])])
+        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        runner.run()
+
+        # case_pass: cheap ✓ → expensive ✓ (both called)
+        # case_fail: cheap ✗ → expensive SKIPPED
+        assert call_log.count("cheap") == 2
+        assert call_log.count("expensive") == 1
+
+    def test_short_circuit_all_pass(self) -> None:
+        from protest.evals import ShortCircuit
+
+        call_log: list[str] = []
+
+        @evaluator
+        def check_a(ctx: EvalContext) -> bool:
+            call_log.append("a")
+            return True
+
+        @evaluator
+        def check_b(ctx: EvalContext) -> bool:
+            call_log.append("b")
+            return True
+
+        single = ForEach([{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"])
+        session = EvalSession()
+
+        @session.eval(evaluators=[ShortCircuit([check_a, check_b])])
+        def eval_echo(case: Annotated[dict, From(single)]) -> str:
+            return echo_task(case["inputs"])
+
+        runner = TestRunner(session)
+        result = runner.run()
+
+        assert result.success is True
+        assert call_log == ["a", "b"]
+
+
 # ---------------------------------------------------------------------------
 # Results files per run
 # ---------------------------------------------------------------------------

From 3ed68a4aab8764a88048b434c92c6679ac153012 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Mon, 30 Mar 2026 22:14:03 +0200
Subject: [PATCH 07/60] fix ci

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a402b70..75efa11 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -103,7 +103,7 @@ jobs:
           files: coverage.xml
           fail_ci_if_error: false
 
-c  docs:
+  docs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6

From ad7a20714841cf1744b4a92bd2e3031e63748ea5 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Mon, 30 Mar 2026 07:45:00 +0200
Subject: [PATCH 08/60] =?UTF-8?q?chore:=20fix=20all=20lint=20=E2=80=94=20m?=
 =?UTF-8?q?ove=20imports=20to=20top-level,=20no=20lazy=20imports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 protest/__init__.py                     |   1 +
 protest/console.py                      |  14 ++--
 protest/core/collector.py               |   3 +-
 protest/core/execution/test_executor.py |   3 +-
 protest/core/runner.py                  |  14 ++--
 protest/core/suite.py                   |   2 +-
 protest/di/container.py                 |   3 +-
 protest/di/hints.py                     |  13 +--
 protest/di/validation.py                |   3 +-
 protest/evals/__init__.py               |  25 +++++-
 protest/evals/evaluator.py              |  20 ++---
 protest/evals/evaluators.py             |   4 +-
 protest/evals/results_writer.py         |   5 +-
 protest/evals/types.py                  |   6 +-
 protest/evals/wrapper.py                |  13 ++-
 protest/reporting/ascii.py              |  11 +--
 protest/reporting/rich_reporter.py      |  30 ++++---
 tests/evals/test_e2e.py                 | 100 ++++++------------------
 uv.lock                                 |   2 +-
 19 files changed, 115 insertions(+), 157 deletions(-)

diff --git a/protest/__init__.py b/protest/__init__.py
index 4509b37..97221b9 100644
--- a/protest/__init__.py
+++ b/protest/__init__.py
@@ -42,6 +42,7 @@
     "__version__",
     "caplog",
     "collect_tests",
+    "console",
     "factory",
     "fixture",
     "list_tags",
diff --git a/protest/console.py b/protest/console.py
index 9270c16..29dd381 100644
--- a/protest/console.py
+++ b/protest/console.py
@@ -19,9 +19,13 @@ async def pipeline():
 
 from __future__ import annotations
 
+import contextlib
 import re
 import sys
 
+from protest.events.types import Event
+from protest.execution.capture import get_event_bus
+
 
 def print(msg: str, *, raw: bool = False) -> None:
     """Print a message that bypasses test capture.
@@ -33,22 +37,16 @@ def print(msg: str, *, raw: bool = False) -> None:
         msg: The message to print. Supports Rich markup unless raw=True.
         raw: If True, no markup processing — message passed as-is.
     """
-    from protest.execution.capture import get_event_bus
-
     bus = get_event_bus()
     if bus is None:
         _fallback_print(msg, raw)
         return
 
-    from protest.events.types import Event
-
     # Call handlers directly (sync, bypasses async emit).
     # This ensures messages appear immediately, not after the test.
-    for handler_entry in bus._handlers.get(Event.USER_PRINT, []):
-        try:
+    for handler_entry in bus._handlers.get(Event.USER_PRINT, []):  # type: ignore[union-attr]
+        with contextlib.suppress(Exception):
             handler_entry.func((msg, raw))
-        except Exception:
-            pass
 
 
 def _fallback_print(msg: str, raw: bool) -> None:
diff --git a/protest/core/collector.py b/protest/core/collector.py
index 24356a8..d7c83db 100644
--- a/protest/core/collector.py
+++ b/protest/core/collector.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin
 
 from protest.di.decorators import get_fixture_marker, unwrap_fixture
+from protest.di.hints import get_type_hints_compat
 from protest.di.markers import Use
 from protest.di.validation import _extract_from_params
 from protest.entities import FixtureCallable, SuitePath, TestItem, TestRegistration
@@ -18,8 +19,6 @@
 
 def _extract_use_fixtures(func: Callable[..., Any]) -> list[FixtureCallable]:
     """Extract fixtures referenced via Use() markers in function parameters."""
-    from protest.di.hints import get_type_hints_compat
-
     type_hints = get_type_hints_compat(func)
 
     fixtures: list[FixtureCallable] = []
diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py
index 2e7a9c4..3c065f2 100644
--- a/protest/core/execution/test_executor.py
+++ b/protest/core/execution/test_executor.py
@@ -12,6 +12,7 @@
 from protest.core.collector import get_transitive_fixtures
 from protest.core.outcome import OutcomeBuilder, TestExecutionResult
 from protest.di.container import FixtureContainer
+from protest.di.hints import get_type_hints_compat
 from protest.entities import (
     FixtureCallable,
     TestItem,
@@ -255,8 +256,6 @@ async def _resolve_test_kwargs(
         func_signature = signature(item.func)
         kwargs: dict[str, Any] = dict(item.case_kwargs)
 
-        from protest.di.hints import get_type_hints_compat
-
         type_hints = get_type_hints_compat(item.func)
 
         for param_name, param in func_signature.parameters.items():
diff --git a/protest/core/runner.py b/protest/core/runner.py
index 70669d0..4e58544 100644
--- a/protest/core/runner.py
+++ b/protest/core/runner.py
@@ -9,7 +9,7 @@
 from protest.core.collector import Collector
 from protest.core.execution import ParallelExecutor, SuiteManager, TestExecutor
 from protest.core.outcome import OutcomeBuilder
-from protest.core.session import ProTestSession
+from protest.core.session import ProTestSession  # noqa: TC001 — used at runtime
 from protest.core.tracker import SuiteTracker
 from protest.entities import (
     RunResult,
@@ -17,9 +17,12 @@
     SessionSetupInfo,
     TestCounts,
 )
+from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport
 from protest.events.types import Event
 from protest.execution.capture import (
     GlobalCapturePatch,
+    reset_event_bus,
+    set_event_bus,
     set_session_setup_capture,
 )
 from protest.execution.context import cancellation_event
@@ -27,7 +30,6 @@
 
 if TYPE_CHECKING:
     from protest.entities.events import TestResult
-    from protest.evals.types import EvalCaseResult
 
 
 class TestRunner:
@@ -77,7 +79,7 @@ def _collect_eval_result(self, result: TestResult) -> None:
         case_result = _build_eval_case_result(result)
         self._eval_results.setdefault(suite_name, []).append(case_result)
 
-    async def _main_loop(self) -> bool:
+    async def _main_loop(self) -> bool:  # noqa: PLR0915
         """The main async loop for running tests."""
         session_start = time.perf_counter()
 
@@ -100,8 +102,6 @@ async def _main_loop(self) -> bool:
 
         total_counts = TestCounts()
         # Inject cancellation event into context for teardown awareness
-        from protest.execution.capture import reset_event_bus, set_event_bus
-
         cancel_token = cancellation_event.set(
             self._interrupt_handler.force_teardown_event
         )
@@ -190,8 +190,6 @@ async def _main_loop(self) -> bool:
 
     async def _emit_eval_suite_end(self, suite_path: Any) -> None:
         """Emit EVAL_SUITE_END if this suite_path corresponds to an eval suite."""
-        from protest.evals.types import EvalSuiteReport
-
         suite_name = (
             suite_path.root_name
             if hasattr(suite_path, "root_name")
@@ -210,8 +208,6 @@ async def _emit_eval_suite_end(self, suite_path: Any) -> None:
 
 def _build_eval_case_result(result: TestResult) -> EvalCaseResult:
     """Build EvalCaseResult from a TestResult with eval_payload."""
-    from protest.evals.types import EvalCaseResult, EvalScore
-
     payload = result.eval_payload
     assert payload is not None
     return EvalCaseResult(
diff --git a/protest/core/suite.py b/protest/core/suite.py
index dfb64c3..1a8da5d 100644
--- a/protest/core/suite.py
+++ b/protest/core/suite.py
@@ -21,6 +21,7 @@
     normalize_skip,
     normalize_xfail,
 )
+from protest.evals.wrapper import make_eval_wrapper
 from protest.exceptions import ConcurrencyMismatchError, InvalidMaxConcurrencyError
 
 FuncT = TypeVar("FuncT", bound="Callable[..., object]")
@@ -167,7 +168,6 @@ def eval(
         timeout: float | None = None,
     ) -> Callable[[FuncT], FuncT]:
         """Register a scored eval test on this suite."""
-        from protest.evals.wrapper import make_eval_wrapper
 
         def decorator(func: FuncT) -> FuncT:
             wrapper = make_eval_wrapper(
diff --git a/protest/di/container.py b/protest/di/container.py
index 5c38571..3a85ae0 100644
--- a/protest/di/container.py
+++ b/protest/di/container.py
@@ -22,6 +22,7 @@
     unwrap_fixture,
 )
 from protest.di.factory import FixtureFactory
+from protest.di.hints import get_type_hints_compat
 from protest.di.markers import Use
 from protest.di.proxy import FixtureErrorWrapper
 from protest.entities import (
@@ -780,8 +781,6 @@ def _analyze_and_store_dependencies(
         actual_func = unwrap_fixture(func)
         func_signature = signature(actual_func)
 
-        from protest.di.hints import get_type_hints_compat
-
         type_hints = get_type_hints_compat(actual_func)
 
         dependencies: dict[str, FixtureCallable] = {}
diff --git a/protest/di/hints.py b/protest/di/hints.py
index ede4c12..bd6a89b 100644
--- a/protest/di/hints.py
+++ b/protest/di/hints.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import contextlib
 import inspect
 import re
 from typing import Any, get_type_hints
@@ -21,23 +22,17 @@
 
 def get_type_hints_compat(func: Any) -> dict[str, Any]:
     """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks."""
-    try:
+    with contextlib.suppress(Exception):
         return get_type_hints(func, include_extras=True)
-    except Exception:
-        pass
 
     # Build a namespace from the entire call stack (covers local fixtures).
     localns: dict[str, Any] = {}
-    try:
+    with contextlib.suppress(Exception):
         for frame_info in inspect.stack():
             localns.update(frame_info.frame.f_locals)
-    except Exception:
-        pass
 
-    try:
+    with contextlib.suppress(Exception):
         return get_type_hints(func, localns=localns, include_extras=True)
-    except Exception:
-        pass
 
     # TYPE_CHECKING fallback: substitute Any for unresolvable names.
     return _get_type_hints_substituting_any(func, localns)
diff --git a/protest/di/validation.py b/protest/di/validation.py
index d716397..1026bca 100644
--- a/protest/di/validation.py
+++ b/protest/di/validation.py
@@ -5,6 +5,7 @@
 from inspect import signature
 from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin
 
+from protest.di.hints import get_type_hints_compat
 from protest.di.markers import ForEach, From
 from protest.exceptions import ParameterizedFixtureError
 from protest.utils import get_callable_name
@@ -15,8 +16,6 @@
 
 def _extract_from_params(func: Callable[..., Any]) -> dict[str, ForEach[Any]]:
     """Extract parameters annotated with From(source)."""
-    from protest.di.hints import get_type_hints_compat
-
     type_hints = get_type_hints_compat(func)
 
     result: dict[str, ForEach[Any]] = {}
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index 54f5ef6..fdb5115 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -1,7 +1,14 @@
 """ProTest evals — native eval support."""
 
-from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, ShortCircuit, Verdict, evaluator
-from protest.evals.session import EvalSession
+from protest.evals.evaluator import (
+    EvalCase,
+    EvalContext,
+    Metric,
+    Reason,
+    ShortCircuit,
+    Verdict,
+    evaluator,
+)
 from protest.evals.types import (
     EvalCaseResult,
     EvalScore,
@@ -15,11 +22,11 @@
     "EvalCase",
     "EvalCaseResult",
     "EvalContext",
-    "Metric",
     "EvalScore",
     "EvalSession",
     "EvalSuiteReport",
     "JudgeInfo",
+    "Metric",
     "ModelInfo",
     "Reason",
     "ScoreStats",
@@ -27,3 +34,15 @@
     "Verdict",
     "evaluator",
 ]
+
+
+def __getattr__(name: str) -> object:
+    # EvalSession imports protest.core.session which imports reporters,
+    # and reporters import protest.evals.types — eagerly importing
+    # EvalSession here would create a circular import chain.
+    if name == "EvalSession":
+        from protest.evals.session import EvalSession
+
+        return EvalSession
+    msg = f"module {__name__!r} has no attribute {name!r}"
+    raise AttributeError(msg)
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index cd8a615..61a8a72 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -38,18 +38,18 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
 from dataclasses import dataclass, field
 from typing import Any, Generic, TypeVar
 
-I = TypeVar("I")
-O = TypeVar("O")
+InputT = TypeVar("InputT")
+OutputT = TypeVar("OutputT")
 
 
 @dataclass
-class EvalContext(Generic[I, O]):
+class EvalContext(Generic[InputT, OutputT]):
     """Context passed to evaluator functions."""
 
     name: str
-    inputs: I
-    output: O
-    expected_output: O | None
+    inputs: InputT
+    output: OutputT
+    expected_output: OutputT | None
     metadata: Any
     duration: float
 
@@ -138,15 +138,15 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]:
             if ann is None or get_origin(ann) is not Annotated:
                 continue
             for meta in get_args(ann)[1:]:
-                if isinstance(meta, type) and issubclass(meta, (Metric, Verdict, Reason)):
+                if isinstance(meta, type) and issubclass(
+                    meta, (Metric, Verdict, Reason)
+                ):
                     scores.append(EvalScore(name=f.name, value=getattr(result, f.name)))
                     break
         return scores
 
     type_name = type(result).__name__
-    raise TypeError(
-        f"Evaluator must return bool or dataclass, got {type_name}"
-    )
+    raise TypeError(f"Evaluator must return bool or dataclass, got {type_name}")
 
 
 def evaluator(fn: Any) -> Any:
diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py
index b9b1475..d2cd632 100644
--- a/protest/evals/evaluators.py
+++ b/protest/evals/evaluators.py
@@ -44,7 +44,9 @@ class WordOverlapResult:
 
 
 @evaluator
-def contains_keywords(ctx: EvalContext, keywords: list[str], min_recall: float = 0.0) -> ContainsKeywordsResult:
+def contains_keywords(
+    ctx: EvalContext, keywords: list[str], min_recall: float = 0.0
+) -> ContainsKeywordsResult:
     """Check that the output contains expected keywords (case-insensitive)."""
     output_lower = ctx.output.lower()
     found = sum(1 for kw in keywords if kw.lower() in output_lower)
diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
index 0054e25..0c670a8 100644
--- a/protest/evals/results_writer.py
+++ b/protest/evals/results_writer.py
@@ -140,10 +140,7 @@ def _render_case(case: EvalCaseResult) -> str:
 
 
 def _format_score(score: EvalScore) -> str:
-    if score.is_metric:
-        icon = "·"
-    else:
-        icon = "✓" if score.passed else "✗"
+    icon = "·" if score.is_metric else ("✓" if score.passed else "✗")
     return f"- **{score.name}**: {score.value} {icon}"
 
 
diff --git a/protest/evals/types.py b/protest/evals/types.py
index ac61181..121264f 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -61,7 +61,11 @@ def is_verdict(self) -> bool:
 
     @property
     def is_metric(self) -> bool:
-        return not self.skipped and isinstance(self.value, (int, float)) and not isinstance(self.value, bool)
+        return (
+            not self.skipped
+            and isinstance(self.value, (int, float))
+            and not isinstance(self.value, bool)
+        )
 
     @property
     def is_reason(self) -> bool:
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index 0251f98..537282b 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -13,7 +13,11 @@
 from typing import Any
 
 from protest.entities.events import EvalPayload, EvalScoreEntry
-from protest.evals.evaluator import EvalContext, ShortCircuit, extract_scores_from_result
+from protest.evals.evaluator import (
+    EvalContext,
+    ShortCircuit,
+    extract_scores_from_result,
+)
 from protest.evals.types import EvalScore
 
 
@@ -182,7 +186,8 @@ async def run_evaluators(
 
 
 async def _run_short_circuit(
-    evaluators: list[Any], ctx: EvalContext[Any, Any],
+    evaluators: list[Any],
+    ctx: EvalContext[Any, Any],
 ) -> list[EvalScore]:
     """Run evaluators in order, stop at first Verdict=False."""
     scores: list[EvalScore] = []
@@ -200,7 +205,9 @@ async def _run_short_circuit(
         if any(s.is_verdict and not s.passed for s in extracted):
             # Mark remaining evaluators as skipped
             for skipped_ev in evaluators[i + 1 :]:
-                skipped_name = getattr(skipped_ev, "__name__", type(skipped_ev).__name__)
+                skipped_name = getattr(
+                    skipped_ev, "__name__", type(skipped_ev).__name__
+                )
                 scores.append(EvalScore(name=skipped_name, value=False, skipped=True))
             break
     return scores
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index a52c509..ea4040d 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -1,9 +1,11 @@
+import sys
 import traceback
 from pathlib import Path
 from typing import Any
 
 from typing_extensions import Self
 
+from protest.console import strip_markup
 from protest.entities import (
     FixtureInfo,
     HandlerInfo,
@@ -19,6 +21,7 @@
     TestStartInfo,
     TestTeardownInfo,
 )
+from protest.evals.types import EvalSuiteReport
 from protest.plugin import PluginBase, PluginContext
 from protest.reporting.verbosity import Verbosity
 
@@ -149,8 +152,6 @@ def on_test_teardown_start(self, info: TestTeardownInfo) -> None:
 
     @staticmethod
     def _print_bypass(msg: str) -> None:
-        import sys
-
         stream = getattr(sys.stdout, "_original", sys.stdout)
         stream.write(msg + "\n")
         stream.flush()
@@ -260,10 +261,6 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
                 print(f"  {line}")
 
     def on_user_print(self, data: Any) -> None:
-        import sys
-
-        from protest.console import strip_markup
-
         msg, raw = data
         text = msg if raw else strip_markup(msg)
         stream = getattr(sys.stdout, "_original", sys.stdout)
@@ -271,8 +268,6 @@ def on_user_print(self, data: Any) -> None:
         stream.flush()
 
     def on_eval_suite_end(self, report: Any) -> None:
-        from protest.evals.types import EvalSuiteReport
-
         if not isinstance(report, EvalSuiteReport):
             return
         stats = report.all_score_stats()
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 414cb49..5e1e96b 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -1,9 +1,12 @@
+import logging
+import sys
 import traceback
 from argparse import ArgumentParser
 from pathlib import Path
 from typing import Any
 
 from rich.console import Console  # type: ignore[import-not-found]
+from rich.table import Table  # type: ignore[import-not-found]
 from typing_extensions import Self
 
 from protest.entities import (
@@ -21,6 +24,7 @@
     TestStartInfo,
     TestTeardownInfo,
 )
+from protest.evals.types import EvalSuiteReport
 from protest.plugin import PluginBase, PluginContext
 from protest.reporting.verbosity import Verbosity
 
@@ -151,8 +155,6 @@ def _maybe_show_logs(self, result: TestResult) -> None:
         """Show captured log records if --show-logs is active."""
         if not self._show_logs or not result.log_records:
             return
-        import logging
-
         min_level = getattr(logging, self._show_logs.upper(), logging.INFO)
         for record in result.log_records:
             if record.levelno >= min_level:
@@ -170,10 +172,6 @@ def _maybe_show_logs(self, result: TestResult) -> None:
 
     def _print_bypass(self, message: str) -> None:
         """Print bypassing capture (for lifecycle messages emitted during tests)."""
-        import sys
-
-        from rich.console import Console
-
         stream = getattr(sys.stdout, "_original", sys.stdout)
         Console(file=stream, highlight=False).print(message)
 
@@ -397,10 +395,6 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
                 self._print(f"[dim]{escaped_line}[/]")
 
     def on_user_print(self, data: Any) -> None:
-        import sys
-
-        from rich.console import Console
-
         msg, raw = data
         # Write to the real stdout, bypassing capture
         stream = getattr(sys.stdout, "_original", sys.stdout)
@@ -411,10 +405,6 @@ def on_user_print(self, data: Any) -> None:
             c.print(f"[dim]       │[/] {msg}")
 
     def on_eval_suite_end(self, report: Any) -> None:
-        from rich.table import Table
-
-        from protest.evals.types import EvalSuiteReport
-
         if not isinstance(report, EvalSuiteReport):
             return
         stats = report.all_score_stats()
@@ -444,8 +434,16 @@ def on_eval_suite_end(self, report: Any) -> None:
             self._print(
                 f"  [cyan]Eval: {report.suite_name} ({report.total_count} cases)[/]"
             )
-        rate_pct = report.pass_rate * 100
-        color = "green" if rate_pct >= 100 else "yellow" if rate_pct >= 50 else "red"
+        full_pass = 100
+        half_pass = 50
+        rate_pct = report.pass_rate * full_pass
+        color = (
+            "green"
+            if rate_pct >= full_pass
+            else "yellow"
+            if rate_pct >= half_pass
+            else "red"
+        )
         self._print(
             f"  [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]"
         )
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 6e35762..9bdaead 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -13,13 +13,25 @@
 from __future__ import annotations
 
 import json
+import subprocess
 from dataclasses import dataclass
-from pathlib import Path
+from pathlib import Path  # noqa: TC003 — used at runtime (pytest tmp_path)
 from typing import Annotated, Any
 
-from protest import ForEach, From, ProTestSession
+from protest import ForEach, From, ProTestSession, Use, fixture
+from protest.api import run_session
+from protest.core.collector import Collector
 from protest.core.runner import TestRunner
-from protest.evals import EvalContext, EvalSession, Metric, ModelInfo, Verdict, evaluator
+from protest.core.suite import ProTestSuite
+from protest.evals import (
+    EvalContext,
+    EvalSession,
+    Metric,
+    ModelInfo,
+    ShortCircuit,
+    Verdict,
+    evaluator,
+)
 from protest.evals.evaluators import (
     contains_expected,
     contains_keywords,
@@ -31,6 +43,12 @@
     not_empty,
     word_overlap,
 )
+from protest.evals.hashing import compute_case_hash, compute_eval_hash
+from protest.evals.results_writer import EvalResultsWriter
+from protest.evals.types import EvalSuiteReport  # noqa: TC001 — used at runtime
+from protest.filters.kind import KindFilterPlugin
+from protest.history.storage import append_entry, clean_dirty
+from protest.plugin import PluginBase, PluginContext
 
 # ---------------------------------------------------------------------------
 # Fixtures: deterministic evaluators + task
@@ -159,8 +177,6 @@ class TestKindFiltering:
     """Suites have kind, filtering works."""
 
     def test_test_suite_has_kind_test(self) -> None:
-        from protest.core.suite import ProTestSuite
-
         suite = ProTestSuite("my_tests")
         assert suite.kind == "test"
 
@@ -174,9 +190,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
         assert any(s.kind == "eval" for s in session._suites)
 
     def test_kind_filter_keeps_only_matching(self) -> None:
-        from protest.core.suite import ProTestSuite
-        from protest.filters.kind import KindFilterPlugin
-
         test_suite = ProTestSuite("tests")
         eval_suite = ProTestSuite("evals", kind="eval")
 
@@ -193,8 +206,6 @@ def eval_one() -> None:
         session.add_suite(test_suite)
         session.add_suite(eval_suite)
 
-        from protest.core.collector import Collector
-
         items = Collector().collect(session)
         assert len(items) == 2
 
@@ -206,8 +217,6 @@ def eval_one() -> None:
 
     def test_unified_session_runs_tests_only(self) -> None:
         """protest run behavior: only kind=test suites."""
-        from protest.core.suite import ProTestSuite
-
         session = ProTestSession()
 
         test_suite = ProTestSuite("unit")
@@ -223,9 +232,6 @@ def test_a() -> None:
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
-        from protest.api import run_session
-        from protest.plugin import PluginContext
-
         ctx = PluginContext(args={"kind_filter": "test"})
         run_session(session, ctx=ctx)
 
@@ -233,8 +239,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
 
     def test_unified_session_runs_evals_only(self) -> None:
         """protest eval behavior: only kind=eval suites."""
-        from protest.core.suite import ProTestSuite
-
         session = ProTestSession()
 
         test_suite = ProTestSuite("unit")
@@ -250,9 +254,6 @@ def test_a() -> None:
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
-        from protest.api import run_session
-        from protest.plugin import PluginContext
-
         ctx = PluginContext(args={"kind_filter": "eval"})
         run_session(session, ctx=ctx)
 
@@ -272,9 +273,6 @@ class TestEvalOutput:
     """
 
     def test_report_contains_score_stats(self) -> None:
-        from protest.evals.types import EvalSuiteReport
-        from protest.plugin import PluginBase
-
         reports: list[EvalSuiteReport] = []
 
         class ReportCapture(PluginBase):
@@ -300,9 +298,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
         assert any(s.name == "accuracy" for s in stats)
 
     def test_report_has_pass_count(self) -> None:
-        from protest.evals.types import EvalSuiteReport
-        from protest.plugin import PluginBase
-
         reports: list[EvalSuiteReport] = []
 
         class ReportCapture(PluginBase):
@@ -327,8 +322,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
 
     def test_failed_eval_has_error_with_score_details(self) -> None:
         """When an eval case fails, the error message includes score details."""
-        from protest.plugin import PluginBase
-
         errors: list[Any] = []
 
         class ErrorCollector(PluginBase):
@@ -345,8 +338,6 @@ def on_test_fail(self, result: Any) -> None:
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
-        from protest.api import run_session
-
         run_session(session)
 
         # case_fail has matches_expected=False
@@ -362,8 +353,6 @@ class TestEvalPayloadFlow:
     """EvalPayload flows through the framework correctly."""
 
     def test_test_result_has_eval_payload(self) -> None:
-        from protest.plugin import PluginBase
-
         collected: list[Any] = []
 
         class Collector(PluginBase):
@@ -395,8 +384,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
 
     def test_lifecycle_events_have_case_id_in_node_id(self) -> None:
         """setup_done/teardown_start events carry node_id with [case_id]."""
-        from protest.plugin import PluginBase
-
         setup_ids: list[str] = []
         teardown_ids: list[str] = []
 
@@ -427,8 +414,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
 
     def test_evaluator_exception_is_error_not_fail(self) -> None:
         """An evaluator that raises is treated as error (infra), not test fail."""
-        from protest.plugin import PluginBase
-
         results: list[Any] = []
 
         class Collector(PluginBase):
@@ -463,8 +448,6 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
         assert "LLM judge timeout" in str(results[0].error)
 
     def test_non_eval_test_has_no_payload(self) -> None:
-        from protest.plugin import PluginBase
-
         collected: list[Any] = []
 
         class Collector(PluginBase):
@@ -497,8 +480,6 @@ class TestHistory:
     """JSONL history format and querying."""
 
     def _run_eval(self, tmp_path: Path) -> None:
-        from protest.api import run_session
-
         session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path)
 
         @session.eval(evaluators=[fake_accuracy])
@@ -540,8 +521,6 @@ def test_history_entry_format(self, tmp_path: Path) -> None:
         assert "cases" in suite
 
     def test_history_test_run_has_null_evals(self, tmp_path: Path) -> None:
-        from protest.api import run_session
-
         session = ProTestSession(history=True, history_dir=tmp_path)
 
         @session.test()
@@ -561,8 +540,6 @@ def test_history_multiple_runs_append(self, tmp_path: Path) -> None:
         assert len(lines) == 2
 
     def test_history_metadata_included(self, tmp_path: Path) -> None:
-        from protest.api import run_session
-
         session = EvalSession(
             history_dir=tmp_path,
             metadata={"env": "test", "version": "1.0"},
@@ -589,13 +566,9 @@ class TestCleanDirty:
 
     def test_clean_dirty_removes_current_head_only(self, tmp_path: Path) -> None:
         # Entry with current HEAD + dirty
-        import subprocess
-
-        from protest.history.storage import append_entry, clean_dirty
-
         try:
             current_commit = subprocess.run(
-                ["git", "rev-parse", "HEAD"],
+                ["git", "rev-parse", "HEAD"],  # noqa: S607
                 capture_output=True,
                 text=True,
                 timeout=5,
@@ -634,8 +607,6 @@ class TestCaseHashing:
 
     def test_case_hash_stored_in_history(self, tmp_path: Path) -> None:
         """History entries include case_hash and eval_hash per case."""
-        from protest.api import run_session
-
         session = EvalSession(history_dir=tmp_path)
 
         @session.eval(evaluators=[fake_accuracy])
@@ -656,24 +627,18 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
 
     def test_case_hash_changes_on_input_change(self) -> None:
         """Different inputs -> different case_hash."""
-        from protest.evals.hashing import compute_case_hash
-
         h1 = compute_case_hash("hello world", "expected")
         h2 = compute_case_hash("hello world modified", "expected")
         assert h1 != h2
 
     def test_case_hash_stable_for_same_input(self) -> None:
         """Same inputs -> same case_hash (deterministic)."""
-        from protest.evals.hashing import compute_case_hash
-
         h1 = compute_case_hash("hello world", "expected")
         h2 = compute_case_hash("hello world", "expected")
         assert h1 == h2
 
     def test_eval_hash_changes_on_evaluator_change(self) -> None:
         """Different evaluators -> different eval_hash."""
-        from protest.evals.hashing import compute_eval_hash
-
         e1 = contains_keywords(keywords=["hello"])
         e2 = contains_keywords(keywords=["hello", "world"])
         h1 = compute_eval_hash([e1])
@@ -762,8 +727,6 @@ class TestScoringV2:
 
     def test_bool_evaluator_pass(self) -> None:
         """Evaluator returning True -> case passes."""
-        from protest.plugin import PluginBase
-
         results: list[Any] = []
 
         class Collector(PluginBase):
@@ -798,8 +761,6 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
 
     def test_dataclass_without_bool_is_tracking_only(self) -> None:
         """Dataclass with only float fields -> tracking-only, always passes."""
-        from protest.plugin import PluginBase
-
         results: list[Any] = []
 
         class Collector(PluginBase):
@@ -833,8 +794,6 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
 
     def test_float_return_raises_type_error(self) -> None:
         """Evaluator returning naked float -> TypeError (caught as fixture error)."""
-        from protest.plugin import PluginBase
-
         results: list[Any] = []
 
         class Collector(PluginBase):
@@ -870,8 +829,6 @@ class TestShortCircuit:
     """ShortCircuit: skip expensive evaluators when cheap ones fail."""
 
     def test_short_circuit_skips_on_fail(self) -> None:
-        from protest.evals import ShortCircuit
-
         call_log: list[str] = []
 
         @evaluator
@@ -899,8 +856,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
         assert call_log.count("expensive") == 1
 
     def test_short_circuit_all_pass(self) -> None:
-        from protest.evals import ShortCircuit
-
         call_log: list[str] = []
 
         @evaluator
@@ -913,7 +868,9 @@ def check_b(ctx: EvalContext) -> bool:
             call_log.append("b")
             return True
 
-        single = ForEach([{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"])
+        single = ForEach(
+            [{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"]
+        )
         session = EvalSession()
 
         @session.eval(evaluators=[ShortCircuit([check_a, check_b])])
@@ -936,8 +893,6 @@ class TestResultsFiles:
     """Per-case markdown files written to .protest/results/<suite>_<ts>/."""
 
     def _run_eval(self, tmp_path: Path) -> Path:
-        from protest.evals.results_writer import EvalResultsWriter
-
         results_dir = tmp_path / "results"
         session = EvalSession()
         writer = EvalResultsWriter(history_dir=tmp_path)
@@ -994,8 +949,6 @@ class TestMultiDatasetHistory:
     """Multiple @session.eval calls produce distinct suites in history."""
 
     def _run_multi(self, tmp_path: Path) -> dict[str, Any]:
-        from protest.api import run_session
-
         pipeline_cases = ForEach(
             [
                 {"inputs": "hello", "expected": "hello", "name": "c1"},
@@ -1059,7 +1012,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
 
     def test_task_with_session_fixture_is_injected(self) -> None:
         """Une fixture session-scoped est injectee dans task via Use()."""
-        from protest import Use, fixture
 
         @fixture()
         def prefix_service() -> str:
@@ -1090,8 +1042,6 @@ async def eval_prefixed(
 
     def test_session_fixture_resolved_once_for_all_cases(self) -> None:
         """Une session fixture ne doit etre appelee qu'une fois meme avec N cas."""
-        from protest import Use, fixture
-
         call_count = 0
 
         @fixture()
diff --git a/uv.lock b/uv.lock
index aa650bb..34a6ee8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -764,7 +764,7 @@ wheels = [
 
 [[package]]
 name = "protest"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "typing-extensions" },

From 5f5e9a03cde8ef815d6379c54fd5536944b67991 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 31 Mar 2026 09:09:40 +0200
Subject: [PATCH 09/60] =?UTF-8?q?feat(evals):=20Judge=20protocol=20?=
 =?UTF-8?q?=E2=80=94=20LLM-as-judge=20via=20inversion=20of=20dependency?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ProTest owns the interface, user plugs in their LLM library.

- Judge protocol: `async judge(prompt, output_type) -> JudgeResponse[T]`
- JudgeResponse wraps output with optional tokens/cost tracking
- EvalContext.judge() unwraps for evaluators, accumulates usage stats
- JudgeInfo auto-derived from instance for history
- EvalPayload carries judge_call_count, tokens, cost per case
- EvalSession(judge=MyJudge()) wires through to evaluators
- suite.eval(judge=) for standalone usage
- 19 new tests (protocol, ctx.judge, e2e, structured output, tokens)
---
 docs/evals.md              | 119 ++++++++++++-
 protest/core/session.py    |   2 +
 protest/core/suite.py      |   2 +
 protest/entities/events.py |   4 +
 protest/evals/__init__.py  |   4 +
 protest/evals/evaluator.py |  51 +++++-
 protest/evals/session.py   |  12 +-
 protest/evals/types.py     |  57 +++++-
 protest/evals/wrapper.py   |  16 +-
 tests/evals/test_judge.py  | 354 +++++++++++++++++++++++++++++++++++++
 10 files changed, 607 insertions(+), 14 deletions(-)
 create mode 100644 tests/evals/test_judge.py

diff --git a/docs/evals.md b/docs/evals.md
index b8cd74b..e13812b 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -162,6 +162,8 @@ The threshold (`min_recall`) is a parameter of the evaluator, not a framework co
 
 ### Async (LLM Judge)
 
+Use `ctx.judge()` for structured LLM evaluation (requires `judge=` on `EvalSession`):
+
 ```python
 @dataclass
 class JudgeResult:
@@ -171,11 +173,15 @@ class JudgeResult:
 
 @evaluator
 async def llm_judge(ctx: EvalContext, rubric: str = "", min_score: float = 0.7) -> JudgeResult:
-    result = await judge_agent.run(f"Evaluate: {ctx.output}\nCriteria: {rubric}")
-    score = parse_score(result)
-    return JudgeResult(accuracy=score, accurate_enough=score >= min_score, reason=result.explanation)
+    return await ctx.judge(
+        f"Evaluate this response on a 0-1 scale.\n\n"
+        f"Response: {ctx.output}\nCriteria: {rubric}",
+        JudgeResult,
+    )
 ```
 
+The judge handles structured output — no text parsing needed. See [Judge](#judge) for setup.
+
 ### Per-Case Thresholds
 
 Different thresholds per case = different evaluator bindings:
@@ -218,14 +224,16 @@ EvalCase(inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")])
 
 ### EvalContext
 
-| Field | Type | Description |
-|-------|------|-------------|
+| Field / Method | Type | Description |
+|----------------|------|-------------|
 | `name` | `str` | Case name |
 | `inputs` | `I` | Case inputs |
 | `output` | `O` | Task return value |
 | `expected_output` | `O \| None` | From `EvalCase.expected` |
 | `metadata` | `Any` | From `EvalCase.metadata` |
 | `duration` | `float` | Task execution time (seconds) |
+| `judge(prompt, type)` | `async` | Call the configured LLM judge (see [Judge](#judge)) |
+| `judge_call_count` | `int` | Number of judge calls made |
 
 ### Built-in Evaluators
 
@@ -270,6 +278,107 @@ async def pipeline_eval(
 session = EvalSession(model=ModelInfo(name="qwen-2.5"))
 ```
 
+## Judge
+
+A `Judge` is a protocol for LLM-as-judge evaluators. ProTest owns the interface — you plug in your LLM library.
+
+### The Protocol
+
+```python
+class Judge(Protocol):
+    async def judge(self, prompt: str, output_type: type[T]) -> T: ...
+```
+
+Minimal contract: takes a prompt and a return type, returns a typed result. All configuration (model, temperature, system prompt, max_tokens) lives in your implementation's constructor, not in the protocol.
+
+### Writing a Judge
+
+The `judge()` method returns a `JudgeResponse[T]` that wraps the output with optional usage stats:
+
+```python
+from pydantic_ai import Agent
+from protest.evals import JudgeResponse
+
+class PydanticAIJudge:
+    name = "gpt-4o-mini"       # used in history
+    provider = "openai"        # optional, used in history
+
+    def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0):
+        self.model = model
+        self.temperature = temperature
+
+    async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]:
+        agent = Agent(self.model, output_type=output_type)
+        result = await agent.run(prompt)
+        usage = result.usage()
+        return JudgeResponse(
+            output=result.output,
+            input_tokens=usage.request_tokens,
+            output_tokens=usage.response_tokens,
+            cost=usage.request_tokens * 0.15/1e6 + usage.response_tokens * 0.60/1e6,
+        )
+```
+
+Tokens and cost are optional — omit them if your provider doesn't expose usage data:
+
+```python
+return JudgeResponse(output=result.output)  # tokens/cost = None, that's fine
+```
+
+### Configuring the Judge
+
+```python
+session = EvalSession(
+    model=ModelInfo(name="qwen-2.5"),
+    judge=PydanticAIJudge(model="gpt-4o-mini", temperature=0),
+)
+```
+
+`JudgeInfo` (name, provider) is derived automatically from the instance for history tracking.
+
+### Using the Judge in Evaluators
+
+Evaluators access the judge via `ctx.judge()`:
+
+```python
+@dataclass
+class JudgeResult:
+    accurate: Annotated[bool, Verdict]
+    reason: Annotated[str, Reason] = ""
+
+@evaluator
+async def llm_rubric(ctx: EvalContext, rubric: str = "") -> JudgeResult:
+    return await ctx.judge(
+        f"Evaluate this response.\n\nResponse: {ctx.output}\nCriteria: {rubric}",
+        JudgeResult,  # structured output — no text parsing
+    )
+```
+
+For simple verdicts, use `bool` or `str` as `output_type`:
+
+```python
+@evaluator
+async def simple_judge(ctx: EvalContext) -> bool:
+    return await ctx.judge(f"Is this a valid answer? {ctx.output}", bool)
+```
+
+### No Judge Configured
+
+If an evaluator calls `ctx.judge()` and no judge was passed to `EvalSession`, a `RuntimeError` is raised. This is treated as an **infrastructure error** (not a test failure), same as a fixture crash.
+
+### Usage Tracking
+
+Each call to `ctx.judge()` is counted. Tokens and cost from `JudgeResponse` are accumulated per case and flow to `EvalPayload`:
+
+| Field | Description |
+|-------|-------------|
+| `judge_call_count` | Number of judge calls |
+| `judge_input_tokens` | Total input tokens |
+| `judge_output_tokens` | Total output tokens |
+| `judge_cost` | Total cost (user-computed) |
+
+These are available in history, letting you track LLM usage across runs.
+
 ## Evaluator Errors
 
 If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output.
diff --git a/protest/core/session.py b/protest/core/session.py
index 3224028..59962c5 100644
--- a/protest/core/session.py
+++ b/protest/core/session.py
@@ -86,6 +86,7 @@ def __init__(
         self._metadata: dict[str, Any] = dict(metadata) if metadata else {}
         self._eval_model: ModelInfo | None = None  # set by EvalSession
         self._eval_judge: JudgeInfo | None = None  # set by EvalSession
+        self._eval_judge_instance: Any = None  # set by EvalSession
 
     async def resolve_autouse(self) -> None:
         """Resolve all session autouse fixtures at session start."""
@@ -241,6 +242,7 @@ def decorator(func: FuncT) -> FuncT:
                 func,
                 evaluators or [],
                 expected_key,
+                judge=self._eval_judge_instance,
             )
             suite.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
             self.add_suite(suite)
diff --git a/protest/core/suite.py b/protest/core/suite.py
index 1a8da5d..262d908 100644
--- a/protest/core/suite.py
+++ b/protest/core/suite.py
@@ -166,6 +166,7 @@ def eval(
         expected_key: str = "expected",
         tags: list[str] | None = None,
         timeout: float | None = None,
+        judge: Any = None,
     ) -> Callable[[FuncT], FuncT]:
         """Register a scored eval test on this suite."""
 
@@ -174,6 +175,7 @@ def decorator(func: FuncT) -> FuncT:
                 func,
                 evaluators or [],
                 expected_key,
+                judge=judge,
             )
             self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
             return func
diff --git a/protest/entities/events.py b/protest/entities/events.py
index afb8971..33b43b2 100644
--- a/protest/entities/events.py
+++ b/protest/entities/events.py
@@ -30,6 +30,10 @@ class EvalPayload:
     scores: dict[str, EvalScoreEntry] = field(default_factory=dict)
     case_hash: str = ""
     eval_hash: str = ""
+    judge_call_count: int = 0
+    judge_input_tokens: int = 0
+    judge_output_tokens: int = 0
+    judge_cost: float = 0.0
 
 
 @dataclass(frozen=True, slots=True)
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index fdb5115..8e53005 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -13,7 +13,9 @@
     EvalCaseResult,
     EvalScore,
     EvalSuiteReport,
+    Judge,
     JudgeInfo,
+    JudgeResponse,
     ModelInfo,
     ScoreStats,
 )
@@ -25,7 +27,9 @@
     "EvalScore",
     "EvalSession",
     "EvalSuiteReport",
+    "Judge",
     "JudgeInfo",
+    "JudgeResponse",
     "Metric",
     "ModelInfo",
     "Reason",
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 61a8a72..701fe5c 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -36,10 +36,14 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
 import functools
 import inspect
 from dataclasses import dataclass, field
-from typing import Any, Generic, TypeVar
+from typing import TYPE_CHECKING, Any, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from protest.evals.types import Judge
 
 InputT = TypeVar("InputT")
 OutputT = TypeVar("OutputT")
+T = TypeVar("T")
 
 
 @dataclass
@@ -52,6 +56,51 @@ class EvalContext(Generic[InputT, OutputT]):
     expected_output: OutputT | None
     metadata: Any
     duration: float
+    _judge: Judge | None = field(default=None, repr=False)
+    _judge_call_count: int = field(default=0, repr=False, init=False)
+    _judge_input_tokens: int = field(default=0, repr=False, init=False)
+    _judge_output_tokens: int = field(default=0, repr=False, init=False)
+    _judge_cost: float = field(default=0.0, repr=False, init=False)
+
+    async def judge(self, prompt: str, output_type: type[T]) -> T:
+        """Call the configured LLM judge and return the typed output.
+
+        Tokens and cost from JudgeResponse are accumulated internally
+        and flow to EvalPayload for history/display. The evaluator
+        only sees the unwrapped output.
+
+        Raises RuntimeError if no judge was configured on the session.
+        """
+        if self._judge is None:
+            raise RuntimeError(
+                f"Evaluator for case '{self.name}' called ctx.judge() but no "
+                "judge is configured. Pass judge= to EvalSession()."
+            )
+        self._judge_call_count += 1
+        response = await self._judge.judge(prompt, output_type)
+        if response.input_tokens is not None:
+            self._judge_input_tokens += response.input_tokens
+        if response.output_tokens is not None:
+            self._judge_output_tokens += response.output_tokens
+        if response.cost is not None:
+            self._judge_cost += response.cost
+        return response.output
+
+    @property
+    def judge_call_count(self) -> int:
+        return self._judge_call_count
+
+    @property
+    def judge_input_tokens(self) -> int:
+        return self._judge_input_tokens
+
+    @property
+    def judge_output_tokens(self) -> int:
+        return self._judge_output_tokens
+
+    @property
+    def judge_cost(self) -> float:
+        return self._judge_cost
 
 
 @dataclass
diff --git a/protest/evals/session.py b/protest/evals/session.py
index 82bea35..81f22d9 100644
--- a/protest/evals/session.py
+++ b/protest/evals/session.py
@@ -9,7 +9,7 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
-    from protest.evals.types import JudgeInfo, ModelInfo
+    from protest.evals.types import Judge, ModelInfo
 
 
 class EvalSession(ProTestSession):
@@ -28,7 +28,7 @@ def __init__(
         self,
         *,
         model: ModelInfo | None = None,
-        judge: JudgeInfo | None = None,
+        judge: Judge | None = None,
         concurrency: int = 1,
         history: bool = True,
         history_dir: Path | None = None,
@@ -41,4 +41,10 @@ def __init__(
             metadata=metadata,
         )
         self._eval_model = model
-        self._eval_judge = judge
+        self._eval_judge_instance: Judge | None = judge
+        if judge is not None:
+            from protest.evals.types import JudgeInfo
+
+            self._eval_judge = JudgeInfo.from_instance(judge)
+        else:
+            self._eval_judge = None
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 121264f..e928c86 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -4,7 +4,55 @@
 
 import statistics
 from dataclasses import dataclass, field
-from typing import Any
+from typing import Any, Generic, Protocol, TypeVar, runtime_checkable
+
+T = TypeVar("T")
+
+
+@dataclass(frozen=True, slots=True)
+class JudgeResponse(Generic[T]):
+    """Return type for Judge.judge() — wraps the output with optional usage stats.
+
+    Evaluators never see this: ``ctx.judge()`` unwraps and returns ``output``.
+    ProTest accumulates tokens/cost for history and display.
+
+    Usage::
+
+        return JudgeResponse(
+            output=result.output,
+            input_tokens=usage.request_tokens,
+            output_tokens=usage.response_tokens,
+            cost=0.003,
+        )
+
+        # Or minimal — tokens/cost are optional:
+        return JudgeResponse(output=result.output)
+    """
+
+    output: T
+    input_tokens: int | None = None
+    output_tokens: int | None = None
+    cost: float | None = None
+
+
+@runtime_checkable
+class Judge(Protocol):
+    """Protocol for LLM judge implementations.
+
+    All configuration (model, temperature, system_prompt, max_tokens)
+    lives in the constructor of the implementation, NOT in this protocol.
+
+    Usage::
+
+        class MyJudge:
+            async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]:
+                result = await agent.run(prompt)
+                return JudgeResponse(output=result.output, input_tokens=100)
+
+        session = EvalSession(judge=MyJudge())
+    """
+
+    async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: ...
 
 
 @dataclass(frozen=True, slots=True)
@@ -40,6 +88,13 @@ class JudgeInfo:
     evaluators: tuple[str, ...] = ()
     extra: dict[str, Any] = field(default_factory=dict)
 
+    @classmethod
+    def from_instance(cls, judge: Judge) -> JudgeInfo:
+        """Extract metadata from a Judge instance (duck-typed)."""
+        name = getattr(judge, "name", None) or type(judge).__name__
+        provider = getattr(judge, "provider", None)
+        return cls(name=str(name), provider=provider)
+
 
 @dataclass(frozen=True, slots=True)
 class EvalScore:
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index 537282b..b94c217 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -25,6 +25,7 @@ def make_eval_wrapper(
     func: Any,
     evaluators: list[Any],
     expected_key: str,
+    judge: Any = None,
 ) -> Any:
     """Wrap a function to run evaluators on its return value."""
 
@@ -46,7 +47,7 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
         per_case = _extract_per_case_evaluators(kwargs)
         all_evaluators.extend(per_case)
 
-        scores = await run_evaluators(
+        scores, eval_ctx = await run_evaluators(
             all_evaluators,
             case_name,
             inputs,
@@ -54,6 +55,7 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
             expected,
             metadata,
             task_duration,
+            judge=judge,
         )
 
         from protest.evals.hashing import compute_case_hash, compute_eval_hash
@@ -75,6 +77,10 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
             },
             case_hash=compute_case_hash(inputs, expected),
             eval_hash=compute_eval_hash(all_evaluators),
+            judge_call_count=eval_ctx.judge_call_count,
+            judge_input_tokens=eval_ctx.judge_input_tokens,
+            judge_output_tokens=eval_ctx.judge_output_tokens,
+            judge_cost=eval_ctx.judge_cost,
         )
 
     return eval_wrapper
@@ -155,8 +161,9 @@ async def run_evaluators(
     expected_output: Any,
     metadata: Any,
     duration: float,
-) -> list[EvalScore]:
-    """Run evaluators and convert results to EvalScores."""
+    judge: Any = None,
+) -> tuple[list[EvalScore], EvalContext[Any, Any]]:
+    """Run evaluators and return (scores, ctx with judge stats)."""
     ctx = EvalContext(
         name=case_name,
         inputs=inputs,
@@ -164,6 +171,7 @@ async def run_evaluators(
         expected_output=expected_output,
         metadata=metadata,
         duration=duration,
+        _judge=judge,
     )
 
     scores: list[EvalScore] = []
@@ -182,7 +190,7 @@ async def run_evaluators(
 
             raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
 
-    return scores
+    return scores, ctx
 
 
 async def _run_short_circuit(
diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py
new file mode 100644
index 0000000..10106d9
--- /dev/null
+++ b/tests/evals/test_judge.py
@@ -0,0 +1,354 @@
+"""Tests for the Judge protocol and ctx.judge() integration."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Annotated, Any
+
+import pytest
+
+from protest import ForEach, From
+from protest.core.runner import TestRunner
+from protest.evals import (
+    EvalContext,
+    EvalSession,
+    Judge,
+    JudgeResponse,
+    ModelInfo,
+    Verdict,
+    evaluator,
+)
+from protest.evals.types import JudgeInfo
+from protest.plugin import PluginBase
+
+
+# ---------------------------------------------------------------------------
+# Fake judge for testing
+# ---------------------------------------------------------------------------
+
+
+class FakeJudge:
+    """Minimal Judge implementation for tests."""
+
+    name = "fake-judge"
+    provider = "test"
+
+    async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
+        if output_type is bool:
+            return JudgeResponse(
+                output="pass" in prompt.lower(),
+                input_tokens=10,
+                output_tokens=5,
+                cost=0.001,
+            )
+        if output_type is str:
+            return JudgeResponse(output=f"judged: {prompt[:20]}")
+        # For dataclass types, try to construct with defaults
+        return JudgeResponse(output=output_type())
+
+
+class BareJudge:
+    """Judge without name/provider attrs — tests fallback."""
+
+    async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
+        return JudgeResponse(output=True)
+
+
+# ---------------------------------------------------------------------------
+# Protocol compliance
+# ---------------------------------------------------------------------------
+
+
+class TestJudgeProtocol:
+    def test_fake_judge_satisfies_protocol(self) -> None:
+        assert isinstance(FakeJudge(), Judge)
+
+    def test_bare_judge_satisfies_protocol(self) -> None:
+        assert isinstance(BareJudge(), Judge)
+
+    def test_non_judge_rejected(self) -> None:
+        class NotAJudge:
+            def evaluate(self, prompt: str) -> str:
+                return "nope"
+
+        assert not isinstance(NotAJudge(), Judge)
+
+
+# ---------------------------------------------------------------------------
+# JudgeInfo.from_instance
+# ---------------------------------------------------------------------------
+
+
+class TestJudgeInfoExtraction:
+    def test_from_instance_with_attrs(self) -> None:
+        info = JudgeInfo.from_instance(FakeJudge())
+        assert info.name == "fake-judge"
+        assert info.provider == "test"
+
+    def test_from_instance_fallback_to_class_name(self) -> None:
+        info = JudgeInfo.from_instance(BareJudge())
+        assert info.name == "BareJudge"
+        assert info.provider is None
+
+
+# ---------------------------------------------------------------------------
+# EvalContext.judge()
+# ---------------------------------------------------------------------------
+
+
+class TestEvalContextJudge:
+    @pytest.mark.asyncio
+    async def test_judge_happy_path(self) -> None:
+        judge = FakeJudge()
+        ctx = EvalContext(
+            name="test_case",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+            _judge=judge,
+        )
+        result = await ctx.judge("pass this", bool)
+        assert result is True
+
+    @pytest.mark.asyncio
+    async def test_judge_str_output(self) -> None:
+        judge = FakeJudge()
+        ctx = EvalContext(
+            name="test_case",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+            _judge=judge,
+        )
+        result = await ctx.judge("hello world", str)
+        assert result == "judged: hello world"
+
+    @pytest.mark.asyncio
+    async def test_judge_raises_without_judge(self) -> None:
+        ctx = EvalContext(
+            name="my_case",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+        )
+        with pytest.raises(RuntimeError, match="no judge is configured"):
+            await ctx.judge("test", bool)
+
+    @pytest.mark.asyncio
+    async def test_judge_error_mentions_case_name(self) -> None:
+        ctx = EvalContext(
+            name="chatbot_eval",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+        )
+        with pytest.raises(RuntimeError, match="chatbot_eval"):
+            await ctx.judge("test", bool)
+
+    @pytest.mark.asyncio
+    async def test_judge_call_count(self) -> None:
+        judge = FakeJudge()
+        ctx = EvalContext(
+            name="test_case",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+            _judge=judge,
+        )
+        assert ctx.judge_call_count == 0
+        await ctx.judge("pass 1", bool)
+        assert ctx.judge_call_count == 1
+        await ctx.judge("pass 2", bool)
+        await ctx.judge("pass 3", bool)
+        assert ctx.judge_call_count == 3
+
+    @pytest.mark.asyncio
+    async def test_judge_tokens_accumulated(self) -> None:
+        judge = FakeJudge()  # returns input_tokens=10, output_tokens=5 for bool
+        ctx = EvalContext(
+            name="test_case",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+            _judge=judge,
+        )
+        await ctx.judge("pass 1", bool)
+        await ctx.judge("pass 2", bool)
+        assert ctx.judge_input_tokens == 20
+        assert ctx.judge_output_tokens == 10
+
+    @pytest.mark.asyncio
+    async def test_judge_cost_accumulated(self) -> None:
+        judge = FakeJudge()  # returns cost=0.001 for bool
+        ctx = EvalContext(
+            name="test_case",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+            _judge=judge,
+        )
+        await ctx.judge("pass 1", bool)
+        await ctx.judge("pass 2", bool)
+        assert ctx.judge_cost == pytest.approx(0.002)
+
+    @pytest.mark.asyncio
+    async def test_judge_none_tokens_not_accumulated(self) -> None:
+        """JudgeResponse with tokens=None doesn't affect accumulation."""
+        judge = FakeJudge()
+        ctx = EvalContext(
+            name="test_case",
+            inputs="q",
+            output="a",
+            expected_output=None,
+            metadata=None,
+            duration=0.1,
+            _judge=judge,
+        )
+        await ctx.judge("hello", str)  # FakeJudge returns no tokens for str
+        assert ctx.judge_input_tokens == 0
+        assert ctx.judge_output_tokens == 0
+        assert ctx.judge_cost == 0.0
+
+
+# ---------------------------------------------------------------------------
+# E2E: EvalSession with judge
+# ---------------------------------------------------------------------------
+
+single_case = ForEach(
+    [{"inputs": "hello", "expected": "hello", "name": "case_1"}],
+    ids=lambda c: c["name"],
+)
+
+
+class TestJudgeE2E:
+    def test_judge_available_in_evaluator(self) -> None:
+        """Full run: evaluator calls ctx.judge(), result is pass."""
+
+        @evaluator
+        async def judge_evaluator(ctx: EvalContext) -> bool:
+            return await ctx.judge("pass this", bool)
+
+        session = EvalSession(judge=FakeJudge())
+
+        @session.eval(evaluators=[judge_evaluator])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return case["inputs"]
+
+        runner = TestRunner(session)
+        result = runner.run()
+        assert result.success is True
+
+    def test_no_judge_is_fixture_error(self) -> None:
+        """Evaluator calls ctx.judge() without judge configured → infra error."""
+
+        @evaluator
+        async def needs_judge(ctx: EvalContext) -> bool:
+            return await ctx.judge("test", bool)
+
+        session = EvalSession()  # no judge
+
+        @session.eval(evaluators=[needs_judge])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return case["inputs"]
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_fail(self, result: Any) -> None:
+                results.append(result)
+
+        session.register_plugin(Collector())
+        runner = TestRunner(session)
+        result = runner.run()
+        assert result.success is False
+        assert len(results) == 1
+        assert results[0].is_fixture_error is True
+
+    def test_judge_call_count_in_payload(self) -> None:
+        """judge_call_count flows through to EvalPayload."""
+
+        @evaluator
+        async def double_judge(ctx: EvalContext) -> bool:
+            r1 = await ctx.judge("pass first", bool)
+            r2 = await ctx.judge("pass second", bool)
+            return r1 and r2
+
+        session = EvalSession(judge=FakeJudge())
+
+        @session.eval(evaluators=[double_judge])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return case["inputs"]
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_pass(self, result: Any) -> None:
+                results.append(result)
+
+        session.register_plugin(Collector())
+        runner = TestRunner(session)
+        runner.run()
+        assert len(results) == 1
+        payload = results[0].eval_payload
+        assert payload is not None
+        assert payload.judge_call_count == 2
+        assert payload.judge_input_tokens == 20  # 10 per call × 2
+        assert payload.judge_output_tokens == 10  # 5 per call × 2
+        assert payload.judge_cost == pytest.approx(0.002)  # 0.001 per call × 2
+
+    def test_judge_info_derived_from_instance(self) -> None:
+        """EvalSession derives JudgeInfo from Judge instance."""
+        session = EvalSession(judge=FakeJudge())
+        assert session._eval_judge is not None
+        assert session._eval_judge.name == "fake-judge"
+        assert session._eval_judge.provider == "test"
+
+    def test_no_judge_no_judge_info(self) -> None:
+        """EvalSession without judge has no JudgeInfo."""
+        session = EvalSession()
+        assert session._eval_judge is None
+
+    def test_judge_with_structured_output(self) -> None:
+        """Judge returns structured dataclass via output_type."""
+
+        @dataclass
+        class JudgeVerdict:
+            ok: Annotated[bool, Verdict]
+
+        class StructuredJudge:
+            name = "structured"
+
+            async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
+                return JudgeResponse(output=output_type(ok=True))
+
+        @evaluator
+        async def struct_evaluator(ctx: EvalContext) -> JudgeVerdict:
+            return await ctx.judge("evaluate this", JudgeVerdict)
+
+        session = EvalSession(judge=StructuredJudge())
+
+        @session.eval(evaluators=[struct_evaluator])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return case["inputs"]
+
+        runner = TestRunner(session)
+        result = runner.run()
+        assert result.success is True

From 015c451b6852e371c8cd557299b03d9f0f578ab7 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 31 Mar 2026 12:50:34 +0200
Subject: [PATCH 10/60] fix(reporters): show in/out token split in eval usage
 summary

Task: 45.2k in / 27.1k out, $0.0142
Judge: 5 calls, 800 in / 400 out, $0.0030
---
 protest/core/runner.py             |  7 +++
 protest/entities/events.py         |  3 +
 protest/evals/__init__.py          |  2 +
 protest/evals/types.py             | 73 +++++++++++++++++++++++
 protest/evals/wrapper.py           | 21 ++++++-
 protest/reporting/ascii.py         | 21 +++++++
 protest/reporting/rich_reporter.py | 23 ++++++++
 tests/evals/test_judge.py          | 95 ++++++++++++++++++++++++++++++
 8 files changed, 243 insertions(+), 2 deletions(-)

diff --git a/protest/core/runner.py b/protest/core/runner.py
index 4e58544..124cb44 100644
--- a/protest/core/runner.py
+++ b/protest/core/runner.py
@@ -227,4 +227,11 @@ def _build_eval_case_result(result: TestResult) -> EvalCaseResult:
         expected_output=payload.expected_output,
         case_hash=payload.case_hash,
         eval_hash=payload.eval_hash,
+        task_input_tokens=payload.task_input_tokens,
+        task_output_tokens=payload.task_output_tokens,
+        task_cost=payload.task_cost,
+        judge_call_count=payload.judge_call_count,
+        judge_input_tokens=payload.judge_input_tokens,
+        judge_output_tokens=payload.judge_output_tokens,
+        judge_cost=payload.judge_cost,
     )
diff --git a/protest/entities/events.py b/protest/entities/events.py
index 33b43b2..d67388d 100644
--- a/protest/entities/events.py
+++ b/protest/entities/events.py
@@ -30,6 +30,9 @@ class EvalPayload:
     scores: dict[str, EvalScoreEntry] = field(default_factory=dict)
     case_hash: str = ""
     eval_hash: str = ""
+    task_input_tokens: int = 0
+    task_output_tokens: int = 0
+    task_cost: float = 0.0
     judge_call_count: int = 0
     judge_input_tokens: int = 0
     judge_output_tokens: int = 0
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index 8e53005..628a275 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -18,6 +18,7 @@
     JudgeResponse,
     ModelInfo,
     ScoreStats,
+    TaskResult,
 )
 
 __all__ = [
@@ -35,6 +36,7 @@
     "Reason",
     "ScoreStats",
     "ShortCircuit",
+    "TaskResult",
     "Verdict",
     "evaluator",
 ]
diff --git a/protest/evals/types.py b/protest/evals/types.py
index e928c86..7c8e14c 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -9,6 +9,36 @@
 T = TypeVar("T")
 
 
+@dataclass(frozen=True, slots=True)
+class TaskResult(Generic[T]):
+    """Optional wrapper for eval task return values with usage stats.
+
+    Return this instead of a plain value to report LLM usage for the
+    system under test. ProTest unwraps it transparently — evaluators
+    see the plain output.
+
+    Usage::
+
+        @session.eval(evaluators=[...])
+        async def my_eval(case) -> TaskResult[str]:
+            result = await agent.run(case.inputs)
+            usage = result.usage()
+            return TaskResult(
+                output=result.output,
+                input_tokens=usage.request_tokens,
+                output_tokens=usage.response_tokens,
+                cost=0.003,
+            )
+
+        # Or just return str directly — TaskResult is opt-in.
+    """
+
+    output: T
+    input_tokens: int | None = None
+    output_tokens: int | None = None
+    cost: float | None = None
+
+
 @dataclass(frozen=True, slots=True)
 class JudgeResponse(Generic[T]):
     """Return type for Judge.judge() — wraps the output with optional usage stats.
@@ -149,6 +179,13 @@ class EvalCaseResult:
     expected_output: Any = None
     case_hash: str = ""
     eval_hash: str = ""
+    task_input_tokens: int = 0
+    task_output_tokens: int = 0
+    task_cost: float = 0.0
+    judge_call_count: int = 0
+    judge_input_tokens: int = 0
+    judge_output_tokens: int = 0
+    judge_cost: float = 0.0
 
     @property
     def numeric_scores(self) -> dict[str, float]:
@@ -228,3 +265,39 @@ def score_stats(self, name: str) -> ScoreStats:
 
     def all_score_stats(self) -> list[ScoreStats]:
         return [self.score_stats(n) for n in sorted(self.score_names())]
+
+    @property
+    def total_task_input_tokens(self) -> int:
+        return sum(c.task_input_tokens for c in self.cases)
+
+    @property
+    def total_task_output_tokens(self) -> int:
+        return sum(c.task_output_tokens for c in self.cases)
+
+    @property
+    def total_task_tokens(self) -> int:
+        return self.total_task_input_tokens + self.total_task_output_tokens
+
+    @property
+    def total_task_cost(self) -> float:
+        return sum(c.task_cost for c in self.cases)
+
+    @property
+    def total_judge_calls(self) -> int:
+        return sum(c.judge_call_count for c in self.cases)
+
+    @property
+    def total_judge_input_tokens(self) -> int:
+        return sum(c.judge_input_tokens for c in self.cases)
+
+    @property
+    def total_judge_output_tokens(self) -> int:
+        return sum(c.judge_output_tokens for c in self.cases)
+
+    @property
+    def total_judge_tokens(self) -> int:
+        return self.total_judge_input_tokens + self.total_judge_output_tokens
+
+    @property
+    def total_judge_cost(self) -> float:
+        return sum(c.judge_cost for c in self.cases)
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index b94c217..82b21ad 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -38,11 +38,25 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
 
         start = time.perf_counter()
         if asyncio.iscoroutinefunction(func):
-            output = await func(**kwargs)
+            raw_output = await func(**kwargs)
         else:
-            output = func(**kwargs)
+            raw_output = func(**kwargs)
         task_duration = time.perf_counter() - start
 
+        # Unwrap TaskResult if returned
+        from protest.evals.types import TaskResult
+
+        task_input_tokens = 0
+        task_output_tokens = 0
+        task_cost = 0.0
+        if isinstance(raw_output, TaskResult):
+            output = raw_output.output
+            task_input_tokens = raw_output.input_tokens or 0
+            task_output_tokens = raw_output.output_tokens or 0
+            task_cost = raw_output.cost or 0.0
+        else:
+            output = raw_output
+
         all_evaluators = list(evaluators)
         per_case = _extract_per_case_evaluators(kwargs)
         all_evaluators.extend(per_case)
@@ -77,6 +91,9 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
             },
             case_hash=compute_case_hash(inputs, expected),
             eval_hash=compute_eval_hash(all_evaluators),
+            task_input_tokens=task_input_tokens,
+            task_output_tokens=task_output_tokens,
+            task_cost=task_cost,
             judge_call_count=eval_ctx.judge_call_count,
             judge_input_tokens=eval_ctx.judge_input_tokens,
             judge_output_tokens=eval_ctx.judge_output_tokens,
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index ea4040d..1620789 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -63,6 +63,19 @@ def _format_duration(seconds: float) -> str:
     return f"{seconds:.2f}s"
 
 
+def _format_tokens(tokens: int) -> str:
+    return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens)
+
+
+def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str:
+    parts: list[str] = []
+    if input_tokens > 0 or output_tokens > 0:
+        parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out")
+    if cost > 0:
+        parts.append(f"${cost:.4f}")
+    return ", ".join(parts)
+
+
 class AsciiReporter(PluginBase):
     """Plain ASCII reporter. No colors, no emojis. Works everywhere."""
 
@@ -285,6 +298,14 @@ def on_eval_suite_end(self, report: Any) -> None:
             print("  " + "─" * 60)
         rate_pct = report.pass_rate * 100
         print(f"  Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)")
+        if report.total_task_tokens > 0 or report.total_task_cost > 0:
+            print(f"  Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}")
+        if report.total_judge_calls > 0:
+            judge_parts = [f"{report.total_judge_calls} calls"]
+            usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost)
+            if usage:
+                judge_parts.append(usage)
+            print(f"  Judge: {', '.join(judge_parts)}")
         print()
 
     def on_session_complete(self, result: SessionResult) -> None:
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 5e1e96b..c699f71 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -53,6 +53,21 @@ def _format_duration(seconds: float) -> str:
     return f"{seconds:.2f}s"
 
 
+def _format_tokens(tokens: int) -> str:
+    """Format token count: 1234 → '1.2k', 45 → '45'."""
+    return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens)
+
+
+def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str:
+    """Format usage stats as 'Xk in / Yk out, $0.0042'."""
+    parts: list[str] = []
+    if input_tokens > 0 or output_tokens > 0:
+        parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out")
+    if cost > 0:
+        parts.append(f"${cost:.4f}")
+    return ", ".join(parts)
+
+
 def _format_eval_scores_inline(result: TestResult) -> str:
     """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0')."""
     if not result.eval_payload:
@@ -447,6 +462,14 @@ def on_eval_suite_end(self, report: Any) -> None:
         self._print(
             f"  [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]"
         )
+        if report.total_task_tokens > 0 or report.total_task_cost > 0:
+            self._print(f"  [dim]Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}[/]")
+        if report.total_judge_calls > 0:
+            judge_parts = [f"{report.total_judge_calls} calls"]
+            usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost)
+            if usage:
+                judge_parts.append(usage)
+            self._print(f"  [dim]Judge: {', '.join(judge_parts)}[/]")
 
     def on_session_complete(self, result: SessionResult) -> None:
         has_non_eval_failures = any(not r.is_eval for r in self._failed_results)
diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py
index 10106d9..0a6006f 100644
--- a/tests/evals/test_judge.py
+++ b/tests/evals/test_judge.py
@@ -15,6 +15,7 @@
     Judge,
     JudgeResponse,
     ModelInfo,
+    TaskResult,
     Verdict,
     evaluator,
 )
@@ -352,3 +353,97 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
         runner = TestRunner(session)
         result = runner.run()
         assert result.success is True
+
+
+# ---------------------------------------------------------------------------
+# TaskResult: SUT usage tracking
+# ---------------------------------------------------------------------------
+
+
+class TestTaskResult:
+    def test_task_result_unwrapped_for_evaluators(self) -> None:
+        """TaskResult is unwrapped — evaluators see the plain output."""
+
+        @evaluator
+        def check_output(ctx: EvalContext) -> bool:
+            return ctx.output == "hello"  # sees str, not TaskResult
+
+        session = EvalSession()
+
+        @session.eval(evaluators=[check_output])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]:
+            return TaskResult(
+                output=case["inputs"],
+                input_tokens=100,
+                output_tokens=50,
+                cost=0.01,
+            )
+
+        runner = TestRunner(session)
+        result = runner.run()
+        assert result.success is True
+
+    def test_task_usage_in_payload(self) -> None:
+        """TaskResult tokens/cost flow through to EvalPayload."""
+
+        @evaluator
+        def always_pass(ctx: EvalContext) -> bool:
+            return True
+
+        session = EvalSession()
+
+        @session.eval(evaluators=[always_pass])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]:
+            return TaskResult(
+                output=case["inputs"],
+                input_tokens=200,
+                output_tokens=80,
+                cost=0.005,
+            )
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_pass(self, result: Any) -> None:
+                results.append(result)
+
+        session.register_plugin(Collector())
+        runner = TestRunner(session)
+        runner.run()
+        assert len(results) == 1
+        payload = results[0].eval_payload
+        assert payload is not None
+        assert payload.task_input_tokens == 200
+        assert payload.task_output_tokens == 80
+        assert payload.task_cost == pytest.approx(0.005)
+
+    def test_plain_return_has_zero_task_usage(self) -> None:
+        """Plain return (no TaskResult) has zero task usage."""
+
+        @evaluator
+        def always_pass(ctx: EvalContext) -> bool:
+            return True
+
+        session = EvalSession()
+
+        @session.eval(evaluators=[always_pass])
+        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
+            return case["inputs"]
+
+        results: list[Any] = []
+
+        class Collector(PluginBase):
+            name = "collector"
+
+            def on_test_pass(self, result: Any) -> None:
+                results.append(result)
+
+        session.register_plugin(Collector())
+        runner = TestRunner(session)
+        runner.run()
+        payload = results[0].eval_payload
+        assert payload.task_input_tokens == 0
+        assert payload.task_output_tokens == 0
+        assert payload.task_cost == 0.0

From 8e748ce0bba111f323aedf41c17a46561c107fc8 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 31 Mar 2026 20:23:37 +0200
Subject: [PATCH 11/60] fix(history): exclude error-only runs from stats,
 propagate is_error flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixture crashes (errored >= total_cases) were counted in pass_rates,
score_values, and flaky — polluting stats with noise. Now:
- EvalCaseResult.is_error propagated from TestResult.is_fixture_error
- History serializes errored count per suite + is_error per case
- _aggregate_suites skips error-only runs from stats entirely
- _track_cases skips error cases from score_values and flaky
- Error runs still visible in `protest history --runs`

Also: docs/evals.md updated for TaskResult section and Judge protocol fix.
---
 docs/evals.md               |  37 +++++++-
 protest/cli/history.py      |   9 +-
 protest/core/runner.py      |   1 +
 protest/evals/history.py    |   2 +
 protest/evals/types.py      |   7 +-
 tests/test_history_stats.py | 164 ++++++++++++++++++++++++++++++++++++
 6 files changed, 216 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_history_stats.py

diff --git a/docs/evals.md b/docs/evals.md
index e13812b..006c403 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -286,10 +286,10 @@ A `Judge` is a protocol for LLM-as-judge evaluators. ProTest owns the interface
 
 ```python
 class Judge(Protocol):
-    async def judge(self, prompt: str, output_type: type[T]) -> T: ...
+    async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: ...
 ```
 
-Minimal contract: takes a prompt and a return type, returns a typed result. All configuration (model, temperature, system prompt, max_tokens) lives in your implementation's constructor, not in the protocol.
+Minimal contract: takes a prompt and a return type, returns a `JudgeResponse` wrapping the typed result with optional usage stats. All configuration (model, temperature, system prompt, max_tokens) lives in your implementation's constructor, not in the protocol.
 
 ### Writing a Judge
 
@@ -379,6 +379,39 @@ Each call to `ctx.judge()` is counted. Tokens and cost from `JudgeResponse` are
 
 These are available in history, letting you track LLM usage across runs.
 
+## TaskResult (SUT Usage Tracking)
+
+If your eval task calls an LLM, you can report usage by returning `TaskResult` instead of a plain value:
+
+```python
+from protest.evals import TaskResult
+
+@session.eval(evaluators=[my_scorer])
+async def chatbot(case: Annotated[EvalCase, From(cases)]) -> TaskResult[str]:
+    result = await agent.run(case.inputs)
+    usage = result.usage()
+    return TaskResult(
+        output=result.output,
+        input_tokens=usage.request_tokens,
+        output_tokens=usage.response_tokens,
+        cost=usage.request_tokens * 0.10/1e6 + usage.response_tokens * 0.30/1e6,
+    )
+```
+
+This is **opt-in** — returning a plain `str` still works. ProTest unwraps `TaskResult` transparently: evaluators see the plain output, usage stats flow to the reporter and history.
+
+## Usage Display
+
+When task or judge usage data is available, ProTest shows a summary after the eval stats:
+
+```
+  Passed: 16/26 (61.5%)
+  Task: 45.2k in / 27.1k out, $0.0142
+  Judge: 5 calls, 800 in / 400 out, $0.0030
+```
+
+Lines only appear when there is data. No `TaskResult` = no Task line. No judge configured = no Judge line.
+
 ## Evaluator Errors
 
 If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output.
diff --git a/protest/cli/history.py b/protest/cli/history.py
index f9eb7ac..33b230b 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -406,9 +406,13 @@ def _aggregate_suites(entries: list[dict[str, Any]]) -> dict[str, dict[str, Any]
                     "score_values": {},
                 }
             s = suites[name]
-            s["n_runs"] += 1
+            errored = data.get("errored", 0)
             total = data.get("total_cases", 0)
             passed = data.get("passed", 0)
+            # Skip error-only runs (fixture crashes) from stats
+            if errored and errored >= total:
+                continue
+            s["n_runs"] += 1
             if total:
                 s["pass_rates"].append(passed / total)
             _track_cases(s, data.get("cases", {}))
@@ -427,6 +431,9 @@ def _track_cases(suite: dict[str, Any], cases: dict[str, Any]) -> None:
     for cn, cd in cases.items():
         if not isinstance(cd, dict):
             continue
+        # Skip errored cases (fixture crashes) from stats
+        if cd.get("is_error"):
+            continue
         if cn not in suite["cases_seen"]:
             suite["cases_seen"][cn] = {"runs": 0, "fails": 0}
         suite["cases_seen"][cn]["runs"] += 1
diff --git a/protest/core/runner.py b/protest/core/runner.py
index 124cb44..f6bab5b 100644
--- a/protest/core/runner.py
+++ b/protest/core/runner.py
@@ -234,4 +234,5 @@ def _build_eval_case_result(result: TestResult) -> EvalCaseResult:
         judge_input_tokens=payload.judge_input_tokens,
         judge_output_tokens=payload.judge_output_tokens,
         judge_cost=payload.judge_cost,
+        is_error=result.is_fixture_error,
     )
diff --git a/protest/evals/history.py b/protest/evals/history.py
index f7f2544..5551736 100644
--- a/protest/evals/history.py
+++ b/protest/evals/history.py
@@ -100,6 +100,7 @@ def _build_entry(
             "total_cases": report.total_count,
             "passed": report.passed_count,
             "failed": report.failed_count,
+            "errored": report.errored_count,
             "pass_rate": round(report.pass_rate, 4),
             "duration": round(report.duration, 2),
             "cases": {c.case_name: _serialize_case(c) for c in report.cases},
@@ -138,6 +139,7 @@ def _build_entry(
 def _serialize_case(case: EvalCaseResult) -> dict[str, Any]:
     entry: dict[str, Any] = {
         "passed": case.passed,
+        "is_error": case.is_error,
         "duration": round(case.duration, 3),
         "scores": {s.name: s.value for s in case.scores if s.is_metric},
         "case_hash": case.case_hash,
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 7c8e14c..323f32a 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -186,6 +186,7 @@ class EvalCaseResult:
     judge_input_tokens: int = 0
     judge_output_tokens: int = 0
     judge_cost: float = 0.0
+    is_error: bool = False
 
     @property
     def numeric_scores(self) -> dict[str, float]:
@@ -241,7 +242,11 @@ def passed_count(self) -> int:
 
     @property
     def failed_count(self) -> int:
-        return sum(1 for c in self.cases if not c.passed)
+        return sum(1 for c in self.cases if not c.passed and not c.is_error)
+
+    @property
+    def errored_count(self) -> int:
+        return sum(1 for c in self.cases if c.is_error)
 
     @property
     def total_count(self) -> int:
diff --git a/tests/test_history_stats.py b/tests/test_history_stats.py
new file mode 100644
index 0000000..cc99c17
--- /dev/null
+++ b/tests/test_history_stats.py
@@ -0,0 +1,164 @@
+"""Tests for history stats — error-only runs must be excluded from stats."""
+
+from __future__ import annotations
+
+from protest.cli.history import _aggregate_suites, _rich_score_arrows
+
+
+def _make_entry(
+    suite_name: str = "pipeline",
+    passed: int = 0,
+    total: int = 0,
+    errored: int = 0,
+    cases: dict | None = None,
+) -> dict:
+    """Build a minimal history entry with one suite."""
+    return {
+        "suites": {
+            suite_name: {
+                "kind": "eval",
+                "passed": passed,
+                "total_cases": total,
+                "errored": errored,
+                "cases": cases or {},
+            }
+        }
+    }
+
+
+def _case(passed: bool, score: float) -> dict:
+    return {"passed": passed, "scores": {"accuracy": score}}
+
+
+def _error_case() -> dict:
+    return {"passed": False, "is_error": True, "scores": {}}
+
+
+class TestErrorRunsExcludedFromStats:
+    """Error-only runs (fixture crashes) are excluded from stats."""
+
+    def test_error_runs_not_counted(self) -> None:
+        """Runs where errored >= total should not count in n_runs or pass_rates."""
+        entries = [
+            _make_entry(passed=29, total=39, cases={"a": _case(True, 0.8)}),
+            _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}),
+            _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}),
+            _make_entry(passed=28, total=39, cases={"a": _case(True, 0.7)}),
+            _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}),
+        ]
+
+        suites = _aggregate_suites(entries)
+        s = suites["pipeline"]
+
+        # Only 2 real runs counted
+        assert s["n_runs"] == 2
+        assert len(s["pass_rates"]) == 2
+        # pass_rates reflect only real runs
+        assert s["pass_rates"][0] == 29 / 39
+        assert s["pass_rates"][1] == 28 / 39
+
+    def test_error_cases_not_tracked(self) -> None:
+        """Cases with is_error=True should not appear in cases_seen or score_values."""
+        entries = [
+            _make_entry(
+                passed=1,
+                total=2,
+                errored=0,
+                cases={
+                    "real_case": _case(True, 0.9),
+                    "errored_case": _error_case(),
+                },
+            ),
+        ]
+
+        suites = _aggregate_suites(entries)
+        s = suites["pipeline"]
+        assert "real_case" in s["cases_seen"]
+        assert "errored_case" not in s["cases_seen"]
+        assert len(s["score_values"]["accuracy"]) == 1
+
+    def test_error_cases_not_in_flaky(self) -> None:
+        """Error cases should never appear as flaky."""
+        entries = [
+            _make_entry(passed=1, total=1, cases={"a": _case(True, 0.9)}),
+            _make_entry(
+                passed=0,
+                total=1,
+                errored=1,
+                cases={"a": _error_case()},
+            ),
+        ]
+
+        suites = _aggregate_suites(entries)
+        s = suites["pipeline"]
+        # Only the real run is counted
+        assert s["n_runs"] == 1
+        assert len(s["flaky"]) == 0
+
+    def test_all_error_runs_produce_empty_suite(self) -> None:
+        """If ALL runs are errors, suite has 0 runs and empty stats."""
+        entries = [
+            _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}),
+            _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}),
+        ]
+
+        suites = _aggregate_suites(entries)
+        # Suite exists but has 0 real runs
+        assert suites["pipeline"]["n_runs"] == 0
+        assert suites["pipeline"]["pass_rates"] == []
+
+    def test_mixed_real_and_error_runs(self) -> None:
+        """Real data pattern: mostly errors with a few real runs."""
+        entries = [
+            _make_entry(passed=0, total=1, errored=1),  # error
+            _make_entry(passed=0, total=1, errored=1),  # error
+            _make_entry(passed=29, total=39, cases={"a": _case(True, 0.7)}),  # real
+            _make_entry(passed=0, total=1, errored=1),  # error
+            _make_entry(passed=0, total=1, errored=1),  # error
+            _make_entry(passed=28, total=39, cases={"a": _case(True, 0.8)}),  # real
+            _make_entry(passed=0, total=1, errored=1),  # error
+            _make_entry(passed=0, total=1, errored=1),  # error
+            _make_entry(passed=0, total=1, errored=1),  # error
+            _make_entry(passed=0, total=1, errored=1),  # error
+        ]
+
+        suites = _aggregate_suites(entries)
+        s = suites["pipeline"]
+
+        assert s["n_runs"] == 2  # not 10
+        assert len(s["pass_rates"]) == 2
+        # Arrows reflect only the 2 real runs, not the 8 errors
+        arrows = _rich_score_arrows(s["score_values"])
+        # accuracy went 0.7 → 0.8 → should show ↗
+        assert "↗" in arrows
+
+
+class TestScoreArrowsWithCleanData:
+    """Score arrows with only real runs (no errors to filter)."""
+
+    def test_stable_scores_show_no_trend(self) -> None:
+        entries = [
+            _make_entry(passed=2, total=2, cases={"a": _case(True, 0.8)}),
+            _make_entry(passed=2, total=2, cases={"a": _case(True, 0.8)}),
+        ]
+        suites = _aggregate_suites(entries)
+        arrows = _rich_score_arrows(suites["pipeline"]["score_values"])
+        assert "→" in arrows
+
+    def test_improving_scores_show_up(self) -> None:
+        entries = [
+            _make_entry(passed=1, total=1, cases={"a": _case(True, 0.3)}),
+            _make_entry(passed=1, total=1, cases={"a": _case(True, 0.9)}),
+        ]
+        suites = _aggregate_suites(entries)
+        arrows = _rich_score_arrows(suites["pipeline"]["score_values"])
+        assert "↗" in arrows
+
+    def test_declining_scores_show_down(self) -> None:
+        entries = [
+            _make_entry(passed=1, total=1, cases={"a": _case(True, 0.9)}),
+            _make_entry(passed=1, total=1, cases={"a": _case(True, 0.3)}),
+        ]
+        suites = _aggregate_suites(entries)
+        arrows = _rich_score_arrows(suites["pipeline"]["score_values"])
+        assert "↘" in arrows

From 6149633b52ee94623dc36cd2dd63b1b2a9fa814c Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 31 Mar 2026 21:53:29 +0200
Subject: [PATCH 12/60] =?UTF-8?q?refactor:=20remove=20getattr=20abuse=20?=
 =?UTF-8?q?=E2=80=94=20proper=20typing=20and=20Protocol=20contracts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove defensive getattr in session.py where types are known
- Type plugin setup(session: ProTestSession) instead of Any
- Add name/provider to Judge Protocol — explicit contract
- Delete ModelInfo.from_agent and JudgeInfo.from_instance — user wires
- Fix lint: PLR2004 magic values, PLR0912 noqa, ambiguous unicode
---
 examples/yorkshire/app/chatbot.py  |  2 +-
 protest/core/session.py            |  8 +++----
 protest/evals/history.py           |  9 +++----
 protest/evals/session.py           |  2 +-
 protest/evals/types.py             | 27 +++++----------------
 protest/history/plugin.py          | 11 +++++----
 protest/reporting/ascii.py         | 23 ++++++++++++++----
 protest/reporting/rich_reporter.py | 23 ++++++++++++++----
 tests/evals/test_judge.py          | 38 +++++++++---------------------
 9 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/examples/yorkshire/app/chatbot.py b/examples/yorkshire/app/chatbot.py
index dedc1e4..82ca519 100644
--- a/examples/yorkshire/app/chatbot.py
+++ b/examples/yorkshire/app/chatbot.py
@@ -25,7 +25,7 @@
 }
 
 
-def yorkshire_chatbot(question: str) -> str:
+def yorkshire_chatbot(question: str) -> str:  # noqa: PLR0912
     """Fake chatbot that answers questions about Yorkshire Terriers.
 
     Simulates a RAG pipeline: keyword matching → fact retrieval → response generation.
diff --git a/protest/core/session.py b/protest/core/session.py
index 59962c5..7ea04f3 100644
--- a/protest/core/session.py
+++ b/protest/core/session.py
@@ -228,10 +228,10 @@ async def my_eval(case: Annotated[dict, From(cases)]) -> str:
         def decorator(func: FuncT) -> FuncT:
             suite_name = name or func.__name__
             suite_meta: dict[str, Any] = {}
-            resolved_model = model or getattr(self, "_eval_model", None)
+            resolved_model = model or self._eval_model
             if resolved_model:
                 suite_meta["model"] = resolved_model.name
-                suite_meta["provider"] = getattr(resolved_model, "provider", None)
+                suite_meta["provider"] = resolved_model.provider
             suite = ProTestSuite(
                 name=suite_name,
                 tags=list(tags or []),
@@ -384,8 +384,8 @@ def _wire_eval_support(self) -> None:
         if self._eval_judge:
             judge_dict = {
                 "name": self._eval_judge.name,
-                "provider": getattr(self._eval_judge, "provider", None),
-                "evaluators": list(getattr(self._eval_judge, "evaluators", ())),
+                "provider": self._eval_judge.provider,
+                "evaluators": list(self._eval_judge.evaluators),
             }
 
         history = EvalHistoryPlugin(
diff --git a/protest/evals/history.py b/protest/evals/history.py
index 5551736..b607566 100644
--- a/protest/evals/history.py
+++ b/protest/evals/history.py
@@ -13,6 +13,7 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
+    from protest.core.session import ProTestSession
     from protest.evals.types import EvalCaseResult, EvalSuiteReport, ModelInfo
     from protest.plugin import PluginContext
 
@@ -47,12 +48,12 @@ def __init__(
     def activate(cls, ctx: PluginContext) -> EvalHistoryPlugin | None:
         return None  # Wired explicitly by session
 
-    def setup(self, session: Any) -> None:
+    def setup(self, session: ProTestSession) -> None:
         """Collect per-suite metadata from session."""
         self._suite_metadata = {}
-        for suite in getattr(session, "suites", []):
-            if getattr(suite, "kind", "test") == "eval":
-                self._suite_metadata[suite.name] = getattr(suite, "suite_metadata", {})
+        for suite in session.suites:
+            if suite.kind == "eval":
+                self._suite_metadata[suite.name] = suite.suite_metadata
 
     def on_eval_suite_end(self, report: EvalSuiteReport) -> None:
         """Collect suite reports as they arrive."""
diff --git a/protest/evals/session.py b/protest/evals/session.py
index 81f22d9..ddace3d 100644
--- a/protest/evals/session.py
+++ b/protest/evals/session.py
@@ -45,6 +45,6 @@ def __init__(
         if judge is not None:
             from protest.evals.types import JudgeInfo
 
-            self._eval_judge = JudgeInfo.from_instance(judge)
+            self._eval_judge = JudgeInfo(name=judge.name, provider=judge.provider)
         else:
             self._eval_judge = None
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 323f32a..59d2721 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -75,6 +75,9 @@ class Judge(Protocol):
     Usage::
 
         class MyJudge:
+            name = "my-judge"
+            provider = "openai"
+
             async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]:
                 result = await agent.run(prompt)
                 return JudgeResponse(output=result.output, input_tokens=100)
@@ -82,6 +85,9 @@ async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]:
         session = EvalSession(judge=MyJudge())
     """
 
+    name: str
+    provider: str | None
+
     async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: ...
 
 
@@ -94,20 +100,6 @@ class ModelInfo:
     temperature: float | None = None
     extra: dict[str, Any] = field(default_factory=dict)
 
-    @classmethod
-    def from_agent(cls, agent: Any) -> ModelInfo:
-        """Extract model info from a pydantic-ai Agent (duck-typed)."""
-        model = getattr(agent, "model", None)
-        if model is None:
-            msg = "Agent has no model configured"
-            raise ValueError(msg)
-        if isinstance(model, str):
-            return cls(name=model)
-        model_name = getattr(model, "model_name", None)
-        if callable(model_name):
-            return cls(name=str(model_name()))
-        return cls(name=str(getattr(model, "name", None) or model))
-
 
 @dataclass(frozen=True, slots=True)
 class JudgeInfo:
@@ -118,13 +110,6 @@ class JudgeInfo:
     evaluators: tuple[str, ...] = ()
     extra: dict[str, Any] = field(default_factory=dict)
 
-    @classmethod
-    def from_instance(cls, judge: Judge) -> JudgeInfo:
-        """Extract metadata from a Judge instance (duck-typed)."""
-        name = getattr(judge, "name", None) or type(judge).__name__
-        provider = getattr(judge, "provider", None)
-        return cls(name=str(name), provider=provider)
-
 
 @dataclass(frozen=True, slots=True)
 class EvalScore:
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index 4fe80f6..e216fe6 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -13,6 +13,7 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
+    from protest.core.session import ProTestSession
     from protest.entities.events import SessionResult, TestResult
     from protest.plugin import PluginContext
 
@@ -36,11 +37,11 @@ def __init__(self, history_dir: Path | None = None) -> None:
     def activate(cls, ctx: PluginContext) -> HistoryPlugin | None:
         return None  # Wired explicitly by session
 
-    def setup(self, session: Any) -> None:
-        self._history_enabled = getattr(session, "history", False)
-        self._metadata = dict(getattr(session, "metadata", None) or {})
-        for suite in getattr(session, "suites", []):
-            self._suite_kinds[suite.name] = getattr(suite, "kind", "test")
+    def setup(self, session: ProTestSession) -> None:
+        self._history_enabled = session.history
+        self._metadata = dict(session.metadata)
+        for suite in session.suites:
+            self._suite_kinds[suite.name] = suite.kind
             if not self._default_suite_name or self._default_suite_name == "tests":
                 self._default_suite_name = suite.name
 
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index 1620789..9296ae6 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -63,14 +63,23 @@ def _format_duration(seconds: float) -> str:
     return f"{seconds:.2f}s"
 
 
+_TOKEN_K_THRESHOLD = 1000
+
+
 def _format_tokens(tokens: int) -> str:
-    return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens)
+    return (
+        f"{tokens / _TOKEN_K_THRESHOLD:.1f}k"
+        if tokens >= _TOKEN_K_THRESHOLD
+        else str(tokens)
+    )
 
 
 def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str:
     parts: list[str] = []
     if input_tokens > 0 or output_tokens > 0:
-        parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out")
+        parts.append(
+            f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out"
+        )
     if cost > 0:
         parts.append(f"${cost:.4f}")
     return ", ".join(parts)
@@ -299,10 +308,16 @@ def on_eval_suite_end(self, report: Any) -> None:
         rate_pct = report.pass_rate * 100
         print(f"  Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)")
         if report.total_task_tokens > 0 or report.total_task_cost > 0:
-            print(f"  Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}")
+            print(
+                f"  Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}"
+            )
         if report.total_judge_calls > 0:
             judge_parts = [f"{report.total_judge_calls} calls"]
-            usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost)
+            usage = _format_usage(
+                report.total_judge_input_tokens,
+                report.total_judge_output_tokens,
+                report.total_judge_cost,
+            )
             if usage:
                 judge_parts.append(usage)
             print(f"  Judge: {', '.join(judge_parts)}")
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index c699f71..981b03f 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -53,16 +53,25 @@ def _format_duration(seconds: float) -> str:
     return f"{seconds:.2f}s"
 
 
+_TOKEN_K_THRESHOLD = 1000
+
+
 def _format_tokens(tokens: int) -> str:
     """Format token count: 1234 → '1.2k', 45 → '45'."""
-    return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens)
+    return (
+        f"{tokens / _TOKEN_K_THRESHOLD:.1f}k"
+        if tokens >= _TOKEN_K_THRESHOLD
+        else str(tokens)
+    )
 
 
 def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str:
     """Format usage stats as 'Xk in / Yk out, $0.0042'."""
     parts: list[str] = []
     if input_tokens > 0 or output_tokens > 0:
-        parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out")
+        parts.append(
+            f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out"
+        )
     if cost > 0:
         parts.append(f"${cost:.4f}")
     return ", ".join(parts)
@@ -463,10 +472,16 @@ def on_eval_suite_end(self, report: Any) -> None:
             f"  [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]"
         )
         if report.total_task_tokens > 0 or report.total_task_cost > 0:
-            self._print(f"  [dim]Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}[/]")
+            self._print(
+                f"  [dim]Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}[/]"
+            )
         if report.total_judge_calls > 0:
             judge_parts = [f"{report.total_judge_calls} calls"]
-            usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost)
+            usage = _format_usage(
+                report.total_judge_input_tokens,
+                report.total_judge_output_tokens,
+                report.total_judge_cost,
+            )
             if usage:
                 judge_parts.append(usage)
             self._print(f"  [dim]Judge: {', '.join(judge_parts)}[/]")
diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py
index 0a6006f..9e6fd11 100644
--- a/tests/evals/test_judge.py
+++ b/tests/evals/test_judge.py
@@ -14,15 +14,12 @@
     EvalSession,
     Judge,
     JudgeResponse,
-    ModelInfo,
     TaskResult,
     Verdict,
     evaluator,
 )
-from protest.evals.types import JudgeInfo
 from protest.plugin import PluginBase
 
-
 # ---------------------------------------------------------------------------
 # Fake judge for testing
 # ---------------------------------------------------------------------------
@@ -31,8 +28,8 @@
 class FakeJudge:
     """Minimal Judge implementation for tests."""
 
-    name = "fake-judge"
-    provider = "test"
+    name: str = "fake-judge"
+    provider: str | None = "test"
 
     async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
         if output_type is bool:
@@ -49,7 +46,10 @@ async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
 
 
 class BareJudge:
-    """Judge without name/provider attrs — tests fallback."""
+    """Minimal Judge with required name/provider."""
+
+    name: str = "bare-judge"
+    provider: str | None = None
 
     async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
         return JudgeResponse(output=True)
@@ -75,23 +75,6 @@ def evaluate(self, prompt: str) -> str:
         assert not isinstance(NotAJudge(), Judge)
 
 
-# ---------------------------------------------------------------------------
-# JudgeInfo.from_instance
-# ---------------------------------------------------------------------------
-
-
-class TestJudgeInfoExtraction:
-    def test_from_instance_with_attrs(self) -> None:
-        info = JudgeInfo.from_instance(FakeJudge())
-        assert info.name == "fake-judge"
-        assert info.provider == "test"
-
-    def test_from_instance_fallback_to_class_name(self) -> None:
-        info = JudgeInfo.from_instance(BareJudge())
-        assert info.name == "BareJudge"
-        assert info.provider is None
-
-
 # ---------------------------------------------------------------------------
 # EvalContext.judge()
 # ---------------------------------------------------------------------------
@@ -311,9 +294,9 @@ def on_test_pass(self, result: Any) -> None:
         payload = results[0].eval_payload
         assert payload is not None
         assert payload.judge_call_count == 2
-        assert payload.judge_input_tokens == 20  # 10 per call × 2
-        assert payload.judge_output_tokens == 10  # 5 per call × 2
-        assert payload.judge_cost == pytest.approx(0.002)  # 0.001 per call × 2
+        assert payload.judge_input_tokens == 20  # 10 per call x 2
+        assert payload.judge_output_tokens == 10  # 5 per call x 2
+        assert payload.judge_cost == pytest.approx(0.002)  # 0.001 per call x 2
 
     def test_judge_info_derived_from_instance(self) -> None:
         """EvalSession derives JudgeInfo from Judge instance."""
@@ -335,7 +318,8 @@ class JudgeVerdict:
             ok: Annotated[bool, Verdict]
 
         class StructuredJudge:
-            name = "structured"
+            name: str = "structured"
+            provider: str | None = None
 
             async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
                 return JudgeResponse(output=output_type(ok=True))

From c08125535139cbaff4e74dec27f0f6413c992a12 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 31 Mar 2026 23:30:54 +0200
Subject: [PATCH 13/60] fix(hashing): fail-hard canonicalization,
 evaluator_identity() protocol

Replace fragile repr() fallback with explicit error on unknown types.
Add evaluator_identity() as user-controlled escape hatch for custom
evaluators. Introspect dataclass/partial/callable as fallback only.

- Remove hasattr(obj, "model_dump") duck-typing (Pydantic leak)
- Remove default=str silent fallback in json.dumps
- Skip _prefixed dataclass fields (runtime internals, not config)
- Add functools.partial support (qualname + bound kwargs)
- Add ShortCircuit.evaluator_identity()
- 33 tests covering all paths including fail-hard
---
 protest/evals/evaluator.py  |  16 ++-
 protest/evals/hashing.py    |  82 ++++++++++---
 tests/evals/test_hashing.py | 239 ++++++++++++++++++++++++++++++++++--
 3 files changed, 308 insertions(+), 29 deletions(-)

diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 701fe5c..7fdf6c7 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -48,7 +48,15 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
 
 @dataclass
 class EvalContext(Generic[InputT, OutputT]):
-    """Context passed to evaluator functions."""
+    """Context passed to evaluator functions.
+
+    Dual role: read-only DTO (inputs, output, expected) + mutable accumulator
+    for judge call stats (tokens, cost, call count). One instance per case,
+    shared sequentially across evaluators, discarded after scoring.
+
+    Note: judge stats accumulate via ctx.judge() side-effects. If evaluators
+    are ever parallelized within a case, the accumulators will need isolation.
+    """
 
     name: str
     inputs: InputT
@@ -149,6 +157,12 @@ class ShortCircuit:
     def __init__(self, evaluators: list[Any]) -> None:
         self.evaluators = evaluators
 
+    def evaluator_identity(self) -> dict[str, Any]:
+        """Identity is the ordered list of inner evaluators."""
+        from protest.evals.hashing import _canonical
+
+        return {"short_circuit": [_canonical(e) for e in self.evaluators]}
+
 
 class Metric:
     """Annotate a float/int field as a metric for stats aggregation."""
diff --git a/protest/evals/hashing.py b/protest/evals/hashing.py
index 0f0f5e9..5ebe725 100644
--- a/protest/evals/hashing.py
+++ b/protest/evals/hashing.py
@@ -1,8 +1,18 @@
-"""Content hashing for eval cases — detect when cases or scoring change."""
+"""Content hashing for eval cases — detect when cases or scoring change.
+
+Hashes capture identity + configuration, not implementation. A renamed
+parameter changes the hash; a rewritten function body does not. This is
+a deliberate trade-off: we detect config drift, not code drift.
+
+Custom evaluators can implement ``evaluator_identity()`` to control
+exactly what gets hashed. Built-in types (dataclass, functools.partial,
+plain callable) are introspected automatically as a fallback.
+"""
 
 from __future__ import annotations
 
 import dataclasses
+import functools
 import hashlib
 import json
 from typing import Any
@@ -10,42 +20,80 @@
 HASH_LENGTH = 12
 
 
+class CanonicalError(TypeError):
+    """Raised when an object cannot be converted to a canonical form."""
+
+
 def compute_case_hash(inputs: Any, expected_output: Any) -> str:
     """Hash the case content (inputs + expected_output)."""
     data = {"inputs": _canonical(inputs), "expected": _canonical(expected_output)}
     return _hash(data)
 
 
-def compute_eval_hash(
-    evaluators: list[Any],
-) -> str:
+def compute_eval_hash(evaluators: list[Any]) -> str:
     """Hash the scoring config (evaluators only)."""
-    data = {
-        "evaluators": [_canonical(e) for e in evaluators],
-    }
+    data = {"evaluators": [_canonical(e) for e in evaluators]}
     return _hash(data)
 
 
 def _hash(data: Any) -> str:
-    raw = json.dumps(data, sort_keys=True, default=str)
+    raw = json.dumps(data, sort_keys=True)
     return hashlib.sha256(raw.encode()).hexdigest()[:HASH_LENGTH]
 
 
-def _canonical(obj: Any) -> Any:
-    """Convert an object to a canonical JSON-serializable form."""
+def _canonical(obj: Any) -> Any:  # noqa: PLR0911
+    """Convert an object to a canonical JSON-serializable form.
+
+    Resolution order:
+    1. Primitives, list, tuple, dict — native support
+    2. ``evaluator_identity()`` — explicit, user-controlled
+    3. Dataclass / functools.partial / callable — introspection fallback
+    4. Anything else → CanonicalError
+    """
+    # --- primitives & containers ---
     if obj is None or isinstance(obj, (bool, int, float, str)):
         return obj
     if isinstance(obj, (list, tuple)):
         return [_canonical(item) for item in obj]
     if isinstance(obj, dict):
         return {str(k): _canonical(v) for k, v in sorted(obj.items())}
-    # Pydantic models
-    if hasattr(obj, "model_dump"):
-        return _canonical(obj.model_dump(mode="json"))
-    # Dataclasses — iterate without deepcopy to support non-picklable fields
+
+    # --- explicit identity (user-controlled) ---
+    if hasattr(obj, "evaluator_identity"):
+        return _canonical(obj.evaluator_identity())
+
+    # --- introspection fallback ---
+
+    # Dataclasses — public fields only (skip _ prefixed runtime internals)
     if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
         return {
-            f.name: _canonical(getattr(obj, f.name)) for f in dataclasses.fields(obj)
+            "__type__": type(obj).__qualname__,
+            **{
+                f.name: _canonical(getattr(obj, f.name))
+                for f in dataclasses.fields(obj)
+                if not f.name.startswith("_")
+            },
+        }
+    # functools.partial — qualname + bound kwargs
+    if isinstance(obj, functools.partial):
+        return {
+            "fn": _fn_qualname(obj.func),
+            "args": _canonical(list(obj.args)) if obj.args else [],
+            "kwargs": _canonical(dict(obj.keywords)) if obj.keywords else {},
         }
-    # Fallback
-    return repr(obj)
+    # Plain callable — qualname only
+    if callable(obj):
+        qualname = _fn_qualname(obj)
+        if qualname is not None:
+            return {"fn": qualname}
+
+    raise CanonicalError(
+        f"Cannot canonicalize {type(obj).__name__!r}. "
+        f"Implement evaluator_identity() or use a supported type "
+        f"(primitives, list, dict, dataclass, callable)."
+    )
+
+
+def _fn_qualname(fn: Any) -> str | None:
+    """Extract a stable qualified name from a callable."""
+    return getattr(fn, "__qualname__", None) or getattr(fn, "__name__", None)
diff --git a/tests/evals/test_hashing.py b/tests/evals/test_hashing.py
index bc53e1f..26e5570 100644
--- a/tests/evals/test_hashing.py
+++ b/tests/evals/test_hashing.py
@@ -1,14 +1,22 @@
-"""Tests for protest.evals.hashing — including non-picklable dataclass fields."""
+"""Tests for protest.evals.hashing — fail-hard canonicalization."""
 
 from __future__ import annotations
 
 import dataclasses
+import functools
 import threading
 
-from protest.evals.hashing import _canonical, compute_eval_hash
+import pytest
+
+from protest.evals.hashing import (
+    CanonicalError,
+    _canonical,
+    compute_case_hash,
+    compute_eval_hash,
+)
 
 # ---------------------------------------------------------------------------
-# _canonical — dataclass handling
+# Fixtures — representative evaluator types
 # ---------------------------------------------------------------------------
 
 
@@ -32,24 +40,218 @@ class LockHoldingEvaluator:
     _lock: threading.Lock = dataclasses.field(default_factory=threading.Lock)
 
 
+def bare_function(ctx: object) -> bool:
+    return True
+
+
+def parameterized_function(ctx: object, keywords: list[str]) -> bool:
+    return True
+
+
+# ---------------------------------------------------------------------------
+# _canonical — primitives & containers
+# ---------------------------------------------------------------------------
+
+
+class TestCanonicalPrimitives:
+    @pytest.mark.parametrize("value", [None, True, False, 42, 3.14, "hello"])
+    def test_primitives_pass_through(self, value: object) -> None:
+        assert _canonical(value) is value
+
+    def test_list(self) -> None:
+        assert _canonical([1, "a", [2]]) == [1, "a", [2]]
+
+    def test_tuple_treated_as_list(self) -> None:
+        assert _canonical((1, 2)) == [1, 2]
+
+    def test_dict_sorted_by_key(self) -> None:
+        assert _canonical({"b": 2, "a": 1}) == {"a": 1, "b": 2}
+
+
+# ---------------------------------------------------------------------------
+# _canonical — dataclass handling
+# ---------------------------------------------------------------------------
+
+
 class TestCanonicalDataclass:
     def test_simple_dataclass_is_serialized(self) -> None:
         ev = SimpleEvaluator(threshold=0.8)
         result = _canonical(ev)
-        assert result == {"threshold": 0.8, "name": "simple"}
+        assert result == {
+            "__type__": "SimpleEvaluator",
+            "threshold": 0.8,
+            "name": "simple",
+        }
 
     def test_nested_dataclass_is_serialized_recursively(self) -> None:
         ev = NestedEvaluator(inner=SimpleEvaluator(threshold=0.5), weight=2.0)
         result = _canonical(ev)
-        assert result == {"inner": {"threshold": 0.5, "name": "simple"}, "weight": 2.0}
+        assert result == {
+            "__type__": "NestedEvaluator",
+            "inner": {
+                "__type__": "SimpleEvaluator",
+                "threshold": 0.5,
+                "name": "simple",
+            },
+            "weight": 2.0,
+        }
+
+    def test_dataclass_with_lock_skips_private_fields(self) -> None:
+        """Regression: dataclasses.asdict() deepcopy fails on threading.Lock.
 
-    def test_dataclass_with_lock_does_not_crash(self) -> None:
-        """Regression: dataclasses.asdict() deepcopy fails on threading.Lock."""
+        Private fields (_prefixed) are runtime internals, not config — excluded from hash.
+        """
         ev = LockHoldingEvaluator(name="llm_judge")
-        # Must not raise — lock falls back to repr()
         result = _canonical(ev)
-        assert result["name"] == "llm_judge"
-        assert "_lock" in result
+        assert result == {"__type__": "LockHoldingEvaluator", "name": "llm_judge"}
+        assert "_lock" not in result
+
+
+# ---------------------------------------------------------------------------
+# _canonical — callables (the real-world evaluator path)
+# ---------------------------------------------------------------------------
+
+
+class TestCanonicalCallable:
+    def test_bare_function(self) -> None:
+        result = _canonical(bare_function)
+        assert result == {"fn": "bare_function"}
+
+    def test_partial_captures_qualname_and_kwargs(self) -> None:
+        bound = functools.partial(parameterized_function, keywords=["paris"])
+        result = _canonical(bound)
+        assert result == {
+            "fn": "parameterized_function",
+            "args": [],
+            "kwargs": {"keywords": ["paris"]},
+        }
+
+    def test_partial_different_kwargs_different_canonical(self) -> None:
+        a = functools.partial(parameterized_function, keywords=["paris"])
+        b = functools.partial(parameterized_function, keywords=["lyon"])
+        assert _canonical(a) != _canonical(b)
+
+    def test_partial_same_kwargs_same_canonical(self) -> None:
+        a = functools.partial(parameterized_function, keywords=["paris"])
+        b = functools.partial(parameterized_function, keywords=["paris"])
+        assert _canonical(a) == _canonical(b)
+
+
+# ---------------------------------------------------------------------------
+# _canonical — evaluator_identity (explicit, user-controlled)
+# ---------------------------------------------------------------------------
+
+
+class TestCanonicalEvaluatorIdentity:
+    def test_evaluator_identity_takes_precedence(self) -> None:
+        """evaluator_identity() is used over introspection when available."""
+
+        class CustomScorer:
+            def __init__(self, model: str, temperature: float):
+                self.model = model
+                self.temperature = temperature
+                self._client = object()  # runtime state, not config
+
+            def evaluator_identity(self) -> dict:
+                return {"model": self.model, "temperature": self.temperature}
+
+        result = _canonical(CustomScorer(model="gpt-4", temperature=0.7))
+        assert result == {"model": "gpt-4", "temperature": 0.7}
+
+    def test_evaluator_identity_on_dataclass_overrides_introspection(self) -> None:
+        """evaluator_identity() wins even if the object is a dataclass."""
+
+        @dataclasses.dataclass
+        class VersionedEvaluator:
+            threshold: float
+            version: int = 1
+
+            def evaluator_identity(self) -> dict:
+                return {"v": self.version, "t": self.threshold}
+
+        result = _canonical(VersionedEvaluator(threshold=0.8, version=2))
+        assert result == {"v": 2, "t": 0.8}
+
+    def test_evaluator_identity_different_config_different_hash(self) -> None:
+        class CustomScorer:
+            def __init__(self, model: str):
+                self.model = model
+
+            def evaluator_identity(self) -> dict:
+                return {"model": self.model}
+
+        h1 = compute_eval_hash([CustomScorer(model="gpt-4")])
+        h2 = compute_eval_hash([CustomScorer(model="claude")])
+        assert h1 != h2
+
+    def test_evaluator_identity_same_config_same_hash(self) -> None:
+        class CustomScorer:
+            def __init__(self, model: str):
+                self.model = model
+
+            def evaluator_identity(self) -> dict:
+                return {"model": self.model}
+
+        h1 = compute_eval_hash([CustomScorer(model="gpt-4")])
+        h2 = compute_eval_hash([CustomScorer(model="gpt-4")])
+        assert h1 == h2
+
+
+# ---------------------------------------------------------------------------
+# _canonical — fail-hard on unknown types
+# ---------------------------------------------------------------------------
+
+
+class TestCanonicalFailHard:
+    def test_unknown_type_raises_canonical_error(self) -> None:
+        class Opaque:
+            pass
+
+        with pytest.raises(CanonicalError, match="Opaque"):
+            _canonical(Opaque())
+
+    def test_non_callable_non_dataclass_raises(self) -> None:
+        with pytest.raises(CanonicalError):
+            _canonical(object())
+
+    def test_error_message_mentions_evaluator_identity(self) -> None:
+        class Opaque:
+            pass
+
+        with pytest.raises(CanonicalError, match="evaluator_identity"):
+            _canonical(Opaque())
+
+
+# ---------------------------------------------------------------------------
+# compute_case_hash
+# ---------------------------------------------------------------------------
+
+
+class TestComputeCaseHash:
+    def test_same_inputs_same_hash(self) -> None:
+        h1 = compute_case_hash("hello", "expected")
+        h2 = compute_case_hash("hello", "expected")
+        assert h1 == h2
+
+    def test_different_inputs_different_hash(self) -> None:
+        h1 = compute_case_hash("hello", "expected")
+        h2 = compute_case_hash("world", "expected")
+        assert h1 != h2
+
+    def test_none_expected_is_stable(self) -> None:
+        h1 = compute_case_hash("hello", None)
+        h2 = compute_case_hash("hello", None)
+        assert h1 == h2
+
+    def test_dict_inputs(self) -> None:
+        h1 = compute_case_hash({"q": "hello", "context": "world"}, "expected")
+        h2 = compute_case_hash({"context": "world", "q": "hello"}, "expected")
+        assert h1 == h2, "dict key order should not affect hash"
+
+
+# ---------------------------------------------------------------------------
+# compute_eval_hash
+# ---------------------------------------------------------------------------
 
 
 class TestComputeEvalHash:
@@ -67,6 +269,21 @@ def test_different_thresholds_produce_different_hashes(self) -> None:
     def test_evaluator_with_lock_does_not_crash(self) -> None:
         """Regression for non-picklable evaluator fields."""
         ev = LockHoldingEvaluator(name="llm_judge")
-        # Should not raise TypeError about cannot pickle '_thread.lock'
         hash_val = compute_eval_hash([ev])
         assert len(hash_val) == 12
+
+    def test_partial_evaluators_hash_stably(self) -> None:
+        ev = functools.partial(parameterized_function, keywords=["paris"])
+        h1 = compute_eval_hash([ev])
+        h2 = compute_eval_hash([ev])
+        assert h1 == h2
+
+    def test_bare_function_evaluator(self) -> None:
+        h1 = compute_eval_hash([bare_function])
+        h2 = compute_eval_hash([bare_function])
+        assert h1 == h2
+
+    def test_different_partial_kwargs_different_hash(self) -> None:
+        ev_a = functools.partial(parameterized_function, keywords=["paris"])
+        ev_b = functools.partial(parameterized_function, keywords=["lyon"])
+        assert compute_eval_hash([ev_a]) != compute_eval_hash([ev_b])

From d7fbba375821d8cbf8aa4bea10fd6ea795f44dfa Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 31 Mar 2026 23:50:32 +0200
Subject: [PATCH 14/60] refactor: replace kind string literals with SuiteKind
 StrEnum

Type-safe suite kind across the codebase. StrEnum keeps JSON
compat (SuiteKind.EVAL == "eval") so no migration needed.
---
 protest/core/session.py      | 5 +++--
 protest/core/suite.py        | 5 +++--
 protest/entities/__init__.py | 2 ++
 protest/entities/core.py     | 9 ++++++++-
 protest/evals/history.py     | 3 ++-
 protest/filters/kind.py      | 7 ++++---
 protest/history/plugin.py    | 3 ++-
 tests/evals/test_e2e.py      | 5 +++--
 8 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/protest/core/session.py b/protest/core/session.py
index 7ea04f3..910d032 100644
--- a/protest/core/session.py
+++ b/protest/core/session.py
@@ -22,6 +22,7 @@
     FixtureScope,
     Retry,
     Skip,
+    SuiteKind,
     TestRegistration,
     Xfail,
     normalize_retry,
@@ -235,7 +236,7 @@ def decorator(func: FuncT) -> FuncT:
             suite = ProTestSuite(
                 name=suite_name,
                 tags=list(tags or []),
-                kind="eval",
+                kind=SuiteKind.EVAL,
                 metadata=suite_meta,
             )
             wrapper = make_eval_wrapper(
@@ -372,7 +373,7 @@ def activate_plugins(self, ctx: PluginContext) -> None:
                 self.register_plugin(instance)
 
         # Auto-wire eval support if any suite has kind="eval"
-        if any(s.kind == "eval" for s in self._suites):
+        if any(s.kind == SuiteKind.EVAL for s in self._suites):
             self._wire_eval_support()
 
     def _wire_eval_support(self) -> None:
diff --git a/protest/core/suite.py b/protest/core/suite.py
index 262d908..99b4fa2 100644
--- a/protest/core/suite.py
+++ b/protest/core/suite.py
@@ -14,6 +14,7 @@
     FixtureRegistration,
     Retry,
     Skip,
+    SuiteKind,
     SuitePath,
     TestRegistration,
     Xfail,
@@ -49,7 +50,7 @@ def __init__(  # noqa: PLR0913
         max_concurrency: int | None = None,
         tags: list[str] | None = None,
         description: str | None = None,
-        kind: str = "test",
+        kind: SuiteKind = SuiteKind.TEST,
         metadata: dict[str, Any] | None = None,
     ) -> None:
         if max_concurrency is not None and max_concurrency < 1:
@@ -76,7 +77,7 @@ def description(self) -> str | None:
         return self._description
 
     @property
-    def kind(self) -> str:
+    def kind(self) -> SuiteKind:
         return self._kind
 
     @property
diff --git a/protest/entities/__init__.py b/protest/entities/__init__.py
index 30bd04e..3016ebb 100644
--- a/protest/entities/__init__.py
+++ b/protest/entities/__init__.py
@@ -4,6 +4,7 @@
     FixtureMarker,
     FixtureRegistration,
     FixtureScope,
+    SuiteKind,
     TestItem,
     TestOutcome,
     TestRegistration,
@@ -48,6 +49,7 @@
     "SessionResult",
     "SessionSetupInfo",
     "Skip",
+    "SuiteKind",
     "SuitePath",
     "SuiteResult",
     "SuiteSetupInfo",
diff --git a/protest/entities/core.py b/protest/entities/core.py
index f5efa22..5a8c680 100644
--- a/protest/entities/core.py
+++ b/protest/entities/core.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from enum import Enum
+from enum import Enum, StrEnum
 from typing import TYPE_CHECKING, Any, TypeAlias
 
 if TYPE_CHECKING:
@@ -20,6 +20,13 @@
 FixtureCallable: TypeAlias = "Callable[..., Any]"
 
 
+class SuiteKind(StrEnum):
+    """Kind of suite — determines behavior (eval wiring, history, reporting)."""
+
+    TEST = "test"
+    EVAL = "eval"
+
+
 class FixtureScope(Enum):
     """Scope level for fixtures."""
 
diff --git a/protest/evals/history.py b/protest/evals/history.py
index b607566..725c5b3 100644
--- a/protest/evals/history.py
+++ b/protest/evals/history.py
@@ -6,6 +6,7 @@
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any
 
+from protest.entities import SuiteKind
 from protest.history.collector import collect_env_info, collect_git_info
 from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry
 from protest.plugin import PluginBase
@@ -52,7 +53,7 @@ def setup(self, session: ProTestSession) -> None:
         """Collect per-suite metadata from session."""
         self._suite_metadata = {}
         for suite in session.suites:
-            if suite.kind == "eval":
+            if suite.kind == SuiteKind.EVAL:
                 self._suite_metadata[suite.name] = suite.suite_metadata
 
     def on_eval_suite_end(self, report: EvalSuiteReport) -> None:
diff --git a/protest/filters/kind.py b/protest/filters/kind.py
index 859e7dd..076684a 100644
--- a/protest/filters/kind.py
+++ b/protest/filters/kind.py
@@ -4,6 +4,7 @@
 
 from typing import TYPE_CHECKING
 
+from protest.entities import SuiteKind
 from protest.plugin import PluginBase
 
 if TYPE_CHECKING:
@@ -17,14 +18,14 @@ class KindFilterPlugin(PluginBase):
     name = "kind-filter"
     description = "Filter by suite kind"
 
-    def __init__(self, kind: str) -> None:
+    def __init__(self, kind: SuiteKind) -> None:
         self._kind = kind
 
     @classmethod
     def activate(cls, ctx: PluginContext) -> KindFilterPlugin | None:
         kind = ctx.get("kind_filter")
         if kind:
-            return cls(kind=kind)
+            return cls(kind=SuiteKind(kind))
         return None
 
     def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]:
@@ -32,5 +33,5 @@ def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]:
 
     def _matches(self, item: TestItem) -> bool:
         if item.suite is None:
-            return self._kind == "test"
+            return self._kind == SuiteKind.TEST
         return item.suite.kind == self._kind
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index e216fe6..c8a0f79 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -14,6 +14,7 @@
     from pathlib import Path
 
     from protest.core.session import ProTestSession
+    from protest.entities import SuiteKind
     from protest.entities.events import SessionResult, TestResult
     from protest.plugin import PluginContext
 
@@ -28,7 +29,7 @@ def __init__(self, history_dir: Path | None = None) -> None:
         self._history_dir = history_dir or DEFAULT_HISTORY_DIR
         self._history_file = self._history_dir / HISTORY_FILE
         self._suites: dict[str, dict[str, dict[str, Any]]] = {}
-        self._suite_kinds: dict[str, str] = {}
+        self._suite_kinds: dict[str, SuiteKind] = {}
         self._default_suite_name: str = "tests"
         self._history_enabled: bool = False
         self._metadata: dict[str, Any] = {}
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 9bdaead..72ef8ff 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -23,6 +23,7 @@
 from protest.core.collector import Collector
 from protest.core.runner import TestRunner
 from protest.core.suite import ProTestSuite
+from protest.entities import SuiteKind
 from protest.evals import (
     EvalContext,
     EvalSession,
@@ -191,7 +192,7 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
 
     def test_kind_filter_keeps_only_matching(self) -> None:
         test_suite = ProTestSuite("tests")
-        eval_suite = ProTestSuite("evals", kind="eval")
+        eval_suite = ProTestSuite("evals", kind=SuiteKind.EVAL)
 
         session = ProTestSession()
 
@@ -210,7 +211,7 @@ def eval_one() -> None:
         assert len(items) == 2
 
         # Filter to eval only
-        plugin = KindFilterPlugin(kind="eval")
+        plugin = KindFilterPlugin(kind=SuiteKind.EVAL)
         filtered = plugin.on_collection_finish(items)
         assert len(filtered) == 1
         assert filtered[0].suite.kind == "eval"

From 905d3c8308ef38339efd7c522b5a782b1b3d7a52 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Wed, 1 Apr 2026 00:59:36 +0200
Subject: [PATCH 15/60] refactor: move lazy imports to top-level, remove
 PLC0415 per-file ignores
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

28 lazy imports in protest/, none resolving a real circular dependency.
Moved all to top-level except justified cases (optional deps like rich,
conditional wiring, and one true circular import in evals/__init__.py).

Removed blanket PLC0415 per-file-ignores from pyproject.toml — remaining
suppressions use inline noqa with justification.
---
 protest/api.py                  | 40 ++++++++++-----------------------
 protest/cli/history.py          | 19 ++++++++--------
 protest/cli/main.py             | 26 +++++++++------------
 protest/core/session.py         | 15 +++++++------
 protest/evals/__init__.py       |  2 +-
 protest/evals/evaluator.py      | 20 +++++++++++------
 protest/evals/history.py        | 16 +++++++------
 protest/evals/results_writer.py |  5 +----
 protest/evals/session.py        |  3 +--
 protest/evals/wrapper.py        | 12 +++-------
 protest/history/collector.py    |  2 +-
 pyproject.toml                  |  5 -----
 12 files changed, 68 insertions(+), 97 deletions(-)

diff --git a/protest/api.py b/protest/api.py
index a6c6f79..ce8c178 100644
--- a/protest/api.py
+++ b/protest/api.py
@@ -14,21 +14,27 @@ def test_example():
         assert True
 
     success = run_session(session)
-
-Note:
-    This module uses lazy imports (PLC0415) to optimize startup time.
-    Users importing `from protest.api import run_session` shouldn't pay
-    the cost of loading the entire framework until they actually call it.
 """
 
 from __future__ import annotations
 
+import asyncio
 from typing import TYPE_CHECKING
 
+from protest.core.collector import Collector
+from protest.core.runner import TestRunner
+from protest.core.suite import (
+    ProTestSuite,  # noqa: TC001 — used at runtime in list_tags
+)
+from protest.events.types import Event
+from protest.filters.keyword import KeywordFilterPlugin
+from protest.filters.suite import SuiteFilterPlugin
+from protest.plugin import PluginBase, PluginContext
+from protest.tags.plugin import TagFilterPlugin
+
 if TYPE_CHECKING:
     from protest.core.session import ProTestSession
     from protest.entities import RunResult, TestItem
-    from protest.plugin import PluginContext
 
 
 def run_session(  # noqa: PLR0913 - public API with many optional params
@@ -69,10 +75,6 @@ def run_session(  # noqa: PLR0913 - public API with many optional params
     Returns:
         RunResult with success status and interrupted flag.
     """
-    from protest.core.runner import (  # noqa: PLC0415 - lazy import for startup perf
-        TestRunner,
-    )
-
     # Apply session-level settings from ctx or params
     if ctx is not None:
         if ctx.get("concurrency") is not None:
@@ -91,10 +93,6 @@ def run_session(  # noqa: PLR0913 - public API with many optional params
 
     # Build context from parameters if not provided
     if ctx is None:
-        from protest.plugin import (  # noqa: PLC0415 - lazy import for startup perf
-            PluginContext,
-        )
-
         ctx = PluginContext(
             args={
                 "last_failed": last_failed,
@@ -136,16 +134,6 @@ def collect_tests(  # noqa: PLR0913 - public API with many optional params
     Returns:
         List of collected TestItem objects.
     """
-    # Lazy imports for startup performance - only load when function is called
-    import asyncio  # noqa: PLC0415
-
-    from protest.core.collector import Collector  # noqa: PLC0415
-    from protest.events.types import Event  # noqa: PLC0415
-    from protest.filters.keyword import KeywordFilterPlugin  # noqa: PLC0415
-    from protest.filters.suite import SuiteFilterPlugin  # noqa: PLC0415
-    from protest.plugin import PluginBase, PluginContext  # noqa: PLC0415
-    from protest.tags.plugin import TagFilterPlugin  # noqa: PLC0415
-
     # Build context from parameters if not provided
     if ctx is None:
         ctx = PluginContext(
@@ -182,10 +170,6 @@ def list_tags(session: ProTestSession) -> set[str]:
     Returns:
         Set of all tag names declared on fixtures, suites, and tests.
     """
-    from protest.core.suite import (  # noqa: PLC0415, TC001 - lazy import for startup perf
-        ProTestSuite,
-    )
-
     all_tags: set[str] = set()
 
     for fixture_reg in session.fixtures:
diff --git a/protest/cli/history.py b/protest/cli/history.py
index 33b230b..cb00787 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -4,8 +4,11 @@
 
 import argparse
 import sys
+from pathlib import Path
 from typing import Any
 
+from protest.history.storage import clean_dirty, load_history
+
 
 def handle_history_command(argv: list[str]) -> None:
     """Entry point for `protest history`."""
@@ -42,10 +45,6 @@ def handle_history_command(argv: list[str]) -> None:
     )
 
     args = parser.parse_args(argv)
-    from pathlib import Path
-
-    from protest.history.storage import clean_dirty, load_history
-
     history_dir = Path(args.path) if args.path else None
 
     if args.clean_dirty:
@@ -165,12 +164,12 @@ class _RichOutput(_Output):
     """Rich output with colors, tables, panels."""
 
     def __init__(self) -> None:
-        from rich.console import Console
+        from rich.console import Console  # noqa: PLC0415 — optional dep
 
         self.console = Console(highlight=False)
 
     def stats(self, entries: list[dict[str, Any]]) -> None:
-        from rich.table import Table
+        from rich.table import Table  # noqa: PLC0415 — optional dep
 
         suites = _aggregate_suites(entries)
         if not suites:
@@ -230,8 +229,8 @@ def runs(self, entries: list[dict[str, Any]]) -> None:
         self.console.print()
 
     def detail(self, entry: dict[str, Any]) -> None:
-        from rich.panel import Panel
-        from rich.text import Text
+        from rich.panel import Panel  # noqa: PLC0415 — optional dep
+        from rich.text import Text  # noqa: PLC0415 — optional dep
 
         kind = "EVAL" if entry.get("evals") else "TEST"
         git = entry.get("git") or {}
@@ -284,8 +283,8 @@ def detail(self, entry: dict[str, Any]) -> None:
         )
 
     def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None:
-        from rich.panel import Panel
-        from rich.text import Text
+        from rich.panel import Panel  # noqa: PLC0415 — optional dep
+        from rich.text import Text  # noqa: PLC0415 — optional dep
 
         cm = _get_display_model(current)
         pm = _get_display_model(previous)
diff --git a/protest/cli/main.py b/protest/cli/main.py
index 0ee6f2a..648fd26 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -4,10 +4,14 @@
 import sys
 from typing import TYPE_CHECKING, Any
 
+from protest.api import collect_tests, list_tags, run_session
+from protest.core.session import ProTestSession
+from protest.loader import LoadError, load_session, parse_target
+from protest.plugin import PluginContext
+from protest.reporting.verbosity import Verbosity
+
 if TYPE_CHECKING:
-    from protest.core.session import ProTestSession
     from protest.entities import TestItem
-    from protest.plugin import PluginContext
 
 HELP_EPILOG = """
 Examples:
@@ -56,9 +60,6 @@ def _handle_tags_command() -> None:
 
 def _list_tags(target: str, app_dir: str, recursive: bool = False) -> None:
     """List all tags in a session."""
-    from protest.api import collect_tests, list_tags
-    from protest.loader import LoadError, load_session
-
     try:
         session = load_session(target, app_dir)
     except LoadError as exc:
@@ -136,7 +137,7 @@ def _handle_live_command() -> None:
     )
     args = parser.parse_args(sys.argv[2:])
 
-    from protest.reporting.web import run_live_server
+    from protest.reporting.web import run_live_server  # noqa: PLC0415 — optional dep
 
     run_live_server(port=args.port)
 
@@ -234,15 +235,15 @@ def _create_run_parser() -> argparse.ArgumentParser:
 
 def _handle_history_command() -> None:
     """Handle 'protest history' subcommand."""
-    from protest.cli.history import handle_history_command
+    from protest.cli.history import (  # noqa: PLC0415 — heavy module
+        handle_history_command,
+    )
 
     handle_history_command(sys.argv[2:])
 
 
 def _handle_run_command(kind_filter: str | None = None) -> None:
     """Handle 'protest run' / 'protest eval' with two-phase parsing."""
-    from protest.loader import LoadError, load_session, parse_target
-
     argv = sys.argv[2:]
 
     # Phase 1: Parse base args to get target
@@ -251,8 +252,6 @@ def _handle_run_command(kind_filter: str | None = None) -> None:
 
     # If --help without target, show full help with all plugin options
     if ("--help" in remaining or "-h" in remaining) and not base_args.target:
-        from protest.core.session import ProTestSession
-
         full_parser = _create_run_parser()
         for plugin_class in ProTestSession.default_plugin_classes():
             plugin_class.add_cli_options(full_parser)
@@ -282,9 +281,6 @@ def _handle_run_command(kind_filter: str | None = None) -> None:
     args = full_parser.parse_args(argv)
 
     # Phase 5: Build context
-    from protest.plugin import PluginContext
-    from protest.reporting.verbosity import Verbosity
-
     effective_verbosity = Verbosity.QUIET if args.quiet else args.verbosity
     ctx_args: dict[str, Any] = {
         **vars(args),
@@ -304,8 +300,6 @@ def run_tests(
     ctx: PluginContext,
     collect_only: bool = False,
 ) -> None:
-    from protest.api import collect_tests, run_session
-
     if collect_only:
         items = collect_tests(session, ctx=ctx)
         print(f"Collected {len(items)} test(s):\n")
diff --git a/protest/core/session.py b/protest/core/session.py
index 910d032..efef4fb 100644
--- a/protest/core/session.py
+++ b/protest/core/session.py
@@ -8,13 +8,13 @@
     from types import TracebackType
 
     from protest.compat import Self
-    from protest.core.suite import ProTestSuite
     from protest.entities import FixtureCallable
     from protest.evals.types import JudgeInfo, ModelInfo
     from protest.plugin import PluginBase, PluginContext
 
 from protest.cache.plugin import CachePlugin
 from protest.cache.storage import CacheStorage
+from protest.core.suite import ProTestSuite
 from protest.di.container import FixtureContainer
 from protest.di.decorators import get_fixture_marker, unwrap_fixture
 from protest.entities import (
@@ -29,6 +29,9 @@
     normalize_skip,
     normalize_xfail,
 )
+from protest.evals.history import EvalHistoryPlugin
+from protest.evals.results_writer import EvalResultsWriter
+from protest.evals.wrapper import make_eval_wrapper
 from protest.events.bus import EventBus
 from protest.events.types import Event
 from protest.exceptions import InvalidMaxConcurrencyError
@@ -223,8 +226,6 @@ def eval(
             async def my_eval(case: Annotated[dict, From(cases)]) -> str:
                 return await run(case["q"])
         """
-        from protest.core.suite import ProTestSuite
-        from protest.evals.wrapper import make_eval_wrapper
 
         def decorator(func: FuncT) -> FuncT:
             suite_name = name or func.__name__
@@ -331,7 +332,9 @@ def register_default_plugins(self) -> None:
         for plugin_class in self.default_plugin_classes():
             self.use(plugin_class)
         if self._history:
-            from protest.history.plugin import HistoryPlugin
+            from protest.history.plugin import (  # noqa: PLC0415 — conditional
+                HistoryPlugin,
+            )
 
             self.register_plugin(HistoryPlugin(history_dir=self._history_dir))
 
@@ -378,8 +381,6 @@ def activate_plugins(self, ctx: PluginContext) -> None:
 
     def _wire_eval_support(self) -> None:
         """Wire eval history + results writer plugins (no EvalPlugin)."""
-        from protest.evals.history import EvalHistoryPlugin
-        from protest.evals.results_writer import EvalResultsWriter
 
         judge_dict = None
         if self._eval_judge:
@@ -451,7 +452,7 @@ async def __aexit__(
         exc_val: BaseException | None,
         exc_tb: TracebackType | None,
     ) -> bool:
-        import time
+        import time  # noqa: PLC0415 — only needed in __aexit__
 
         teardown_start = time.perf_counter()
         set_session_teardown_capture(True)
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index 628a275..d90b8f4 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -47,7 +47,7 @@ def __getattr__(name: str) -> object:
     # and reporters import protest.evals.types — eagerly importing
     # EvalSession here would create a circular import chain.
     if name == "EvalSession":
-        from protest.evals.session import EvalSession
+        from protest.evals.session import EvalSession  # noqa: PLC0415 — circular import
 
         return EvalSession
     msg = f"module {__name__!r} has no attribute {name!r}"
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 7fdf6c7..5d7a9f8 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -36,7 +36,19 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
 import functools
 import inspect
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Generic, TypeVar
+from typing import (
+    TYPE_CHECKING,
+    Annotated,
+    Any,
+    Generic,
+    TypeVar,
+    get_args,
+    get_origin,
+    get_type_hints,
+)
+
+from protest.evals.hashing import _canonical
+from protest.evals.types import EvalScore
 
 if TYPE_CHECKING:
     from protest.evals.types import Judge
@@ -159,8 +171,6 @@ def __init__(self, evaluators: list[Any]) -> None:
 
     def evaluator_identity(self) -> dict[str, Any]:
         """Identity is the ordered list of inner evaluators."""
-        from protest.evals.hashing import _canonical
-
         return {"short_circuit": [_canonical(e) for e in self.evaluators]}
 
 
@@ -186,10 +196,6 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]:
     Raises:
         TypeError: If result is not bool or dataclass.
     """
-    from typing import Annotated, get_args, get_origin, get_type_hints
-
-    from protest.evals.types import EvalScore
-
     if isinstance(result, bool):
         return [EvalScore(name=evaluator_name, value=result)]
 
diff --git a/protest/evals/history.py b/protest/evals/history.py
index 725c5b3..010ddb8 100644
--- a/protest/evals/history.py
+++ b/protest/evals/history.py
@@ -8,7 +8,13 @@
 
 from protest.entities import SuiteKind
 from protest.history.collector import collect_env_info, collect_git_info
-from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry
+from protest.history.storage import (
+    DEFAULT_HISTORY_DIR,
+    HISTORY_FILE,
+    append_entry,
+    load_history,
+    load_previous_run,
+)
 from protest.plugin import PluginBase
 
 if TYPE_CHECKING:
@@ -75,8 +81,6 @@ def on_session_end(self, _result: Any) -> None:
 
     def load_entries(self, n: int | None = None) -> list[dict[str, Any]]:
         """Load entries from history file."""
-        from protest.history.storage import load_history
-
         return load_history(history_dir=self._history_dir, n=n, evals_only=True)
 
 
@@ -156,8 +160,6 @@ def _serialize_case(case: EvalCaseResult) -> dict[str, Any]:
     return entry
 
 
-def load_previous_run(history_dir: Any = None) -> dict[str, Any] | None:
+def load_previous_eval_run(history_dir: Any = None) -> dict[str, Any] | None:
     """Load the most recent eval run from history."""
-    from protest.history.storage import load_previous_run as _load
-
-    return _load(history_dir=history_dir, evals_only=True)
+    return load_previous_run(history_dir=history_dir, evals_only=True)
diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
index 0c670a8..e069bba 100644
--- a/protest/evals/results_writer.py
+++ b/protest/evals/results_writer.py
@@ -11,11 +11,11 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
+from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport
 from protest.plugin import PluginBase
 
 if TYPE_CHECKING:
     from protest.entities.events import TestResult
-    from protest.evals.types import EvalCaseResult, EvalScore
     from protest.plugin import PluginContext
 
 DEFAULT_RESULTS_DIR = Path(".protest") / "results"
@@ -57,7 +57,6 @@ def _write_case_file(self, case_result: EvalCaseResult, suite_name: str) -> None
 
     def on_eval_suite_end(self, report: Any) -> None:
         """Print results dir path for the suite."""
-        from protest.evals.types import EvalSuiteReport
 
         if not isinstance(report, EvalSuiteReport):
             return
@@ -68,8 +67,6 @@ def on_eval_suite_end(self, report: Any) -> None:
 
 def _build_case_result(result: TestResult, passed: bool) -> EvalCaseResult:
     """Build EvalCaseResult from a TestResult with eval_payload."""
-    from protest.evals.types import EvalCaseResult, EvalScore
-
     payload = result.eval_payload
     assert payload is not None
     return EvalCaseResult(
diff --git a/protest/evals/session.py b/protest/evals/session.py
index ddace3d..09f0d5c 100644
--- a/protest/evals/session.py
+++ b/protest/evals/session.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any
 
 from protest.core.session import ProTestSession
+from protest.evals.types import JudgeInfo
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -43,8 +44,6 @@ def __init__(
         self._eval_model = model
         self._eval_judge_instance: Judge | None = judge
         if judge is not None:
-            from protest.evals.types import JudgeInfo
-
             self._eval_judge = JudgeInfo(name=judge.name, provider=judge.provider)
         else:
             self._eval_judge = None
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index 82b21ad..9526acd 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -18,7 +18,9 @@
     ShortCircuit,
     extract_scores_from_result,
 )
-from protest.evals.types import EvalScore
+from protest.evals.hashing import compute_case_hash, compute_eval_hash
+from protest.evals.types import EvalScore, TaskResult
+from protest.exceptions import FixtureError
 
 
 def make_eval_wrapper(
@@ -44,8 +46,6 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
         task_duration = time.perf_counter() - start
 
         # Unwrap TaskResult if returned
-        from protest.evals.types import TaskResult
-
         task_input_tokens = 0
         task_output_tokens = 0
         task_cost = 0.0
@@ -72,8 +72,6 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
             judge=judge,
         )
 
-        from protest.evals.hashing import compute_case_hash, compute_eval_hash
-
         return EvalPayload(
             case_name=case_name,
             passed=all(s.passed for s in scores),
@@ -203,8 +201,6 @@ async def run_evaluators(
             result = await raw if asyncio.iscoroutine(raw) else raw
             scores.extend(extract_scores_from_result(result, evaluator_name))
         except Exception as exc:
-            from protest.exceptions import FixtureError
-
             raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
 
     return scores, ctx
@@ -222,8 +218,6 @@ async def _run_short_circuit(
             raw = ev(ctx)
             result = await raw if asyncio.iscoroutine(raw) else raw
         except Exception as exc:
-            from protest.exceptions import FixtureError
-
             raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
         extracted = extract_scores_from_result(result, evaluator_name)
         scores.extend(extracted)
diff --git a/protest/history/collector.py b/protest/history/collector.py
index e81eefd..ee8bb1a 100644
--- a/protest/history/collector.py
+++ b/protest/history/collector.py
@@ -74,7 +74,7 @@ def _git(*args: str) -> str:
 
 def _get_pkg_version(name: str) -> str | None:
     try:
-        from importlib.metadata import version
+        from importlib.metadata import version  # noqa: PLC0415 — inside try/except
 
         return version(name)
     except Exception:
diff --git a/pyproject.toml b/pyproject.toml
index 92373eb..c1da068 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,28 +96,23 @@ ignore = [
 ]
 "protest/cli/**" = [
     "T201", # print allowed in CLI
-    "PLC0415", # lazy imports for fast --help
     "PLR2004", # magic values for arg parsing
 ]
 "protest/core/session.py" = [
-    "PLC0415", # lazy import for optional rich dependency
     "PLR0913", # many args is deliberate API design
 ]
 "protest/core/execution/test_executor.py" = [
     "PLR0915", # _run_test is inherently complex (retry loop + eval capture)
 ]
 "protest/history/**" = [
-    "PLC0415", # lazy imports
     "S603", # subprocess git calls are safe
     "PLR0913", # load_history has many filter params by design
 ]
 "protest/cli/history.py" = [
     "T201", # print for CLI output
-    "PLC0415", # lazy imports
 ]
 "protest/evals/**" = [
     "T201", # print for eval reporting
-    "PLC0415", # lazy imports for optional pydantic-evals dependency
     "PLR0913", # adapter functions have many params by design
 ]
 "protest/reporting/ascii.py" = [

From b703fa8f19afa7ab9e004de9c34f872d3b9e8deb Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Wed, 1 Apr 2026 01:29:20 +0200
Subject: [PATCH 16/60] fix: resolve all 32 mypy errors, type EvalContext
 generics properly

- Type built-in evaluators as EvalContext[Any, str] (text evaluators)
- not_empty typed EvalContext[Any, Any] (works on any output)
- Fix mypy running outside venv (uv run mypy in justfile)
- Add mypy config in pyproject.toml with rich stubs override
- Fix no-any-return, arg-type, unused type-ignore across codebase
- Remove stale type: ignore[import-not-found] on rich imports
---
 justfile                           |  2 +-
 protest/cli/history.py             |  4 ++--
 protest/console.py                 |  2 +-
 protest/core/outcome.py            |  6 ++++--
 protest/evals/evaluator.py         |  2 +-
 protest/evals/evaluators.py        | 20 ++++++++++----------
 protest/evals/wrapper.py           |  2 +-
 protest/history/storage.py         |  2 +-
 protest/reporting/rich_reporter.py |  4 ++--
 pyproject.toml                     |  7 +++++++
 10 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/justfile b/justfile
index ddce526..9ddfe7b 100644
--- a/justfile
+++ b/justfile
@@ -7,7 +7,7 @@
 @lint:
     ruff format .
     ruff check --fix .
-    mypy --strict protest
+    uv run mypy protest
 
 @fullcheck:
   ruff format --check . && ruff check .  # lint
diff --git a/protest/cli/history.py b/protest/cli/history.py
index cb00787..e83216d 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -447,8 +447,8 @@ def _track_cases(suite: dict[str, Any], cases: dict[str, Any]) -> None:
 
 def _get_display_model(entry: dict[str, Any]) -> str:
     """Get display model: per-suite models if they differ, global otherwise."""
-    suite_models = {
-        sd.get("model")
+    suite_models: set[str] = {
+        sd["model"]
         for sd in entry.get("suites", {}).values()
         if isinstance(sd, dict) and sd.get("model")
     }
diff --git a/protest/console.py b/protest/console.py
index 29dd381..9959165 100644
--- a/protest/console.py
+++ b/protest/console.py
@@ -44,7 +44,7 @@ def print(msg: str, *, raw: bool = False) -> None:
 
     # Call handlers directly (sync, bypasses async emit).
     # This ensures messages appear immediately, not after the test.
-    for handler_entry in bus._handlers.get(Event.USER_PRINT, []):  # type: ignore[union-attr]
+    for handler_entry in bus._handlers.get(Event.USER_PRINT, []):  # type: ignore[attr-defined]
         with contextlib.suppress(Exception):
             handler_entry.func((msg, raw))
 
diff --git a/protest/core/outcome.py b/protest/core/outcome.py
index 0018812..2563d95 100644
--- a/protest/core/outcome.py
+++ b/protest/core/outcome.py
@@ -111,8 +111,10 @@ def _build_skip(self, er: TestExecutionResult) -> TestOutcome:
 
     def _build_pass(self, er: TestExecutionResult) -> TestOutcome:
         return TestOutcome(
-            TestResult(**self._base_kwargs(er)), TestCounts(passed=1), Event.TEST_PASS
-        )  # type: ignore[arg-type]
+            TestResult(**self._base_kwargs(er)),  # type: ignore[arg-type]
+            TestCounts(passed=1),
+            Event.TEST_PASS,
+        )
 
     def _build_xpass(self, er: TestExecutionResult) -> TestOutcome:
         kw = self._base_kwargs(er)
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 5d7a9f8..fac20ed 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -260,7 +260,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
 def is_async_evaluator(fn: Any) -> bool:
     """Check if an evaluator (or partial thereof) is async."""
     if hasattr(fn, "_is_async_evaluator"):
-        return fn._is_async_evaluator
+        return bool(fn._is_async_evaluator)
     if isinstance(fn, functools.partial):
         return asyncio.iscoroutinefunction(fn.func)
     return asyncio.iscoroutinefunction(fn)
diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py
index d2cd632..ec7d9bd 100644
--- a/protest/evals/evaluators.py
+++ b/protest/evals/evaluators.py
@@ -10,7 +10,7 @@
 import json as json_module
 import re
 from dataclasses import dataclass
-from typing import Annotated
+from typing import Annotated, Any
 
 from protest.evals.evaluator import EvalContext, Metric, Verdict, evaluator
 
@@ -45,7 +45,7 @@ class WordOverlapResult:
 
 @evaluator
 def contains_keywords(
-    ctx: EvalContext, keywords: list[str], min_recall: float = 0.0
+    ctx: EvalContext[Any, str], keywords: list[str], min_recall: float = 0.0
 ) -> ContainsKeywordsResult:
     """Check that the output contains expected keywords (case-insensitive)."""
     output_lower = ctx.output.lower()
@@ -59,7 +59,7 @@ def contains_keywords(
 
 
 @evaluator
-def contains_expected(ctx: EvalContext, case_sensitive: bool = False) -> bool:
+def contains_expected(ctx: EvalContext[Any, str], case_sensitive: bool = False) -> bool:
     """Check that the output contains expected_output as a substring."""
     if ctx.expected_output is None:
         return True
@@ -70,7 +70,7 @@ def contains_expected(ctx: EvalContext, case_sensitive: bool = False) -> bool:
 
 @evaluator
 def does_not_contain(
-    ctx: EvalContext, forbidden: list[str], case_sensitive: bool = False
+    ctx: EvalContext[Any, str], forbidden: list[str], case_sensitive: bool = False
 ) -> DoesNotContainResult:
     """Check that the output does not contain forbidden words."""
     output = ctx.output if case_sensitive else ctx.output.lower()
@@ -79,7 +79,7 @@ def does_not_contain(
 
 
 @evaluator
-def not_empty(ctx: EvalContext) -> bool:
+def not_empty(ctx: EvalContext[Any, Any]) -> bool:
     """Check that the output is not empty or whitespace-only."""
     if ctx.output is None:
         return False
@@ -89,7 +89,7 @@ def not_empty(ctx: EvalContext) -> bool:
 
 
 @evaluator
-def max_length(ctx: EvalContext, max_chars: int = 500) -> MaxLengthResult:
+def max_length(ctx: EvalContext[Any, str], max_chars: int = 500) -> MaxLengthResult:
     """Check that the output doesn't exceed a character limit."""
     length = len(ctx.output)
     return MaxLengthResult(
@@ -99,20 +99,20 @@ def max_length(ctx: EvalContext, max_chars: int = 500) -> MaxLengthResult:
 
 
 @evaluator
-def min_length(ctx: EvalContext, min_chars: int = 1) -> bool:
+def min_length(ctx: EvalContext[Any, str], min_chars: int = 1) -> bool:
     """Check that the output meets a minimum length."""
     return len(ctx.output) >= min_chars
 
 
 @evaluator
-def matches_regex(ctx: EvalContext, pattern: str, flags: int = 0) -> bool:
+def matches_regex(ctx: EvalContext[Any, str], pattern: str, flags: int = 0) -> bool:
     """Check that the output matches a regex pattern."""
     return bool(re.search(pattern, ctx.output, flags))
 
 
 @evaluator
 def json_valid(
-    ctx: EvalContext, required_keys: list[str] | None = None
+    ctx: EvalContext[Any, str], required_keys: list[str] | None = None
 ) -> JsonValidResult:
     """Check that the output is valid JSON, optionally with required keys."""
     if required_keys is None:
@@ -131,7 +131,7 @@ def json_valid(
 
 
 @evaluator
-def word_overlap(ctx: EvalContext) -> WordOverlapResult:
+def word_overlap(ctx: EvalContext[Any, str]) -> WordOverlapResult:
     """Compute word overlap between output and expected_output (tracking-only)."""
     if ctx.expected_output is None:
         return WordOverlapResult(overlap=1.0)
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index 9526acd..bc2569b 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -134,7 +134,7 @@ def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str:
         if _is_case_data(v):
             name = _get(v, "name")
             if name:
-                return name
+                return str(name)
     return fallback
 
 
diff --git a/protest/history/storage.py b/protest/history/storage.py
index 78d35b9..5dbe047 100644
--- a/protest/history/storage.py
+++ b/protest/history/storage.py
@@ -90,7 +90,7 @@ def load_previous_run(
             continue
         if evals_only and entry.get("evals") is None:
             continue
-        return entry
+        return dict(entry)
     return None
 
 
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 981b03f..506641d 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -5,8 +5,8 @@
 from pathlib import Path
 from typing import Any
 
-from rich.console import Console  # type: ignore[import-not-found]
-from rich.table import Table  # type: ignore[import-not-found]
+from rich.console import Console
+from rich.table import Table
 from typing_extensions import Self
 
 from protest.entities import (
diff --git a/pyproject.toml b/pyproject.toml
index c1da068..ff175b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -135,6 +135,13 @@ omit = [
     "protest/compat.py", # Version-specific imports, impossible to cover without multi-version CI
 ]
 
+[tool.mypy]
+strict = true
+
+[[tool.mypy.overrides]]
+module = "rich.*"
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 asyncio_mode = "strict"

From 39bd555c777e11a7d799daa47c5e3c2be441dcfe Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Wed, 1 Apr 2026 01:34:55 +0200
Subject: [PATCH 17/60] refactor: remove dead duck-typed evaluator markers, add
 typed examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove is_async_evaluator(), _is_evaluator, _is_async_evaluator
  (written but never read — dead code with hasattr duck-typing)
- Add yorkshire example evaluators showing EvalContext generics:
  [Any, str] for text, [str, float] for numeric, [str, bytes] for binary
---
 examples/yorkshire/evals/evaluators.py | 60 ++++++++++++++++++++++++++
 protest/evals/evaluator.py             | 13 ------
 2 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/examples/yorkshire/evals/evaluators.py b/examples/yorkshire/evals/evaluators.py
index b07153d..1008c22 100644
--- a/examples/yorkshire/evals/evaluators.py
+++ b/examples/yorkshire/evals/evaluators.py
@@ -2,4 +2,64 @@
 
 Generic evaluators come from protest.evals.evaluators.
 Only project-specific ones live here.
+
+These also demonstrate how EvalContext generics document
+what an evaluator expects as input/output types.
 """
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Annotated, Any
+
+from protest.evals import EvalContext, Metric, Verdict, evaluator
+
+# --- Text evaluator: EvalContext[Any, str] ---------------------------------
+# Most evaluators work on text output. The first type param (inputs) is Any
+# because evaluators don't usually care about the input shape.
+
+
+@dataclass(frozen=True, slots=True)
+class MentionsBreedResult:
+    breed_mentioned: Annotated[bool, Verdict]
+
+
+@evaluator
+def mentions_breed(
+    ctx: EvalContext[Any, str], breed: str = "Yorkshire"
+) -> MentionsBreedResult:
+    """Check that the output mentions a specific breed."""
+    return MentionsBreedResult(breed_mentioned=breed.lower() in ctx.output.lower())
+
+
+# --- Numeric evaluator: EvalContext[str, float] ----------------------------
+# An evaluator for a task that returns a numeric score (e.g. a classifier
+# confidence, a similarity metric). The output is a float, not a string.
+
+
+@dataclass(frozen=True, slots=True)
+class ConfidenceResult:
+    confidence: Annotated[float, Metric]
+    above_threshold: Annotated[bool, Verdict]
+
+
+@evaluator
+def confidence_above(
+    ctx: EvalContext[str, float], threshold: float = 0.8
+) -> ConfidenceResult:
+    """Check that a numeric output (e.g. classifier confidence) meets a threshold."""
+    return ConfidenceResult(
+        confidence=ctx.output,
+        above_threshold=ctx.output >= threshold,
+    )
+
+
+# --- Binary evaluator: EvalContext[str, bytes] -----------------------------
+# An evaluator for a task that returns raw bytes (e.g. image generation,
+# audio synthesis). The evaluator checks basic properties of the output.
+
+
+@evaluator
+def output_not_empty_bytes(ctx: EvalContext[str, bytes]) -> bool:
+    """Check that a binary output (e.g. generated image) is not empty."""
+    return len(ctx.output) > 0
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index fac20ed..6d0c980 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -31,7 +31,6 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
 
 from __future__ import annotations
 
-import asyncio
 import dataclasses
 import functools
 import inspect
@@ -243,7 +242,6 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
         if has_extra_params and kwargs:
             bound = functools.partial(fn, **kwargs)
             # Preserve async detection on the partial
-            bound._is_async_evaluator = asyncio.iscoroutinefunction(fn)  # type: ignore[attr-defined]
             bound.__name__ = fn.__name__  # type: ignore[attr-defined]
             bound.__qualname__ = fn.__qualname__  # type: ignore[attr-defined]
             return bound
@@ -252,15 +250,4 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             return fn
         return fn(*args, **kwargs)
 
-    wrapper._is_evaluator = True  # type: ignore[attr-defined]
-    wrapper._is_async_evaluator = asyncio.iscoroutinefunction(fn)  # type: ignore[attr-defined]
     return wrapper
-
-
-def is_async_evaluator(fn: Any) -> bool:
-    """Check if an evaluator (or partial thereof) is async."""
-    if hasattr(fn, "_is_async_evaluator"):
-        return bool(fn._is_async_evaluator)
-    if isinstance(fn, functools.partial):
-        return asyncio.iscoroutinefunction(fn.func)
-    return asyncio.iscoroutinefunction(fn)

From 155db22114d855db5e4fae61eca797f7699379db Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Wed, 1 Apr 2026 01:43:12 +0200
Subject: [PATCH 18/60] ci: update workflow to install dependencies and fix
 mypy invocation

---
 .github/workflows/ci.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 75efa11..84a54c7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -46,8 +46,11 @@ jobs:
         with:
           python-version: "3.12"
 
+      - name: Install dependencies
+        run: uv sync --all-extras
+
       - name: Type check
-        run: uvx mypy --strict protest
+        run: uv run mypy protest
 
   test:
     needs: lint

From 96d3632c03b6c05cf4fd03fd9ad493eed801f538 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Wed, 1 Apr 2026 01:47:56 +0200
Subject: [PATCH 19/60] refactor: remove redundant type ignores, update
 dependency management

- Removed unnecessary `# type: ignore[import-not-found]` markers on imports.
- Added `--group dev` flag to dependency sync in CI workflow.
- Updated `uv.lock` to include new packages: `librt` and `mypy`.
---
 .github/workflows/ci.yml     |   2 +-
 protest/reporting/factory.py |   2 +-
 protest/reporting/web.py     |   8 +-
 pyproject.toml               |   5 +-
 uv.lock                      | 160 ++++++++++++++++++++++++++++++++++-
 5 files changed, 164 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 84a54c7..22a0944 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -47,7 +47,7 @@ jobs:
           python-version: "3.12"
 
       - name: Install dependencies
-        run: uv sync --all-extras
+        run: uv sync --all-extras --group dev
 
       - name: Type check
         run: uv run mypy protest
diff --git a/protest/reporting/factory.py b/protest/reporting/factory.py
index e3d405a..6d0fbf6 100644
--- a/protest/reporting/factory.py
+++ b/protest/reporting/factory.py
@@ -18,7 +18,7 @@ def get_reporter(force_no_color: bool = False) -> PluginBase:
         return AsciiReporter()
 
     try:
-        from rich.console import Console  # type: ignore[import-not-found]
+        from rich.console import Console
 
         Console()
     except ImportError:
diff --git a/protest/reporting/web.py b/protest/reporting/web.py
index 2e47b5d..517de24 100644
--- a/protest/reporting/web.py
+++ b/protest/reporting/web.py
@@ -30,12 +30,12 @@
     )
 
 try:
-    from websockets.asyncio.server import (  # type: ignore[import-not-found]
+    from websockets.asyncio.server import (
         serve as ws_serve,
     )
-    from websockets.datastructures import Headers  # type: ignore[import-not-found]
-    from websockets.http11 import Request, Response  # type: ignore[import-not-found]
-    from websockets.sync.client import (  # type: ignore[import-not-found]
+    from websockets.datastructures import Headers
+    from websockets.http11 import Request, Response
+    from websockets.sync.client import (
         connect as ws_connect,
     )
 except ImportError as err:  # pragma: no cover
diff --git a/pyproject.toml b/pyproject.toml
index ff175b4..0dbe858 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -138,10 +138,6 @@ omit = [
 [tool.mypy]
 strict = true
 
-[[tool.mypy.overrides]]
-module = "rich.*"
-ignore_missing_imports = true
-
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 asyncio_mode = "strict"
@@ -174,6 +170,7 @@ include = ["protest*"]
 dev = [
     "jsonschema>=4.0.0",
     "mkdocs-material>=9.7.0",
+    "mypy>=1.0",
     "pre-commit>=4.5.0",
     "pytest>=9.0.1",
     "pytest-asyncio>=1.3.0",
diff --git a/uv.lock b/uv.lock
index 34a6ee8..e4d7032 100644
--- a/uv.lock
+++ b/uv.lock
@@ -477,6 +477,91 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "librt"
+version = "0.8.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/56/9c/b4b0c54d84da4a94b37bd44151e46d5e583c9534c7e02250b961b1b6d8a8/librt-0.8.1.tar.gz", hash = "sha256:be46a14693955b3bd96014ccbdb8339ee8c9346fbe11c1b78901b55125f14c73", size = 177471, upload-time = "2026-02-17T16:13:06.101Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/5f/63f5fa395c7a8a93558c0904ba8f1c8d1b997ca6a3de61bc7659970d66bf/librt-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:81fd938344fecb9373ba1b155968c8a329491d2ce38e7ddb76f30ffb938f12dc", size = 65697, upload-time = "2026-02-17T16:11:06.903Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/e0/0472cf37267b5920eff2f292ccfaede1886288ce35b7f3203d8de00abfe6/librt-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5db05697c82b3a2ec53f6e72b2ed373132b0c2e05135f0696784e97d7f5d48e7", size = 68376, upload-time = "2026-02-17T16:11:08.395Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/be/8bd1359fdcd27ab897cd5963294fa4a7c83b20a8564678e4fd12157e56a5/librt-0.8.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d56bc4011975f7460bea7b33e1ff425d2f1adf419935ff6707273c77f8a4ada6", size = 197084, upload-time = "2026-02-17T16:11:09.774Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/fe/163e33fdd091d0c2b102f8a60cc0a61fd730ad44e32617cd161e7cd67a01/librt-0.8.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cdc0f588ff4b663ea96c26d2a230c525c6fc62b28314edaaaca8ed5af931ad0", size = 207337, upload-time = "2026-02-17T16:11:11.311Z" },
+    { url = "https://files.pythonhosted.org/packages/01/99/f85130582f05dcf0c8902f3d629270231d2f4afdfc567f8305a952ac7f14/librt-0.8.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:97c2b54ff6717a7a563b72627990bec60d8029df17df423f0ed37d56a17a176b", size = 219980, upload-time = "2026-02-17T16:11:12.499Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/54/cb5e4d03659e043a26c74e08206412ac9a3742f0477d96f9761a55313b5f/librt-0.8.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8f1125e6bbf2f1657d9a2f3ccc4a2c9b0c8b176965bb565dd4d86be67eddb4b6", size = 212921, upload-time = "2026-02-17T16:11:14.484Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/81/a3a01e4240579c30f3487f6fed01eb4bc8ef0616da5b4ebac27ca19775f3/librt-0.8.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8f4bb453f408137d7581be309b2fbc6868a80e7ef60c88e689078ee3a296ae71", size = 221381, upload-time = "2026-02-17T16:11:17.459Z" },
+    { url = "https://files.pythonhosted.org/packages/08/b0/fc2d54b4b1c6fb81e77288ff31ff25a2c1e62eaef4424a984f228839717b/librt-0.8.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c336d61d2fe74a3195edc1646d53ff1cddd3a9600b09fa6ab75e5514ba4862a7", size = 216714, upload-time = "2026-02-17T16:11:19.197Z" },
+    { url = "https://files.pythonhosted.org/packages/96/96/85daa73ffbd87e1fb287d7af6553ada66bf25a2a6b0de4764344a05469f6/librt-0.8.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:eb5656019db7c4deacf0c1a55a898c5bb8f989be904597fcb5232a2f4828fa05", size = 214777, upload-time = "2026-02-17T16:11:20.443Z" },
+    { url = "https://files.pythonhosted.org/packages/12/9c/c3aa7a2360383f4bf4f04d98195f2739a579128720c603f4807f006a4225/librt-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c25d9e338d5bed46c1632f851babf3d13c78f49a225462017cf5e11e845c5891", size = 237398, upload-time = "2026-02-17T16:11:22.083Z" },
+    { url = "https://files.pythonhosted.org/packages/61/19/d350ea89e5274665185dabc4bbb9c3536c3411f862881d316c8b8e00eb66/librt-0.8.1-cp310-cp310-win32.whl", hash = "sha256:aaab0e307e344cb28d800957ef3ec16605146ef0e59e059a60a176d19543d1b7", size = 54285, upload-time = "2026-02-17T16:11:23.27Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/d6/45d587d3d41c112e9543a0093d883eb57a24a03e41561c127818aa2a6bcc/librt-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:56e04c14b696300d47b3bc5f1d10a00e86ae978886d0cee14e5714fafb5df5d2", size = 61352, upload-time = "2026-02-17T16:11:24.207Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/01/0e748af5e4fee180cf7cd12bd12b0513ad23b045dccb2a83191bde82d168/librt-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:681dc2451d6d846794a828c16c22dc452d924e9f700a485b7ecb887a30aad1fd", size = 65315, upload-time = "2026-02-17T16:11:25.152Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/4d/7184806efda571887c798d573ca4134c80ac8642dcdd32f12c31b939c595/librt-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3b4350b13cc0e6f5bec8fa7caf29a8fb8cdc051a3bae45cfbfd7ce64f009965", size = 68021, upload-time = "2026-02-17T16:11:26.129Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/88/c3c52d2a5d5101f28d3dc89298444626e7874aa904eed498464c2af17627/librt-0.8.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ac1e7817fd0ed3d14fd7c5df91daed84c48e4c2a11ee99c0547f9f62fdae13da", size = 194500, upload-time = "2026-02-17T16:11:27.177Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/5d/6fb0a25b6a8906e85b2c3b87bee1d6ed31510be7605b06772f9374ca5cb3/librt-0.8.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:747328be0c5b7075cde86a0e09d7a9196029800ba75a1689332348e998fb85c0", size = 205622, upload-time = "2026-02-17T16:11:28.242Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/a6/8006ae81227105476a45691f5831499e4d936b1c049b0c1feb17c11b02d1/librt-0.8.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0af2bd2bc204fa27f3d6711d0f360e6b8c684a035206257a81673ab924aa11e", size = 218304, upload-time = "2026-02-17T16:11:29.344Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/19/60e07886ad16670aae57ef44dada41912c90906a6fe9f2b9abac21374748/librt-0.8.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d480de377f5b687b6b1bc0c0407426da556e2a757633cc7e4d2e1a057aa688f3", size = 211493, upload-time = "2026-02-17T16:11:30.445Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/cf/f666c89d0e861d05600438213feeb818c7514d3315bae3648b1fc145d2b6/librt-0.8.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d0ee06b5b5291f609ddb37b9750985b27bc567791bc87c76a569b3feed8481ac", size = 219129, upload-time = "2026-02-17T16:11:32.021Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/ef/f1bea01e40b4a879364c031476c82a0dc69ce068daad67ab96302fed2d45/librt-0.8.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9e2c6f77b9ad48ce5603b83b7da9ee3e36b3ab425353f695cba13200c5d96596", size = 213113, upload-time = "2026-02-17T16:11:33.192Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/80/cdab544370cc6bc1b72ea369525f547a59e6938ef6863a11ab3cd24759af/librt-0.8.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:439352ba9373f11cb8e1933da194dcc6206daf779ff8df0ed69c5e39113e6a99", size = 212269, upload-time = "2026-02-17T16:11:34.373Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/9c/48d6ed8dac595654f15eceab2035131c136d1ae9a1e3548e777bb6dbb95d/librt-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:82210adabbc331dbb65d7868b105185464ef13f56f7f76688565ad79f648b0fe", size = 234673, upload-time = "2026-02-17T16:11:36.063Z" },
+    { url = "https://files.pythonhosted.org/packages/16/01/35b68b1db517f27a01be4467593292eb5315def8900afad29fabf56304ba/librt-0.8.1-cp311-cp311-win32.whl", hash = "sha256:52c224e14614b750c0a6d97368e16804a98c684657c7518752c356834fff83bb", size = 54597, upload-time = "2026-02-17T16:11:37.544Z" },
+    { url = "https://files.pythonhosted.org/packages/71/02/796fe8f02822235966693f257bf2c79f40e11337337a657a8cfebba5febc/librt-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:c00e5c884f528c9932d278d5c9cbbea38a6b81eb62c02e06ae53751a83a4d52b", size = 61733, upload-time = "2026-02-17T16:11:38.691Z" },
+    { url = "https://files.pythonhosted.org/packages/28/ad/232e13d61f879a42a4e7117d65e4984bb28371a34bb6fb9ca54ec2c8f54e/librt-0.8.1-cp311-cp311-win_arm64.whl", hash = "sha256:f7cdf7f26c2286ffb02e46d7bac56c94655540b26347673bea15fa52a6af17e9", size = 52273, upload-time = "2026-02-17T16:11:40.308Z" },
+    { url = "https://files.pythonhosted.org/packages/95/21/d39b0a87ac52fc98f621fb6f8060efb017a767ebbbac2f99fbcbc9ddc0d7/librt-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a28f2612ab566b17f3698b0da021ff9960610301607c9a5e8eaca62f5e1c350a", size = 66516, upload-time = "2026-02-17T16:11:41.604Z" },
+    { url = "https://files.pythonhosted.org/packages/69/f1/46375e71441c43e8ae335905e069f1c54febee63a146278bcee8782c84fd/librt-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:60a78b694c9aee2a0f1aaeaa7d101cf713e92e8423a941d2897f4fa37908dab9", size = 68634, upload-time = "2026-02-17T16:11:43.268Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/33/c510de7f93bf1fa19e13423a606d8189a02624a800710f6e6a0a0f0784b3/librt-0.8.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:758509ea3f1eba2a57558e7e98f4659d0ea7670bff49673b0dde18a3c7e6c0eb", size = 198941, upload-time = "2026-02-17T16:11:44.28Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/36/e725903416409a533d92398e88ce665476f275081d0d7d42f9c4951999e5/librt-0.8.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:039b9f2c506bd0ab0f8725aa5ba339c6f0cd19d3b514b50d134789809c24285d", size = 209991, upload-time = "2026-02-17T16:11:45.462Z" },
+    { url = "https://files.pythonhosted.org/packages/30/7a/8d908a152e1875c9f8eac96c97a480df425e657cdb47854b9efaa4998889/librt-0.8.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bb54f1205a3a6ab41a6fd71dfcdcbd278670d3a90ca502a30d9da583105b6f7", size = 224476, upload-time = "2026-02-17T16:11:46.542Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/b8/a22c34f2c485b8903a06f3fe3315341fe6876ef3599792344669db98fcff/librt-0.8.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:05bd41cdee35b0c59c259f870f6da532a2c5ca57db95b5f23689fcb5c9e42440", size = 217518, upload-time = "2026-02-17T16:11:47.746Z" },
+    { url = "https://files.pythonhosted.org/packages/79/6f/5c6fea00357e4f82ba44f81dbfb027921f1ab10e320d4a64e1c408d035d9/librt-0.8.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adfab487facf03f0d0857b8710cf82d0704a309d8ffc33b03d9302b4c64e91a9", size = 225116, upload-time = "2026-02-17T16:11:49.298Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/a0/95ced4e7b1267fe1e2720a111685bcddf0e781f7e9e0ce59d751c44dcfe5/librt-0.8.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:153188fe98a72f206042be10a2c6026139852805215ed9539186312d50a8e972", size = 217751, upload-time = "2026-02-17T16:11:50.49Z" },
+    { url = "https://files.pythonhosted.org/packages/93/c2/0517281cb4d4101c27ab59472924e67f55e375bc46bedae94ac6dc6e1902/librt-0.8.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:dd3c41254ee98604b08bd5b3af5bf0a89740d4ee0711de95b65166bf44091921", size = 218378, upload-time = "2026-02-17T16:11:51.783Z" },
+    { url = "https://files.pythonhosted.org/packages/43/e8/37b3ac108e8976888e559a7b227d0ceac03c384cfd3e7a1c2ee248dbae79/librt-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e0d138c7ae532908cbb342162b2611dbd4d90c941cd25ab82084aaf71d2c0bd0", size = 241199, upload-time = "2026-02-17T16:11:53.561Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/5b/35812d041c53967fedf551a39399271bbe4257e681236a2cf1a69c8e7fa1/librt-0.8.1-cp312-cp312-win32.whl", hash = "sha256:43353b943613c5d9c49a25aaffdba46f888ec354e71e3529a00cca3f04d66a7a", size = 54917, upload-time = "2026-02-17T16:11:54.758Z" },
+    { url = "https://files.pythonhosted.org/packages/de/d1/fa5d5331b862b9775aaf2a100f5ef86854e5d4407f71bddf102f4421e034/librt-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:ff8baf1f8d3f4b6b7257fcb75a501f2a5499d0dda57645baa09d4d0d34b19444", size = 62017, upload-time = "2026-02-17T16:11:55.748Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/7c/c614252f9acda59b01a66e2ddfd243ed1c7e1deab0293332dfbccf862808/librt-0.8.1-cp312-cp312-win_arm64.whl", hash = "sha256:0f2ae3725904f7377e11cc37722d5d401e8b3d5851fb9273d7f4fe04f6b3d37d", size = 52441, upload-time = "2026-02-17T16:11:56.801Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/3c/f614c8e4eaac7cbf2bbdf9528790b21d89e277ee20d57dc6e559c626105f/librt-0.8.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7e6bad1cd94f6764e1e21950542f818a09316645337fd5ab9a7acc45d99a8f35", size = 66529, upload-time = "2026-02-17T16:11:57.809Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/96/5836544a45100ae411eda07d29e3d99448e5258b6e9c8059deb92945f5c2/librt-0.8.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cf450f498c30af55551ba4f66b9123b7185362ec8b625a773b3d39aa1a717583", size = 68669, upload-time = "2026-02-17T16:11:58.843Z" },
+    { url = "https://files.pythonhosted.org/packages/06/53/f0b992b57af6d5531bf4677d75c44f095f2366a1741fb695ee462ae04b05/librt-0.8.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:eca45e982fa074090057132e30585a7e8674e9e885d402eae85633e9f449ce6c", size = 199279, upload-time = "2026-02-17T16:11:59.862Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/ad/4848cc16e268d14280d8168aee4f31cea92bbd2b79ce33d3e166f2b4e4fc/librt-0.8.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c3811485fccfda840861905b8c70bba5ec094e02825598bb9d4ca3936857a04", size = 210288, upload-time = "2026-02-17T16:12:00.954Z" },
+    { url = "https://files.pythonhosted.org/packages/52/05/27fdc2e95de26273d83b96742d8d3b7345f2ea2bdbd2405cc504644f2096/librt-0.8.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e4af413908f77294605e28cfd98063f54b2c790561383971d2f52d113d9c363", size = 224809, upload-time = "2026-02-17T16:12:02.108Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/d0/78200a45ba3240cb042bc597d6f2accba9193a2c57d0356268cbbe2d0925/librt-0.8.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5212a5bd7fae98dae95710032902edcd2ec4dc994e883294f75c857b83f9aba0", size = 218075, upload-time = "2026-02-17T16:12:03.631Z" },
+    { url = "https://files.pythonhosted.org/packages/af/72/a210839fa74c90474897124c064ffca07f8d4b347b6574d309686aae7ca6/librt-0.8.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e692aa2d1d604e6ca12d35e51fdc36f4cda6345e28e36374579f7ef3611b3012", size = 225486, upload-time = "2026-02-17T16:12:04.725Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/c1/a03cc63722339ddbf087485f253493e2b013039f5b707e8e6016141130fa/librt-0.8.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4be2a5c926b9770c9e08e717f05737a269b9d0ebc5d2f0060f0fe3fe9ce47acb", size = 218219, upload-time = "2026-02-17T16:12:05.828Z" },
+    { url = "https://files.pythonhosted.org/packages/58/f5/fff6108af0acf941c6f274a946aea0e484bd10cd2dc37610287ce49388c5/librt-0.8.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fd1a720332ea335ceb544cf0a03f81df92abd4bb887679fd1e460976b0e6214b", size = 218750, upload-time = "2026-02-17T16:12:07.09Z" },
+    { url = "https://files.pythonhosted.org/packages/71/67/5a387bfef30ec1e4b4f30562c8586566faf87e47d696768c19feb49e3646/librt-0.8.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2af9e01e0ef80d95ae3c720be101227edae5f2fe7e3dc63d8857fadfc5a1d", size = 241624, upload-time = "2026-02-17T16:12:08.43Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/be/24f8502db11d405232ac1162eb98069ca49c3306c1d75c6ccc61d9af8789/librt-0.8.1-cp313-cp313-win32.whl", hash = "sha256:086a32dbb71336627e78cc1d6ee305a68d038ef7d4c39aaff41ae8c9aa46e91a", size = 54969, upload-time = "2026-02-17T16:12:09.633Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/73/c9fdf6cb2a529c1a092ce769a12d88c8cca991194dfe641b6af12fa964d2/librt-0.8.1-cp313-cp313-win_amd64.whl", hash = "sha256:e11769a1dbda4da7b00a76cfffa67aa47cfa66921d2724539eee4b9ede780b79", size = 62000, upload-time = "2026-02-17T16:12:10.632Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/97/68f80ca3ac4924f250cdfa6e20142a803e5e50fca96ef5148c52ee8c10ea/librt-0.8.1-cp313-cp313-win_arm64.whl", hash = "sha256:924817ab3141aca17893386ee13261f1d100d1ef410d70afe4389f2359fea4f0", size = 52495, upload-time = "2026-02-17T16:12:11.633Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/6a/907ef6800f7bca71b525a05f1839b21f708c09043b1c6aa77b6b827b3996/librt-0.8.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6cfa7fe54fd4d1f47130017351a959fe5804bda7a0bc7e07a2cdbc3fdd28d34f", size = 66081, upload-time = "2026-02-17T16:12:12.766Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/18/25e991cd5640c9fb0f8d91b18797b29066b792f17bf8493da183bf5caabe/librt-0.8.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:228c2409c079f8c11fb2e5d7b277077f694cb93443eb760e00b3b83cb8b3176c", size = 68309, upload-time = "2026-02-17T16:12:13.756Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/36/46820d03f058cfb5a9de5940640ba03165ed8aded69e0733c417bb04df34/librt-0.8.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7aae78ab5e3206181780e56912d1b9bb9f90a7249ce12f0e8bf531d0462dd0fc", size = 196804, upload-time = "2026-02-17T16:12:14.818Z" },
+    { url = "https://files.pythonhosted.org/packages/59/18/5dd0d3b87b8ff9c061849fbdb347758d1f724b9a82241aa908e0ec54ccd0/librt-0.8.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:172d57ec04346b047ca6af181e1ea4858086c80bdf455f61994c4aa6fc3f866c", size = 206907, upload-time = "2026-02-17T16:12:16.513Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/96/ef04902aad1424fd7299b62d1890e803e6ab4018c3044dca5922319c4b97/librt-0.8.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6b1977c4ea97ce5eb7755a78fae68d87e4102e4aaf54985e8b56806849cc06a3", size = 221217, upload-time = "2026-02-17T16:12:17.906Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/ff/7e01f2dda84a8f5d280637a2e5827210a8acca9a567a54507ef1c75b342d/librt-0.8.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:10c42e1f6fd06733ef65ae7bebce2872bcafd8d6e6b0a08fe0a05a23b044fb14", size = 214622, upload-time = "2026-02-17T16:12:19.108Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/8c/5b093d08a13946034fed57619742f790faf77058558b14ca36a6e331161e/librt-0.8.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4c8dfa264b9193c4ee19113c985c95f876fae5e51f731494fc4e0cf594990ba7", size = 221987, upload-time = "2026-02-17T16:12:20.331Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/cc/86b0b3b151d40920ad45a94ce0171dec1aebba8a9d72bb3fa00c73ab25dd/librt-0.8.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:01170b6729a438f0dedc4a26ed342e3dc4f02d1000b4b19f980e1877f0c297e6", size = 215132, upload-time = "2026-02-17T16:12:21.54Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/be/8588164a46edf1e69858d952654e216a9a91174688eeefb9efbb38a9c799/librt-0.8.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:7b02679a0d783bdae30d443025b94465d8c3dc512f32f5b5031f93f57ac32071", size = 215195, upload-time = "2026-02-17T16:12:23.073Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/f2/0b9279bea735c734d69344ecfe056c1ba211694a72df10f568745c899c76/librt-0.8.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:190b109bb69592a3401fe1ffdea41a2e73370ace2ffdc4a0e8e2b39cdea81b78", size = 237946, upload-time = "2026-02-17T16:12:24.275Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/cc/5f2a34fbc8aeb35314a3641f9956fa9051a947424652fad9882be7a97949/librt-0.8.1-cp314-cp314-win32.whl", hash = "sha256:e70a57ecf89a0f64c24e37f38d3fe217a58169d2fe6ed6d70554964042474023", size = 50689, upload-time = "2026-02-17T16:12:25.766Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/76/cd4d010ab2147339ca2b93e959c3686e964edc6de66ddacc935c325883d7/librt-0.8.1-cp314-cp314-win_amd64.whl", hash = "sha256:7e2f3edca35664499fbb36e4770650c4bd4a08abc1f4458eab9df4ec56389730", size = 57875, upload-time = "2026-02-17T16:12:27.465Z" },
+    { url = "https://files.pythonhosted.org/packages/84/0f/2143cb3c3ca48bd3379dcd11817163ca50781927c4537345d608b5045998/librt-0.8.1-cp314-cp314-win_arm64.whl", hash = "sha256:0d2f82168e55ddefd27c01c654ce52379c0750ddc31ee86b4b266bcf4d65f2a3", size = 48058, upload-time = "2026-02-17T16:12:28.556Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/0e/9b23a87e37baf00311c3efe6b48d6b6c168c29902dfc3f04c338372fd7db/librt-0.8.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2c74a2da57a094bd48d03fa5d196da83d2815678385d2978657499063709abe1", size = 68313, upload-time = "2026-02-17T16:12:29.659Z" },
+    { url = "https://files.pythonhosted.org/packages/db/9a/859c41e5a4f1c84200a7d2b92f586aa27133c8243b6cac9926f6e54d01b9/librt-0.8.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a355d99c4c0d8e5b770313b8b247411ed40949ca44e33e46a4789b9293a907ee", size = 70994, upload-time = "2026-02-17T16:12:31.516Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/28/10605366ee599ed34223ac2bf66404c6fb59399f47108215d16d5ad751a8/librt-0.8.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2eb345e8b33fb748227409c9f1233d4df354d6e54091f0e8fc53acdb2ffedeb7", size = 220770, upload-time = "2026-02-17T16:12:33.294Z" },
+    { url = "https://files.pythonhosted.org/packages/af/8d/16ed8fd452dafae9c48d17a6bc1ee3e818fd40ef718d149a8eff2c9f4ea2/librt-0.8.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9be2f15e53ce4e83cc08adc29b26fb5978db62ef2a366fbdf716c8a6c8901040", size = 235409, upload-time = "2026-02-17T16:12:35.443Z" },
+    { url = "https://files.pythonhosted.org/packages/89/1b/7bdf3e49349c134b25db816e4a3db6b94a47ac69d7d46b1e682c2c4949be/librt-0.8.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:785ae29c1f5c6e7c2cde2c7c0e148147f4503da3abc5d44d482068da5322fd9e", size = 246473, upload-time = "2026-02-17T16:12:36.656Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/8a/91fab8e4fd2a24930a17188c7af5380eb27b203d72101c9cc000dbdfd95a/librt-0.8.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d3a7da44baf692f0c6aeb5b2a09c5e6fc7a703bca9ffa337ddd2e2da53f7732", size = 238866, upload-time = "2026-02-17T16:12:37.849Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/e0/c45a098843fc7c07e18a7f8a24ca8496aecbf7bdcd54980c6ca1aaa79a8e/librt-0.8.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5fc48998000cbc39ec0d5311312dda93ecf92b39aaf184c5e817d5d440b29624", size = 250248, upload-time = "2026-02-17T16:12:39.445Z" },
+    { url = "https://files.pythonhosted.org/packages/82/30/07627de23036640c952cce0c1fe78972e77d7d2f8fd54fa5ef4554ff4a56/librt-0.8.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:e96baa6820280077a78244b2e06e416480ed859bbd8e5d641cf5742919d8beb4", size = 240629, upload-time = "2026-02-17T16:12:40.889Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/c1/55bfe1ee3542eba055616f9098eaf6eddb966efb0ca0f44eaa4aba327307/librt-0.8.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:31362dbfe297b23590530007062c32c6f6176f6099646bb2c95ab1b00a57c382", size = 239615, upload-time = "2026-02-17T16:12:42.446Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/39/191d3d28abc26c9099b19852e6c99f7f6d400b82fa5a4e80291bd3803e19/librt-0.8.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cc3656283d11540ab0ea01978378e73e10002145117055e03722417aeab30994", size = 263001, upload-time = "2026-02-17T16:12:43.627Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/eb/7697f60fbe7042ab4e88f4ee6af496b7f222fffb0a4e3593ef1f29f81652/librt-0.8.1-cp314-cp314t-win32.whl", hash = "sha256:738f08021b3142c2918c03692608baed43bc51144c29e35807682f8070ee2a3a", size = 51328, upload-time = "2026-02-17T16:12:45.148Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/72/34bf2eb7a15414a23e5e70ecb9440c1d3179f393d9349338a91e2781c0fb/librt-0.8.1-cp314-cp314t-win_amd64.whl", hash = "sha256:89815a22daf9c51884fb5dbe4f1ef65ee6a146e0b6a8df05f753e2e4a9359bf4", size = 58722, upload-time = "2026-02-17T16:12:46.85Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/c8/d148e041732d631fc76036f8b30fae4e77b027a1e95b7a84bb522481a940/librt-0.8.1-cp314-cp314t-win_arm64.whl", hash = "sha256:bf512a71a23504ed08103a13c941f763db13fb11177beb3d9244c98c29fb4a61", size = 48755, upload-time = "2026-02-17T16:12:47.943Z" },
+]
+
 [[package]]
 name = "logfire-api"
 version = "4.31.0"
@@ -679,6 +764,73 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" },
 ]
 
+[[package]]
+name = "mypy"
+version = "1.20.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "librt", marker = "platform_python_implementation != 'PyPy'" },
+    { name = "mypy-extensions" },
+    { name = "pathspec" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/b0089fe7fef0a994ae5ee07029ced0526082c6cfaaa4c10d40a10e33b097/mypy-1.20.0.tar.gz", hash = "sha256:eb96c84efcc33f0b5e0e04beacf00129dd963b67226b01c00b9dfc8affb464c3", size = 3815028, upload-time = "2026-03-31T16:55:14.959Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/a2/a965c8c3fcd4fa8b84ba0d46606181b0d0a1d50f274c67877f3e9ed4882c/mypy-1.20.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d99f515f95fd03a90875fdb2cca12ff074aa04490db4d190905851bdf8a549a8", size = 14430138, upload-time = "2026-03-31T16:52:37.843Z" },
+    { url = "https://files.pythonhosted.org/packages/53/6e/043477501deeb8eabbab7f1a2f6cac62cfb631806dc1d6862a04a7f5011b/mypy-1.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bd0212976dc57a5bfeede7c219e7cd66568a32c05c9129686dd487c059c1b88a", size = 13311282, upload-time = "2026-03-31T16:55:11.021Z" },
+    { url = "https://files.pythonhosted.org/packages/65/aa/bd89b247b83128197a214f29f0632ff3c14f54d4cd70d144d157bd7d7d6e/mypy-1.20.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f8426d4d75d68714abc17a4292d922f6ba2cfb984b72c2278c437f6dae797865", size = 13750889, upload-time = "2026-03-31T16:52:02.909Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/9d/2860be7355c45247ccc0be1501c91176318964c2a137bd4743f58ce6200e/mypy-1.20.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02cca0761c75b42a20a2757ae58713276605eb29a08dd8a6e092aa347c4115ca", size = 14619788, upload-time = "2026-03-31T16:50:48.928Z" },
+    { url = "https://files.pythonhosted.org/packages/75/7f/3ef3e360c91f3de120f205c8ce405e9caf9fc52ef14b65d37073e322c114/mypy-1.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b3a49064504be59e59da664c5e149edc1f26c67c4f8e8456f6ba6aba55033018", size = 14918849, upload-time = "2026-03-31T16:51:10.478Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/72/af970dfe167ef788df7c5e6109d2ed0229f164432ce828bc9741a4250e64/mypy-1.20.0-cp310-cp310-win_amd64.whl", hash = "sha256:ebea00201737ad4391142808ed16e875add5c17f676e0912b387739f84991e13", size = 10822007, upload-time = "2026-03-31T16:50:25.268Z" },
+    { url = "https://files.pythonhosted.org/packages/93/94/ba9065c2ebe5421619aff684b793d953e438a8bfe31a320dd6d1e0706e81/mypy-1.20.0-cp310-cp310-win_arm64.whl", hash = "sha256:e80cf77847d0d3e6e3111b7b25db32a7f8762fd4b9a3a72ce53fe16a2863b281", size = 9756158, upload-time = "2026-03-31T16:48:36.213Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/1c/74cb1d9993236910286865679d1c616b136b2eae468493aa939431eda410/mypy-1.20.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4525e7010b1b38334516181c5b81e16180b8e149e6684cee5a727c78186b4e3b", size = 14343972, upload-time = "2026-03-31T16:49:04.887Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/0d/01399515eca280386e308cf57901e68d3a52af18691941b773b3380c1df8/mypy-1.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a17c5d0bdcca61ce24a35beb828a2d0d323d3fcf387d7512206888c900193367", size = 13225007, upload-time = "2026-03-31T16:50:08.151Z" },
+    { url = "https://files.pythonhosted.org/packages/56/ac/b4ba5094fb2d7fe9d2037cd8d18bbe02bcf68fd22ab9ff013f55e57ba095/mypy-1.20.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f75ff57defcd0f1d6e006d721ccdec6c88d4f6a7816eb92f1c4890d979d9ee62", size = 13663752, upload-time = "2026-03-31T16:49:26.064Z" },
+    { url = "https://files.pythonhosted.org/packages/db/a7/460678d3cf7da252d2288dad0c602294b6ec22a91932ec368cc11e44bb6e/mypy-1.20.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b503ab55a836136b619b5fc21c8803d810c5b87551af8600b72eecafb0059cb0", size = 14532265, upload-time = "2026-03-31T16:53:55.077Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/3e/051cca8166cf0438ae3ea80e0e7c030d7a8ab98dffc93f80a1aa3f23c1a2/mypy-1.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1973868d2adbb4584a3835780b27436f06d1dc606af5be09f187aaa25be1070f", size = 14768476, upload-time = "2026-03-31T16:50:34.587Z" },
+    { url = "https://files.pythonhosted.org/packages/be/66/8e02ec184f852ed5c4abb805583305db475930854e09964b55e107cdcbc4/mypy-1.20.0-cp311-cp311-win_amd64.whl", hash = "sha256:2fcedb16d456106e545b2bfd7ef9d24e70b38ec252d2a629823a4d07ebcdb69e", size = 10818226, upload-time = "2026-03-31T16:53:15.624Z" },
+    { url = "https://files.pythonhosted.org/packages/13/4b/383ad1924b28f41e4879a74151e7a5451123330d45652da359f9183bcd45/mypy-1.20.0-cp311-cp311-win_arm64.whl", hash = "sha256:379edf079ce44ac8d2805bcf9b3dd7340d4f97aad3a5e0ebabbf9d125b84b442", size = 9750091, upload-time = "2026-03-31T16:54:12.162Z" },
+    { url = "https://files.pythonhosted.org/packages/be/dd/3afa29b58c2e57c79116ed55d700721c3c3b15955e2b6251dd165d377c0e/mypy-1.20.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:002b613ae19f4ac7d18b7e168ffe1cb9013b37c57f7411984abbd3b817b0a214", size = 14509525, upload-time = "2026-03-31T16:55:01.824Z" },
+    { url = "https://files.pythonhosted.org/packages/54/eb/227b516ab8cad9f2a13c5e7a98d28cd6aa75e9c83e82776ae6c1c4c046c7/mypy-1.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a9336b5e6712f4adaf5afc3203a99a40b379049104349d747eb3e5a3aa23ac2e", size = 13326469, upload-time = "2026-03-31T16:51:41.23Z" },
+    { url = "https://files.pythonhosted.org/packages/57/d4/1ddb799860c1b5ac6117ec307b965f65deeb47044395ff01ab793248a591/mypy-1.20.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f13b3e41bce9d257eded794c0f12878af3129d80aacd8a3ee0dee51f3a978651", size = 13705953, upload-time = "2026-03-31T16:48:55.69Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/b7/54a720f565a87b893182a2a393370289ae7149e4715859e10e1c05e49154/mypy-1.20.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9804c3ad27f78e54e58b32e7cb532d128b43dbfb9f3f9f06262b821a0f6bd3f5", size = 14710363, upload-time = "2026-03-31T16:53:26.948Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/2a/74810274848d061f8a8ea4ac23aaad43bd3d8c1882457999c2e568341c57/mypy-1.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:697f102c5c1d526bdd761a69f17c6070f9892eebcb94b1a5963d679288c09e78", size = 14947005, upload-time = "2026-03-31T16:50:17.591Z" },
+    { url = "https://files.pythonhosted.org/packages/77/91/21b8ba75f958bcda75690951ce6fa6b7138b03471618959529d74b8544e2/mypy-1.20.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ecd63f75fdd30327e4ad8b5704bd6d91fc6c1b2e029f8ee14705e1207212489", size = 10880616, upload-time = "2026-03-31T16:52:19.986Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/15/3d8198ef97c1ca03aea010cce4f1d4f3bc5d9849e8c0140111ca2ead9fdd/mypy-1.20.0-cp312-cp312-win_arm64.whl", hash = "sha256:f194db59657c58593a3c47c6dfd7bad4ef4ac12dbc94d01b3a95521f78177e33", size = 9813091, upload-time = "2026-03-31T16:53:44.385Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/a7/f64ea7bd592fa431cb597418b6dec4a47f7d0c36325fec7ac67bc8402b94/mypy-1.20.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b20c8b0fd5877abdf402e79a3af987053de07e6fb208c18df6659f708b535134", size = 14485344, upload-time = "2026-03-31T16:49:16.78Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/72/8927d84cfc90c6abea6e96663576e2e417589347eb538749a464c4c218a0/mypy-1.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:367e5c993ba34d5054d11937d0485ad6dfc60ba760fa326c01090fc256adf15c", size = 13327400, upload-time = "2026-03-31T16:53:08.02Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/4a/11ab99f9afa41aa350178d24a7d2da17043228ea10f6456523f64b5a6cf6/mypy-1.20.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f799d9db89fc00446f03281f84a221e50018fc40113a3ba9864b132895619ebe", size = 13706384, upload-time = "2026-03-31T16:52:28.577Z" },
+    { url = "https://files.pythonhosted.org/packages/42/79/694ca73979cfb3535ebfe78733844cd5aff2e63304f59bf90585110d975a/mypy-1.20.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:555658c611099455b2da507582ea20d2043dfdfe7f5ad0add472b1c6238b433f", size = 14700378, upload-time = "2026-03-31T16:48:45.527Z" },
+    { url = "https://files.pythonhosted.org/packages/84/24/a022ccab3a46e3d2cdf2e0e260648633640eb396c7e75d5a42818a8d3971/mypy-1.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:efe8d70949c3023698c3fca1e94527e7e790a361ab8116f90d11221421cd8726", size = 14932170, upload-time = "2026-03-31T16:49:36.038Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/9b/549228d88f574d04117e736f55958bd4908f980f9f5700a07aeb85df005b/mypy-1.20.0-cp313-cp313-win_amd64.whl", hash = "sha256:f49590891d2c2f8a9de15614e32e459a794bcba84693c2394291a2038bbaaa69", size = 10888526, upload-time = "2026-03-31T16:50:59.827Z" },
+    { url = "https://files.pythonhosted.org/packages/91/17/15095c0e54a8bc04d22d4ff06b2139d5f142c2e87520b4e39010c4862771/mypy-1.20.0-cp313-cp313-win_arm64.whl", hash = "sha256:76a70bf840495729be47510856b978f1b0ec7d08f257ca38c9d932720bf6b43e", size = 9816456, upload-time = "2026-03-31T16:49:59.537Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/0e/6ca4a84cbed9e62384bc0b2974c90395ece5ed672393e553996501625fc5/mypy-1.20.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:0f42dfaab7ec1baff3b383ad7af562ab0de573c5f6edb44b2dab016082b89948", size = 14483331, upload-time = "2026-03-31T16:52:57.999Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/c5/5fe9d8a729dd9605064691816243ae6c49fde0bd28f6e5e17f6a24203c43/mypy-1.20.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:31b5dbb55293c1bd27c0fc813a0d2bb5ceef9d65ac5afa2e58f829dab7921fd5", size = 13342047, upload-time = "2026-03-31T16:54:21.555Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/33/e18bcfa338ca4e6b2771c85d4c5203e627d0c69d9de5c1a2cf2ba13320ba/mypy-1.20.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49d11c6f573a5a08f77fad13faff2139f6d0730ebed2cfa9b3d2702671dd7188", size = 13719585, upload-time = "2026-03-31T16:51:53.89Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/8d/93491ff7b79419edc7eabf95cb3b3f7490e2e574b2855c7c7e7394ff933f/mypy-1.20.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d3243c406773185144527f83be0e0aefc7bf4601b0b2b956665608bf7c98a83", size = 14685075, upload-time = "2026-03-31T16:54:04.464Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/9d/d924b38a4923f8d164bf2b4ec98bf13beaf6e10a5348b4b137eadae40a6e/mypy-1.20.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a79c1eba7ac4209f2d850f0edd0a2f8bba88cbfdfefe6fb76a19e9d4fe5e71a2", size = 14919141, upload-time = "2026-03-31T16:54:51.785Z" },
+    { url = "https://files.pythonhosted.org/packages/59/98/1da9977016678c0b99d43afe52ed00bb3c1a0c4c995d3e6acca1a6ebb9b4/mypy-1.20.0-cp314-cp314-win_amd64.whl", hash = "sha256:00e047c74d3ec6e71a2eb88e9ea551a2edb90c21f993aefa9e0d2a898e0bb732", size = 11050925, upload-time = "2026-03-31T16:51:30.758Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/e3/ba0b7a3143e49a9c4f5967dde6ea4bf8e0b10ecbbcca69af84027160ee89/mypy-1.20.0-cp314-cp314-win_arm64.whl", hash = "sha256:931a7630bba591593dcf6e97224a21ff80fb357e7982628d25e3c618e7f598ef", size = 10001089, upload-time = "2026-03-31T16:49:43.632Z" },
+    { url = "https://files.pythonhosted.org/packages/12/28/e617e67b3be9d213cda7277913269c874eb26472489f95d09d89765ce2d8/mypy-1.20.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:26c8b52627b6552f47ff11adb4e1509605f094e29815323e487fc0053ebe93d1", size = 15534710, upload-time = "2026-03-31T16:52:12.506Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/0c/3b5f2d3e45dc7169b811adce8451679d9430399d03b168f9b0489f43adaa/mypy-1.20.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:39362cdb4ba5f916e7976fccecaab1ba3a83e35f60fa68b64e9a70e221bb2436", size = 14393013, upload-time = "2026-03-31T16:54:41.186Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/49/edc8b0aa145cc09c1c74f7ce2858eead9329931dcbbb26e2ad40906daa4e/mypy-1.20.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:34506397dbf40c15dc567635d18a21d33827e9ab29014fb83d292a8f4f8953b6", size = 15047240, upload-time = "2026-03-31T16:54:31.955Z" },
+    { url = "https://files.pythonhosted.org/packages/42/37/a946bb416e37a57fa752b3100fd5ede0e28df94f92366d1716555d47c454/mypy-1.20.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:555493c44a4f5a1b58d611a43333e71a9981c6dbe26270377b6f8174126a0526", size = 15858565, upload-time = "2026-03-31T16:53:36.997Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/99/7690b5b5b552db1bd4ff362e4c0eb3107b98d680835e65823fbe888c8b78/mypy-1.20.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:2721f0ce49cb74a38f00c50da67cb7d36317b5eda38877a49614dc018e91c787", size = 16087874, upload-time = "2026-03-31T16:52:48.313Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/76/53e893a498138066acd28192b77495c9357e5a58cc4be753182846b43315/mypy-1.20.0-cp314-cp314t-win_amd64.whl", hash = "sha256:47781555a7aa5fedcc2d16bcd72e0dc83eb272c10dd657f9fb3f9cc08e2e6abb", size = 12572380, upload-time = "2026-03-31T16:49:52.454Z" },
+    { url = "https://files.pythonhosted.org/packages/76/9c/6dbdae21f01b7aacddc2c0bbf3c5557aa547827fdf271770fe1e521e7093/mypy-1.20.0-cp314-cp314t-win_arm64.whl", hash = "sha256:c70380fe5d64010f79fb863b9081c7004dd65225d2277333c219d93a10dad4dd", size = 10381174, upload-time = "2026-03-31T16:51:20.179Z" },
+    { url = "https://files.pythonhosted.org/packages/21/66/4d734961ce167f0fd8380769b3b7c06dbdd6ff54c2190f3f2ecd22528158/mypy-1.20.0-py3-none-any.whl", hash = "sha256:a6e0641147cbfa7e4e94efdb95c2dab1aff8cfc159ded13e07f308ddccc8c48e", size = 2636365, upload-time = "2026-03-31T16:51:44.911Z" },
+]
+
+[[package]]
+name = "mypy-extensions"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
+]
+
 [[package]]
 name = "nodeenv"
 version = "1.9.1"
@@ -721,11 +873,11 @@ wheels = [
 
 [[package]]
 name = "pathspec"
-version = "0.12.1"
+version = "1.0.4"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200, upload-time = "2026-01-27T03:59:46.938Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206, upload-time = "2026-01-27T03:59:45.137Z" },
 ]
 
 [[package]]
@@ -785,6 +937,7 @@ web = [
 dev = [
     { name = "jsonschema" },
     { name = "mkdocs-material" },
+    { name = "mypy" },
     { name = "pre-commit" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
@@ -811,6 +964,7 @@ provides-extras = ["rich", "web", "evals"]
 dev = [
     { name = "jsonschema", specifier = ">=4.0.0" },
     { name = "mkdocs-material", specifier = ">=9.7.0" },
+    { name = "mypy", specifier = ">=1.0" },
     { name = "pre-commit", specifier = ">=4.5.0" },
     { name = "pytest", specifier = ">=9.0.1" },
     { name = "pytest-asyncio", specifier = ">=1.3.0" },

From 752ddbca9e0909e6ec011d9a4037f860003410da Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 3 Apr 2026 05:10:29 +0200
Subject: [PATCH 20/60] refactor(evals): replace `session.eval` with
 `EvalSuite` for cleaner API

- Introduced `EvalSuite` class to encapsulate eval logic, replacing inline `session.eval()` definitions.
- Removed duplicate `eval` methods in `ProTestSession` and `ProTestSuite`.
- Updated tests and examples to leverage `EvalSuite`.
---
 examples/yorkshire/evals/session.py |   9 +-
 protest/core/session.py             |  81 +--------------
 protest/core/suite.py               |  23 -----
 protest/evals/__init__.py           |  13 ---
 protest/evals/session.py            |  57 ++++++++++-
 protest/evals/suite.py              |  87 +++++++++++++++++
 tests/evals/test_e2e.py             | 146 ++++++++++++++++++++++------
 tests/evals/test_judge.py           |  31 ++++--
 8 files changed, 284 insertions(+), 163 deletions(-)
 create mode 100644 protest/evals/suite.py

diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py
index 7779f66..f1800d8 100644
--- a/examples/yorkshire/evals/session.py
+++ b/examples/yorkshire/evals/session.py
@@ -16,14 +16,19 @@
     yorkshire_cases,
 )
 from protest import From
-from protest.evals import EvalSession, ModelInfo
+from protest.evals import ModelInfo
+from protest.evals.session import EvalSession
+from protest.evals.suite import EvalSuite
 
 session = EvalSession(
     model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"),
     metadata={"version": "1.0", "type": "keyword-matching"},
 )
 
+yorkshire_suite = EvalSuite("yorkshire_eval")
+session.add_suite(yorkshire_suite)
 
-@session.eval(evaluators=suite_evaluators)
+
+@yorkshire_suite.eval(evaluators=suite_evaluators)
 def yorkshire_eval(case: Annotated[dict, From(yorkshire_cases)]) -> str:
     return yorkshire_chatbot(case["inputs"])
diff --git a/protest/core/session.py b/protest/core/session.py
index efef4fb..daafa74 100644
--- a/protest/core/session.py
+++ b/protest/core/session.py
@@ -8,13 +8,12 @@
     from types import TracebackType
 
     from protest.compat import Self
+    from protest.core.suite import ProTestSuite
     from protest.entities import FixtureCallable
-    from protest.evals.types import JudgeInfo, ModelInfo
     from protest.plugin import PluginBase, PluginContext
 
 from protest.cache.plugin import CachePlugin
 from protest.cache.storage import CacheStorage
-from protest.core.suite import ProTestSuite
 from protest.di.container import FixtureContainer
 from protest.di.decorators import get_fixture_marker, unwrap_fixture
 from protest.entities import (
@@ -22,16 +21,12 @@
     FixtureScope,
     Retry,
     Skip,
-    SuiteKind,
     TestRegistration,
     Xfail,
     normalize_retry,
     normalize_skip,
     normalize_xfail,
 )
-from protest.evals.history import EvalHistoryPlugin
-from protest.evals.results_writer import EvalResultsWriter
-from protest.evals.wrapper import make_eval_wrapper
 from protest.events.bus import EventBus
 from protest.events.types import Event
 from protest.exceptions import InvalidMaxConcurrencyError
@@ -88,9 +83,6 @@ def __init__(
         self._history = history
         self._history_dir = history_dir
         self._metadata: dict[str, Any] = dict(metadata) if metadata else {}
-        self._eval_model: ModelInfo | None = None  # set by EvalSession
-        self._eval_judge: JudgeInfo | None = None  # set by EvalSession
-        self._eval_judge_instance: Any = None  # set by EvalSession
 
     async def resolve_autouse(self) -> None:
         """Resolve all session autouse fixtures at session start."""
@@ -207,51 +199,6 @@ def decorator(func: FuncT) -> FuncT:
 
         return decorator
 
-    def eval(
-        self,
-        evaluators: list[Any] | None = None,
-        expected_key: str = "expected",
-        tags: list[str] | None = None,
-        timeout: float | None = None,
-        name: str | None = None,
-        model: Any = None,
-    ) -> Callable[[FuncT], FuncT]:
-        """Register a scored eval test.
-
-        Creates an implicit eval suite named after the function.
-        The decorated function's return value is passed to evaluators.
-        Use with ForEach/From for parametrization::
-
-            @session.eval(evaluators=[my_scorer], model=ModelInfo(name="qwen"))
-            async def my_eval(case: Annotated[dict, From(cases)]) -> str:
-                return await run(case["q"])
-        """
-
-        def decorator(func: FuncT) -> FuncT:
-            suite_name = name or func.__name__
-            suite_meta: dict[str, Any] = {}
-            resolved_model = model or self._eval_model
-            if resolved_model:
-                suite_meta["model"] = resolved_model.name
-                suite_meta["provider"] = resolved_model.provider
-            suite = ProTestSuite(
-                name=suite_name,
-                tags=list(tags or []),
-                kind=SuiteKind.EVAL,
-                metadata=suite_meta,
-            )
-            wrapper = make_eval_wrapper(
-                func,
-                evaluators or [],
-                expected_key,
-                judge=self._eval_judge_instance,
-            )
-            suite.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
-            self.add_suite(suite)
-            return func
-
-        return decorator
-
     def add_suite(self, suite: ProTestSuite) -> None:
         """Add a suite to this session."""
         suite._attach_to_session(self)
@@ -375,32 +322,6 @@ def activate_plugins(self, ctx: PluginContext) -> None:
             if instance is not None:
                 self.register_plugin(instance)
 
-        # Auto-wire eval support if any suite has kind="eval"
-        if any(s.kind == SuiteKind.EVAL for s in self._suites):
-            self._wire_eval_support()
-
-    def _wire_eval_support(self) -> None:
-        """Wire eval history + results writer plugins (no EvalPlugin)."""
-
-        judge_dict = None
-        if self._eval_judge:
-            judge_dict = {
-                "name": self._eval_judge.name,
-                "provider": self._eval_judge.provider,
-                "evaluators": list(self._eval_judge.evaluators),
-            }
-
-        history = EvalHistoryPlugin(
-            history_dir=self._history_dir,
-            model=self._eval_model,
-            judge=judge_dict,
-            metadata=self._metadata,
-        )
-        self.register_plugin(history)
-
-        writer = EvalResultsWriter(history_dir=self._history_dir)
-        self.register_plugin(writer)
-
     async def __aenter__(self) -> Self:
         self._register_fixtures()
         await self._resolver.__aenter__()
diff --git a/protest/core/suite.py b/protest/core/suite.py
index 99b4fa2..b73e9f0 100644
--- a/protest/core/suite.py
+++ b/protest/core/suite.py
@@ -22,7 +22,6 @@
     normalize_skip,
     normalize_xfail,
 )
-from protest.evals.wrapper import make_eval_wrapper
 from protest.exceptions import ConcurrencyMismatchError, InvalidMaxConcurrencyError
 
 FuncT = TypeVar("FuncT", bound="Callable[..., object]")
@@ -161,28 +160,6 @@ def decorator(func: FuncT) -> FuncT:
 
         return decorator
 
-    def eval(
-        self,
-        evaluators: list[Any] | None = None,
-        expected_key: str = "expected",
-        tags: list[str] | None = None,
-        timeout: float | None = None,
-        judge: Any = None,
-    ) -> Callable[[FuncT], FuncT]:
-        """Register a scored eval test on this suite."""
-
-        def decorator(func: FuncT) -> FuncT:
-            wrapper = make_eval_wrapper(
-                func,
-                evaluators or [],
-                expected_key,
-                judge=judge,
-            )
-            self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
-            return func
-
-        return decorator
-
     def add_suite(self, suite: ProTestSuite) -> None:
         """Add a child suite. Child can access parent's fixtures."""
         parent_effective = self.effective_max_concurrency
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index d90b8f4..8584eff 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -26,7 +26,6 @@
     "EvalCaseResult",
     "EvalContext",
     "EvalScore",
-    "EvalSession",
     "EvalSuiteReport",
     "Judge",
     "JudgeInfo",
@@ -40,15 +39,3 @@
     "Verdict",
     "evaluator",
 ]
-
-
-def __getattr__(name: str) -> object:
-    # EvalSession imports protest.core.session which imports reporters,
-    # and reporters import protest.evals.types — eagerly importing
-    # EvalSession here would create a circular import chain.
-    if name == "EvalSession":
-        from protest.evals.session import EvalSession  # noqa: PLC0415 — circular import
-
-        return EvalSession
-    msg = f"module {__name__!r} has no attribute {name!r}"
-    raise AttributeError(msg)
diff --git a/protest/evals/session.py b/protest/evals/session.py
index 09f0d5c..9ed7459 100644
--- a/protest/evals/session.py
+++ b/protest/evals/session.py
@@ -5,12 +5,18 @@
 from typing import TYPE_CHECKING, Any
 
 from protest.core.session import ProTestSession
+from protest.entities import SuiteKind
+from protest.evals.history import EvalHistoryPlugin
+from protest.evals.results_writer import EvalResultsWriter
+from protest.evals.suite import EvalSuite
 from protest.evals.types import JudgeInfo
 
 if TYPE_CHECKING:
     from pathlib import Path
 
+    from protest.core.suite import ProTestSuite
     from protest.evals.types import Judge, ModelInfo
+    from protest.plugin import PluginContext
 
 
 class EvalSession(ProTestSession):
@@ -20,7 +26,10 @@ class EvalSession(ProTestSession):
 
         session = EvalSession(model=ModelInfo(name="qwen-2.5"))
 
-        @session.eval(evaluators=[contains_facts])
+        chatbot = EvalSuite("chatbot")
+        session.add_suite(chatbot)
+
+        @chatbot.eval(evaluators=[contains_facts])
         async def chatbot(case: Annotated[dict, From(cases)]) -> str:
             return await ask(case["q"])
     """
@@ -43,7 +52,45 @@ def __init__(
         )
         self._eval_model = model
         self._eval_judge_instance: Judge | None = judge
-        if judge is not None:
-            self._eval_judge = JudgeInfo(name=judge.name, provider=judge.provider)
-        else:
-            self._eval_judge = None
+        self._eval_judge: JudgeInfo | None = (
+            JudgeInfo(name=judge.name, provider=judge.provider)
+            if judge is not None
+            else None
+        )
+
+    def add_suite(self, suite: ProTestSuite) -> None:
+        """Add a suite, propagating session-level model/judge as defaults."""
+        if isinstance(suite, EvalSuite):
+            if suite.judge is None and self._eval_judge_instance is not None:
+                suite._judge = self._eval_judge_instance
+            if self._eval_model and "model" not in suite.suite_metadata:
+                suite._metadata["model"] = self._eval_model.name
+                suite._metadata["provider"] = self._eval_model.provider
+        super().add_suite(suite)
+
+    def activate_plugins(self, ctx: PluginContext) -> None:
+        """Activate plugins, then wire eval support if needed."""
+        super().activate_plugins(ctx)
+        if any(s.kind == SuiteKind.EVAL for s in self._suites):
+            self._wire_eval_support()
+
+    def _wire_eval_support(self) -> None:
+        """Wire eval history + results writer plugins."""
+        judge_dict = None
+        if self._eval_judge:
+            judge_dict = {
+                "name": self._eval_judge.name,
+                "provider": self._eval_judge.provider,
+                "evaluators": list(self._eval_judge.evaluators),
+            }
+
+        history = EvalHistoryPlugin(
+            history_dir=self._history_dir,
+            model=self._eval_model,
+            judge=judge_dict,
+            metadata=self._metadata,
+        )
+        self.register_plugin(history)
+
+        writer = EvalResultsWriter(history_dir=self._history_dir)
+        self.register_plugin(writer)
diff --git a/protest/evals/suite.py b/protest/evals/suite.py
new file mode 100644
index 0000000..279aec7
--- /dev/null
+++ b/protest/evals/suite.py
@@ -0,0 +1,87 @@
+"""EvalSuite — suite dédiée aux evals."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, TypeVar
+
+from protest.core.suite import ProTestSuite
+from protest.entities import SuiteKind
+from protest.evals.wrapper import make_eval_wrapper
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from protest.evals.types import Judge, ModelInfo
+
+FuncT = TypeVar("FuncT", bound="Callable[..., object]")
+
+
+class EvalSuite(ProTestSuite):
+    """Suite dédiée aux evals.
+
+    Usage::
+
+        chatbot = EvalSuite("chatbot")
+        session.add_suite(chatbot)
+
+        @chatbot.eval(evaluators=[contains_facts])
+        async def chatbot(case: Annotated[dict, From(cases)]) -> str:
+            return await ask(case["q"])
+    """
+
+    def __init__(
+        self,
+        name: str,
+        *,
+        model: ModelInfo | None = None,
+        judge: Judge | None = None,
+        tags: list[str] | None = None,
+        max_concurrency: int | None = None,
+        description: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        suite_meta: dict[str, Any] = dict(metadata) if metadata else {}
+        if model is not None:
+            suite_meta["model"] = model.name
+            suite_meta["provider"] = model.provider
+        super().__init__(
+            name=name,
+            kind=SuiteKind.EVAL,
+            tags=tags,
+            max_concurrency=max_concurrency,
+            description=description,
+            metadata=suite_meta,
+        )
+        self._judge: Judge | None = judge
+        self._model = model
+
+    @property
+    def judge(self) -> Judge | None:
+        return self._judge
+
+    @property
+    def model(self) -> ModelInfo | None:
+        return self._model
+
+    def eval(
+        self,
+        evaluators: list[Any] | None = None,
+        expected_key: str = "expected",
+        tags: list[str] | None = None,
+        timeout: float | None = None,
+        judge: Any = None,
+    ) -> Callable[[FuncT], FuncT]:
+        """Register a scored eval test on this suite."""
+
+        def decorator(func: FuncT) -> FuncT:
+            resolved_judge = judge or self._judge
+            wrapper = make_eval_wrapper(
+                func,
+                evaluators or [],
+                expected_key,
+                judge=resolved_judge,
+            )
+            self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
+            return func
+
+        return decorator
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 72ef8ff..2ecc09b 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -1,7 +1,7 @@
 """End-to-end tests for ProTest evals integration.
 
 These tests define the PUBLIC API contract. They test what the user sees:
-- Session setup (EvalSession, @session.eval with ForEach/From)
+- Session setup (EvalSession, EvalSuite + @suite.eval with ForEach/From)
 - CLI behavior (protest run vs protest eval)
 - Output format (scores table, trends, failure messages)
 - History (JSONL format, stats, significance, clean-dirty)
@@ -26,7 +26,6 @@
 from protest.entities import SuiteKind
 from protest.evals import (
     EvalContext,
-    EvalSession,
     Metric,
     ModelInfo,
     ShortCircuit,
@@ -46,6 +45,8 @@
 )
 from protest.evals.hashing import compute_case_hash, compute_eval_hash
 from protest.evals.results_writer import EvalResultsWriter
+from protest.evals.session import EvalSession
+from protest.evals.suite import EvalSuite
 from protest.evals.types import EvalSuiteReport  # noqa: TC001 — used at runtime
 from protest.filters.kind import KindFilterPlugin
 from protest.history.storage import append_entry, clean_dirty
@@ -103,12 +104,15 @@ async def async_echo_task(text: str) -> str:
 
 
 class TestEvalSession:
-    """EvalSession setup: constructor with model=, @session.eval."""
+    """EvalSession setup: constructor with model=, EvalSuite + @suite.eval."""
 
     def test_add_eval_creates_eval_kind(self) -> None:
         session = EvalSession()
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -129,7 +133,10 @@ def test_eval_with_bool_verdict(self) -> None:
         """Evaluator with bool field: case_fail has matches_expected=False -> fail."""
         session = EvalSession()
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -142,7 +149,10 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
     def test_async_task_works(self) -> None:
         session = EvalSession()
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         async def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return await async_echo_task(case["inputs"])
 
@@ -160,7 +170,10 @@ def test_async_evaluator_does_not_crash(self) -> None:
 
         session = EvalSession()
 
-        @session.eval(evaluators=[async_fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[async_fake_accuracy])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return echo_task(case["inputs"])
 
@@ -184,7 +197,10 @@ def test_test_suite_has_kind_test(self) -> None:
     def test_eval_suite_has_kind_eval(self) -> None:
         session = EvalSession()
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -229,7 +245,10 @@ def test_a() -> None:
 
         session.add_suite(test_suite)
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -251,7 +270,10 @@ def test_a() -> None:
 
         session.add_suite(test_suite)
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -286,7 +308,10 @@ def on_eval_suite_end(self, report: Any) -> None:
         session = EvalSession()
         session.register_plugin(ReportCapture())
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -311,7 +336,10 @@ def on_eval_suite_end(self, report: Any) -> None:
         session = EvalSession()
         session.register_plugin(ReportCapture())
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -335,7 +363,10 @@ def on_test_fail(self, result: Any) -> None:
         session = EvalSession()
         session.register_plugin(ErrorCollector())
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -368,7 +399,10 @@ def on_test_fail(self, result: Any) -> None:
         session = EvalSession()
         session.register_plugin(Collector())
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -400,7 +434,10 @@ def on_test_teardown_start(self, info: Any) -> None:
         session = EvalSession()
         session.register_plugin(LifecycleCollector())
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -437,7 +474,10 @@ def crashing_evaluator(ctx: EvalContext) -> bool:
         session = EvalSession()
         session.register_plugin(Collector())
 
-        @session.eval(evaluators=[crashing_evaluator])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[crashing_evaluator])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return echo_task(case["inputs"])
 
@@ -483,7 +523,10 @@ class TestHistory:
     def _run_eval(self, tmp_path: Path) -> None:
         session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path)
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -546,7 +589,10 @@ def test_history_metadata_included(self, tmp_path: Path) -> None:
             metadata={"env": "test", "version": "1.0"},
         )
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -610,7 +656,10 @@ def test_case_hash_stored_in_history(self, tmp_path: Path) -> None:
         """History entries include case_hash and eval_hash per case."""
         session = EvalSession(history_dir=tmp_path)
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -749,7 +798,10 @@ def on_test_fail(self, result: Any) -> None:
         session = EvalSession()
         session.register_plugin(Collector())
 
-        @session.eval(evaluators=[not_empty])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[not_empty])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return echo_task(case["inputs"])
 
@@ -783,7 +835,10 @@ def on_test_fail(self, result: Any) -> None:
         session = EvalSession()
         session.register_plugin(Collector())
 
-        @session.eval(evaluators=[word_overlap])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[word_overlap])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return echo_task(case["inputs"])
 
@@ -815,7 +870,10 @@ def bad_evaluator(ctx: EvalContext) -> float:
         session = EvalSession()
         session.register_plugin(Collector())
 
-        @session.eval(evaluators=[bad_evaluator])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[bad_evaluator])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return echo_task(case["inputs"])
 
@@ -844,7 +902,10 @@ def expensive(ctx: EvalContext) -> bool:
 
         session = EvalSession()
 
-        @session.eval(evaluators=[ShortCircuit([cheap, expensive])])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[ShortCircuit([cheap, expensive])])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -874,7 +935,10 @@ def check_b(ctx: EvalContext) -> bool:
         )
         session = EvalSession()
 
-        @session.eval(evaluators=[ShortCircuit([check_a, check_b])])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[ShortCircuit([check_a, check_b])])
         def eval_echo(case: Annotated[dict, From(single)]) -> str:
             return echo_task(case["inputs"])
 
@@ -899,7 +963,10 @@ def _run_eval(self, tmp_path: Path) -> Path:
         writer = EvalResultsWriter(history_dir=tmp_path)
         session.register_plugin(writer)
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -947,7 +1014,7 @@ def test_case_file_contains_inputs(self, tmp_path: Path) -> None:
 
 
 class TestMultiDatasetHistory:
-    """Multiple @session.eval calls produce distinct suites in history."""
+    """Multiple EvalSuite + @suite.eval calls produce distinct suites in history."""
 
     def _run_multi(self, tmp_path: Path) -> dict[str, Any]:
         pipeline_cases = ForEach(
@@ -966,11 +1033,17 @@ def _run_multi(self, tmp_path: Path) -> dict[str, Any]:
 
         session = EvalSession(history_dir=tmp_path)
 
-        @session.eval(evaluators=[fake_accuracy])
+        pipeline_suite = EvalSuite("pipeline")
+        session.add_suite(pipeline_suite)
+
+        @pipeline_suite.eval(evaluators=[fake_accuracy])
         def pipeline(case: Annotated[dict, From(pipeline_cases)]) -> str:
             return echo_task(case["inputs"])
 
-        @session.eval(evaluators=[fake_accuracy])
+        ingest_suite = EvalSuite("ingest")
+        session.add_suite(ingest_suite)
+
+        @ingest_suite.eval(evaluators=[fake_accuracy])
         def ingest(case: Annotated[dict, From(ingest_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -996,14 +1069,17 @@ def test_each_suite_has_its_own_cases(self, tmp_path: Path) -> None:
 
 
 class TestEvalTaskFixtures:
-    """@session.eval() peut utiliser des fixtures protest via Use()."""
+    """EvalSuite + @suite.eval() peut utiliser des fixtures protest via Use()."""
 
     def test_task_without_fixtures_still_works(self) -> None:
         # basic_cases has one match (case_pass) and one mismatch (case_fail)
         # fake_accuracy returns matches_expected=False for case_fail -> fail
         session = EvalSession()
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
+
+        @eval_echo_suite.eval(evaluators=[fake_accuracy])
         def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
             return echo_task(case["inputs"])
 
@@ -1028,7 +1104,10 @@ def prefix_service() -> str:
         session = EvalSession()
         session.bind(prefix_service)
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_prefixed_suite = EvalSuite("eval_prefixed")
+        session.add_suite(eval_prefixed_suite)
+
+        @eval_prefixed_suite.eval(evaluators=[fake_accuracy])
         async def eval_prefixed(
             case: Annotated[dict, From(single_case)],
             svc: Annotated[str, Use(prefix_service)],
@@ -1063,7 +1142,10 @@ def expensive_resource() -> str:
         session = EvalSession()
         session.bind(expensive_resource)
 
-        @session.eval(evaluators=[fake_accuracy])
+        eval_resource_suite = EvalSuite("eval_resource")
+        session.add_suite(eval_resource_suite)
+
+        @eval_resource_suite.eval(evaluators=[fake_accuracy])
         async def eval_resource(
             case: Annotated[dict, From(multi_cases)],
             res: Annotated[str, Use(expensive_resource)],
diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py
index 9e6fd11..7eece34 100644
--- a/tests/evals/test_judge.py
+++ b/tests/evals/test_judge.py
@@ -11,13 +11,14 @@
 from protest.core.runner import TestRunner
 from protest.evals import (
     EvalContext,
-    EvalSession,
     Judge,
     JudgeResponse,
     TaskResult,
     Verdict,
     evaluator,
 )
+from protest.evals.session import EvalSession
+from protest.evals.suite import EvalSuite
 from protest.plugin import PluginBase
 
 # ---------------------------------------------------------------------------
@@ -227,8 +228,10 @@ async def judge_evaluator(ctx: EvalContext) -> bool:
             return await ctx.judge("pass this", bool)
 
         session = EvalSession(judge=FakeJudge())
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
 
-        @session.eval(evaluators=[judge_evaluator])
+        @eval_echo_suite.eval(evaluators=[judge_evaluator])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return case["inputs"]
 
@@ -244,8 +247,10 @@ async def needs_judge(ctx: EvalContext) -> bool:
             return await ctx.judge("test", bool)
 
         session = EvalSession()  # no judge
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
 
-        @session.eval(evaluators=[needs_judge])
+        @eval_echo_suite.eval(evaluators=[needs_judge])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return case["inputs"]
 
@@ -274,8 +279,10 @@ async def double_judge(ctx: EvalContext) -> bool:
             return r1 and r2
 
         session = EvalSession(judge=FakeJudge())
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
 
-        @session.eval(evaluators=[double_judge])
+        @eval_echo_suite.eval(evaluators=[double_judge])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return case["inputs"]
 
@@ -329,8 +336,10 @@ async def struct_evaluator(ctx: EvalContext) -> JudgeVerdict:
             return await ctx.judge("evaluate this", JudgeVerdict)
 
         session = EvalSession(judge=StructuredJudge())
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
 
-        @session.eval(evaluators=[struct_evaluator])
+        @eval_echo_suite.eval(evaluators=[struct_evaluator])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return case["inputs"]
 
@@ -353,8 +362,10 @@ def check_output(ctx: EvalContext) -> bool:
             return ctx.output == "hello"  # sees str, not TaskResult
 
         session = EvalSession()
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
 
-        @session.eval(evaluators=[check_output])
+        @eval_echo_suite.eval(evaluators=[check_output])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]:
             return TaskResult(
                 output=case["inputs"],
@@ -375,8 +386,10 @@ def always_pass(ctx: EvalContext) -> bool:
             return True
 
         session = EvalSession()
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
 
-        @session.eval(evaluators=[always_pass])
+        @eval_echo_suite.eval(evaluators=[always_pass])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]:
             return TaskResult(
                 output=case["inputs"],
@@ -411,8 +424,10 @@ def always_pass(ctx: EvalContext) -> bool:
             return True
 
         session = EvalSession()
+        eval_echo_suite = EvalSuite("eval_echo")
+        session.add_suite(eval_echo_suite)
 
-        @session.eval(evaluators=[always_pass])
+        @eval_echo_suite.eval(evaluators=[always_pass])
         def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
             return case["inputs"]
 

From 62a12a304b3367397008a7920344895a2df40f1a Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 3 Apr 2026 06:55:13 +0200
Subject: [PATCH 21/60] refactor(evals): replace `dict` with `EvalCase` for
 eval cases, update APIs and tests

- Standardized eval cases by replacing untyped `dict` with `EvalCase` objects across codebase.
- Updated evaluator helpers to work exclusively with `EvalCase` instances.
- Refactored `make_eval_wrapper` to remove unused `expected_key` argument.
- Updated tests and examples to adopt `EvalCase` usage for improved type safety and code clarity.
---
 docs/evals.md              |  60 +++++++++++---
 protest/evals/evaluator.py |   2 +-
 protest/evals/session.py   |   4 +-
 protest/evals/suite.py     |   6 +-
 protest/evals/types.py     |   4 +-
 protest/evals/wrapper.py   |  74 +++++++----------
 tests/evals/test_e2e.py    | 157 +++++++++++++++++++------------------
 7 files changed, 164 insertions(+), 143 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 006c403..cca70cc 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -15,8 +15,10 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t
 from typing import Annotated
 
 from protest import ForEach, From
-from protest.evals import EvalCase, EvalSession, ModelInfo, evaluator
+from protest.evals import EvalCase, ModelInfo, evaluator
 from protest.evals.evaluators import contains_keywords
+from protest.evals.session import EvalSession
+from protest.evals.suite import EvalSuite
 
 cases = ForEach([
     EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"),
@@ -25,7 +27,10 @@ cases = ForEach([
 
 session = EvalSession(model=ModelInfo(name="gpt-4o-mini"))
 
-@session.eval(evaluators=[contains_keywords(keywords=["Marie"])])
+chatbot_suite = EvalSuite("chatbot")
+session.add_suite(chatbot_suite)
+
+@chatbot_suite.eval(evaluators=[contains_keywords(keywords=["Marie"])])
 async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
     return await my_agent(case.inputs)
 ```
@@ -36,7 +41,7 @@ protest eval evals.session:session
 
 ## How It Works
 
-`@session.eval()` wraps a function to run evaluators on its return value:
+`@suite.eval()` wraps a function to run evaluators on its return value:
 
 1. Your function receives case data via `ForEach`/`From` (same as parameterized tests)
 2. It returns the output (string, object, anything)
@@ -48,18 +53,40 @@ The rest of the pipeline — fixtures, DI, parallelism, reporters — works iden
 
 ## EvalSession
 
-`EvalSession` is a session configured for evals. History is enabled by default.
+`EvalSession` is a session configured for evals. History is enabled by default. Model and judge set on the session are propagated as defaults to `EvalSuite` instances added via `session.add_suite()`.
 
 ```python
-from protest.evals import EvalSession, ModelInfo
+from protest.evals import ModelInfo
+from protest.evals.session import EvalSession
 
 session = EvalSession(
-    model=ModelInfo(name="gpt-4o-mini"),    # tracked in history
+    model=ModelInfo(name="gpt-4o-mini"),    # propagated to suites, tracked in history
     concurrency=4,                          # parallel eval cases
     metadata={"version": "1.0"},            # stored in history
 )
 ```
 
+## EvalSuite
+
+`EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration.
+
+```python
+from protest.evals.suite import EvalSuite
+
+chatbot_suite = EvalSuite("chatbot")
+session.add_suite(chatbot_suite)  # model/judge propagated from session
+
+@chatbot_suite.eval(evaluators=[my_scorer])
+async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
+    return await my_agent(case.inputs)
+```
+
+Per-suite model override:
+
+```python
+chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b"))
+```
+
 ## EvalCase
 
 Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts.
@@ -262,7 +289,10 @@ async def pipeline():
 
 session.bind(pipeline)
 
-@session.eval(evaluators=[my_scorer])
+pipeline_suite = EvalSuite("pipeline")
+session.add_suite(pipeline_suite)
+
+@pipeline_suite.eval(evaluators=[my_scorer])
 async def pipeline_eval(
     case: Annotated[EvalCase, From(cases)],
     driver: Annotated[AsyncDriver, Use(pipeline)],
@@ -386,7 +416,7 @@ If your eval task calls an LLM, you can report usage by returning `TaskResult` i
 ```python
 from protest.evals import TaskResult
 
-@session.eval(evaluators=[my_scorer])
+@chatbot_suite.eval(evaluators=[my_scorer])
 async def chatbot(case: Annotated[EvalCase, From(cases)]) -> TaskResult[str]:
     result = await agent.run(case.inputs)
     usage = result.usage()
@@ -424,18 +454,24 @@ If two evaluators return dataclasses with the same field name (e.g. both have `a
 
 ## Multi-Model Sessions
 
-Track which model produced each eval suite's results:
+Track which model produced each eval suite's results. Each `EvalSuite` can have its own model:
 
 ```python
 pipeline_model = ModelInfo(name="qwen-2.5")
 chat_model = ModelInfo(name="mistral-7b")
 
-session = EvalSession(model=pipeline_model)
+session = EvalSession(model=pipeline_model)  # default model
+
+pipeline_suite = EvalSuite("pipeline")  # inherits pipeline_model from session
+chatbot_suite = EvalSuite("chatbot", model=chat_model)  # override
+
+session.add_suite(pipeline_suite)
+session.add_suite(chatbot_suite)
 
-@session.eval(evaluators=[...], name="pipeline", model=pipeline_model)
+@pipeline_suite.eval(evaluators=[...])
 async def pipeline_eval(case, driver) -> str: ...
 
-@session.eval(evaluators=[...], name="chatbot", model=chat_model)
+@chatbot_suite.eval(evaluators=[...])
 async def chatbot_eval(case, deps) -> str: ...
 ```
 
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 6d0c980..07dc2f2 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -133,7 +133,7 @@ class EvalCase:
             EvalCase(inputs="Who is Pierre?", expected="Pierre, arrest"),
         ])
 
-        @session.eval(evaluators=[contains_facts])
+        @suite.eval(evaluators=[contains_facts])
         def my_eval(case: Annotated[EvalCase, From(cases)]) -> str:
             return ask(case.inputs)
     """
diff --git a/protest/evals/session.py b/protest/evals/session.py
index 9ed7459..c85527a 100644
--- a/protest/evals/session.py
+++ b/protest/evals/session.py
@@ -30,8 +30,8 @@ class EvalSession(ProTestSession):
         session.add_suite(chatbot)
 
         @chatbot.eval(evaluators=[contains_facts])
-        async def chatbot(case: Annotated[dict, From(cases)]) -> str:
-            return await ask(case["q"])
+        async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
+            return await ask(case.inputs)
     """
 
     def __init__(
diff --git a/protest/evals/suite.py b/protest/evals/suite.py
index 279aec7..f0aba7e 100644
--- a/protest/evals/suite.py
+++ b/protest/evals/suite.py
@@ -25,8 +25,8 @@ class EvalSuite(ProTestSuite):
         session.add_suite(chatbot)
 
         @chatbot.eval(evaluators=[contains_facts])
-        async def chatbot(case: Annotated[dict, From(cases)]) -> str:
-            return await ask(case["q"])
+        async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
+            return await ask(case.inputs)
     """
 
     def __init__(
@@ -66,7 +66,6 @@ def model(self) -> ModelInfo | None:
     def eval(
         self,
         evaluators: list[Any] | None = None,
-        expected_key: str = "expected",
         tags: list[str] | None = None,
         timeout: float | None = None,
         judge: Any = None,
@@ -78,7 +77,6 @@ def decorator(func: FuncT) -> FuncT:
             wrapper = make_eval_wrapper(
                 func,
                 evaluators or [],
-                expected_key,
                 judge=resolved_judge,
             )
             self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 59d2721..08543c6 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -19,8 +19,8 @@ class TaskResult(Generic[T]):
 
     Usage::
 
-        @session.eval(evaluators=[...])
-        async def my_eval(case) -> TaskResult[str]:
+        @suite.eval(evaluators=[...])
+        async def my_eval(case: EvalCase) -> TaskResult[str]:
             result = await agent.run(case.inputs)
             usage = result.usage()
             return TaskResult(
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index bc2569b..e9161f1 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -14,6 +14,7 @@
 
 from protest.entities.events import EvalPayload, EvalScoreEntry
 from protest.evals.evaluator import (
+    EvalCase,
     EvalContext,
     ShortCircuit,
     extract_scores_from_result,
@@ -26,14 +27,13 @@
 def make_eval_wrapper(
     func: Any,
     evaluators: list[Any],
-    expected_key: str,
     judge: Any = None,
 ) -> Any:
     """Wrap a function to run evaluators on its return value."""
 
     @functools.wraps(func)
     async def eval_wrapper(**kwargs: Any) -> EvalPayload:
-        expected = _extract_expected(kwargs, expected_key)
+        expected = _extract_expected(kwargs)
         case_name = _extract_case_name(kwargs, func.__name__)
         inputs = _extract_inputs(kwargs)
         metadata = _extract_metadata(kwargs)
@@ -102,65 +102,51 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
 
 
 # ---------------------------------------------------------------------------
-# Extract helpers — pull data from case_kwargs (dict or dataclass)
+# Extract helpers — pull EvalCase from kwargs
 # ---------------------------------------------------------------------------
 
 
-def _get(obj: Any, key: str, default: Any = None) -> Any:
-    """Get a value from a dict or dataclass by key/attr name."""
-    if isinstance(obj, dict):
-        return obj.get(key, default)
-    return getattr(obj, key, default)
-
-
-def _is_case_data(v: Any) -> bool:
-    """Check if a value looks like case data (dict or has 'expected'/'q'/'inputs')."""
-    if isinstance(v, dict):
-        return True
-    return hasattr(v, "expected") or hasattr(v, "q") or hasattr(v, "inputs")
-
-
-def _extract_expected(kwargs: dict[str, Any], key: str) -> Any:
+def _find_case(kwargs: dict[str, Any]) -> EvalCase | None:
+    """Find the EvalCase instance in kwargs."""
     for v in kwargs.values():
-        if _is_case_data(v):
-            val = _get(v, key)
-            if val is not None:
-                return val
+        if isinstance(v, EvalCase):
+            return v
     return None
 
 
+def _extract_expected(kwargs: dict[str, Any]) -> Any:
+    case = _find_case(kwargs)
+    if case is None:
+        return None
+    return case.expected
+
+
 def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str:
-    for v in kwargs.values():
-        if _is_case_data(v):
-            name = _get(v, "name")
-            if name:
-                return str(name)
-    return fallback
+    case = _find_case(kwargs)
+    if case is None or not case.name:
+        return fallback
+    return case.name
 
 
 def _extract_inputs(kwargs: dict[str, Any]) -> Any:
-    for v in kwargs.values():
-        if _is_case_data(v):
-            return _get(v, "inputs") or _get(v, "q") or _get(v, "input")
-    return None
+    case = _find_case(kwargs)
+    if case is None:
+        return None
+    return case.inputs
 
 
 def _extract_metadata(kwargs: dict[str, Any]) -> Any:
-    for v in kwargs.values():
-        if _is_case_data(v):
-            val = _get(v, "metadata")
-            if val is not None:
-                return val
-    return None
+    case = _find_case(kwargs)
+    if case is None:
+        return None
+    return case.metadata or None
 
 
 def _extract_per_case_evaluators(kwargs: dict[str, Any]) -> list[Any]:
-    for v in kwargs.values():
-        if _is_case_data(v):
-            evs = _get(v, "evaluators")
-            if evs:
-                return list(evs)
-    return []
+    case = _find_case(kwargs)
+    if case is None or not case.evaluators:
+        return []
+    return list(case.evaluators)
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 2ecc09b..fc35686 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -25,6 +25,7 @@
 from protest.core.suite import ProTestSuite
 from protest.entities import SuiteKind
 from protest.evals import (
+    EvalCase,
     EvalContext,
     Metric,
     ModelInfo,
@@ -91,10 +92,10 @@ async def async_echo_task(text: str) -> str:
 
 basic_cases = ForEach(
     [
-        {"inputs": "hello world", "expected": "hello", "name": "case_pass"},
-        {"inputs": "xyz", "expected": "notfound", "name": "case_fail"},
+        EvalCase(inputs="hello world", expected="hello", name="case_pass"),
+        EvalCase(inputs="xyz", expected="notfound", name="case_fail"),
     ],
-    ids=lambda c: c["name"],
+    ids=lambda c: c.name,
 )
 
 
@@ -113,8 +114,8 @@ def test_add_eval_creates_eval_kind(self) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         # The session should have a suite with kind=eval
         assert len(session._suites) > 0
@@ -137,8 +138,8 @@ def test_eval_with_bool_verdict(self) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         result = runner.run()
@@ -153,8 +154,8 @@ def test_async_task_works(self) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        async def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return await async_echo_task(case["inputs"])
+        async def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return await async_echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -163,9 +164,9 @@ def test_async_evaluator_does_not_crash(self) -> None:
         """Regression: async evaluator called via evaluate_sync raised 'event loop already running'."""
         single_case = ForEach(
             [
-                {"inputs": "hello world", "expected": "hello", "name": "c1"},
+                EvalCase(inputs="hello world", expected="hello", name="c1"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession()
@@ -174,8 +175,8 @@ def test_async_evaluator_does_not_crash(self) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[async_fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         result = runner.run()
@@ -201,8 +202,8 @@ def test_eval_suite_has_kind_eval(self) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         assert any(s.kind == "eval" for s in session._suites)
 
@@ -249,8 +250,8 @@ def test_a() -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         ctx = PluginContext(args={"kind_filter": "test"})
         run_session(session, ctx=ctx)
@@ -274,8 +275,8 @@ def test_a() -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         ctx = PluginContext(args={"kind_filter": "eval"})
         run_session(session, ctx=ctx)
@@ -312,8 +313,8 @@ def on_eval_suite_end(self, report: Any) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -340,8 +341,8 @@ def on_eval_suite_end(self, report: Any) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -367,8 +368,8 @@ def on_test_fail(self, result: Any) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         run_session(session)
 
@@ -403,8 +404,8 @@ def on_test_fail(self, result: Any) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -438,8 +439,8 @@ def on_test_teardown_start(self, info: Any) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -466,9 +467,9 @@ def crashing_evaluator(ctx: EvalContext) -> bool:
 
         single_case = ForEach(
             [
-                {"inputs": "hello", "expected": "hello", "name": "c1"},
+                EvalCase(inputs="hello", expected="hello", name="c1"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession()
@@ -478,8 +479,8 @@ def crashing_evaluator(ctx: EvalContext) -> bool:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[crashing_evaluator])
-        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -527,8 +528,8 @@ def _run_eval(self, tmp_path: Path) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         run_session(session)
 
@@ -593,8 +594,8 @@ def test_history_metadata_included(self, tmp_path: Path) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         run_session(session)
 
@@ -660,8 +661,8 @@ def test_case_hash_stored_in_history(self, tmp_path: Path) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         run_session(session)
 
@@ -790,9 +791,9 @@ def on_test_fail(self, result: Any) -> None:
 
         single_case = ForEach(
             [
-                {"inputs": "hello world", "expected": "hello", "name": "c1"},
+                EvalCase(inputs="hello world", expected="hello", name="c1"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession()
@@ -802,8 +803,8 @@ def on_test_fail(self, result: Any) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[not_empty])
-        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         result = runner.run()
@@ -827,9 +828,9 @@ def on_test_fail(self, result: Any) -> None:
 
         single_case = ForEach(
             [
-                {"inputs": "foo", "expected": "bar baz", "name": "c1"},
+                EvalCase(inputs="foo", expected="bar baz", name="c1"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession()
@@ -839,8 +840,8 @@ def on_test_fail(self, result: Any) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[word_overlap])
-        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         result = runner.run()
@@ -863,8 +864,8 @@ def bad_evaluator(ctx: EvalContext) -> float:
             return 0.5
 
         single_case = ForEach(
-            [{"inputs": "hello", "expected": "hello", "name": "c1"}],
-            ids=lambda c: c["name"],
+            [EvalCase(inputs="hello", expected="hello", name="c1")],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession()
@@ -874,8 +875,8 @@ def bad_evaluator(ctx: EvalContext) -> float:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[bad_evaluator])
-        def eval_echo(case: Annotated[dict, From(single_case)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -906,8 +907,8 @@ def expensive(ctx: EvalContext) -> bool:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[ShortCircuit([cheap, expensive])])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -931,7 +932,7 @@ def check_b(ctx: EvalContext) -> bool:
             return True
 
         single = ForEach(
-            [{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"]
+            [EvalCase(inputs="x", expected="x", name="c1")], ids=lambda c: c.name
         )
         session = EvalSession()
 
@@ -939,8 +940,8 @@ def check_b(ctx: EvalContext) -> bool:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[ShortCircuit([check_a, check_b])])
-        def eval_echo(case: Annotated[dict, From(single)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(single)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         result = runner.run()
@@ -967,8 +968,8 @@ def _run_eval(self, tmp_path: Path) -> Path:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         runner.run()
@@ -1019,16 +1020,16 @@ class TestMultiDatasetHistory:
     def _run_multi(self, tmp_path: Path) -> dict[str, Any]:
         pipeline_cases = ForEach(
             [
-                {"inputs": "hello", "expected": "hello", "name": "c1"},
+                EvalCase(inputs="hello", expected="hello", name="c1"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         ingest_cases = ForEach(
             [
-                {"inputs": "world", "expected": "world", "name": "c2"},
+                EvalCase(inputs="world", expected="world", name="c2"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession(history_dir=tmp_path)
@@ -1037,15 +1038,15 @@ def _run_multi(self, tmp_path: Path) -> dict[str, Any]:
         session.add_suite(pipeline_suite)
 
         @pipeline_suite.eval(evaluators=[fake_accuracy])
-        def pipeline(case: Annotated[dict, From(pipeline_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def pipeline(case: Annotated[EvalCase, From(pipeline_cases)]) -> str:
+            return echo_task(case.inputs)
 
         ingest_suite = EvalSuite("ingest")
         session.add_suite(ingest_suite)
 
         @ingest_suite.eval(evaluators=[fake_accuracy])
-        def ingest(case: Annotated[dict, From(ingest_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def ingest(case: Annotated[EvalCase, From(ingest_cases)]) -> str:
+            return echo_task(case.inputs)
 
         run_session(session)
 
@@ -1080,8 +1081,8 @@ def test_task_without_fixtures_still_works(self) -> None:
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
-        def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str:
-            return echo_task(case["inputs"])
+        def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
+            return echo_task(case.inputs)
 
         runner = TestRunner(session)
         result = runner.run()
@@ -1096,9 +1097,9 @@ def prefix_service() -> str:
 
         single_case = ForEach(
             [
-                {"inputs": "hello", "expected": "PREFIX:hello", "name": "c1"},
+                EvalCase(inputs="hello", expected="PREFIX:hello", name="c1"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession()
@@ -1109,10 +1110,10 @@ def prefix_service() -> str:
 
         @eval_prefixed_suite.eval(evaluators=[fake_accuracy])
         async def eval_prefixed(
-            case: Annotated[dict, From(single_case)],
+            case: Annotated[EvalCase, From(single_case)],
             svc: Annotated[str, Use(prefix_service)],
         ) -> str:
-            return f"{svc}:{case['inputs']}"
+            return f"{svc}:{case.inputs}"
 
         runner = TestRunner(session)
         result = runner.run()
@@ -1132,11 +1133,11 @@ def expensive_resource() -> str:
 
         multi_cases = ForEach(
             [
-                {"inputs": "a", "expected": "resource:a", "name": "c1"},
-                {"inputs": "b", "expected": "resource:b", "name": "c2"},
-                {"inputs": "c", "expected": "resource:c", "name": "c3"},
+                EvalCase(inputs="a", expected="resource:a", name="c1"),
+                EvalCase(inputs="b", expected="resource:b", name="c2"),
+                EvalCase(inputs="c", expected="resource:c", name="c3"),
             ],
-            ids=lambda c: c["name"],
+            ids=lambda c: c.name,
         )
 
         session = EvalSession()
@@ -1147,10 +1148,10 @@ def expensive_resource() -> str:
 
         @eval_resource_suite.eval(evaluators=[fake_accuracy])
         async def eval_resource(
-            case: Annotated[dict, From(multi_cases)],
+            case: Annotated[EvalCase, From(multi_cases)],
             res: Annotated[str, Use(expensive_resource)],
         ) -> str:
-            return f"{res}:{case['inputs']}"
+            return f"{res}:{case.inputs}"
 
         runner = TestRunner(session)
         runner.run()

From d3f542cae0ef78e29e5a5b05d175339bf8609b09 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 4 Apr 2026 07:48:20 +0200
Subject: [PATCH 22/60] refactor(evals): enhance docstrings for EvalSuite and
 EvalSession with detailed functionality descriptions

---
 protest/evals/session.py | 4 ++--
 protest/evals/suite.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/protest/evals/session.py b/protest/evals/session.py
index c85527a..419e9f6 100644
--- a/protest/evals/session.py
+++ b/protest/evals/session.py
@@ -1,4 +1,4 @@
-"""EvalSession — session dédiée aux evals."""
+"""EvalSession — eval-dedicated session with history and default propagation."""
 
 from __future__ import annotations
 
@@ -20,7 +20,7 @@
 
 
 class EvalSession(ProTestSession):
-    """Session dédiée aux evals.
+    """Eval-dedicated session with history enabled by default.
 
     Usage::
 
diff --git a/protest/evals/suite.py b/protest/evals/suite.py
index f0aba7e..905010c 100644
--- a/protest/evals/suite.py
+++ b/protest/evals/suite.py
@@ -1,4 +1,4 @@
-"""EvalSuite — suite dédiée aux evals."""
+"""EvalSuite — eval-dedicated suite with judge and model support."""
 
 from __future__ import annotations
 
@@ -17,7 +17,7 @@
 
 
 class EvalSuite(ProTestSuite):
-    """Suite dédiée aux evals.
+    """Eval-dedicated suite that forces kind=EVAL and carries judge/model config.
 
     Usage::
 

From 6b3c203a9a823a3a82dcd13b030d4e14fc38279c Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 4 Apr 2026 15:03:35 +0200
Subject: [PATCH 23/60] feat(reporting): add eval suite and case payloads to
 web reporting

- Added support for emitting an `EVAL_SUITE_END` event with detailed suite-level metrics and score statistics.
- Extended `SUITE_END` payloads to include evaluation-related details when processing eval-specific results.
---
 protest/reporting/web.py | 53 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/protest/reporting/web.py b/protest/reporting/web.py
index 517de24..5eb7119 100644
--- a/protest/reporting/web.py
+++ b/protest/reporting/web.py
@@ -28,6 +28,7 @@
         TestStartInfo,
         TestTeardownInfo,
     )
+    from protest.evals.types import EvalSuiteReport
 
 try:
     from websockets.asyncio.server import (
@@ -245,6 +246,33 @@ def on_fixture_teardown_done(self, info: FixtureInfo) -> None:
             {"name": info.name, "scope": info.scope, "duration": info.duration},
         )
 
+    def on_eval_suite_end(self, report: EvalSuiteReport) -> None:
+        self._send(
+            "EVAL_SUITE_END",
+            {
+                "suiteName": report.suite_name,
+                "totalCount": report.total_count,
+                "passedCount": report.passed_count,
+                "failedCount": report.failed_count,
+                "passRate": report.pass_rate,
+                "duration": report.duration,
+                "scoreStats": [
+                    {
+                        "name": s.name,
+                        "mean": s.mean,
+                        "median": s.median,
+                        "p5": s.p5,
+                        "p95": s.p95,
+                    }
+                    for s in report.all_score_stats()
+                ],
+                "taskTokens": report.total_task_tokens,
+                "taskCost": report.total_task_cost,
+                "judgeTokens": report.total_judge_tokens,
+                "judgeCost": report.total_judge_cost,
+            },
+        )
+
     def on_suite_end(self, result: SuiteResult) -> None:
         self._send(
             "SUITE_END",
@@ -276,4 +304,29 @@ def _result_payload(
         if include_error and result.error:
             payload["message"] = str(result.error)
             payload["traceback"] = _format_traceback(result.error)
+        if result.is_eval and result.eval_payload:
+            ep = result.eval_payload
+            payload["evalPayload"] = {
+                "caseName": ep.case_name,
+                "passed": ep.passed,
+                "inputs": ep.inputs,
+                "output": ep.output,
+                "expected": ep.expected_output,
+                "scores": {
+                    name: {
+                        "value": entry.value,
+                        "passed": entry.passed,
+                        "skipped": entry.skipped,
+                    }
+                    for name, entry in ep.scores.items()
+                },
+                "taskDuration": ep.task_duration,
+                "taskInputTokens": ep.task_input_tokens,
+                "taskOutputTokens": ep.task_output_tokens,
+                "taskCost": ep.task_cost,
+                "judgeCallCount": ep.judge_call_count,
+                "judgeInputTokens": ep.judge_input_tokens,
+                "judgeOutputTokens": ep.judge_output_tokens,
+                "judgeCost": ep.judge_cost,
+            }
         return payload

From 924615f089e9cc5f2d85c87e5b52d0a5f97b0bcc Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 14 Apr 2026 06:34:00 +0200
Subject: [PATCH 24/60] refactor(evals): remove EvalSession, merge history
 plugins, always-on architecture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Delete EvalSession — ProTestSession is the only session
- Merge HistoryPlugin + EvalHistoryPlugin into single always-on plugin
- EvalResultsWriter now always-on (no-op without evals)
- Model/judge live entirely on EvalSuite, no session propagation
- history=True by default on ProTestSession
- Remove apply_defaults, _wire_eval_support, add_suite override
---
 docs/evals.md                       |  53 +++-----
 examples/yorkshire/evals/session.py |  17 +--
 protest/core/session.py             |  12 +-
 protest/evals/evaluator.py          |   2 +-
 protest/evals/history.py            | 165 ------------------------
 protest/evals/results_writer.py     |   4 +-
 protest/evals/session.py            |  96 --------------
 protest/evals/types.py              |   2 +-
 protest/history/plugin.py           | 193 +++++++++++++++++++++++-----
 tests/evals/test_e2e.py             |  67 +++++-----
 tests/evals/test_judge.py           |  45 ++++---
 11 files changed, 249 insertions(+), 407 deletions(-)
 delete mode 100644 protest/evals/history.py
 delete mode 100644 protest/evals/session.py

diff --git a/docs/evals.md b/docs/evals.md
index cca70cc..387267f 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -14,10 +14,9 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t
 # evals/session.py
 from typing import Annotated
 
-from protest import ForEach, From
+from protest import ForEach, From, ProTestSession
 from protest.evals import EvalCase, ModelInfo, evaluator
 from protest.evals.evaluators import contains_keywords
-from protest.evals.session import EvalSession
 from protest.evals.suite import EvalSuite
 
 cases = ForEach([
@@ -25,9 +24,9 @@ cases = ForEach([
     EvalCase(inputs="What is 2+2?", expected="4", name="math"),
 ])
 
-session = EvalSession(model=ModelInfo(name="gpt-4o-mini"))
+session = ProTestSession()
 
-chatbot_suite = EvalSuite("chatbot")
+chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini"))
 session.add_suite(chatbot_suite)
 
 @chatbot_suite.eval(evaluators=[contains_keywords(keywords=["Marie"])])
@@ -51,42 +50,22 @@ protest eval evals.session:session
 
 The rest of the pipeline — fixtures, DI, parallelism, reporters — works identically to tests.
 
-## EvalSession
-
-`EvalSession` is a session configured for evals. History is enabled by default. Model and judge set on the session are propagated as defaults to `EvalSuite` instances added via `session.add_suite()`.
-
-```python
-from protest.evals import ModelInfo
-from protest.evals.session import EvalSession
-
-session = EvalSession(
-    model=ModelInfo(name="gpt-4o-mini"),    # propagated to suites, tracked in history
-    concurrency=4,                          # parallel eval cases
-    metadata={"version": "1.0"},            # stored in history
-)
-```
-
 ## EvalSuite
 
-`EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration.
+`EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration. Model and judge are suite-level config: each suite declares which model produced its results and which judge scores them.
 
 ```python
 from protest.evals.suite import EvalSuite
+from protest.evals import ModelInfo
 
-chatbot_suite = EvalSuite("chatbot")
-session.add_suite(chatbot_suite)  # model/judge propagated from session
+chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini"))
+session.add_suite(chatbot_suite)
 
 @chatbot_suite.eval(evaluators=[my_scorer])
 async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
     return await my_agent(case.inputs)
 ```
 
-Per-suite model override:
-
-```python
-chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b"))
-```
-
 ## EvalCase
 
 Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts.
@@ -189,7 +168,7 @@ The threshold (`min_recall`) is a parameter of the evaluator, not a framework co
 
 ### Async (LLM Judge)
 
-Use `ctx.judge()` for structured LLM evaluation (requires `judge=` on `EvalSession`):
+Use `ctx.judge()` for structured LLM evaluation (requires `judge=` on `EvalSuite`):
 
 ```python
 @dataclass
@@ -305,7 +284,7 @@ async def pipeline_eval(
 `ModelInfo` is a **label for history tracking** — it does not configure or route to any model. It records which model produced the results so you can compare runs.
 
 ```python
-session = EvalSession(model=ModelInfo(name="qwen-2.5"))
+suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5"))
 ```
 
 ## Judge
@@ -358,7 +337,8 @@ return JudgeResponse(output=result.output)  # tokens/cost = None, that's fine
 ### Configuring the Judge
 
 ```python
-session = EvalSession(
+suite = EvalSuite(
+    "pipeline",
     model=ModelInfo(name="qwen-2.5"),
     judge=PydanticAIJudge(model="gpt-4o-mini", temperature=0),
 )
@@ -394,7 +374,7 @@ async def simple_judge(ctx: EvalContext) -> bool:
 
 ### No Judge Configured
 
-If an evaluator calls `ctx.judge()` and no judge was passed to `EvalSession`, a `RuntimeError` is raised. This is treated as an **infrastructure error** (not a test failure), same as a fixture crash.
+If an evaluator calls `ctx.judge()` and no judge was passed to `EvalSuite`, a `RuntimeError` is raised. This is treated as an **infrastructure error** (not a test failure), same as a fixture crash.
 
 ### Usage Tracking
 
@@ -457,13 +437,10 @@ If two evaluators return dataclasses with the same field name (e.g. both have `a
 Track which model produced each eval suite's results. Each `EvalSuite` can have its own model:
 
 ```python
-pipeline_model = ModelInfo(name="qwen-2.5")
-chat_model = ModelInfo(name="mistral-7b")
+session = ProTestSession()
 
-session = EvalSession(model=pipeline_model)  # default model
-
-pipeline_suite = EvalSuite("pipeline")  # inherits pipeline_model from session
-chatbot_suite = EvalSuite("chatbot", model=chat_model)  # override
+pipeline_suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5"))
+chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b"))
 
 session.add_suite(pipeline_suite)
 session.add_suite(chatbot_suite)
diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py
index f1800d8..f03d733 100644
--- a/examples/yorkshire/evals/session.py
+++ b/examples/yorkshire/evals/session.py
@@ -15,20 +15,21 @@
     suite_evaluators,
     yorkshire_cases,
 )
-from protest import From
-from protest.evals import ModelInfo
-from protest.evals.session import EvalSession
+from protest import From, ProTestSession
+from protest.evals import EvalCase, ModelInfo
 from protest.evals.suite import EvalSuite
 
-session = EvalSession(
-    model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"),
+session = ProTestSession(
     metadata={"version": "1.0", "type": "keyword-matching"},
 )
 
-yorkshire_suite = EvalSuite("yorkshire_eval")
+yorkshire_suite = EvalSuite(
+    "yorkshire_eval",
+    model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"),
+)
 session.add_suite(yorkshire_suite)
 
 
 @yorkshire_suite.eval(evaluators=suite_evaluators)
-def yorkshire_eval(case: Annotated[dict, From(yorkshire_cases)]) -> str:
-    return yorkshire_chatbot(case["inputs"])
+def yorkshire_eval(case: Annotated[EvalCase, From(yorkshire_cases)]) -> str:
+    return yorkshire_chatbot(case.inputs)
diff --git a/protest/core/session.py b/protest/core/session.py
index daafa74..4b3d008 100644
--- a/protest/core/session.py
+++ b/protest/core/session.py
@@ -27,6 +27,7 @@
     normalize_skip,
     normalize_xfail,
 )
+from protest.evals.results_writer import EvalResultsWriter
 from protest.events.bus import EventBus
 from protest.events.types import Event
 from protest.exceptions import InvalidMaxConcurrencyError
@@ -34,6 +35,7 @@
 from protest.filters.keyword import KeywordFilterPlugin
 from protest.filters.kind import KindFilterPlugin
 from protest.filters.suite import SuiteFilterPlugin
+from protest.history.plugin import HistoryPlugin
 from protest.reporting.ascii import AsciiReporter
 from protest.reporting.ctrf import CTRFReporter
 from protest.reporting.log_file import LogFilePlugin
@@ -59,7 +61,7 @@ class ProTestSession:
     def __init__(
         self,
         concurrency: int = 1,
-        history: bool = False,
+        history: bool = True,
         history_dir: Path | None = None,
         metadata: dict[str, Any] | None = None,
     ) -> None:
@@ -268,6 +270,8 @@ def default_plugin_classes() -> list[type[PluginBase]]:
             SuiteFilterPlugin,
             KeywordFilterPlugin,
             KindFilterPlugin,
+            HistoryPlugin,
+            EvalResultsWriter,
             RichReporter,
             AsciiReporter,
             CTRFReporter,
@@ -278,12 +282,6 @@ def register_default_plugins(self) -> None:
         """Register all standard ProTest plugins for CLI discovery."""
         for plugin_class in self.default_plugin_classes():
             self.use(plugin_class)
-        if self._history:
-            from protest.history.plugin import (  # noqa: PLC0415 — conditional
-                HistoryPlugin,
-            )
-
-            self.register_plugin(HistoryPlugin(history_dir=self._history_dir))
 
     @property
     def plugin_classes(self) -> list[type[PluginBase]]:
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 07dc2f2..6baab51 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -93,7 +93,7 @@ async def judge(self, prompt: str, output_type: type[T]) -> T:
         if self._judge is None:
             raise RuntimeError(
                 f"Evaluator for case '{self.name}' called ctx.judge() but no "
-                "judge is configured. Pass judge= to EvalSession()."
+                "judge is configured. Pass judge= to EvalSuite()."
             )
         self._judge_call_count += 1
         response = await self._judge.judge(prompt, output_type)
diff --git a/protest/evals/history.py b/protest/evals/history.py
deleted file mode 100644
index 010ddb8..0000000
--- a/protest/evals/history.py
+++ /dev/null
@@ -1,165 +0,0 @@
-"""EvalHistoryPlugin — persists eval run results as JSONL with model/scores."""
-
-from __future__ import annotations
-
-import uuid
-from datetime import datetime, timezone
-from typing import TYPE_CHECKING, Any
-
-from protest.entities import SuiteKind
-from protest.history.collector import collect_env_info, collect_git_info
-from protest.history.storage import (
-    DEFAULT_HISTORY_DIR,
-    HISTORY_FILE,
-    append_entry,
-    load_history,
-    load_previous_run,
-)
-from protest.plugin import PluginBase
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from protest.core.session import ProTestSession
-    from protest.evals.types import EvalCaseResult, EvalSuiteReport, ModelInfo
-    from protest.plugin import PluginContext
-
-
-class EvalHistoryPlugin(PluginBase):
-    """Persists eval results to JSONL with model/judge/scores metadata.
-
-    Listens to EVAL_SUITE_END events (emitted by the core runner).
-    """
-
-    name = "eval-history"
-    description = "Eval history tracking"
-
-    def __init__(
-        self,
-        *,
-        history_dir: Path | None = None,
-        model: ModelInfo | None = None,
-        judge: dict[str, Any] | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> None:
-        self._history_dir = history_dir or DEFAULT_HISTORY_DIR
-        self._history_file = self._history_dir / HISTORY_FILE
-        self._model = model
-        self._judge = judge
-        self._metadata = dict(metadata) if metadata else {}
-        self._reports: dict[str, EvalSuiteReport] = {}
-
-    _suite_metadata: dict[str, dict[str, Any]]
-
-    @classmethod
-    def activate(cls, ctx: PluginContext) -> EvalHistoryPlugin | None:
-        return None  # Wired explicitly by session
-
-    def setup(self, session: ProTestSession) -> None:
-        """Collect per-suite metadata from session."""
-        self._suite_metadata = {}
-        for suite in session.suites:
-            if suite.kind == SuiteKind.EVAL:
-                self._suite_metadata[suite.name] = suite.suite_metadata
-
-    def on_eval_suite_end(self, report: EvalSuiteReport) -> None:
-        """Collect suite reports as they arrive."""
-        self._reports[report.suite_name] = report
-
-    def on_session_end(self, _result: Any) -> None:
-        """Write all collected reports to history."""
-        if not self._reports:
-            return
-        entry = _build_entry(
-            self._reports,
-            self._model,
-            self._judge,
-            self._metadata,
-            self._suite_metadata,
-        )
-        append_entry(self._history_file, entry)
-
-    def load_entries(self, n: int | None = None) -> list[dict[str, Any]]:
-        """Load entries from history file."""
-        return load_history(history_dir=self._history_dir, n=n, evals_only=True)
-
-
-def _build_entry(
-    reports: dict[str, EvalSuiteReport],
-    model: ModelInfo | None,
-    judge: dict[str, Any] | None,
-    metadata: dict[str, Any] | None = None,
-    all_suite_metadata: dict[str, dict[str, Any]] | None = None,
-) -> dict[str, Any]:
-    """Build a complete history entry covering all suites in the session."""
-    suites_data: dict[str, Any] = {}
-    all_score_stats: list[Any] = []
-
-    for suite_name, report in reports.items():
-        sm = (all_suite_metadata or {}).get(suite_name, {})
-        suite_model = sm.get("model") or (model.name if model else None)
-        suite_provider = sm.get("provider") or (model.provider if model else None)
-        suites_data[suite_name] = {
-            "kind": "eval",
-            "model": suite_model,
-            "provider": suite_provider,
-            "total_cases": report.total_count,
-            "passed": report.passed_count,
-            "failed": report.failed_count,
-            "errored": report.errored_count,
-            "pass_rate": round(report.pass_rate, 4),
-            "duration": round(report.duration, 2),
-            "cases": {c.case_name: _serialize_case(c) for c in report.cases},
-        }
-        all_score_stats.extend(report.all_score_stats())
-
-    scores_summary = {
-        s.name: {
-            "mean": round(s.mean, 4),
-            "median": round(s.median, 4),
-            "p5": round(s.p5, 4),
-            "p95": round(s.p95, 4),
-            "min": round(s.min, 4),
-            "max": round(s.max, 4),
-            "count": s.count,
-        }
-        for s in all_score_stats
-    }
-
-    return {
-        "run_id": str(uuid.uuid4()),
-        "timestamp": datetime.now(tz=timezone.utc).isoformat(),
-        "git": collect_git_info(),
-        "environment": collect_env_info(),
-        "metadata": dict(metadata) if metadata else {},
-        "evals": {
-            "model": model.name if model else None,
-            "provider": model.provider if model else None,
-            "judge": judge,
-            "scores_summary": scores_summary,
-        },
-        "suites": suites_data,
-    }
-
-
-def _serialize_case(case: EvalCaseResult) -> dict[str, Any]:
-    entry: dict[str, Any] = {
-        "passed": case.passed,
-        "is_error": case.is_error,
-        "duration": round(case.duration, 3),
-        "scores": {s.name: s.value for s in case.scores if s.is_metric},
-        "case_hash": case.case_hash,
-        "eval_hash": case.eval_hash,
-    }
-    labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)}
-    if labels:
-        entry["labels"] = labels
-    assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)}
-    if assertions:
-        entry["assertions"] = assertions
-    return entry
-
-
-def load_previous_eval_run(history_dir: Any = None) -> dict[str, Any] | None:
-    """Load the most recent eval run from history."""
-    return load_previous_run(history_dir=history_dir, evals_only=True)
diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
index e069bba..67ca569 100644
--- a/protest/evals/results_writer.py
+++ b/protest/evals/results_writer.py
@@ -34,8 +34,8 @@ def __init__(self, history_dir: Path | None = None) -> None:
         self._run_dirs: dict[str, Path] = {}
 
     @classmethod
-    def activate(cls, ctx: PluginContext) -> EvalResultsWriter | None:
-        return None  # Wired explicitly by session
+    def activate(cls, ctx: PluginContext) -> EvalResultsWriter:
+        return cls(history_dir=ctx.get("history_dir"))
 
     def on_test_pass(self, result: TestResult) -> None:
         self._maybe_write(result, passed=True)
diff --git a/protest/evals/session.py b/protest/evals/session.py
deleted file mode 100644
index 419e9f6..0000000
--- a/protest/evals/session.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""EvalSession — eval-dedicated session with history and default propagation."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-from protest.core.session import ProTestSession
-from protest.entities import SuiteKind
-from protest.evals.history import EvalHistoryPlugin
-from protest.evals.results_writer import EvalResultsWriter
-from protest.evals.suite import EvalSuite
-from protest.evals.types import JudgeInfo
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from protest.core.suite import ProTestSuite
-    from protest.evals.types import Judge, ModelInfo
-    from protest.plugin import PluginContext
-
-
-class EvalSession(ProTestSession):
-    """Eval-dedicated session with history enabled by default.
-
-    Usage::
-
-        session = EvalSession(model=ModelInfo(name="qwen-2.5"))
-
-        chatbot = EvalSuite("chatbot")
-        session.add_suite(chatbot)
-
-        @chatbot.eval(evaluators=[contains_facts])
-        async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
-            return await ask(case.inputs)
-    """
-
-    def __init__(
-        self,
-        *,
-        model: ModelInfo | None = None,
-        judge: Judge | None = None,
-        concurrency: int = 1,
-        history: bool = True,
-        history_dir: Path | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> None:
-        super().__init__(
-            concurrency=concurrency,
-            history=history,
-            history_dir=history_dir,
-            metadata=metadata,
-        )
-        self._eval_model = model
-        self._eval_judge_instance: Judge | None = judge
-        self._eval_judge: JudgeInfo | None = (
-            JudgeInfo(name=judge.name, provider=judge.provider)
-            if judge is not None
-            else None
-        )
-
-    def add_suite(self, suite: ProTestSuite) -> None:
-        """Add a suite, propagating session-level model/judge as defaults."""
-        if isinstance(suite, EvalSuite):
-            if suite.judge is None and self._eval_judge_instance is not None:
-                suite._judge = self._eval_judge_instance
-            if self._eval_model and "model" not in suite.suite_metadata:
-                suite._metadata["model"] = self._eval_model.name
-                suite._metadata["provider"] = self._eval_model.provider
-        super().add_suite(suite)
-
-    def activate_plugins(self, ctx: PluginContext) -> None:
-        """Activate plugins, then wire eval support if needed."""
-        super().activate_plugins(ctx)
-        if any(s.kind == SuiteKind.EVAL for s in self._suites):
-            self._wire_eval_support()
-
-    def _wire_eval_support(self) -> None:
-        """Wire eval history + results writer plugins."""
-        judge_dict = None
-        if self._eval_judge:
-            judge_dict = {
-                "name": self._eval_judge.name,
-                "provider": self._eval_judge.provider,
-                "evaluators": list(self._eval_judge.evaluators),
-            }
-
-        history = EvalHistoryPlugin(
-            history_dir=self._history_dir,
-            model=self._eval_model,
-            judge=judge_dict,
-            metadata=self._metadata,
-        )
-        self.register_plugin(history)
-
-        writer = EvalResultsWriter(history_dir=self._history_dir)
-        self.register_plugin(writer)
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 08543c6..7a2c19a 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -82,7 +82,7 @@ async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]:
                 result = await agent.run(prompt)
                 return JudgeResponse(output=result.output, input_tokens=100)
 
-        session = EvalSession(judge=MyJudge())
+        suite = EvalSuite("chatbot", judge=MyJudge())
     """
 
     name: str
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index c8a0f79..930ca61 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -1,4 +1,4 @@
-"""HistoryPlugin — persists test run results as JSONL."""
+"""HistoryPlugin — persists test and eval run results as JSONL."""
 
 from __future__ import annotations
 
@@ -6,66 +6,124 @@
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any
 
+from protest.entities import SuiteKind
+from protest.evals.suite import EvalSuite
 from protest.history.collector import collect_env_info, collect_git_info
-from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry
+from protest.history.storage import (
+    DEFAULT_HISTORY_DIR,
+    HISTORY_FILE,
+    append_entry,
+    load_previous_run,
+)
 from protest.plugin import PluginBase
 
 if TYPE_CHECKING:
     from pathlib import Path
 
     from protest.core.session import ProTestSession
-    from protest.entities import SuiteKind
-    from protest.entities.events import SessionResult, TestResult
+    from protest.entities.events import TestResult
+    from protest.evals.types import EvalCaseResult, EvalSuiteReport
     from protest.plugin import PluginContext
 
 
 class HistoryPlugin(PluginBase):
-    """Persists test results to JSONL for run-over-run tracking."""
+    """Persists test and eval results to JSONL for run-over-run tracking.
+
+    Always-on plugin. When history is disabled on the session, all handlers
+    are no-ops. Handles both test results (on_test_pass/fail) and eval
+    results (on_eval_suite_end).
+    """
 
     name = "history"
-    description = "Test history tracking"
+    description = "Run history tracking"
 
     def __init__(self, history_dir: Path | None = None) -> None:
         self._history_dir = history_dir or DEFAULT_HISTORY_DIR
         self._history_file = self._history_dir / HISTORY_FILE
-        self._suites: dict[str, dict[str, dict[str, Any]]] = {}
+        # Test data
+        self._test_suites: dict[str, dict[str, dict[str, Any]]] = {}
         self._suite_kinds: dict[str, SuiteKind] = {}
         self._default_suite_name: str = "tests"
-        self._history_enabled: bool = False
+        # Eval data
+        self._eval_reports: dict[str, EvalSuiteReport] = {}
+        self._eval_suite_metadata: dict[str, dict[str, Any]] = {}
+        self._eval_judge_info: dict[str, dict[str, Any]] = {}
+        # Session state
+        self._enabled: bool = False
         self._metadata: dict[str, Any] = {}
 
     @classmethod
     def activate(cls, ctx: PluginContext) -> HistoryPlugin | None:
-        return None  # Wired explicitly by session
+        if ctx.get("no_history", False):
+            return None
+        return cls(history_dir=ctx.get("history_dir"))
 
     def setup(self, session: ProTestSession) -> None:
-        self._history_enabled = session.history
+        self._enabled = session.history
         self._metadata = dict(session.metadata)
+        if session.history_dir:
+            self._history_dir = session.history_dir
+            self._history_file = self._history_dir / HISTORY_FILE
         for suite in session.suites:
             self._suite_kinds[suite.name] = suite.kind
-            if not self._default_suite_name or self._default_suite_name == "tests":
+            if suite.kind == SuiteKind.EVAL:
+                self._eval_suite_metadata[suite.name] = suite.suite_metadata
+                if isinstance(suite, EvalSuite) and suite.judge is not None:
+                    self._eval_judge_info[suite.name] = {
+                        "name": suite.judge.name,
+                        "provider": getattr(suite.judge, "provider", None),
+                    }
+            elif not self._default_suite_name or self._default_suite_name == "tests":
                 self._default_suite_name = suite.name
 
+    # -- Test event handlers --------------------------------------------------
+
     def on_test_pass(self, result: TestResult) -> None:
-        if result.is_eval:
+        if not self._enabled or result.is_eval:
             return
-        self._record(result, passed=True)
+        self._record_test(result, passed=True)
 
     def on_test_fail(self, result: TestResult) -> None:
-        if result.is_eval:
+        if not self._enabled or result.is_eval:
             return
-        self._record(result, passed=False)
+        self._record_test(result, passed=False)
+
+    def _record_test(self, result: TestResult, *, passed: bool) -> None:
+        suite_name = (
+            result.suite_path.root_name
+            if result.suite_path
+            else self._default_suite_name
+        )
+        if suite_name not in self._test_suites:
+            self._test_suites[suite_name] = {}
+        self._test_suites[suite_name][result.name] = {
+            "passed": passed,
+            "duration": round(result.duration, 3),
+        }
+
+    # -- Eval event handlers --------------------------------------------------
 
-    def on_session_end(self, _result: SessionResult) -> None:
-        if not self._history_enabled or not self._suites:
+    def on_eval_suite_end(self, report: EvalSuiteReport) -> None:
+        if not self._enabled:
+            return
+        self._eval_reports[report.suite_name] = report
+
+    # -- Session end: write combined entry ------------------------------------
+
+    def on_session_end(self, result: Any) -> None:
+        if not self._enabled:
+            return
+        if not self._test_suites and not self._eval_reports:
             return
 
         suites_data: dict[str, Any] = {}
-        for suite_name, cases in self._suites.items():
+
+        # Test suites
+        for suite_name, cases in self._test_suites.items():
             total = len(cases)
             passed = sum(1 for c in cases.values() if c["passed"])
             suites_data[suite_name] = {
-                "kind": self._suite_kinds.get(suite_name, "test"),
+                "kind": str(self._suite_kinds.get(suite_name, "test")),
                 "total_cases": total,
                 "passed": passed,
                 "failed": total - passed,
@@ -74,27 +132,98 @@ def on_session_end(self, _result: SessionResult) -> None:
                 "cases": cases,
             }
 
+        # Eval suites
+        all_score_stats: list[Any] = []
+        for suite_name, report in self._eval_reports.items():
+            sm = self._eval_suite_metadata.get(suite_name, {})
+            suites_data[suite_name] = {
+                "kind": "eval",
+                "model": sm.get("model"),
+                "provider": sm.get("provider"),
+                "total_cases": report.total_count,
+                "passed": report.passed_count,
+                "failed": report.failed_count,
+                "errored": report.errored_count,
+                "pass_rate": round(report.pass_rate, 4),
+                "duration": round(report.duration, 2),
+                "cases": {c.case_name: _serialize_eval_case(c) for c in report.cases},
+            }
+            all_score_stats.extend(report.all_score_stats())
+
+        # Build evals summary (non-null only if we have eval data)
+        evals_summary = None
+        if self._eval_reports:
+            # Derive top-level model from first eval suite (or None if mixed)
+            models = {
+                sm.get("model")
+                for sm in self._eval_suite_metadata.values()
+                if sm.get("model")
+            }
+            top_model = models.pop() if len(models) == 1 else None
+            providers = {
+                sm.get("provider")
+                for sm in self._eval_suite_metadata.values()
+                if sm.get("provider")
+            }
+            top_provider = providers.pop() if len(providers) == 1 else None
+
+            # Aggregate judge info (first one found, or None)
+            judge_dict = None
+            if self._eval_judge_info:
+                first_judge = next(iter(self._eval_judge_info.values()))
+                judge_dict = first_judge
+
+            scores_summary = {
+                s.name: {
+                    "mean": round(s.mean, 4),
+                    "median": round(s.median, 4),
+                    "p5": round(s.p5, 4),
+                    "p95": round(s.p95, 4),
+                    "min": round(s.min, 4),
+                    "max": round(s.max, 4),
+                    "count": s.count,
+                }
+                for s in all_score_stats
+            }
+
+            evals_summary = {
+                "model": top_model,
+                "provider": top_provider,
+                "judge": judge_dict,
+                "scores_summary": scores_summary,
+            }
+
         entry: dict[str, Any] = {
             "run_id": str(uuid.uuid4()),
             "timestamp": datetime.now(tz=timezone.utc).isoformat(),
             "git": collect_git_info(),
             "environment": collect_env_info(),
             "metadata": self._metadata,
-            "evals": None,
+            "evals": evals_summary,
             "suites": suites_data,
         }
         append_entry(self._history_file, entry)
 
-    def _record(self, result: TestResult, *, passed: bool) -> None:
-        suite_name = self._get_suite_name(result)
-        if suite_name not in self._suites:
-            self._suites[suite_name] = {}
-        self._suites[suite_name][result.name] = {
-            "passed": passed,
-            "duration": round(result.duration, 3),
-        }
 
-    def _get_suite_name(self, result: TestResult) -> str:
-        if result.suite_path:
-            return result.suite_path.root_name
-        return self._default_suite_name
+def _serialize_eval_case(case: EvalCaseResult) -> dict[str, Any]:
+    """Serialize an eval case result for JSONL storage."""
+    entry: dict[str, Any] = {
+        "passed": case.passed,
+        "is_error": case.is_error,
+        "duration": round(case.duration, 3),
+        "scores": {s.name: s.value for s in case.scores if s.is_metric},
+        "case_hash": case.case_hash,
+        "eval_hash": case.eval_hash,
+    }
+    labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)}
+    if labels:
+        entry["labels"] = labels
+    assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)}
+    if assertions:
+        entry["assertions"] = assertions
+    return entry
+
+
+def load_previous_eval_run(history_dir: Any = None) -> dict[str, Any] | None:
+    """Load the most recent eval run from history."""
+    return load_previous_run(history_dir=history_dir, evals_only=True)
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index fc35686..f1fc5d1 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -1,7 +1,7 @@
 """End-to-end tests for ProTest evals integration.
 
 These tests define the PUBLIC API contract. They test what the user sees:
-- Session setup (EvalSession, EvalSuite + @suite.eval with ForEach/From)
+- Session setup (ProTestSession, EvalSuite + @suite.eval with ForEach/From)
 - CLI behavior (protest run vs protest eval)
 - Output format (scores table, trends, failure messages)
 - History (JSONL format, stats, significance, clean-dirty)
@@ -46,7 +46,6 @@
 )
 from protest.evals.hashing import compute_case_hash, compute_eval_hash
 from protest.evals.results_writer import EvalResultsWriter
-from protest.evals.session import EvalSession
 from protest.evals.suite import EvalSuite
 from protest.evals.types import EvalSuiteReport  # noqa: TC001 — used at runtime
 from protest.filters.kind import KindFilterPlugin
@@ -104,11 +103,11 @@ async def async_echo_task(text: str) -> str:
 # ---------------------------------------------------------------------------
 
 
-class TestEvalSession:
-    """EvalSession setup: constructor with model=, EvalSuite + @suite.eval."""
+class TestEvalSetup:
+    """Eval setup: ProTestSession + EvalSuite with model=, @suite.eval."""
 
     def test_add_eval_creates_eval_kind(self) -> None:
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -121,18 +120,18 @@ def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
         assert len(session._suites) > 0
         assert any(s.kind == "eval" for s in session._suites)
 
-    def test_model_set_via_constructor(self) -> None:
-        session = EvalSession(model=ModelInfo(name="test-model"))
-        assert session._eval_model is not None
-        assert session._eval_model.name == "test-model"
+    def test_model_set_via_suite(self) -> None:
+        suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model"))
+        assert suite._model is not None
+        assert suite._model.name == "test-model"
 
     def test_metadata_on_constructor(self) -> None:
-        session = EvalSession(metadata={"env": "test"})
+        session = ProTestSession(metadata={"env": "test"})
         assert session.metadata["env"] == "test"
 
     def test_eval_with_bool_verdict(self) -> None:
         """Evaluator with bool field: case_fail has matches_expected=False -> fail."""
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -148,7 +147,7 @@ def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
         assert result.success is False
 
     def test_async_task_works(self) -> None:
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -169,7 +168,7 @@ def test_async_evaluator_does_not_crash(self) -> None:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -196,7 +195,7 @@ def test_test_suite_has_kind_test(self) -> None:
         assert suite.kind == "test"
 
     def test_eval_suite_has_kind_eval(self) -> None:
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -306,7 +305,7 @@ class ReportCapture(PluginBase):
             def on_eval_suite_end(self, report: Any) -> None:
                 reports.append(report)
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(ReportCapture())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -334,7 +333,7 @@ class ReportCapture(PluginBase):
             def on_eval_suite_end(self, report: Any) -> None:
                 reports.append(report)
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(ReportCapture())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -361,7 +360,7 @@ def on_test_fail(self, result: Any) -> None:
                 if result.error:
                     errors.append(str(result.error))
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(ErrorCollector())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -397,7 +396,7 @@ def on_test_pass(self, result: Any) -> None:
             def on_test_fail(self, result: Any) -> None:
                 collected.append(result)
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(Collector())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -432,7 +431,7 @@ def on_test_setup_done(self, info: Any) -> None:
             def on_test_teardown_start(self, info: Any) -> None:
                 teardown_ids.append(info.node_id)
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(LifecycleCollector())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -472,7 +471,7 @@ def crashing_evaluator(ctx: EvalContext) -> bool:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(Collector())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -522,9 +521,9 @@ class TestHistory:
     """JSONL history format and querying."""
 
     def _run_eval(self, tmp_path: Path) -> None:
-        session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path)
+        session = ProTestSession(history_dir=tmp_path)
 
-        eval_echo_suite = EvalSuite("eval_echo")
+        eval_echo_suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model"))
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
@@ -585,7 +584,7 @@ def test_history_multiple_runs_append(self, tmp_path: Path) -> None:
         assert len(lines) == 2
 
     def test_history_metadata_included(self, tmp_path: Path) -> None:
-        session = EvalSession(
+        session = ProTestSession(
             history_dir=tmp_path,
             metadata={"env": "test", "version": "1.0"},
         )
@@ -655,7 +654,7 @@ class TestCaseHashing:
 
     def test_case_hash_stored_in_history(self, tmp_path: Path) -> None:
         """History entries include case_hash and eval_hash per case."""
-        session = EvalSession(history_dir=tmp_path)
+        session = ProTestSession(history_dir=tmp_path)
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -796,7 +795,7 @@ def on_test_fail(self, result: Any) -> None:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(Collector())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -833,7 +832,7 @@ def on_test_fail(self, result: Any) -> None:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(Collector())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -868,7 +867,7 @@ def bad_evaluator(ctx: EvalContext) -> float:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession()
+        session = ProTestSession()
         session.register_plugin(Collector())
 
         eval_echo_suite = EvalSuite("eval_echo")
@@ -901,7 +900,7 @@ def expensive(ctx: EvalContext) -> bool:
             call_log.append("expensive")
             return True
 
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -934,7 +933,7 @@ def check_b(ctx: EvalContext) -> bool:
         single = ForEach(
             [EvalCase(inputs="x", expected="x", name="c1")], ids=lambda c: c.name
         )
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -960,7 +959,7 @@ class TestResultsFiles:
 
     def _run_eval(self, tmp_path: Path) -> Path:
         results_dir = tmp_path / "results"
-        session = EvalSession()
+        session = ProTestSession()
         writer = EvalResultsWriter(history_dir=tmp_path)
         session.register_plugin(writer)
 
@@ -1032,7 +1031,7 @@ def _run_multi(self, tmp_path: Path) -> dict[str, Any]:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession(history_dir=tmp_path)
+        session = ProTestSession(history_dir=tmp_path)
 
         pipeline_suite = EvalSuite("pipeline")
         session.add_suite(pipeline_suite)
@@ -1075,7 +1074,7 @@ class TestEvalTaskFixtures:
     def test_task_without_fixtures_still_works(self) -> None:
         # basic_cases has one match (case_pass) and one mismatch (case_fail)
         # fake_accuracy returns matches_expected=False for case_fail -> fail
-        session = EvalSession()
+        session = ProTestSession()
 
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
@@ -1102,7 +1101,7 @@ def prefix_service() -> str:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession()
+        session = ProTestSession()
         session.bind(prefix_service)
 
         eval_prefixed_suite = EvalSuite("eval_prefixed")
@@ -1140,7 +1139,7 @@ def expensive_resource() -> str:
             ids=lambda c: c.name,
         )
 
-        session = EvalSession()
+        session = ProTestSession()
         session.bind(expensive_resource)
 
         eval_resource_suite = EvalSuite("eval_resource")
diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py
index 7eece34..e711bdb 100644
--- a/tests/evals/test_judge.py
+++ b/tests/evals/test_judge.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from protest import ForEach, From
+from protest import ForEach, From, ProTestSession
 from protest.core.runner import TestRunner
 from protest.evals import (
     EvalContext,
@@ -17,7 +17,6 @@
     Verdict,
     evaluator,
 )
-from protest.evals.session import EvalSession
 from protest.evals.suite import EvalSuite
 from protest.plugin import PluginBase
 
@@ -210,7 +209,7 @@ async def test_judge_none_tokens_not_accumulated(self) -> None:
 
 
 # ---------------------------------------------------------------------------
-# E2E: EvalSession with judge
+# E2E: ProTestSession with judge on EvalSuite
 # ---------------------------------------------------------------------------
 
 single_case = ForEach(
@@ -227,8 +226,8 @@ def test_judge_available_in_evaluator(self) -> None:
         async def judge_evaluator(ctx: EvalContext) -> bool:
             return await ctx.judge("pass this", bool)
 
-        session = EvalSession(judge=FakeJudge())
-        eval_echo_suite = EvalSuite("eval_echo")
+        session = ProTestSession()
+        eval_echo_suite = EvalSuite("eval_echo", judge=FakeJudge())
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[judge_evaluator])
@@ -246,8 +245,8 @@ def test_no_judge_is_fixture_error(self) -> None:
         async def needs_judge(ctx: EvalContext) -> bool:
             return await ctx.judge("test", bool)
 
-        session = EvalSession()  # no judge
-        eval_echo_suite = EvalSuite("eval_echo")
+        session = ProTestSession()
+        eval_echo_suite = EvalSuite("eval_echo")  # no judge
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[needs_judge])
@@ -278,8 +277,8 @@ async def double_judge(ctx: EvalContext) -> bool:
             r2 = await ctx.judge("pass second", bool)
             return r1 and r2
 
-        session = EvalSession(judge=FakeJudge())
-        eval_echo_suite = EvalSuite("eval_echo")
+        session = ProTestSession()
+        eval_echo_suite = EvalSuite("eval_echo", judge=FakeJudge())
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[double_judge])
@@ -305,17 +304,17 @@ def on_test_pass(self, result: Any) -> None:
         assert payload.judge_output_tokens == 10  # 5 per call x 2
         assert payload.judge_cost == pytest.approx(0.002)  # 0.001 per call x 2
 
-    def test_judge_info_derived_from_instance(self) -> None:
-        """EvalSession derives JudgeInfo from Judge instance."""
-        session = EvalSession(judge=FakeJudge())
-        assert session._eval_judge is not None
-        assert session._eval_judge.name == "fake-judge"
-        assert session._eval_judge.provider == "test"
+    def test_judge_info_derived_from_suite(self) -> None:
+        """EvalSuite derives JudgeInfo from Judge instance."""
+        suite = EvalSuite("eval_echo", judge=FakeJudge())
+        assert suite._judge is not None
+        assert suite._judge.name == "fake-judge"
+        assert suite._judge.provider == "test"
 
     def test_no_judge_no_judge_info(self) -> None:
-        """EvalSession without judge has no JudgeInfo."""
-        session = EvalSession()
-        assert session._eval_judge is None
+        """EvalSuite without judge has no JudgeInfo."""
+        suite = EvalSuite("eval_echo")
+        assert suite._judge is None
 
     def test_judge_with_structured_output(self) -> None:
         """Judge returns structured dataclass via output_type."""
@@ -335,8 +334,8 @@ async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
         async def struct_evaluator(ctx: EvalContext) -> JudgeVerdict:
             return await ctx.judge("evaluate this", JudgeVerdict)
 
-        session = EvalSession(judge=StructuredJudge())
-        eval_echo_suite = EvalSuite("eval_echo")
+        session = ProTestSession()
+        eval_echo_suite = EvalSuite("eval_echo", judge=StructuredJudge())
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[struct_evaluator])
@@ -361,7 +360,7 @@ def test_task_result_unwrapped_for_evaluators(self) -> None:
         def check_output(ctx: EvalContext) -> bool:
             return ctx.output == "hello"  # sees str, not TaskResult
 
-        session = EvalSession()
+        session = ProTestSession()
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
 
@@ -385,7 +384,7 @@ def test_task_usage_in_payload(self) -> None:
         def always_pass(ctx: EvalContext) -> bool:
             return True
 
-        session = EvalSession()
+        session = ProTestSession()
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
 
@@ -423,7 +422,7 @@ def test_plain_return_has_zero_task_usage(self) -> None:
         def always_pass(ctx: EvalContext) -> bool:
             return True
 
-        session = EvalSession()
+        session = ProTestSession()
         eval_echo_suite = EvalSuite("eval_echo")
         session.add_suite(eval_echo_suite)
 

From 9c5830230ecd1bcdaad4ec0c116fff7457305320 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 14 Apr 2026 20:33:31 +0200
Subject: [PATCH 25/60] refactor(evals): replace evaluator function wrapper
 with `Evaluator` class

- Introduced `Evaluator
---
 protest/evals/__init__.py  |  2 +
 protest/evals/evaluator.py | 75 ++++++++++++++++++++++----------------
 protest/evals/hashing.py   | 16 ++++++--
 protest/evals/wrapper.py   | 11 ++++--
 protest/history/plugin.py  |  2 +-
 5 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index 8584eff..c985114 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -3,6 +3,7 @@
 from protest.evals.evaluator import (
     EvalCase,
     EvalContext,
+    Evaluator,
     Metric,
     Reason,
     ShortCircuit,
@@ -27,6 +28,7 @@
     "EvalContext",
     "EvalScore",
     "EvalSuiteReport",
+    "Evaluator",
     "Judge",
     "JudgeInfo",
     "JudgeResponse",
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 6baab51..569ce83 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -32,8 +32,6 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
 from __future__ import annotations
 
 import dataclasses
-import functools
-import inspect
 from dataclasses import dataclass, field
 from typing import (
     TYPE_CHECKING,
@@ -50,6 +48,8 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
 from protest.evals.types import EvalScore
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from protest.evals.types import Judge
 
 InputT = TypeVar("InputT")
@@ -217,37 +217,50 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]:
     raise TypeError(f"Evaluator must return bool or dataclass, got {type_name}")
 
 
-def evaluator(fn: Any) -> Any:
-    """Decorator that turns a function into a protest evaluator.
-
-    The decorated function can be called two ways:
+class Evaluator:
+    """A configured evaluator — callable with identity for hashing.
 
-    1. ``evaluator_fn(ctx)`` — evaluate directly
-    2. ``evaluator_fn(keyword=value, ...)`` — returns a bound evaluator (partial)
+    Created by the ``@evaluator`` decorator. Supports two calling modes:
 
-    This is just ``functools.partial`` with nicer ergonomics: when the first
-    positional argument is an ``EvalContext``, the function evaluates. Otherwise,
-    all arguments are bound and the result is a new callable expecting only ``ctx``.
+    1. ``ev(ctx)`` — evaluate directly (first arg is EvalContext)
+    2. ``ev(keyword=value, ...)`` — bind params, return a new Evaluator
     """
-    sig = inspect.signature(fn)
-    params = list(sig.parameters.values())
-    has_extra_params = len(params) > 1
 
-    @functools.wraps(fn)
-    def wrapper(*args: Any, **kwargs: Any) -> Any:
-        # Direct call: first positional arg is an EvalContext
+    __slots__ = ("_fn", "_kwargs", "_name", "_qualname")
+
+    def __init__(
+        self, fn: Callable[..., Any], kwargs: dict[str, Any] | None = None
+    ) -> None:
+        self._fn = fn
+        self._kwargs = kwargs or {}
+        self._name = fn.__name__
+        self._qualname = fn.__qualname__
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
         if args and isinstance(args[0], EvalContext):
-            return fn(*args, **kwargs)
-        # Bind params → return partial
-        if has_extra_params and kwargs:
-            bound = functools.partial(fn, **kwargs)
-            # Preserve async detection on the partial
-            bound.__name__ = fn.__name__  # type: ignore[attr-defined]
-            bound.__qualname__ = fn.__qualname__  # type: ignore[attr-defined]
-            return bound
-        # No args at all — if no extra params, this IS the evaluator
-        if not has_extra_params and not args and not kwargs:
-            return fn
-        return fn(*args, **kwargs)
-
-    return wrapper
+            merged = {**self._kwargs, **kwargs}
+            return self._fn(*args, **merged)
+        if kwargs:
+            return Evaluator(self._fn, {**self._kwargs, **kwargs})
+        return self
+
+    def evaluator_identity(self) -> dict[str, Any]:
+        identity: dict[str, Any] = {"fn": self._qualname}
+        if self._kwargs:
+            identity["kwargs"] = self._kwargs
+        return identity
+
+    def __repr__(self) -> str:
+        if self._kwargs:
+            kw = ", ".join(f"{k}={v!r}" for k, v in self._kwargs.items())
+            return f"Evaluator({self._name}({kw}))"
+        return f"Evaluator({self._name})"
+
+
+def evaluator(fn: Callable[..., Any]) -> Evaluator:
+    """Turn a function into a ProTest evaluator."""
+    return Evaluator(fn)
diff --git a/protest/evals/hashing.py b/protest/evals/hashing.py
index 5ebe725..5b3114a 100644
--- a/protest/evals/hashing.py
+++ b/protest/evals/hashing.py
@@ -15,7 +15,7 @@
 import functools
 import hashlib
 import json
-from typing import Any
+from typing import Any, Protocol, runtime_checkable
 
 HASH_LENGTH = 12
 
@@ -24,6 +24,13 @@ class CanonicalError(TypeError):
     """Raised when an object cannot be converted to a canonical form."""
 
 
+@runtime_checkable
+class HasEvaluatorIdentity(Protocol):
+    """Protocol for objects that provide explicit hashing identity."""
+
+    def evaluator_identity(self) -> dict[str, Any]: ...
+
+
 def compute_case_hash(inputs: Any, expected_output: Any) -> str:
     """Hash the case content (inputs + expected_output)."""
     data = {"inputs": _canonical(inputs), "expected": _canonical(expected_output)}
@@ -56,10 +63,13 @@ def _canonical(obj: Any) -> Any:  # noqa: PLR0911
     if isinstance(obj, (list, tuple)):
         return [_canonical(item) for item in obj]
     if isinstance(obj, dict):
-        return {str(k): _canonical(v) for k, v in sorted(obj.items())}
+        return {
+            str(k): _canonical(v)
+            for k, v in sorted(obj.items(), key=lambda item: str(item[0]))
+        }
 
     # --- explicit identity (user-controlled) ---
-    if hasattr(obj, "evaluator_identity"):
+    if isinstance(obj, HasEvaluatorIdentity):
         return _canonical(obj.evaluator_identity())
 
     # --- introspection fallback ---
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index e9161f1..d4278c0 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -16,6 +16,7 @@
 from protest.evals.evaluator import (
     EvalCase,
     EvalContext,
+    Evaluator,
     ShortCircuit,
     extract_scores_from_result,
 )
@@ -181,7 +182,7 @@ async def run_evaluators(
             scores.extend(await _run_short_circuit(ev.evaluators, ctx))
             continue
 
-        evaluator_name = getattr(ev, "__name__", type(ev).__name__)
+        evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__
         try:
             raw = ev(ctx)
             result = await raw if asyncio.iscoroutine(raw) else raw
@@ -199,7 +200,7 @@ async def _run_short_circuit(
     """Run evaluators in order, stop at first Verdict=False."""
     scores: list[EvalScore] = []
     for i, ev in enumerate(evaluators):
-        evaluator_name = getattr(ev, "__name__", type(ev).__name__)
+        evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__
         try:
             raw = ev(ctx)
             result = await raw if asyncio.iscoroutine(raw) else raw
@@ -210,8 +211,10 @@ async def _run_short_circuit(
         if any(s.is_verdict and not s.passed for s in extracted):
             # Mark remaining evaluators as skipped
             for skipped_ev in evaluators[i + 1 :]:
-                skipped_name = getattr(
-                    skipped_ev, "__name__", type(skipped_ev).__name__
+                skipped_name = (
+                    skipped_ev.name
+                    if isinstance(skipped_ev, Evaluator)
+                    else type(skipped_ev).__name__
                 )
                 scores.append(EvalScore(name=skipped_name, value=False, skipped=True))
             break
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index 930ca61..eac653c 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -71,7 +71,7 @@ def setup(self, session: ProTestSession) -> None:
                 if isinstance(suite, EvalSuite) and suite.judge is not None:
                     self._eval_judge_info[suite.name] = {
                         "name": suite.judge.name,
-                        "provider": getattr(suite.judge, "provider", None),
+                        "provider": suite.judge.provider,
                     }
             elif not self._default_suite_name or self._default_suite_name == "tests":
                 self._default_suite_name = suite.name

From 6f6d16a79077ad56fb41056a7c983040cd197c2e Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 14 Apr 2026 21:05:51 +0200
Subject: [PATCH 26/60] refactor(examples): replace dict-based eval cases with
 `EvalCase` objects in Yorkshire example dataset

---
 examples/yorkshire/evals/dataset.py | 169 ++++++++++++++--------------
 1 file changed, 85 insertions(+), 84 deletions(-)

diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py
index 7153ab6..e7ad926 100644
--- a/examples/yorkshire/evals/dataset.py
+++ b/examples/yorkshire/evals/dataset.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from protest import ForEach
+from protest.evals import EvalCase
 from protest.evals.evaluators import (
     contains_keywords,
     does_not_contain,
@@ -13,109 +14,109 @@
 yorkshire_cases = ForEach(
     [
         # --- Factual recall ---
-        {
-            "name": "weight_question",
-            "inputs": "How much does a Yorkshire Terrier weigh?",
-            "expected": "2-3 kg",
-            "metadata": {"tags": ["factual", "size"]},
-            "evaluators": [
+        EvalCase(
+            name="weight_question",
+            inputs="How much does a Yorkshire Terrier weigh?",
+            expected="2-3 kg",
+            metadata={"tags": ["factual", "size"]},
+            evaluators=[
                 contains_keywords(keywords=["2-3 kg", "teacup", "mini", "standard"])
             ],
-        },
-        {
-            "name": "grooming_basics",
-            "inputs": "How often should I brush my Yorkie?",
-            "expected": "daily brushing for long coats",
-            "metadata": {"tags": ["factual", "grooming"]},
-            "evaluators": [contains_keywords(keywords=["daily", "brushing", "long"])],
-        },
-        {
-            "name": "diet_advice",
-            "inputs": "What should I feed my Yorkshire Terrier?",
-            "expected": "small breed formula, 2-3 meals",
-            "metadata": {"tags": ["factual", "diet"]},
-            "evaluators": [
+        ),
+        EvalCase(
+            name="grooming_basics",
+            inputs="How often should I brush my Yorkie?",
+            expected="daily brushing for long coats",
+            metadata={"tags": ["factual", "grooming"]},
+            evaluators=[contains_keywords(keywords=["daily", "brushing", "long"])],
+        ),
+        EvalCase(
+            name="diet_advice",
+            inputs="What should I feed my Yorkshire Terrier?",
+            expected="small breed formula, 2-3 meals",
+            metadata={"tags": ["factual", "diet"]},
+            evaluators=[
                 contains_keywords(keywords=["small breed", "meals", "avoid"])
             ],
-        },
-        {
-            "name": "exercise_needs",
-            "inputs": "How much exercise does a Yorkie need?",
-            "expected": "30 minutes daily",
-            "metadata": {"tags": ["factual", "exercise"]},
-            "evaluators": [contains_keywords(keywords=["30 minutes", "walk"])],
-        },
+        ),
+        EvalCase(
+            name="exercise_needs",
+            inputs="How much exercise does a Yorkie need?",
+            expected="30 minutes daily",
+            metadata={"tags": ["factual", "exercise"]},
+            evaluators=[contains_keywords(keywords=["30 minutes", "walk"])],
+        ),
         # --- Temperament ---
-        {
-            "name": "personality",
-            "inputs": "What is the temperament of a Yorkshire Terrier?",
-            "expected": "bold, confident, affectionate",
-            "metadata": {"tags": ["factual", "temperament"]},
-            "evaluators": [
+        EvalCase(
+            name="personality",
+            inputs="What is the temperament of a Yorkshire Terrier?",
+            expected="bold, confident, affectionate",
+            metadata={"tags": ["factual", "temperament"]},
+            evaluators=[
                 contains_keywords(keywords=["bold", "confident", "affectionate"])
             ],
-        },
+        ),
         # --- Age-specific ---
-        {
-            "name": "puppy_care",
-            "inputs": "How do I care for a Yorkshire puppy?",
-            "expected": "extra care, socialization",
-            "metadata": {"tags": ["factual", "puppies"]},
-            "evaluators": [contains_keywords(keywords=["12 months", "socialization"])],
-        },
-        {
-            "name": "senior_care",
-            "inputs": "My Yorkie is getting old, what should I change?",
-            "expected": "adjust exercise, more vet visits",
-            "metadata": {"tags": ["factual", "seniors"]},
-            "evaluators": [contains_keywords(keywords=["senior", "exercise", "vet"])],
-        },
+        EvalCase(
+            name="puppy_care",
+            inputs="How do I care for a Yorkshire puppy?",
+            expected="extra care, socialization",
+            metadata={"tags": ["factual", "puppies"]},
+            evaluators=[contains_keywords(keywords=["12 months", "socialization"])],
+        ),
+        EvalCase(
+            name="senior_care",
+            inputs="My Yorkie is getting old, what should I change?",
+            expected="adjust exercise, more vet visits",
+            metadata={"tags": ["factual", "seniors"]},
+            evaluators=[contains_keywords(keywords=["senior", "exercise", "vet"])],
+        ),
         # --- Hallucination checks ---
-        {
-            "name": "no_cat_advice",
-            "inputs": "Tell me about Yorkshire Terrier health",
-            "expected": "dental problems, patellar luxation",
-            "metadata": {"tags": ["safety"]},
-            "evaluators": [
+        EvalCase(
+            name="no_cat_advice",
+            inputs="Tell me about Yorkshire Terrier health",
+            expected="dental problems, patellar luxation",
+            metadata={"tags": ["safety"]},
+            evaluators=[
                 does_not_contain(forbidden=["cat", "feline", "persian"]),
                 contains_keywords(keywords=["dental", "health"]),
             ],
-        },
-        {
-            "name": "no_made_up_breeds",
-            "inputs": "What jobs can a Yorkie do?",
-            "expected": "therapy dogs, companions",
-            "metadata": {"tags": ["safety"]},
-            "evaluators": [
+        ),
+        EvalCase(
+            name="no_made_up_breeds",
+            inputs="What jobs can a Yorkie do?",
+            expected="therapy dogs, companions",
+            metadata={"tags": ["safety"]},
+            evaluators=[
                 does_not_contain(forbidden=["labrador", "golden retriever", "poodle"]),
                 contains_keywords(keywords=["therapy", "companion"]),
             ],
-        },
+        ),
         # --- Edge cases ---
-        {
-            "name": "unknown_topic",
-            "inputs": "What is the GDP of France?",
-            "expected": "I'm not sure",
-            "metadata": {"tags": ["edge_case"]},
-            "evaluators": [contains_keywords(keywords=["not sure", "specialize"])],
-        },
-        {
-            "name": "empty_question",
-            "inputs": "",
-            "expected": "I'm not sure",
-            "metadata": {"tags": ["edge_case"]},
-            "evaluators": [contains_keywords(keywords=["not sure"])],
-        },
+        EvalCase(
+            name="unknown_topic",
+            inputs="What is the GDP of France?",
+            expected="I'm not sure",
+            metadata={"tags": ["edge_case"]},
+            evaluators=[contains_keywords(keywords=["not sure", "specialize"])],
+        ),
+        EvalCase(
+            name="empty_question",
+            inputs="",
+            expected="I'm not sure",
+            metadata={"tags": ["edge_case"]},
+            evaluators=[contains_keywords(keywords=["not sure"])],
+        ),
         # --- Known weak spot (chatbot doesn't know about training treats) ---
-        {
-            "name": "training_treats",
-            "inputs": "What treats are best for training a Yorkie?",
-            "expected": "small soft treats, positive reinforcement",
-            "metadata": {"tags": ["factual", "training"]},
-            "evaluators": [
+        EvalCase(
+            name="training_treats",
+            inputs="What treats are best for training a Yorkie?",
+            expected="small soft treats, positive reinforcement",
+            metadata={"tags": ["factual", "training"]},
+            evaluators=[
                 contains_keywords(keywords=["treats", "small", "soft", "reward"])
             ],
-        },
+        ),
     ]
 )
 

From 1d42252ffa3d3e20a2c59f72168b18223d763baa Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 14 Apr 2026 21:53:44 +0200
Subject: [PATCH 27/60] docs(evals): clarify that `EvalCase` must replace plain
 dicts

---
 docs/evals.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/evals.md b/docs/evals.md
index 387267f..96e51d7 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -68,7 +68,7 @@ async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
 
 ## EvalCase
 
-Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts.
+Typed dataclass for eval case data. All eval cases **must** use `EvalCase` — plain dicts are not supported.
 
 ```python
 from protest.evals import EvalCase

From 9a4ce43eb4f9fb672b5138f854474b5b269c6d14 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 24 Apr 2026 07:37:24 +0200
Subject: [PATCH 28/60] refactor(reporting): centralize shared formatting logic
 and add CLI options for output/log visibility

- Moved duplicate formatting helpers (`format_duration`, `format_usage`) to a new `format` module for reuse.
- Added `--show-output` and `--show-logs` CLI options to enhance reporting flexibility.
- Updated tests to verify symmetry between Rich and ASCII reporters for structural, CLI, and behavioral consistency.
---
 examples/yorkshire/evals/dataset.py       |   4 +-
 protest/cli/main.py                       |  15 +
 protest/reporting/ascii.py                | 115 ++++---
 protest/reporting/format.py               |  39 +++
 protest/reporting/rich_reporter.py        |  56 +---
 tests/reporting/test_reporter_symmetry.py | 369 ++++++++++++++++++++++
 tests/reporting/test_rich_reporter.py     |   8 +-
 7 files changed, 511 insertions(+), 95 deletions(-)
 create mode 100644 protest/reporting/format.py
 create mode 100644 tests/reporting/test_reporter_symmetry.py

diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py
index e7ad926..89b3362 100644
--- a/examples/yorkshire/evals/dataset.py
+++ b/examples/yorkshire/evals/dataset.py
@@ -35,9 +35,7 @@
             inputs="What should I feed my Yorkshire Terrier?",
             expected="small breed formula, 2-3 meals",
             metadata={"tags": ["factual", "diet"]},
-            evaluators=[
-                contains_keywords(keywords=["small breed", "meals", "avoid"])
-            ],
+            evaluators=[contains_keywords(keywords=["small breed", "meals", "avoid"])],
         ),
         EvalCase(
             name="exercise_needs",
diff --git a/protest/cli/main.py b/protest/cli/main.py
index 648fd26..9c0b324 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -230,6 +230,21 @@ def _create_run_parser() -> argparse.ArgumentParser:
         default=0,
         help="Increase verbosity (-v for lifecycle, -vv for fixtures)",
     )
+    parser.add_argument(
+        "--show-output",
+        dest="show_output",
+        action="store_true",
+        help="Show eval inputs/output/expected per case",
+    )
+    parser.add_argument(
+        "--show-logs",
+        dest="show_logs",
+        nargs="?",
+        const="INFO",
+        default=None,
+        metavar="LEVEL",
+        help="Show captured log records (default: INFO+)",
+    )
     return parser
 
 
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index 9296ae6..64470b8 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -1,3 +1,4 @@
+import logging
 import sys
 import traceback
 from pathlib import Path
@@ -23,6 +24,12 @@
 )
 from protest.evals.types import EvalSuiteReport
 from protest.plugin import PluginBase, PluginContext
+from protest.reporting.format import (
+    format_duration as _format_duration,
+)
+from protest.reporting.format import (
+    format_usage as _format_usage,
+)
 from protest.reporting.verbosity import Verbosity
 
 _MIN_NODE_ID_PARTS = 2
@@ -51,38 +58,23 @@ def _format_test_name(result: TestResult, include_suite: bool = False) -> str:
     return name
 
 
-MIN_DURATION_THRESHOLD = 0.001
-
-
-def _format_duration(seconds: float) -> str:
-    """Format duration: ms for fast, s for slow."""
-    if seconds < MIN_DURATION_THRESHOLD:
-        return "<1ms"
-    if seconds < 1:
-        return f"{seconds * 1000:.0f}ms"
-    return f"{seconds:.2f}s"
-
-
-_TOKEN_K_THRESHOLD = 1000
-
-
-def _format_tokens(tokens: int) -> str:
-    return (
-        f"{tokens / _TOKEN_K_THRESHOLD:.1f}k"
-        if tokens >= _TOKEN_K_THRESHOLD
-        else str(tokens)
-    )
-
-
-def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str:
+def _format_eval_scores_inline(result: TestResult) -> str:
+    """Format eval scores for inline display — ASCII version (no glyphs)."""
+    if not result.eval_payload:
+        return ""
     parts: list[str] = []
-    if input_tokens > 0 or output_tokens > 0:
-        parts.append(
-            f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out"
-        )
-    if cost > 0:
-        parts.append(f"${cost:.4f}")
-    return ", ".join(parts)
+    for name, entry in result.eval_payload.scores.items():
+        if entry.skipped:
+            parts.append(f"{name}=skip")
+            continue
+        val = entry.value
+        if isinstance(val, bool):
+            parts.append(f"{name}={'pass' if val else 'fail'}")
+        elif isinstance(val, float):
+            parts.append(f"{name}={val:.2f}")
+        else:
+            parts.append(f"{name}={val}")
+    return f" {' '.join(parts)}" if parts else ""
 
 
 class AsciiReporter(PluginBase):
@@ -91,8 +83,15 @@ class AsciiReporter(PluginBase):
     name = "ascii-reporter"
     description = "Plain ASCII reporter"
 
-    def __init__(self, verbosity: int = 0) -> None:
+    def __init__(
+        self,
+        verbosity: int = 0,
+        show_logs: str | None = None,
+        show_output: bool = False,
+    ) -> None:
         self._verbosity = verbosity
+        self._show_logs = show_logs
+        self._show_output = show_output
         self._is_parallel = False
         self._failed_results: list[TestResult] = []
         self._error_results: list[TestResult] = []
@@ -100,9 +99,36 @@ def __init__(self, verbosity: int = 0) -> None:
     @classmethod
     def activate(cls, ctx: PluginContext) -> Self | None:
         if ctx.get("no_color", False):
-            return cls(verbosity=ctx.get("verbosity", 0))
+            return cls(
+                verbosity=ctx.get("verbosity", 0),
+                show_logs=ctx.get("show_logs"),
+                show_output=ctx.get("show_output", False),
+            )
         return None
 
+    def _print_eval_detail(self, result: TestResult) -> None:
+        """Print eval inputs/output/expected (enabled by --show-output or on failure)."""
+        p = result.eval_payload
+        if not p:
+            return
+        if p.inputs is not None:
+            print(f"    | inputs: {str(p.inputs)[:200]}")
+        if p.output is not None:
+            print(f"    | output: {str(p.output)[:200]}")
+        if p.expected_output is not None:
+            print(f"    | expected: {str(p.expected_output)[:200]}")
+
+    def _maybe_show_logs(self, result: TestResult) -> None:
+        """Show captured log records if --show-logs is active."""
+        if not self._show_logs or not result.log_records:
+            return
+        min_level = getattr(logging, self._show_logs.upper(), logging.INFO)
+        for record in result.log_records:
+            if record.levelno >= min_level:
+                print(
+                    f"    LOG [{record.levelname}] {record.name}: {record.getMessage()}"
+                )
+
     def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]:
         self._is_parallel = len(items) > 1
         return items
@@ -193,7 +219,11 @@ def on_test_pass(self, result: TestResult) -> None:
             retry_suffix = ""
             if result.max_attempts > 1:
                 retry_suffix = f" [attempt {result.attempt}/{result.max_attempts}]"
-            print(f"  OK {name} ({duration}){retry_suffix}")
+            scores_str = _format_eval_scores_inline(result) if result.is_eval else ""
+            print(f"  OK {name} ({duration}){scores_str}{retry_suffix}")
+            if self._show_output and result.is_eval:
+                self._print_eval_detail(result)
+            self._maybe_show_logs(result)
 
     def on_test_fail(self, result: TestResult) -> None:
         name = _format_test_name(result, include_suite=self._is_parallel)
@@ -216,6 +246,9 @@ def on_test_fail(self, result: TestResult) -> None:
         if result.output:
             for line in result.output.rstrip().splitlines():
                 print(f"    | {line}")
+        if result.is_eval:
+            self._print_eval_detail(result)
+        self._maybe_show_logs(result)
 
     def on_test_skip(self, result: TestResult) -> None:
         if self._verbosity >= Verbosity.NORMAL:
@@ -257,14 +290,16 @@ def _format_traceback(self, error: Exception) -> str:
         return "".join(lines)
 
     def _print_failure_summary(self) -> None:
-        if self._failed_results:
+        non_eval_failures = [r for r in self._failed_results if not r.is_eval]
+        if non_eval_failures:
             print("\n=== FAILURES ===")
-            for result in self._failed_results:
+            for result in non_eval_failures:
                 self._print_failure_detail(result, is_error=False)
 
-        if self._error_results:
+        non_eval_errors = [r for r in self._error_results if not r.is_eval]
+        if non_eval_errors:
             print("\n=== ERRORS ===")
-            for result in self._error_results:
+            for result in non_eval_errors:
                 self._print_failure_detail(result, is_error=True)
 
     def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
@@ -324,7 +359,9 @@ def on_eval_suite_end(self, report: Any) -> None:
         print()
 
     def on_session_complete(self, result: SessionResult) -> None:
-        if self._failed_results or self._error_results:
+        has_non_eval_failures = any(not r.is_eval for r in self._failed_results)
+        has_non_eval_errors = any(not r.is_eval for r in self._error_results)
+        if has_non_eval_failures or has_non_eval_errors:
             self._print_failure_summary()
 
         total = (
diff --git a/protest/reporting/format.py b/protest/reporting/format.py
new file mode 100644
index 0000000..6e23151
--- /dev/null
+++ b/protest/reporting/format.py
@@ -0,0 +1,39 @@
+"""Shared formatting helpers used by both Rich and Ascii reporters.
+
+Only formats that are *truly identical* between the two reporters live here.
+Visual rendering (glyphs vs ASCII words, colors) stays in each reporter.
+"""
+
+from __future__ import annotations
+
+MIN_DURATION_THRESHOLD = 0.001
+_TOKEN_K_THRESHOLD = 1000
+
+
+def format_duration(seconds: float) -> str:
+    if seconds < MIN_DURATION_THRESHOLD:
+        return "<1ms"
+    if seconds < 1:
+        return f"{seconds * 1000:.0f}ms"
+    return f"{seconds:.2f}s"
+
+
+def format_tokens(tokens: int) -> str:
+    """Format token count: 1234 → '1.2k', 45 → '45'."""
+    return (
+        f"{tokens / _TOKEN_K_THRESHOLD:.1f}k"
+        if tokens >= _TOKEN_K_THRESHOLD
+        else str(tokens)
+    )
+
+
+def format_usage(input_tokens: int, output_tokens: int, cost: float) -> str:
+    """Format usage stats as 'Xk in / Yk out, $0.0042'."""
+    parts: list[str] = []
+    if input_tokens > 0 or output_tokens > 0:
+        parts.append(
+            f"{format_tokens(input_tokens)} in / {format_tokens(output_tokens)} out"
+        )
+    if cost > 0:
+        parts.append(f"${cost:.4f}")
+    return ", ".join(parts)
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 506641d..5794457 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -26,6 +26,12 @@
 )
 from protest.evals.types import EvalSuiteReport
 from protest.plugin import PluginBase, PluginContext
+from protest.reporting.format import (
+    format_duration as _format_duration,
+)
+from protest.reporting.format import (
+    format_usage as _format_usage,
+)
 from protest.reporting.verbosity import Verbosity
 
 
@@ -42,41 +48,6 @@ def _format_test_name(result: TestResult) -> str:
     return label.replace("[", "\\[")
 
 
-MIN_DURATION_THRESHOLD = 0.001
-
-
-def _format_duration(seconds: float) -> str:
-    if seconds < MIN_DURATION_THRESHOLD:
-        return "<1ms"
-    if seconds < 1:
-        return f"{seconds * 1000:.0f}ms"
-    return f"{seconds:.2f}s"
-
-
-_TOKEN_K_THRESHOLD = 1000
-
-
-def _format_tokens(tokens: int) -> str:
-    """Format token count: 1234 → '1.2k', 45 → '45'."""
-    return (
-        f"{tokens / _TOKEN_K_THRESHOLD:.1f}k"
-        if tokens >= _TOKEN_K_THRESHOLD
-        else str(tokens)
-    )
-
-
-def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str:
-    """Format usage stats as 'Xk in / Yk out, $0.0042'."""
-    parts: list[str] = []
-    if input_tokens > 0 or output_tokens > 0:
-        parts.append(
-            f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out"
-        )
-    if cost > 0:
-        parts.append(f"${cost:.4f}")
-    return ", ".join(parts)
-
-
 def _format_eval_scores_inline(result: TestResult) -> str:
     """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0')."""
     if not result.eval_payload:
@@ -131,21 +102,6 @@ def add_cli_options(cls, parser: ArgumentParser) -> None:
             action="store_true",
             help="Disable colors (plain ASCII output)",
         )
-        group.add_argument(
-            "--show-logs",
-            dest="show_logs",
-            nargs="?",
-            const="INFO",
-            default=None,
-            metavar="LEVEL",
-            help="Show captured log records (default: INFO+)",
-        )
-        group.add_argument(
-            "--show-output",
-            dest="show_output",
-            action="store_true",
-            help="Show eval inputs/output/expected per case",
-        )
 
     @classmethod
     def activate(cls, ctx: PluginContext) -> Self | None:
diff --git a/tests/reporting/test_reporter_symmetry.py b/tests/reporting/test_reporter_symmetry.py
new file mode 100644
index 0000000..4eec50c
--- /dev/null
+++ b/tests/reporting/test_reporter_symmetry.py
@@ -0,0 +1,369 @@
+"""Symmetry tests between RichReporter and AsciiReporter.
+
+Goal: catch divergences between the two reporters before they ship as silent
+asymmetries. A user who swaps `--no-color` should get the same *semantic*
+output (same fields, same filters) — only the visual style differs.
+
+Three axes are enforced:
+
+1. Structural  — both reporters expose the same public hooks (`on_*` handlers).
+2. CLI         — both reporters react to the same shared flags (`--show-output`,
+                 `--show-logs`). Reporter-specific flags (`--no-color`) are allowed.
+3. Behavioral  — parametrized scenarios drive the same input through both
+                 reporters and assert the same *semantic* markers appear
+                 (score names for eval pass, eval detail on fail, summary omits
+                 eval failures, etc.).
+"""
+
+from __future__ import annotations
+
+import argparse
+import inspect
+import logging
+from typing import Any
+
+import pytest
+
+from protest.entities import (
+    EvalPayload,
+    EvalScoreEntry,
+    SessionResult,
+    TestResult,
+)
+from protest.plugin import PluginBase, PluginContext
+from protest.reporting.ascii import AsciiReporter
+from protest.reporting.rich_reporter import RichReporter
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+REPORTER_CLASSES = [RichReporter, AsciiReporter]
+
+# CLI flags that are handled by the shared run-parser (not by either reporter's
+# add_cli_options). Both reporters must still read them via their activate().
+_SHARED_CLI_FLAGS = {"show_output", "show_logs"}
+
+
+def _public_handlers(cls: type[PluginBase]) -> set[str]:
+    """Return the set of `on_*` handlers defined or overridden on cls.
+
+    Only include methods that are *actually overridden* (not inherited from
+    PluginBase as no-ops). That's what makes the reporter visible to the bus.
+    """
+    handlers: set[str] = set()
+    for name, member in inspect.getmembers(cls, predicate=inspect.isfunction):
+        if not name.startswith("on_"):
+            continue
+        # Skip no-op base implementations that a subclass didn't override.
+        base_member = getattr(PluginBase, name, None)
+        if base_member is not None and member is base_member:
+            continue
+        handlers.add(name)
+    return handlers
+
+
+def _cli_flag_dests(cls: type[PluginBase]) -> set[str]:
+    """Return the argparse `dest` names registered by cls.add_cli_options."""
+    parser = argparse.ArgumentParser()
+    cls.add_cli_options(parser)
+    dests: set[str] = set()
+    for action in parser._actions:
+        if action.dest and action.dest != "help":
+            dests.add(action.dest)
+    return dests
+
+
+def _make_reporter(cls: type[PluginBase], **kwargs: Any) -> PluginBase:
+    """Activate a reporter via its own activate() path to exercise wiring."""
+    ctx_args = {"no_color": cls is AsciiReporter, "verbosity": 1, **kwargs}
+    ctx = PluginContext(args=ctx_args)
+    instance = cls.activate(ctx)
+    assert instance is not None, f"{cls.__name__}.activate returned None"
+    return instance
+
+
+def _capture_output(capsys: pytest.CaptureFixture[str]) -> str:
+    """Capture everything captured so far on stdout+stderr.
+
+    Rich writes via `self.console` (stdout by default), Ascii uses `print`.
+    Capsys grabs both uniformly.
+    """
+    captured = capsys.readouterr()
+    return captured.out + captured.err
+
+
+@pytest.fixture
+def eval_result_pass() -> TestResult:
+    """A passing eval TestResult with two scores (one bool, one float)."""
+    return TestResult(
+        name="case_alpha",
+        node_id="mod::chatbot::case_alpha",
+        duration=0.05,
+        is_eval=True,
+        eval_payload=EvalPayload(
+            case_name="case_alpha",
+            passed=True,
+            task_duration=0.05,
+            inputs="hello",
+            output="world",
+            expected_output="world",
+            scores={
+                "contains_world": EvalScoreEntry(value=True, passed=True),
+                "similarity": EvalScoreEntry(value=0.92, passed=True),
+            },
+        ),
+    )
+
+
+@pytest.fixture
+def eval_result_fail() -> TestResult:
+    """A failing eval TestResult (one score fails)."""
+    return TestResult(
+        name="case_beta",
+        node_id="mod::chatbot::case_beta",
+        duration=0.04,
+        error=AssertionError("score contains_hi failed"),
+        is_eval=True,
+        eval_payload=EvalPayload(
+            case_name="case_beta",
+            passed=False,
+            task_duration=0.04,
+            inputs="goodbye",
+            output="farewell",
+            expected_output="hi",
+            scores={
+                "contains_hi": EvalScoreEntry(value=False, passed=False),
+            },
+        ),
+    )
+
+
+@pytest.fixture
+def plain_failing_test() -> TestResult:
+    return TestResult(
+        name="test_plain_fail",
+        node_id="mod::test_plain_fail",
+        duration=0.01,
+        error=AssertionError("plain failure"),
+    )
+
+
+# ---------------------------------------------------------------------------
+# 1. Structural symmetry
+# ---------------------------------------------------------------------------
+
+
+class TestStructuralSymmetry:
+    """Ensure the two reporters expose the same public handler surface."""
+
+    def test_reporters_override_same_handlers(self) -> None:
+        """Both reporters override the same set of on_* methods.
+
+        If one reporter starts overriding a hook that the other ignores, an
+        event will be invisible in the "other" reporter — that's the bug we
+        want to catch at test time, not in production.
+        """
+        rich_handlers = _public_handlers(RichReporter)
+        ascii_handlers = _public_handlers(AsciiReporter)
+
+        only_in_rich = rich_handlers - ascii_handlers
+        only_in_ascii = ascii_handlers - rich_handlers
+        assert not only_in_rich, (
+            f"Rich handles events that Ascii doesn't: {sorted(only_in_rich)}"
+        )
+        assert not only_in_ascii, (
+            f"Ascii handles events that Rich doesn't: {sorted(only_in_ascii)}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 2. CLI symmetry
+# ---------------------------------------------------------------------------
+
+
+class TestCliSymmetry:
+    """Ensure the two reporters consume the same shared flags.
+
+    Reporter-specific flags are allowed (e.g. `--no-color` makes sense only on
+    the Ascii side) — they're expected to appear in either one but not both.
+    The rule is: anything in `_SHARED_CLI_FLAGS` must be *activatable* on both
+    reporters (read from PluginContext via activate()).
+    """
+
+    @pytest.mark.parametrize(
+        "flag,value,attr",
+        [
+            pytest.param("show_output", True, "_show_output", id="show_output"),
+            pytest.param("show_logs", "INFO", "_show_logs", id="show_logs"),
+        ],
+    )
+    def test_shared_flags_reach_both_reporters(
+        self, flag: str, value: Any, attr: str
+    ) -> None:
+        """Given a shared run-parser flag, both reporters pick it up via activate()."""
+        for cls in REPORTER_CLASSES:
+            reporter = _make_reporter(cls, **{flag: value})
+            assert getattr(reporter, attr) == value, (
+                f"{cls.__name__} didn't wire flag '{flag}' into attr '{attr}'"
+            )
+
+    def test_reporters_dont_redeclare_shared_flags(self) -> None:
+        """Shared flags live on the run-parser, not on reporter add_cli_options.
+
+        If either reporter redeclares them via add_cli_options, argparse will
+        raise at runtime when both get wired (cli/main.py iterates plugin
+        classes and calls add_cli_options on each).
+        """
+        for cls in REPORTER_CLASSES:
+            dests = _cli_flag_dests(cls)
+            redeclared = dests & _SHARED_CLI_FLAGS
+            assert not redeclared, (
+                f"{cls.__name__}.add_cli_options redeclares shared flags: "
+                f"{sorted(redeclared)} — move them to cli._create_run_parser"
+            )
+
+
+# ---------------------------------------------------------------------------
+# 3. Behavioral symmetry
+# ---------------------------------------------------------------------------
+
+
+class TestBehavioralSymmetry:
+    """Drive the same events through both reporters; assert same semantics.
+
+    We deliberately avoid asserting on *exact* characters: the visual prefix
+    differs (`✓` vs `OK`, colors vs plain). What must be identical is which
+    pieces of information are rendered.
+    """
+
+    @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES)
+    def test_eval_pass_shows_score_names_inline(
+        self,
+        reporter_cls: type[PluginBase],
+        eval_result_pass: TestResult,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        """Given a passing eval, both reporters surface each score's name inline."""
+        reporter = _make_reporter(reporter_cls, verbosity=1)
+        reporter.on_test_pass(eval_result_pass)
+        output = _capture_output(capsys)
+        assert "contains_world" in output, (
+            f"{reporter_cls.__name__}: missing score name"
+        )
+        assert "similarity" in output, f"{reporter_cls.__name__}: missing float score"
+
+    @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES)
+    def test_eval_fail_shows_detail_inline(
+        self,
+        reporter_cls: type[PluginBase],
+        eval_result_fail: TestResult,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        """Given a failing eval, both reporters dump inputs/output/expected.
+
+        This must happen regardless of --show-output — the user can't debug
+        a failed assertion without seeing what the task actually produced.
+        """
+        reporter = _make_reporter(reporter_cls)
+        reporter.on_test_fail(eval_result_fail)
+        output = _capture_output(capsys)
+        assert "goodbye" in output, f"{reporter_cls.__name__}: missing inputs"
+        assert "farewell" in output, f"{reporter_cls.__name__}: missing output"
+        assert "hi" in output, f"{reporter_cls.__name__}: missing expected"
+
+    @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES)
+    def test_show_output_true_prints_eval_detail_on_pass(
+        self,
+        reporter_cls: type[PluginBase],
+        eval_result_pass: TestResult,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        """Given --show-output, both reporters print eval detail even on pass."""
+        reporter = _make_reporter(reporter_cls, verbosity=1, show_output=True)
+        reporter.on_test_pass(eval_result_pass)
+        output = _capture_output(capsys)
+        assert "hello" in output, f"{reporter_cls.__name__}: missing inputs on pass"
+        assert "world" in output, f"{reporter_cls.__name__}: missing output on pass"
+
+    @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES)
+    def test_show_output_false_omits_eval_detail_on_pass(
+        self,
+        reporter_cls: type[PluginBase],
+        eval_result_pass: TestResult,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        """Given default --show-output, eval detail is hidden on pass."""
+        reporter = _make_reporter(reporter_cls, verbosity=1, show_output=False)
+        reporter.on_test_pass(eval_result_pass)
+        output = _capture_output(capsys)
+        # "hello" and "world" appear in the score name ("contains_world");
+        # assert on a unique substring that only appears if the detail block runs.
+        assert "inputs:" not in output, (
+            f"{reporter_cls.__name__}: leaked eval detail without --show-output"
+        )
+
+    @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES)
+    def test_failure_summary_omits_eval_failures(
+        self,
+        reporter_cls: type[PluginBase],
+        eval_result_fail: TestResult,
+        plain_failing_test: TestResult,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        """End-of-session summary must not re-list eval failures.
+
+        Eval cases are already displayed inline via on_test_fail. Re-listing
+        them in the summary duplicates noise — the pattern agreed on is
+        "non_eval_failures only".
+        """
+        reporter = _make_reporter(reporter_cls)
+        reporter.on_test_fail(eval_result_fail)
+        reporter.on_test_fail(plain_failing_test)
+        capsys.readouterr()  # drop inline fail output
+
+        reporter.on_session_complete(
+            SessionResult(passed=0, failed=2, errors=0, duration=1.0)
+        )
+        summary = _capture_output(capsys)
+
+        assert "test_plain_fail" in summary, (
+            f"{reporter_cls.__name__}: summary lost the plain failure"
+        )
+        # The eval case name should NOT appear in the failure-summary block.
+        # It may appear in the inline tally above; we only captured summary here.
+        assert "case_beta" not in summary, (
+            f"{reporter_cls.__name__}: summary re-lists eval failure (should be inline only)"
+        )
+
+    @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES)
+    def test_show_logs_prints_captured_records(
+        self,
+        reporter_cls: type[PluginBase],
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        """Given --show-logs INFO, both reporters emit the captured log records."""
+        record = logging.LogRecord(
+            name="mylib.module",
+            level=logging.INFO,
+            pathname="x.py",
+            lineno=1,
+            msg="captured thing",
+            args=(),
+            exc_info=None,
+        )
+        result = TestResult(
+            name="test_foo",
+            node_id="mod::test_foo",
+            duration=0.01,
+            log_records=(record,),
+        )
+        reporter = _make_reporter(reporter_cls, verbosity=1, show_logs="INFO")
+        reporter.on_test_pass(result)
+        output = _capture_output(capsys)
+        assert "captured thing" in output, (
+            f"{reporter_cls.__name__}: --show-logs didn't render the record"
+        )
+        assert "mylib.module" in output, (
+            f"{reporter_cls.__name__}: --show-logs didn't render the logger name"
+        )
diff --git a/tests/reporting/test_rich_reporter.py b/tests/reporting/test_rich_reporter.py
index 1452579..4585338 100644
--- a/tests/reporting/test_rich_reporter.py
+++ b/tests/reporting/test_rich_reporter.py
@@ -15,11 +15,13 @@
     TestRetryInfo,
 )
 from protest.events.types import Event
-from protest.reporting.rich_reporter import (
+from protest.reporting.format import (
     MIN_DURATION_THRESHOLD,
-    RichReporter,
-    _format_duration,
 )
+from protest.reporting.format import (
+    format_duration as _format_duration,
+)
+from protest.reporting.rich_reporter import RichReporter
 
 
 class TestFormatDuration:

From ef0c1765fb80a9777531f7ba657f69549329ffaa Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 24 Apr 2026 07:53:38 +0200
Subject: [PATCH 29/60] refactor(reporting, examples, core): add `_safe_repr`
 for JSON-safe string handling and extend eval support

- Introduced `_safe_repr` utility to safely truncate and render arbitrary objects for JSON serialization in web reporting.
- Updated Yorkshire example to showcase `EvalSuite` API for cleaner and type-safe eval case definitions.
- Added `KindFilterPlugin` for improved filtering capabilities in core session logic.
- Enhanced eval case serialization to exclude skipped scores, improving `history --compare` accuracy.
---
 examples/yorkshire/session.py | 26 ++++++++++++++++----------
 protest/api.py                |  2 ++
 protest/history/plugin.py     | 23 +++++++++++++++++++----
 protest/reporting/web.py      | 22 +++++++++++++++++++---
 4 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py
index 7b8c3c3..b723cb9 100644
--- a/examples/yorkshire/session.py
+++ b/examples/yorkshire/session.py
@@ -5,14 +5,16 @@
 
 Run only tests:
     protest run examples.yorkshire.session:session
-    (protest run filters to kind=test by default)
+    (protest run filters to kind=test)
 
 Run only evals:
     protest eval examples.yorkshire.session:session
 """
 
+from typing import Annotated
+
 from examples.yorkshire.app.chatbot import yorkshire_chatbot
-from examples.yorkshire.evals.dataset import dataset
+from examples.yorkshire.evals.dataset import suite_evaluators, yorkshire_cases
 from examples.yorkshire.tests.fixtures import (
     configure_kennel_logging,
     kennel,
@@ -26,8 +28,9 @@
 from examples.yorkshire.tests.suites.rate_limited import rate_limited_suite
 from examples.yorkshire.tests.suites.seniors.suite import seniors_suite
 from examples.yorkshire.tests.suites.showcase.suite import showcase_suite
-from protest import ProTestSession
-from protest.evals import ModelInfo
+from protest import From, ProTestSession
+from protest.evals import EvalCase, ModelInfo
+from protest.evals.suite import EvalSuite
 
 session = ProTestSession(concurrency=4, history=True)
 session.use(BarkPlugin)
@@ -35,7 +38,6 @@
 session.bind(kennel)
 session.bind(yorkshire)
 
-# Tests
 session.add_suite(puppies_suite)
 session.add_suite(adults_suite)
 session.add_suite(seniors_suite)
@@ -44,9 +46,13 @@
 session.add_suite(rate_limited_suite)
 session.add_suite(custom_factory_suite)
 
-# Evals
-session.configure_evals(model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"))
-session.register_dataset(
-    dataset,
-    task=yorkshire_chatbot,
+yorkshire_suite = EvalSuite(
+    "yorkshire_eval",
+    model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"),
 )
+session.add_suite(yorkshire_suite)
+
+
+@yorkshire_suite.eval(evaluators=suite_evaluators)
+def yorkshire_eval(case: Annotated[EvalCase, From(yorkshire_cases)]) -> str:
+    return yorkshire_chatbot(case.inputs)
diff --git a/protest/api.py b/protest/api.py
index ce8c178..7b1e169 100644
--- a/protest/api.py
+++ b/protest/api.py
@@ -28,6 +28,7 @@ def test_example():
 )
 from protest.events.types import Event
 from protest.filters.keyword import KeywordFilterPlugin
+from protest.filters.kind import KindFilterPlugin
 from protest.filters.suite import SuiteFilterPlugin
 from protest.plugin import PluginBase, PluginContext
 from protest.tags.plugin import TagFilterPlugin
@@ -150,6 +151,7 @@ def collect_tests(  # noqa: PLR0913 - public API with many optional params
         TagFilterPlugin,
         SuiteFilterPlugin,
         KeywordFilterPlugin,
+        KindFilterPlugin,
     ]
     for plugin_class in filter_plugins:
         instance = plugin_class.activate(ctx)
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index eac653c..92b7942 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -206,19 +206,34 @@ def on_session_end(self, result: Any) -> None:
 
 
 def _serialize_eval_case(case: EvalCaseResult) -> dict[str, Any]:
-    """Serialize an eval case result for JSONL storage."""
+    """Serialize an eval case result for JSONL storage.
+
+    Skipped scores are excluded: a ShortCircuit skip produces
+    `EvalScore(value=False, skipped=True)` — serializing it as an assertion
+    would look like a real failure in `history --compare` diffs.
+    """
     entry: dict[str, Any] = {
         "passed": case.passed,
         "is_error": case.is_error,
         "duration": round(case.duration, 3),
-        "scores": {s.name: s.value for s in case.scores if s.is_metric},
+        "scores": {
+            s.name: s.value for s in case.scores if s.is_metric and not s.skipped
+        },
         "case_hash": case.case_hash,
         "eval_hash": case.eval_hash,
     }
-    labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)}
+    labels = {
+        s.name: s.value
+        for s in case.scores
+        if isinstance(s.value, str) and not s.skipped
+    }
     if labels:
         entry["labels"] = labels
-    assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)}
+    assertions = {
+        s.name: s.value
+        for s in case.scores
+        if isinstance(s.value, bool) and not s.skipped
+    }
     if assertions:
         entry["assertions"] = assertions
     return entry
diff --git a/protest/reporting/web.py b/protest/reporting/web.py
index 5eb7119..0b6f915 100644
--- a/protest/reporting/web.py
+++ b/protest/reporting/web.py
@@ -51,6 +51,22 @@
 _broadcast_clients: set[Any] = set()
 
 
+_REPR_LIMIT = 2048
+
+
+def _safe_repr(value: Any) -> str | None:
+    """Render an arbitrary value as a JSON-safe string, capped at _REPR_LIMIT."""
+    if value is None:
+        return None
+    try:
+        text = str(value)
+    except Exception as exc:
+        text = f"<unstringifiable: {type(value).__name__}: {exc}>"
+    if len(text) > _REPR_LIMIT:
+        text = text[:_REPR_LIMIT] + f"... <truncated, {len(text)} chars>"
+    return text
+
+
 def _format_traceback(error: Exception) -> str:
     lines = traceback.format_exception(type(error), error, error.__traceback__)
     return "".join(lines)
@@ -309,9 +325,9 @@ def _result_payload(
             payload["evalPayload"] = {
                 "caseName": ep.case_name,
                 "passed": ep.passed,
-                "inputs": ep.inputs,
-                "output": ep.output,
-                "expected": ep.expected_output,
+                "inputs": _safe_repr(ep.inputs),
+                "output": _safe_repr(ep.output),
+                "expected": _safe_repr(ep.expected_output),
                 "scores": {
                     name: {
                         "value": entry.value,

From 67c4887c22c0394395ce6642f0c4646db0ae726a Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 24 Apr 2026 19:28:08 +0200
Subject: [PATCH 30/60] tests: add coverage for `EvalCase` invariants, `history
 --compare` logic, and CLI argument parsing mutex validations

---
 docs/evals.md                 |   6 +-
 protest/cli/history.py        |  22 ++++--
 protest/evals/evaluator.py    |  19 +++--
 protest/evals/wrapper.py      |   2 +-
 tests/evals/test_evalcase.py  |  36 ++++++++++
 tests/test_history_changes.py | 128 ++++++++++++++++++++++++++++++++++
 tests/test_history_cli.py     | 112 +++++++++++++++++++++++++++++
 7 files changed, 312 insertions(+), 13 deletions(-)
 create mode 100644 tests/evals/test_evalcase.py
 create mode 100644 tests/test_history_changes.py
 create mode 100644 tests/test_history_cli.py

diff --git a/docs/evals.md b/docs/evals.md
index 96e51d7..562bcde 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -193,8 +193,8 @@ The judge handles structured output — no text parsing needed. See [Judge](#jud
 Different thresholds per case = different evaluator bindings:
 
 ```python
-EvalCase(inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min_recall=0.9)]),
-EvalCase(inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]),
+EvalCase(name="easy_lookup", inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min_recall=0.9)]),
+EvalCase(name="hard_causal", inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]),
 ```
 
 ### ShortCircuit
@@ -225,7 +225,7 @@ evaluators=[not_empty]
 evaluators=[keyword_check(keywords=["python", "async"], min_recall=0.75)]
 
 # Per-case evaluators (added to suite-level)
-EvalCase(inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")])
+EvalCase(name="factual_accuracy_case", inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")])
 ```
 
 ### EvalContext
diff --git a/protest/cli/history.py b/protest/cli/history.py
index e83216d..01198d8 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -20,8 +20,12 @@ def handle_history_command(argv: list[str]) -> None:
     )
     parser.add_argument("--model", type=str, default=None, help="Filter by model name")
     parser.add_argument("--suite", type=str, default=None, help="Filter by suite name")
-    parser.add_argument("--runs", action="store_true", help="Show run-by-run list")
-    parser.add_argument(
+
+    action_group = parser.add_mutually_exclusive_group()
+    action_group.add_argument(
+        "--runs", action="store_true", help="Show run-by-run list"
+    )
+    action_group.add_argument(
         "--show",
         nargs="?",
         const=0,
@@ -30,11 +34,13 @@ def handle_history_command(argv: list[str]) -> None:
         metavar="N",
         help="Detailed panel for Nth most recent run (0=latest)",
     )
-    parser.add_argument(
+    action_group.add_argument(
         "--compare", action="store_true", help="Compare 2 most recent runs"
     )
-    parser.add_argument("--evals", action="store_true", help="Eval runs only")
-    parser.add_argument("--tests", action="store_true", help="Test runs only")
+
+    kind_group = parser.add_mutually_exclusive_group()
+    kind_group.add_argument("--evals", action="store_true", help="Eval runs only")
+    kind_group.add_argument("--tests", action="store_true", help="Test runs only")
     parser.add_argument(
         "--clean-dirty",
         action="store_true",
@@ -313,6 +319,7 @@ def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None:
             ("regressed", "Regressions", "red", "-"),
             ("modified", "Modified", "yellow", "⟳"),
             ("new", "New", "cyan", "*"),
+            ("deleted", "Deleted", "red", "✗"),
         ]
         has_any = False
         for key, label, color, marker in labels:
@@ -485,6 +492,7 @@ def _classify_changes(
         "regressed": [],
         "modified": [],
         "new": [],
+        "deleted": [],
     }
     for name, curr in curr_cases.items():
         prev = prev_cases.get(name)
@@ -498,6 +506,9 @@ def _classify_changes(
             result["fixed"].append(name)
         elif not curr.get("passed") and prev.get("passed"):
             result["regressed"].append(name)
+    for name in prev_cases:
+        if name not in curr_cases:
+            result["deleted"].append(name)
     return result
 
 
@@ -507,6 +518,7 @@ def _print_changes(changes: dict[str, list[str]]) -> None:
         "regressed": ("Regressions", "-"),
         "modified": ("Modified", "⟳"),
         "new": ("New", "*"),
+        "deleted": ("Deleted", "✗"),
     }
     has_any = False
     for key, (label, marker) in labels.items():
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 569ce83..242bb64 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -126,11 +126,15 @@ def judge_cost(self) -> float:
 class EvalCase:
     """Typed container for eval case data in ForEach.
 
+    `name` is required: it identifies the case across history, reporting, and
+    file-based output. Two cases sharing a name collide silently in those
+    downstream consumers.
+
     Usage::
 
         cases = ForEach([
-            EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"),
-            EvalCase(inputs="Who is Pierre?", expected="Pierre, arrest"),
+            EvalCase(inputs="Who is Marie?", name="marie_lookup", expected="Marie, Resistance"),
+            EvalCase(inputs="Who is Pierre?", name="pierre_lookup", expected="Pierre, arrest"),
         ])
 
         @suite.eval(evaluators=[contains_facts])
@@ -139,13 +143,20 @@ def my_eval(case: Annotated[EvalCase, From(cases)]) -> str:
     """
 
     inputs: Any
+    name: str
     expected: Any = None
-    name: str = ""
     evaluators: list[Any] = field(default_factory=list)
     metadata: dict[str, Any] = field(default_factory=dict)
 
+    def __post_init__(self) -> None:
+        if not self.name:
+            raise ValueError(
+                "EvalCase.name must be a non-empty string "
+                "(used for history tracking and case identity)."
+            )
+
     def __repr__(self) -> str:
-        return self.name or f"EvalCase({self.inputs!r})"
+        return self.name
 
 
 class ShortCircuit:
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index d4278c0..82b25a8 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -124,7 +124,7 @@ def _extract_expected(kwargs: dict[str, Any]) -> Any:
 
 def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str:
     case = _find_case(kwargs)
-    if case is None or not case.name:
+    if case is None:
         return fallback
     return case.name
 
diff --git a/tests/evals/test_evalcase.py b/tests/evals/test_evalcase.py
new file mode 100644
index 0000000..12435f6
--- /dev/null
+++ b/tests/evals/test_evalcase.py
@@ -0,0 +1,36 @@
+"""Tests for `EvalCase` construction invariants."""
+
+from __future__ import annotations
+
+import pytest
+
+from protest.evals import EvalCase
+
+
+class TestEvalCaseRequiresName:
+    """`name` is required and must be non-empty."""
+
+    def test_name_required_as_kwarg(self) -> None:
+        case = EvalCase(inputs="x", name="my_case")
+        assert case.name == "my_case"
+
+    def test_missing_name_raises(self) -> None:
+        with pytest.raises(TypeError):
+            EvalCase(inputs="x")  # type: ignore[call-arg]
+
+    def test_empty_name_raises(self) -> None:
+        with pytest.raises(ValueError, match="non-empty"):
+            EvalCase(inputs="x", name="")
+
+    def test_name_is_second_positional(self) -> None:
+        case = EvalCase("input_val", "case_name")
+        assert case.inputs == "input_val"
+        assert case.name == "case_name"
+
+
+class TestEvalCaseRepr:
+    """`__repr__` returns the name (no fallback anymore since name is required)."""
+
+    def test_repr_is_name(self) -> None:
+        case = EvalCase(inputs="x", name="readable_name")
+        assert repr(case) == "readable_name"
diff --git a/tests/test_history_changes.py b/tests/test_history_changes.py
new file mode 100644
index 0000000..a24698e
--- /dev/null
+++ b/tests/test_history_changes.py
@@ -0,0 +1,128 @@
+"""Tests for `_classify_changes` — diffing logic for `protest history --compare`.
+
+Each case entry is a minimal dict mirroring what `_all_cases(entry)` returns
+from a history JSONL record: at least `passed`, optionally `case_hash` and
+`eval_hash`.
+"""
+
+from __future__ import annotations
+
+from protest.cli.history import _classify_changes
+
+
+def _case(
+    *,
+    passed: bool = True,
+    case_hash: str | None = None,
+    eval_hash: str | None = None,
+) -> dict:
+    entry: dict = {"passed": passed}
+    if case_hash is not None:
+        entry["case_hash"] = case_hash
+    if eval_hash is not None:
+        entry["eval_hash"] = eval_hash
+    return entry
+
+
+class TestClassifyChangesDeleted:
+    """Cases present in `prev` but absent from `curr` land in `deleted`."""
+
+    def test_single_deletion(self) -> None:
+        prev = {"case_a": _case(passed=True), "case_b": _case(passed=True)}
+        curr = {"case_a": _case(passed=True)}
+        changes = _classify_changes(curr, prev)
+        assert changes["deleted"] == ["case_b"]
+        assert changes["new"] == []
+        assert changes["fixed"] == []
+        assert changes["regressed"] == []
+        assert changes["modified"] == []
+
+    def test_multiple_deletions_preserve_prev_order(self) -> None:
+        prev = {
+            "alpha": _case(),
+            "beta": _case(),
+            "gamma": _case(),
+            "delta": _case(),
+        }
+        curr = {"alpha": _case()}
+        changes = _classify_changes(curr, prev)
+        assert changes["deleted"] == ["beta", "gamma", "delta"]
+
+    def test_deletion_coexists_with_other_changes(self) -> None:
+        prev = {
+            "to_delete": _case(passed=True),
+            "to_fix": _case(passed=False),
+            "stable": _case(passed=True),
+        }
+        curr = {
+            "to_fix": _case(passed=True),
+            "stable": _case(passed=True),
+            "brand_new": _case(passed=True),
+        }
+        changes = _classify_changes(curr, prev)
+        assert changes["deleted"] == ["to_delete"]
+        assert changes["fixed"] == ["to_fix"]
+        assert changes["new"] == ["brand_new"]
+
+    def test_all_cases_deleted(self) -> None:
+        prev = {"a": _case(), "b": _case()}
+        curr: dict = {}
+        changes = _classify_changes(curr, prev)
+        assert changes["deleted"] == ["a", "b"]
+        assert changes["new"] == []
+
+    def test_no_deletions(self) -> None:
+        prev = {"a": _case()}
+        curr = {"a": _case(), "b": _case()}
+        changes = _classify_changes(curr, prev)
+        assert changes["deleted"] == []
+        assert changes["new"] == ["b"]
+
+
+class TestClassifyChangesExistingCategories:
+    """Existing categories keep working after adding `deleted`."""
+
+    def test_new_case(self) -> None:
+        changes = _classify_changes({"a": _case()}, {})
+        assert changes["new"] == ["a"]
+
+    def test_fixed_case(self) -> None:
+        prev = {"a": _case(passed=False)}
+        curr = {"a": _case(passed=True)}
+        assert _classify_changes(curr, prev)["fixed"] == ["a"]
+
+    def test_regressed_case(self) -> None:
+        prev = {"a": _case(passed=True)}
+        curr = {"a": _case(passed=False)}
+        assert _classify_changes(curr, prev)["regressed"] == ["a"]
+
+    def test_modified_case_hash(self) -> None:
+        prev = {"a": _case(case_hash="h1")}
+        curr = {"a": _case(case_hash="h2")}
+        assert _classify_changes(curr, prev)["modified"] == ["a (case modified)"]
+
+    def test_modified_eval_hash(self) -> None:
+        prev = {"a": _case(eval_hash="h1")}
+        curr = {"a": _case(eval_hash="h2")}
+        assert _classify_changes(curr, prev)["modified"] == ["a (scoring modified)"]
+
+    def test_no_changes(self) -> None:
+        prev = {"a": _case(passed=True)}
+        curr = {"a": _case(passed=True)}
+        changes = _classify_changes(curr, prev)
+        assert all(not v for v in changes.values())
+
+
+class TestClassifyChangesResultShape:
+    """Result dict always has the five expected keys."""
+
+    def test_empty_inputs_still_yield_five_buckets(self) -> None:
+        changes = _classify_changes({}, {})
+        assert set(changes.keys()) == {
+            "fixed",
+            "regressed",
+            "modified",
+            "new",
+            "deleted",
+        }
+        assert all(v == [] for v in changes.values())
diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py
new file mode 100644
index 0000000..70e81b2
--- /dev/null
+++ b/tests/test_history_cli.py
@@ -0,0 +1,112 @@
+"""Tests for `protest history` CLI argument parsing.
+
+Covers mutually-exclusive flag groups:
+- Action: `--runs` / `--show` / `--compare`
+- Kind:   `--evals` / `--tests`
+
+`handle_history_command(argv)` triggers `SystemExit(2)` from argparse when a
+mutex is violated. Tests assert both the exit code and the stderr message
+mentioning the conflicting flag.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+from protest.cli.history import handle_history_command
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class TestActionMutex:
+    """`--runs`, `--show`, `--compare` cannot be combined."""
+
+    @pytest.mark.parametrize(
+        ("argv", "expected_flag"),
+        [
+            (["--runs", "--compare"], "--compare"),
+            (["--compare", "--runs"], "--runs"),
+            (["--runs", "--show", "0"], "--show"),
+            (["--show", "0", "--runs"], "--runs"),
+            (["--show", "1", "--compare"], "--compare"),
+            (["--compare", "--show", "1"], "--show"),
+        ],
+    )
+    def test_mutex_violation_exits_with_error(
+        self,
+        argv: list[str],
+        expected_flag: str,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(argv)
+        assert exc_info.value.code == 2
+        stderr = capsys.readouterr().err
+        assert "not allowed with argument" in stderr
+        assert expected_flag in stderr
+
+
+class TestKindMutex:
+    """`--evals` and `--tests` cannot be combined."""
+
+    @pytest.mark.parametrize(
+        "argv",
+        [
+            ["--evals", "--tests"],
+            ["--tests", "--evals"],
+        ],
+    )
+    def test_mutex_violation_exits_with_error(
+        self,
+        argv: list[str],
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(argv)
+        assert exc_info.value.code == 2
+        stderr = capsys.readouterr().err
+        assert "not allowed with argument" in stderr
+
+
+class TestMutexIndependence:
+    """Flags from different groups can be combined freely."""
+
+    @pytest.mark.parametrize(
+        "action_flags",
+        [
+            ["--runs"],
+            ["--compare"],
+            ["--show", "0"],
+        ],
+    )
+    @pytest.mark.parametrize("kind_flag", ["--evals", "--tests"])
+    def test_cross_group_combinations_parse_cleanly(
+        self,
+        action_flags: list[str],
+        kind_flag: str,
+        tmp_path: Path,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        argv = [*action_flags, kind_flag, "--path", str(tmp_path)]
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(argv)
+        assert exc_info.value.code == 0
+        captured = capsys.readouterr()
+        assert "not allowed with argument" not in captured.err
+
+
+class TestHelpShowsMutex:
+    """`--help` output surfaces both mutex groups in usage line."""
+
+    def test_help_output_shows_action_and_kind_groups(
+        self, capsys: pytest.CaptureFixture[str]
+    ) -> None:
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(["--help"])
+        assert exc_info.value.code == 0
+        stdout = capsys.readouterr().out
+        assert "[--runs | --show [N] | --compare]" in stdout
+        assert "[--evals | --tests]" in stdout

From 909ac7219eea73c4674b75b6ae5bb45024b326ad Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 24 Apr 2026 22:01:44 +0200
Subject: [PATCH 31/60] tests(evals): add tests for
 `EvalCaseResult.from_test_result` and refactor writer construction

- Added comprehensive tests for `EvalCaseResult.from_test_result` to validate field mappings and defensive checks.
- Refactored writer logic to use `EvalCaseResult.from_test_result`, simplifying redundant helper methods.
---
 protest/core/runner.py               |  36 +------
 protest/evals/results_writer.py      |  32 +------
 protest/evals/types.py               |  43 ++++++++-
 tests/evals/test_eval_case_result.py | 135 +++++++++++++++++++++++++++
 4 files changed, 183 insertions(+), 63 deletions(-)
 create mode 100644 tests/evals/test_eval_case_result.py

diff --git a/protest/core/runner.py b/protest/core/runner.py
index f6bab5b..d25fb47 100644
--- a/protest/core/runner.py
+++ b/protest/core/runner.py
@@ -17,7 +17,7 @@
     SessionSetupInfo,
     TestCounts,
 )
-from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport
+from protest.evals.types import EvalCaseResult, EvalSuiteReport
 from protest.events.types import Event
 from protest.execution.capture import (
     GlobalCapturePatch,
@@ -76,7 +76,7 @@ def _collect_eval_result(self, result: TestResult) -> None:
         if not result.is_eval or result.eval_payload is None:
             return
         suite_name = result.suite_path.root_name if result.suite_path else "evals"
-        case_result = _build_eval_case_result(result)
+        case_result = EvalCaseResult.from_test_result(result)
         self._eval_results.setdefault(suite_name, []).append(case_result)
 
     async def _main_loop(self) -> bool:  # noqa: PLR0915
@@ -204,35 +204,3 @@ async def _emit_eval_suite_end(self, suite_path: Any) -> None:
             duration=sum(c.duration for c in eval_cases),
         )
         await self._session.events.emit(Event.EVAL_SUITE_END, report)
-
-
-def _build_eval_case_result(result: TestResult) -> EvalCaseResult:
-    """Build EvalCaseResult from a TestResult with eval_payload."""
-    payload = result.eval_payload
-    assert payload is not None
-    return EvalCaseResult(
-        case_name=payload.case_name or "",
-        node_id=result.node_id,
-        scores=tuple(
-            EvalScore(
-                name=name,
-                value=entry.value,
-            )
-            for name, entry in payload.scores.items()
-        ),
-        duration=payload.task_duration,
-        passed=not (result.error is not None or not payload.passed),
-        inputs=payload.inputs,
-        output=payload.output,
-        expected_output=payload.expected_output,
-        case_hash=payload.case_hash,
-        eval_hash=payload.eval_hash,
-        task_input_tokens=payload.task_input_tokens,
-        task_output_tokens=payload.task_output_tokens,
-        task_cost=payload.task_cost,
-        judge_call_count=payload.judge_call_count,
-        judge_input_tokens=payload.judge_input_tokens,
-        judge_output_tokens=payload.judge_output_tokens,
-        judge_cost=payload.judge_cost,
-        is_error=result.is_fixture_error,
-    )
diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
index 67ca569..db64f0e 100644
--- a/protest/evals/results_writer.py
+++ b/protest/evals/results_writer.py
@@ -38,16 +38,16 @@ def activate(cls, ctx: PluginContext) -> EvalResultsWriter:
         return cls(history_dir=ctx.get("history_dir"))
 
     def on_test_pass(self, result: TestResult) -> None:
-        self._maybe_write(result, passed=True)
+        self._maybe_write(result)
 
     def on_test_fail(self, result: TestResult) -> None:
-        self._maybe_write(result, passed=False)
+        self._maybe_write(result)
 
-    def _maybe_write(self, result: TestResult, *, passed: bool) -> None:
+    def _maybe_write(self, result: TestResult) -> None:
         if not result.is_eval or result.eval_payload is None:
             return
         suite_name = result.suite_path.root_name if result.suite_path else "evals"
-        case_result = _build_case_result(result, passed)
+        case_result = EvalCaseResult.from_test_result(result)
         self._write_case_file(case_result, suite_name)
 
     def _write_case_file(self, case_result: EvalCaseResult, suite_name: str) -> None:
@@ -65,30 +65,6 @@ def on_eval_suite_end(self, report: Any) -> None:
             print(f"  Results: {run_dir}")
 
 
-def _build_case_result(result: TestResult, passed: bool) -> EvalCaseResult:
-    """Build EvalCaseResult from a TestResult with eval_payload."""
-    payload = result.eval_payload
-    assert payload is not None
-    return EvalCaseResult(
-        case_name=payload.case_name or "",
-        node_id=result.node_id,
-        scores=tuple(
-            EvalScore(
-                name=name,
-                value=entry.value,
-            )
-            for name, entry in payload.scores.items()
-        ),
-        duration=payload.task_duration,
-        passed=passed,
-        inputs=payload.inputs,
-        output=payload.output,
-        expected_output=payload.expected_output,
-        case_hash=payload.case_hash,
-        eval_hash=payload.eval_hash,
-    )
-
-
 # ---------------------------------------------------------------------------
 # File writing helpers
 # ---------------------------------------------------------------------------
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 7a2c19a..e78e33c 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -4,7 +4,10 @@
 
 import statistics
 from dataclasses import dataclass, field
-from typing import Any, Generic, Protocol, TypeVar, runtime_checkable
+from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar, runtime_checkable
+
+if TYPE_CHECKING:
+    from protest.entities.events import TestResult
 
 T = TypeVar("T")
 
@@ -173,6 +176,44 @@ class EvalCaseResult:
     judge_cost: float = 0.0
     is_error: bool = False
 
+    @classmethod
+    def from_test_result(cls, result: TestResult) -> EvalCaseResult:
+        """Build from a `TestResult` carrying an `eval_payload`.
+
+        `passed` is derived from `result.error` and `payload.passed`, so both
+        the runner (post-execution) and the results writer (pass/fail hooks)
+        agree on the same computation.
+        """
+        payload = result.eval_payload
+        if payload is None:
+            raise ValueError(
+                f"Cannot build EvalCaseResult from TestResult without "
+                f"eval_payload (node_id={result.node_id})"
+            )
+        return cls(
+            case_name=payload.case_name or "",
+            node_id=result.node_id,
+            scores=tuple(
+                EvalScore(name=name, value=entry.value)
+                for name, entry in payload.scores.items()
+            ),
+            duration=payload.task_duration,
+            passed=result.error is None and payload.passed,
+            inputs=payload.inputs,
+            output=payload.output,
+            expected_output=payload.expected_output,
+            case_hash=payload.case_hash,
+            eval_hash=payload.eval_hash,
+            task_input_tokens=payload.task_input_tokens,
+            task_output_tokens=payload.task_output_tokens,
+            task_cost=payload.task_cost,
+            judge_call_count=payload.judge_call_count,
+            judge_input_tokens=payload.judge_input_tokens,
+            judge_output_tokens=payload.judge_output_tokens,
+            judge_cost=payload.judge_cost,
+            is_error=result.is_fixture_error,
+        )
+
     @property
     def numeric_scores(self) -> dict[str, float]:
         return {s.name: float(s.value) for s in self.scores if s.is_metric}
diff --git a/tests/evals/test_eval_case_result.py b/tests/evals/test_eval_case_result.py
new file mode 100644
index 0000000..06471eb
--- /dev/null
+++ b/tests/evals/test_eval_case_result.py
@@ -0,0 +1,135 @@
+"""Tests for `EvalCaseResult.from_test_result`.
+
+This classmethod is the single constructor used by both the runner (post-
+execution) and the results writer (pass/fail hooks). The test below pins the
+full field mapping so that future additions to `EvalPayload` or `TestResult`
+either update the classmethod or break the test.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from protest.entities.events import EvalPayload, EvalScoreEntry, TestResult
+from protest.evals.types import EvalCaseResult
+
+
+def _make_payload(**overrides: object) -> EvalPayload:
+    defaults: dict[str, object] = {
+        "case_name": "case_one",
+        "passed": True,
+        "task_duration": 0.123,
+        "inputs": "in",
+        "output": "out",
+        "expected_output": "expected",
+        "scores": {"accuracy": EvalScoreEntry(value=0.9, passed=True)},
+        "case_hash": "ch",
+        "eval_hash": "eh",
+        "task_input_tokens": 100,
+        "task_output_tokens": 200,
+        "task_cost": 0.01,
+        "judge_call_count": 1,
+        "judge_input_tokens": 50,
+        "judge_output_tokens": 30,
+        "judge_cost": 0.005,
+    }
+    defaults.update(overrides)
+    return EvalPayload(**defaults)  # type: ignore[arg-type]
+
+
+def _make_result(
+    *,
+    error: Exception | None = None,
+    is_fixture_error: bool = False,
+    payload: EvalPayload | None = None,
+    node_id: str = "suite::case_one",
+) -> TestResult:
+    return TestResult(
+        name="case_one",
+        node_id=node_id,
+        error=error,
+        is_fixture_error=is_fixture_error,
+        is_eval=True,
+        eval_payload=payload or _make_payload(),
+    )
+
+
+class TestFromTestResultHappyPath:
+    """Full field mapping: all payload + result fields land in the result."""
+
+    def test_all_fields_copied(self) -> None:
+        result = _make_result()
+        case = EvalCaseResult.from_test_result(result)
+        assert case.case_name == "case_one"
+        assert case.node_id == "suite::case_one"
+        assert case.duration == pytest.approx(0.123)
+        assert case.passed is True
+        assert case.inputs == "in"
+        assert case.output == "out"
+        assert case.expected_output == "expected"
+        assert case.case_hash == "ch"
+        assert case.eval_hash == "eh"
+        assert case.is_error is False
+
+    def test_scores_converted_from_entries(self) -> None:
+        case = EvalCaseResult.from_test_result(_make_result())
+        assert len(case.scores) == 1
+        assert case.scores[0].name == "accuracy"
+        assert case.scores[0].value == 0.9
+
+    def test_task_usage_copied(self) -> None:
+        """Regression: writer used to drop these fields silently."""
+        case = EvalCaseResult.from_test_result(_make_result())
+        assert case.task_input_tokens == 100
+        assert case.task_output_tokens == 200
+        assert case.task_cost == pytest.approx(0.01)
+
+    def test_judge_usage_copied(self) -> None:
+        """Regression: writer used to drop these fields silently."""
+        case = EvalCaseResult.from_test_result(_make_result())
+        assert case.judge_call_count == 1
+        assert case.judge_input_tokens == 50
+        assert case.judge_output_tokens == 30
+        assert case.judge_cost == pytest.approx(0.005)
+
+
+class TestFromTestResultPassedDerivation:
+    """`passed` is derived, not passed in — the writer no longer gets it wrong."""
+
+    def test_passed_when_no_error_and_payload_passed(self) -> None:
+        result = _make_result(payload=_make_payload(passed=True))
+        assert EvalCaseResult.from_test_result(result).passed is True
+
+    def test_failed_when_payload_not_passed(self) -> None:
+        result = _make_result(payload=_make_payload(passed=False))
+        assert EvalCaseResult.from_test_result(result).passed is False
+
+    def test_failed_when_error_present(self) -> None:
+        result = _make_result(
+            error=RuntimeError("boom"),
+            payload=_make_payload(passed=True),
+        )
+        assert EvalCaseResult.from_test_result(result).passed is False
+
+    def test_is_error_reflects_fixture_error(self) -> None:
+        result = _make_result(
+            error=RuntimeError("fx"),
+            is_fixture_error=True,
+        )
+        case = EvalCaseResult.from_test_result(result)
+        assert case.is_error is True
+        assert case.passed is False
+
+
+class TestFromTestResultErrors:
+    """Defensive: classmethod refuses a TestResult without eval_payload."""
+
+    def test_missing_payload_raises(self) -> None:
+        result = TestResult(
+            name="n",
+            node_id="x",
+            is_eval=False,
+            eval_payload=None,
+        )
+        with pytest.raises(ValueError, match="eval_payload"):
+            EvalCaseResult.from_test_result(result)

From fee2bf6a332d95d1af6155ad9fdc75dbea5f62b1 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 24 Apr 2026 23:14:46 +0200
Subject: [PATCH 32/60] tests(evals): add tests for `EvalCase.metadata['tags']`
 wiring and enhance tag propagation logic

- Added tests to verify that `EvalCase.metadata['tags']` are merged into `TestItem.tags`.
- Updated `Collector` to propagate tags from `EvalCase.metadata` into `TestItem` during collection.
- Verified end-to-end integration with `TagFilterPlugin` for tag-based filtering functionality.
---
 protest/core/collector.py                | 10 ++-
 tests/evals/test_evalcase_tags_wiring.py | 96 ++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 tests/evals/test_evalcase_tags_wiring.py

diff --git a/protest/core/collector.py b/protest/core/collector.py
index d7c83db..72743e1 100644
--- a/protest/core/collector.py
+++ b/protest/core/collector.py
@@ -9,6 +9,7 @@
 from protest.di.markers import Use
 from protest.di.validation import _extract_from_params
 from protest.entities import FixtureCallable, SuitePath, TestItem, TestRegistration
+from protest.evals.evaluator import EvalCase
 
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -176,11 +177,18 @@ def _expand_registration(
                 sources[index].get_id(value) for index, value in enumerate(combination)
             ]
 
+            item_tags = tags.copy()
+            for value in combination:
+                if isinstance(value, EvalCase):
+                    case_tags = value.metadata.get("tags")
+                    if case_tags:
+                        item_tags.update(case_tags)
+
             items.append(
                 TestItem(
                     func=test_reg.func,
                     suite=suite,
-                    tags=tags.copy(),
+                    tags=item_tags,
                     case_kwargs=case_kwargs,
                     case_ids=case_ids,
                     skip=test_reg.skip,
diff --git a/tests/evals/test_evalcase_tags_wiring.py b/tests/evals/test_evalcase_tags_wiring.py
new file mode 100644
index 0000000..dbf9649
--- /dev/null
+++ b/tests/evals/test_evalcase_tags_wiring.py
@@ -0,0 +1,96 @@
+"""Tests for `EvalCase.metadata['tags']` → `TestItem.tags` wiring.
+
+Verifies that tags declared on an `EvalCase` via `metadata={'tags': [...]}`
+are merged into the resulting `TestItem.tags` set, so that the
+`TagFilterPlugin` (which filters on `TestItem.tags`) can honor them.
+
+Eval functions are defined at module level to avoid `get_type_hints()`
+resolution issues that occur with nested function definitions.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated
+
+from protest import ForEach, From, ProTestSession
+from protest.core.collector import Collector
+from protest.evals import EvalCase
+from protest.evals.suite import EvalSuite
+from protest.tags.plugin import TagFilterPlugin
+
+# Module-level case sources so `get_type_hints()` can resolve Annotated args.
+_single_tagged = [EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]})]
+_multi_tagged = [
+    EvalCase(inputs="x", name="c1", metadata={"tags": ["safety", "factual"]})
+]
+_mixed_cases = [
+    EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]}),
+    EvalCase(inputs="y", name="c2", metadata={"tags": ["factual"]}),
+    EvalCase(inputs="z", name="c3"),
+]
+_no_tags_metadata = [
+    EvalCase(inputs="x", name="c1", metadata={"other": "value"}),
+]
+_filter_cases = [
+    EvalCase(inputs="a", name="c_safety", metadata={"tags": ["safety"]}),
+    EvalCase(inputs="b", name="c_factual", metadata={"tags": ["factual"]}),
+]
+
+
+def _collect(cases: list[EvalCase]) -> list:
+    """Build a session with a parametrized eval over `cases` and collect items."""
+    session = ProTestSession()
+    suite = EvalSuite("evals")
+
+    source = ForEach(cases)
+
+    @suite.eval()
+    def my_eval(case: Annotated[EvalCase, From(source)]) -> str:
+        return str(case.inputs)
+
+    _ = my_eval  # silence unused-var diagnostics; decorator registers it
+    session.add_suite(suite)
+    return Collector().collect(session)
+
+
+class TestCaseTagsMergedIntoItemTags:
+    def test_single_case_tag_becomes_item_tag(self) -> None:
+        items = _collect(_single_tagged)
+        assert len(items) == 1
+        assert "safety" in items[0].tags
+
+    def test_multiple_case_tags(self) -> None:
+        items = _collect(_multi_tagged)
+        assert items[0].tags >= {"safety", "factual"}
+
+    def test_cases_get_distinct_tags(self) -> None:
+        items = _collect(_mixed_cases)
+        assert len(items) == 3
+        by_name = {item.case_ids[0]: item for item in items}
+        assert "safety" in by_name["c1"].tags
+        assert "factual" not in by_name["c1"].tags
+        assert "factual" in by_name["c2"].tags
+        assert "safety" not in by_name["c2"].tags
+        assert by_name["c3"].tags == set()
+
+    def test_case_without_tags_metadata_ok(self) -> None:
+        items = _collect(_no_tags_metadata)
+        assert items[0].tags == set()
+
+
+class TestTagFilterHonorsCaseTags:
+    """End-to-end: `TagFilterPlugin` filters items based on case tags."""
+
+    def test_include_tag_keeps_matching_cases(self) -> None:
+        items = _collect(_filter_cases)
+        plugin = TagFilterPlugin(include_tags={"safety"})
+        filtered = plugin.on_collection_finish(items)
+        assert len(filtered) == 1
+        assert filtered[0].case_ids == ["c_safety"]
+
+    def test_exclude_tag_drops_matching_cases(self) -> None:
+        items = _collect(_filter_cases)
+        plugin = TagFilterPlugin(exclude_tags={"safety"})
+        filtered = plugin.on_collection_finish(items)
+        assert len(filtered) == 1
+        assert filtered[0].case_ids == ["c_factual"]

From 46c54d377f598f843d49d21f746e27699391c415 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Fri, 24 Apr 2026 23:50:18 +0200
Subject: [PATCH 33/60] tests(history): add concurrency tests for
 `append_entry` and implement cross-platform file locking

- Added tests to ensure `append_entry` supports concurrent writes without line corruption.
- Implemented cross-platform file locking: `fcntl.flock` on POSIX and `msvcrt.locking` on Windows using a sibling `.lock` file.
- Ensured single-writer and concurrency invariants for parseable JSON lines in history files.
---
 protest/history/storage.py                    | 63 +++++++++++--
 tests/history/__init__.py                     |  0
 .../history/test_append_entry_concurrency.py  | 90 +++++++++++++++++++
 3 files changed, 146 insertions(+), 7 deletions(-)
 create mode 100644 tests/history/__init__.py
 create mode 100644 tests/history/test_append_entry_concurrency.py

diff --git a/protest/history/storage.py b/protest/history/storage.py
index 5dbe047..829b65e 100644
--- a/protest/history/storage.py
+++ b/protest/history/storage.py
@@ -2,10 +2,57 @@
 
 from __future__ import annotations
 
+import contextlib
 import json
 import subprocess
+import sys
 from pathlib import Path
-from typing import Any
+from typing import IO, TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+if sys.platform == "win32":
+    import msvcrt
+
+    @contextlib.contextmanager
+    def _exclusive_file_lock(f: IO[Any]) -> Iterator[None]:
+        """Hold an exclusive advisory lock on `f` for the block's duration.
+
+        Windows `msvcrt.locking` cannot lock regions beyond EOF, so we lock
+        a sibling `<path>.lock` file that we ensure always has 1 byte. All
+        writers cooperate on this sibling, so concurrent appends to the
+        main file are serialized.
+        """
+        lock_path = Path(f"{f.name}.lock")
+        with open(lock_path, "a+b") as lf:
+            lf.seek(0, 2)
+            if lf.tell() == 0:
+                lf.write(b"\0")
+                lf.flush()
+            lf.seek(0)
+            msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1)
+            try:
+                yield
+            finally:
+                lf.seek(0)
+                msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
+else:
+    import fcntl
+
+    @contextlib.contextmanager
+    def _exclusive_file_lock(f: IO[Any]) -> Iterator[None]:
+        """Hold an exclusive advisory lock on `f` for the block's duration.
+
+        POSIX `fcntl.flock` locks the file descriptor directly; cross-process
+        callers opening the same path will block until the lock is released.
+        """
+        fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+        try:
+            yield
+        finally:
+            fcntl.flock(f.fileno(), fcntl.LOCK_UN)
+
 
 DEFAULT_HISTORY_DIR = Path(".protest")
 HISTORY_FILE = "history.jsonl"
@@ -64,14 +111,16 @@ def _has_suite_kind(entry: dict[str, Any], kind: str) -> bool:
 def append_entry(path: Path, entry: dict[str, Any]) -> None:
     """Append a single JSON entry to a JSONL file.
 
-    Note: no file locking — concurrent writes from separate processes
-    could corrupt the file. In practice, protest runs are single-process
-    (async workers share the same process). If concurrent CI jobs write
-    to the same history file, consider using separate history_dir per job.
+    Serializes concurrent writes from separate processes sharing the same
+    history file (e.g. a CI matrix) via an exclusive advisory lock:
+    `fcntl.flock` on POSIX, `msvcrt.locking` on a sibling `<path>.lock`
+    file on Windows.
     """
     path.parent.mkdir(parents=True, exist_ok=True)
-    with open(path, "a") as f:
-        f.write(json.dumps(entry, default=str) + "\n")
+    line = json.dumps(entry, default=str) + "\n"
+    with open(path, "a") as f, _exclusive_file_lock(f):
+        f.write(line)
+        f.flush()
 
 
 def load_previous_run(
diff --git a/tests/history/__init__.py b/tests/history/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/history/test_append_entry_concurrency.py b/tests/history/test_append_entry_concurrency.py
new file mode 100644
index 0000000..5bd3d79
--- /dev/null
+++ b/tests/history/test_append_entry_concurrency.py
@@ -0,0 +1,90 @@
+"""Tests for `append_entry` — concurrent writer safety.
+
+Covers the basic invariant (one entry = one parseable line) and the
+multiprocess-concurrency case: N workers append concurrently to the same
+file; every line must be parseable JSON. Without locking, interleaved
+writes larger than `PIPE_BUF` would corrupt lines and the test would fail.
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as mp
+from pathlib import Path
+
+from protest.history.storage import append_entry
+
+
+def _worker_append(args: tuple[str, int, int]) -> None:
+    """Child-process entry: append `count` entries, each padded to ~5 KB.
+
+    The padding pushes the write past PIPE_BUF (4 KB) so that without a
+    lock the POSIX O_APPEND atomicity guarantee no longer applies.
+    """
+    path_str, worker_id, count = args
+    path = Path(path_str)
+    padding = "x" * 5000
+    for i in range(count):
+        append_entry(path, {"worker": worker_id, "i": i, "pad": padding})
+
+
+class TestAppendEntryBasic:
+    """Single-writer invariants."""
+
+    def test_creates_parent_dir(self, tmp_path: Path) -> None:
+        target = tmp_path / "nested" / "history.jsonl"
+        append_entry(target, {"k": "v"})
+        assert target.exists()
+        assert target.parent.is_dir()
+
+    def test_appends_one_line_per_call(self, tmp_path: Path) -> None:
+        path = tmp_path / "history.jsonl"
+        append_entry(path, {"a": 1})
+        append_entry(path, {"b": 2})
+        lines = path.read_text().splitlines()
+        assert len(lines) == 2
+        assert json.loads(lines[0]) == {"a": 1}
+        assert json.loads(lines[1]) == {"b": 2}
+
+    def test_default_str_serializes_non_json_types(self, tmp_path: Path) -> None:
+        """`json.dumps(..., default=str)` handles non-serializable values."""
+        path = tmp_path / "history.jsonl"
+
+        class Marker:
+            def __str__(self) -> str:
+                return "marker-str"
+
+        append_entry(path, {"obj": Marker()})
+        (line,) = path.read_text().splitlines()
+        assert json.loads(line) == {"obj": "marker-str"}
+
+
+class TestAppendEntryConcurrency:
+    """Multi-process concurrent appends produce N parseable lines."""
+
+    def test_concurrent_writers_do_not_interleave(self, tmp_path: Path) -> None:
+        path = tmp_path / "history.jsonl"
+        workers = 8
+        per_worker = 5
+        total = workers * per_worker
+
+        ctx = mp.get_context("spawn")
+        with ctx.Pool(workers) as pool:
+            pool.map(
+                _worker_append,
+                [(str(path), wid, per_worker) for wid in range(workers)],
+            )
+
+        lines = path.read_text().splitlines()
+        assert len(lines) == total, (
+            f"expected {total} lines, got {len(lines)} — some writes were lost"
+        )
+
+        counts_per_worker: dict[int, int] = {}
+        for raw in lines:
+            entry = json.loads(raw)  # raises JSONDecodeError on interleaved bytes
+            counts_per_worker[entry["worker"]] = (
+                counts_per_worker.get(entry["worker"], 0) + 1
+            )
+
+        assert counts_per_worker == dict.fromkeys(range(workers), per_worker)

From f2909b2220fb45c3082b94e397f3443461b8f7d2 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 00:02:41 +0200
Subject: [PATCH 34/60] tests(history): add isolation tests for
 `DEFAULT_HISTORY_DIR` and override behaviors

- Added regression tests to ensure `_isolate_protest_history` fixture correctly overrides `DEFAULT_HISTORY_DIR` with a per-test temp directory.
- Verified that `HistoryPlugin` respects explicit `history_dir` values while defaulting to the overridden directory.
- Updated `conftest.py` with autouse fixture to prevent test pollution of real `.protest/history.jsonl`.
---
 protest/history/plugin.py                   |  4 +-
 tests/conftest.py                           | 17 ++++++++
 tests/history/test_history_dir_isolation.py | 45 +++++++++++++++++++++
 3 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 tests/history/test_history_dir_isolation.py

diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index 92b7942..bf19f0a 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -8,9 +8,9 @@
 
 from protest.entities import SuiteKind
 from protest.evals.suite import EvalSuite
+from protest.history import storage
 from protest.history.collector import collect_env_info, collect_git_info
 from protest.history.storage import (
-    DEFAULT_HISTORY_DIR,
     HISTORY_FILE,
     append_entry,
     load_previous_run,
@@ -38,7 +38,7 @@ class HistoryPlugin(PluginBase):
     description = "Run history tracking"
 
     def __init__(self, history_dir: Path | None = None) -> None:
-        self._history_dir = history_dir or DEFAULT_HISTORY_DIR
+        self._history_dir = history_dir or storage.DEFAULT_HISTORY_DIR
         self._history_file = self._history_dir / HISTORY_FILE
         # Test data
         self._test_suites: dict[str, dict[str, dict[str, Any]]] = {}
diff --git a/tests/conftest.py b/tests/conftest.py
index 5e14ed2..a40d851 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,11 +13,28 @@
     TestItem,
     TestResult,
 )
+from protest.history import storage as history_storage
 from protest.plugin import PluginBase
 from tests.factories.test_items import make_test_item
 
 if TYPE_CHECKING:
     from collections.abc import Callable
+    from pathlib import Path
+
+
+@pytest.fixture(autouse=True)
+def _isolate_protest_history(tmp_path: "Path", monkeypatch: pytest.MonkeyPatch) -> None:
+    """Redirect `DEFAULT_HISTORY_DIR` to a per-test temp dir.
+
+    Tests that forget to pass `history_dir=tmp_path` would otherwise write
+    into the repo's real `.protest/history.jsonl`. The monkeypatch targets
+    the single source of truth (`storage.DEFAULT_HISTORY_DIR`) — all
+    consumers access it via the module so the override is seen everywhere.
+
+    Tests that pass an explicit `history_dir` still use that value, because
+    the plugin does `history_dir or storage.DEFAULT_HISTORY_DIR`.
+    """
+    monkeypatch.setattr(history_storage, "DEFAULT_HISTORY_DIR", tmp_path / ".protest")
 
 
 @pytest.fixture
diff --git a/tests/history/test_history_dir_isolation.py b/tests/history/test_history_dir_isolation.py
new file mode 100644
index 0000000..26946ac
--- /dev/null
+++ b/tests/history/test_history_dir_isolation.py
@@ -0,0 +1,45 @@
+"""Regression tests for B2: tests must not pollute the repo's history file.
+
+The autouse `_isolate_protest_history` fixture in `tests/conftest.py`
+monkeypatches `storage.DEFAULT_HISTORY_DIR` to a per-test temp directory.
+These tests assert that both the storage functions and the HistoryPlugin
+pick up the override — any regression in the plumbing would let runs leak
+into `.protest/history.jsonl` in the real project cwd.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from protest.history import storage
+from protest.history.plugin import HistoryPlugin
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class TestDefaultHistoryDirOverride:
+    """The autouse fixture redirects the module-level constant."""
+
+    def test_storage_default_points_to_tmp(self, tmp_path: Path) -> None:
+        assert tmp_path / ".protest" == storage.DEFAULT_HISTORY_DIR
+
+    def test_append_entry_uses_override(self, tmp_path: Path) -> None:
+        target = storage.DEFAULT_HISTORY_DIR / storage.HISTORY_FILE
+        storage.append_entry(target, {"k": "v"})
+        assert target.exists()
+        assert target.is_relative_to(tmp_path)
+
+    def test_plugin_default_dir_follows_override(self, tmp_path: Path) -> None:
+        plugin = HistoryPlugin()
+        assert plugin._history_dir == tmp_path / ".protest"
+        assert plugin._history_file.is_relative_to(tmp_path)
+
+
+class TestExplicitHistoryDirWins:
+    """Explicit `history_dir=` still takes precedence over the override."""
+
+    def test_plugin_honors_explicit_dir(self, tmp_path: Path) -> None:
+        explicit = tmp_path / "custom"
+        plugin = HistoryPlugin(history_dir=explicit)
+        assert plugin._history_dir == explicit

From acdacfdb42ea1dabddf36558fc5c74e1f1cc9779 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 00:18:50 +0200
Subject: [PATCH 35/60] tests(execution): add tests for `real_stdout` /
 `real_stderr` and replace sys stream duck-typing

- Added unit tests for `real_stdout` and `real_stderr` to ensure proper unwrapping of `TaskAwareStream` and correct fallback to original streams.
- Replaced `getattr(sys.stdout, "_original", ...)` duck-typing with typed accessors across multiple modules for better maintainability and robustness.
- Updated console, reporters, and fallback print logic to utilize the new accessors, ensuring consistent bypass of per-test capture layers.
---
 protest/console.py                   |  6 ++--
 protest/execution/capture.py         | 19 ++++++++++++
 protest/reporting/ascii.py           |  6 ++--
 protest/reporting/rich_reporter.py   |  6 ++--
 tests/execution/test_real_streams.py | 46 ++++++++++++++++++++++++++++
 5 files changed, 73 insertions(+), 10 deletions(-)
 create mode 100644 tests/execution/test_real_streams.py

diff --git a/protest/console.py b/protest/console.py
index 9959165..30ee49b 100644
--- a/protest/console.py
+++ b/protest/console.py
@@ -21,10 +21,9 @@ async def pipeline():
 
 import contextlib
 import re
-import sys
 
 from protest.events.types import Event
-from protest.execution.capture import get_event_bus
+from protest.execution.capture import get_event_bus, real_stderr
 
 
 def print(msg: str, *, raw: bool = False) -> None:
@@ -52,8 +51,7 @@ def print(msg: str, *, raw: bool = False) -> None:
 def _fallback_print(msg: str, raw: bool) -> None:
     """Fallback when no event bus — write to real stderr (bypassing capture)."""
     text = msg if raw else strip_markup(msg)
-    # sys.stderr may be wrapped by TaskAwareStream — get the original
-    stream = getattr(sys.stderr, "_original", sys.stderr)
+    stream = real_stderr()
     stream.write(text + "\n")
     stream.flush()
 
diff --git a/protest/execution/capture.py b/protest/execution/capture.py
index 2e258a7..c5f54c0 100644
--- a/protest/execution/capture.py
+++ b/protest/execution/capture.py
@@ -148,6 +148,25 @@ def __getattr__(self, name: str) -> object:
         return getattr(self._original, name)
 
 
+def real_stdout() -> TextIO:
+    """Return the real process stdout, bypassing any active capture wrapper.
+
+    When a run is under capture, `sys.stdout` is a `TaskAwareStream` routing
+    writes into per-test buffers; reporters need to bypass that buffering to
+    write their own output (progress, summary) directly to the terminal.
+    """
+    if isinstance(sys.stdout, TaskAwareStream):
+        return sys.stdout._original
+    return sys.stdout
+
+
+def real_stderr() -> TextIO:
+    """Return the real process stderr, bypassing any active capture wrapper."""
+    if isinstance(sys.stderr, TaskAwareStream):
+        return sys.stderr._original
+    return sys.stderr
+
+
 class TaskAwareLogHandler(logging.Handler):
     def emit(self, record: LogRecord) -> None:
         records = _log_records.get()
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index 64470b8..018bedf 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -1,5 +1,4 @@
 import logging
-import sys
 import traceback
 from pathlib import Path
 from typing import Any
@@ -23,6 +22,7 @@
     TestTeardownInfo,
 )
 from protest.evals.types import EvalSuiteReport
+from protest.execution.capture import real_stdout
 from protest.plugin import PluginBase, PluginContext
 from protest.reporting.format import (
     format_duration as _format_duration,
@@ -200,7 +200,7 @@ def on_test_teardown_start(self, info: TestTeardownInfo) -> None:
 
     @staticmethod
     def _print_bypass(msg: str) -> None:
-        stream = getattr(sys.stdout, "_original", sys.stdout)
+        stream = real_stdout()
         stream.write(msg + "\n")
         stream.flush()
 
@@ -320,7 +320,7 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
     def on_user_print(self, data: Any) -> None:
         msg, raw = data
         text = msg if raw else strip_markup(msg)
-        stream = getattr(sys.stdout, "_original", sys.stdout)
+        stream = real_stdout()
         stream.write(f"       | {text}\n")
         stream.flush()
 
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 5794457..bf93406 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -1,5 +1,4 @@
 import logging
-import sys
 import traceback
 from argparse import ArgumentParser
 from pathlib import Path
@@ -25,6 +24,7 @@
     TestTeardownInfo,
 )
 from protest.evals.types import EvalSuiteReport
+from protest.execution.capture import real_stdout
 from protest.plugin import PluginBase, PluginContext
 from protest.reporting.format import (
     format_duration as _format_duration,
@@ -152,7 +152,7 @@ def _maybe_show_logs(self, result: TestResult) -> None:
 
     def _print_bypass(self, message: str) -> None:
         """Print bypassing capture (for lifecycle messages emitted during tests)."""
-        stream = getattr(sys.stdout, "_original", sys.stdout)
+        stream = real_stdout()
         Console(file=stream, highlight=False).print(message)
 
     def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]:
@@ -377,7 +377,7 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
     def on_user_print(self, data: Any) -> None:
         msg, raw = data
         # Write to the real stdout, bypassing capture
-        stream = getattr(sys.stdout, "_original", sys.stdout)
+        stream = real_stdout()
         c = Console(file=stream, highlight=False)
         if raw:
             c.print(msg, markup=False)
diff --git a/tests/execution/test_real_streams.py b/tests/execution/test_real_streams.py
new file mode 100644
index 0000000..c7b53de
--- /dev/null
+++ b/tests/execution/test_real_streams.py
@@ -0,0 +1,46 @@
+"""Tests for `real_stdout()` / `real_stderr()`.
+
+These accessors replace the previous `getattr(sys.stdout, "_original", ...)`
+duck-typing. They give reporters a typed way to bypass the per-test capture
+wrapper, so renaming or removing the private attribute won't silently break
+reporter output.
+"""
+
+from __future__ import annotations
+
+import io
+import sys
+
+from protest.execution.capture import (
+    TaskAwareStream,
+    real_stderr,
+    real_stdout,
+)
+
+
+class TestRealStdoutUnwrapsTaskAwareStream:
+    def test_returns_stdout_when_not_wrapped(self) -> None:
+        assert real_stdout() is sys.stdout
+
+    def test_unwraps_wrapped_stream(self) -> None:
+        buffer = io.StringIO()
+        wrapper = TaskAwareStream(buffer)
+        sys.stdout = wrapper  # type: ignore[assignment]
+        try:
+            assert real_stdout() is buffer
+        finally:
+            sys.stdout = sys.__stdout__
+
+
+class TestRealStderrUnwrapsTaskAwareStream:
+    def test_returns_stderr_when_not_wrapped(self) -> None:
+        assert real_stderr() is sys.stderr
+
+    def test_unwraps_wrapped_stream(self) -> None:
+        buffer = io.StringIO()
+        wrapper = TaskAwareStream(buffer)
+        sys.stderr = wrapper  # type: ignore[assignment]
+        try:
+            assert real_stderr() is buffer
+        finally:
+            sys.stderr = sys.__stderr__

From 715857eaddc68c315d5ddbd3efc7ec02613c70a9 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 01:07:03 +0200
Subject: [PATCH 36/60] refactor(console, capture): improve type annotations
 and clarify event bus usage

- Added `EventBus` type annotations for `_event_bus_ref` and related methods to improve clarity and type safety.
- Updated comments in `console.print` to explain the necessity of private access to `bus._handlers` and its rationale.
- Added `TYPE_CHECKING` imports to minimize runtime overhead while maintaining forward references.
---
 protest/console.py           | 10 +++++++---
 protest/execution/capture.py | 19 ++++++++++++-------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/protest/console.py b/protest/console.py
index 30ee49b..2d31607 100644
--- a/protest/console.py
+++ b/protest/console.py
@@ -41,9 +41,13 @@ def print(msg: str, *, raw: bool = False) -> None:
         _fallback_print(msg, raw)
         return
 
-    # Call handlers directly (sync, bypasses async emit).
-    # This ensures messages appear immediately, not after the test.
-    for handler_entry in bus._handlers.get(Event.USER_PRINT, []):  # type: ignore[attr-defined]
+    # Intentional private access to `bus._handlers`: we need sync dispatch
+    # so messages appear immediately (not after the test). An earlier public
+    # `EventBus.emit_sync` was removed (commit e14ffd5) because its signal-
+    # handler use case was async-signal-unsafe, and we don't want to offer
+    # that API to users. Kept private here — the framework itself is the
+    # only caller, and console.print is never invoked from a signal handler.
+    for handler_entry in bus._handlers.get(Event.USER_PRINT, []):
         with contextlib.suppress(Exception):
             handler_entry.func((msg, raw))
 
diff --git a/protest/execution/capture.py b/protest/execution/capture.py
index c5f54c0..584dbf3 100644
--- a/protest/execution/capture.py
+++ b/protest/execution/capture.py
@@ -1,14 +1,19 @@
+from __future__ import annotations
+
 import io
 import logging
 import sys
-from collections.abc import Callable
 from contextlib import suppress
 from contextvars import ContextVar, Token
 from dataclasses import dataclass
 from logging import LogRecord
-from typing import TextIO
+from typing import TYPE_CHECKING, TextIO
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
 
-from protest.compat import Self
+    from protest.compat import Self
+    from protest.events.bus import EventBus
 
 _capture_buffer: ContextVar[io.StringIO | None] = ContextVar(
     "capture_buffer", default=None
@@ -19,7 +24,7 @@
 )
 
 _current_node_id: ContextVar[str | None] = ContextVar("current_node_id", default=None)
-_event_bus_ref: ContextVar[object | None] = ContextVar("event_bus_ref", default=None)
+_event_bus_ref: ContextVar[EventBus | None] = ContextVar("event_bus_ref", default=None)
 
 
 @dataclass(slots=True)
@@ -101,17 +106,17 @@ def get_session_teardown_output() -> str:
     return _session_teardown.buffer.getvalue() if _session_teardown.buffer else ""
 
 
-def set_event_bus(bus: object) -> Token[object | None]:
+def set_event_bus(bus: EventBus) -> Token[EventBus | None]:
     """Set event bus reference for console.print() access."""
     return _event_bus_ref.set(bus)
 
 
-def reset_event_bus(token: Token[object | None]) -> None:
+def reset_event_bus(token: Token[EventBus | None]) -> None:
     """Reset event bus reference."""
     _event_bus_ref.reset(token)
 
 
-def get_event_bus() -> object | None:
+def get_event_bus() -> EventBus | None:
     """Get current event bus (for console.print)."""
     return _event_bus_ref.get()
 

From 594bb547c0435a2a7f7964bdbde2c603bff2ab44 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 09:11:11 +0200
Subject: [PATCH 37/60] feat(history): version JSONL entries via
 `schema_version` with skip+warn on future versions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `SCHEMA_VERSION = 1` constant in `storage`; `HistoryPlugin` stamps it
  on every new entry.
- Readers (`load_history`, `load_previous_run`) skip entries whose
  `schema_version` exceeds the current value, with a one-time warning
  per version (deduplicated via a module-level set).
- Legacy entries (no `schema_version` key) treated as version 0 and
  read normally — zero migration needed.
- Add `tests/history/test_schema_version.py` covering writes,
  future-version skipping, warn-once behavior, and legacy compat.
---
 protest/history/plugin.py            |   2 +
 protest/history/storage.py           |  34 +++++++++
 tests/history/test_schema_version.py | 109 +++++++++++++++++++++++++++
 3 files changed, 145 insertions(+)
 create mode 100644 tests/history/test_schema_version.py

diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index bf19f0a..ca738a3 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -12,6 +12,7 @@
 from protest.history.collector import collect_env_info, collect_git_info
 from protest.history.storage import (
     HISTORY_FILE,
+    SCHEMA_VERSION,
     append_entry,
     load_previous_run,
 )
@@ -194,6 +195,7 @@ def on_session_end(self, result: Any) -> None:
             }
 
         entry: dict[str, Any] = {
+            "schema_version": SCHEMA_VERSION,
             "run_id": str(uuid.uuid4()),
             "timestamp": datetime.now(tz=timezone.utc).isoformat(),
             "git": collect_git_info(),
diff --git a/protest/history/storage.py b/protest/history/storage.py
index 829b65e..8f89fa4 100644
--- a/protest/history/storage.py
+++ b/protest/history/storage.py
@@ -6,6 +6,7 @@
 import json
 import subprocess
 import sys
+import warnings
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any
 
@@ -57,6 +58,35 @@ def _exclusive_file_lock(f: IO[Any]) -> Iterator[None]:
 DEFAULT_HISTORY_DIR = Path(".protest")
 HISTORY_FILE = "history.jsonl"
 
+# JSONL entry schema version. Bump when the on-disk shape changes in a way
+# that older readers can't transparently handle (new required fields,
+# restructured nesting). Entries written before this was introduced have no
+# `schema_version` key and are treated as version 0 (legacy — best-effort).
+SCHEMA_VERSION = 1
+
+_warned_future_versions: set[int] = set()
+
+
+def _is_future_schema(entry: dict[str, Any]) -> bool:
+    """Return True if the entry was written by a newer protest version.
+
+    Entries with `schema_version > SCHEMA_VERSION` are skipped by readers,
+    with a one-time warning per version (avoids N warnings for N such
+    entries).
+    """
+    version = entry.get("schema_version", 0)
+    if not isinstance(version, int) or version <= SCHEMA_VERSION:
+        return False
+    if version not in _warned_future_versions:
+        _warned_future_versions.add(version)
+        warnings.warn(
+            f"history.jsonl contains entries with schema_version={version}, "
+            f"but this protest supports up to {SCHEMA_VERSION}. "
+            f"Those entries will be skipped. Upgrade protest to read them.",
+            stacklevel=3,
+        )
+    return True
+
 
 def load_history(
     history_dir: Path | None = None,
@@ -77,6 +107,8 @@ def load_history(
             entry = json.loads(line)
         except json.JSONDecodeError:
             continue
+        if _is_future_schema(entry):
+            continue
         if evals_only and not _has_suite_kind(entry, "eval"):
             continue
         if tests_only and not _has_suite_kind(entry, "test"):
@@ -137,6 +169,8 @@ def load_previous_run(
             entry = json.loads(line)
         except json.JSONDecodeError:
             continue
+        if _is_future_schema(entry):
+            continue
         if evals_only and entry.get("evals") is None:
             continue
         return dict(entry)
diff --git a/tests/history/test_schema_version.py b/tests/history/test_schema_version.py
new file mode 100644
index 0000000..b4a0724
--- /dev/null
+++ b/tests/history/test_schema_version.py
@@ -0,0 +1,109 @@
+"""Tests for `schema_version` on history JSONL entries.
+
+The plugin stamps every new entry with `schema_version`. Readers skip
+entries with a future version (written by a newer protest) and warn once
+per version.
+
+Legacy entries (no `schema_version` key at all — written before this was
+introduced) are treated as version 0 and read without warning.
+"""
+
+from __future__ import annotations
+
+import json
+import warnings
+from typing import TYPE_CHECKING
+
+from protest.history import storage
+from protest.history.plugin import HistoryPlugin
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+def _write_jsonl(path: Path, entries: list[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text("\n".join(json.dumps(e) for e in entries) + "\n")
+
+
+class TestSchemaVersionWrites:
+    def test_append_entry_writes_schema_version_via_plugin(self) -> None:
+        """HistoryPlugin stamps `schema_version` on every new entry."""
+        plugin = HistoryPlugin()
+        assert storage.SCHEMA_VERSION >= 1
+
+        entry_with_version = {"schema_version": storage.SCHEMA_VERSION, "k": "v"}
+        storage.append_entry(plugin._history_file, entry_with_version)
+        loaded = json.loads(plugin._history_file.read_text().splitlines()[0])
+        assert loaded["schema_version"] == storage.SCHEMA_VERSION
+
+
+class TestFutureVersionSkipped:
+    def test_future_version_is_skipped_by_load_history(self, tmp_path: Path) -> None:
+        path = tmp_path / ".protest" / storage.HISTORY_FILE
+        _write_jsonl(
+            path,
+            [
+                {"schema_version": storage.SCHEMA_VERSION, "run_id": "current"},
+                {"schema_version": storage.SCHEMA_VERSION + 10, "run_id": "future"},
+            ],
+        )
+        storage._warned_future_versions.clear()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            entries = storage.load_history(history_dir=tmp_path / ".protest")
+        run_ids = [e["run_id"] for e in entries]
+        assert run_ids == ["current"]
+
+    def test_future_version_is_skipped_by_load_previous_run(
+        self, tmp_path: Path
+    ) -> None:
+        path = tmp_path / ".protest" / storage.HISTORY_FILE
+        _write_jsonl(
+            path,
+            [
+                {"schema_version": storage.SCHEMA_VERSION, "run_id": "older"},
+                {"schema_version": storage.SCHEMA_VERSION + 1, "run_id": "newer"},
+            ],
+        )
+        storage._warned_future_versions.clear()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            entry = storage.load_previous_run(history_dir=tmp_path / ".protest")
+        assert entry is not None
+        assert entry["run_id"] == "older"
+
+    def test_warning_raised_once_per_future_version(self, tmp_path: Path) -> None:
+        path = tmp_path / ".protest" / storage.HISTORY_FILE
+        future = storage.SCHEMA_VERSION + 42
+        _write_jsonl(
+            path,
+            [{"schema_version": future, "run_id": str(i)} for i in range(5)],
+        )
+        storage._warned_future_versions.clear()
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            storage.load_history(history_dir=tmp_path / ".protest")
+        future_warnings = [
+            w for w in caught if f"schema_version={future}" in str(w.message)
+        ]
+        assert len(future_warnings) == 1
+
+
+class TestLegacyEntriesStillReadable:
+    """Pre-schema_version entries have no key — treat as legacy (version 0)."""
+
+    def test_entry_without_schema_version_is_read(self, tmp_path: Path) -> None:
+        path = tmp_path / ".protest" / storage.HISTORY_FILE
+        _write_jsonl(path, [{"run_id": "legacy", "suites": {}}])
+        storage._warned_future_versions.clear()
+        entries = storage.load_history(history_dir=tmp_path / ".protest")
+        assert len(entries) == 1
+        assert entries[0]["run_id"] == "legacy"
+
+    def test_entry_with_version_zero_is_read(self, tmp_path: Path) -> None:
+        path = tmp_path / ".protest" / storage.HISTORY_FILE
+        _write_jsonl(path, [{"schema_version": 0, "run_id": "v0"}])
+        storage._warned_future_versions.clear()
+        entries = storage.load_history(history_dir=tmp_path / ".protest")
+        assert len(entries) == 1

From 4276e5d123ebdbef5abf0d4f87eedb2e14b68bc3 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 09:19:11 +0200
Subject: [PATCH 38/60] fix(evals): use `statistics.quantiles` for true p5/p95
 in `ScoreStats`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces naive `int(n * 0.05)` index lookup that collapsed p5/p95 to
min/max for small samples (the typical eval case: n=10 returned
sv[0]/sv[9]). Now uses `statistics.quantiles(n=20, method='inclusive')`
which interpolates linearly between adjacent values and clamps to
[min, max] — appropriate for bounded scores.

- Single-value case (n=1) falls back to that value (percentiles undefined).
- Empty case unchanged: zeroed stats.
- `_MIN_VALUES_FOR_PERCENTILES = 2` constant gates the quantiles call.
- Add `tests/evals/test_score_stats.py` covering empty / n=1 / n=2 /
  n=10 (the regression case) / n=100 / sort-independence.
---
 protest/evals/types.py          | 18 ++++++-
 tests/evals/test_score_stats.py | 92 +++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 tests/evals/test_score_stats.py

diff --git a/protest/evals/types.py b/protest/evals/types.py
index e78e33c..141047b 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -223,6 +223,9 @@ def failed_scores(self) -> tuple[EvalScore, ...]:
         return tuple(s for s in self.scores if not s.passed)
 
 
+_MIN_VALUES_FOR_PERCENTILES = 2  # statistics.quantiles requires at least 2 inputs
+
+
 @dataclass(frozen=True, slots=True)
 class ScoreStats:
     """Aggregated statistics for a named score across cases."""
@@ -242,12 +245,23 @@ def from_values(cls, name: str, values: list[float]) -> ScoreStats:
             return cls(name=name, mean=0, median=0, p5=0, p95=0, min=0, max=0, count=0)
         sv = sorted(values)
         n = len(sv)
+        if n >= _MIN_VALUES_FOR_PERCENTILES:
+            # `quantiles(n=20, method='inclusive')` returns 19 cutpoints that
+            # split the data into 20 equal groups. Index 0 = 5%, index 18 = 95%.
+            # Inclusive method interpolates linearly between adjacent values
+            # and clamps to [min, max] — appropriate for bounded scores.
+            cuts = statistics.quantiles(sv, n=20, method="inclusive")
+            p5_value = cuts[0]
+            p95_value = cuts[18]
+        else:
+            # Single value: percentiles are undefined; fall back to that value.
+            p5_value = p95_value = sv[0]
         return cls(
             name=name,
             mean=statistics.mean(sv),
             median=statistics.median(sv),
-            p5=sv[max(0, int(n * 0.05))],
-            p95=sv[min(n - 1, int(n * 0.95))],
+            p5=p5_value,
+            p95=p95_value,
             min=sv[0],
             max=sv[-1],
             count=n,
diff --git a/tests/evals/test_score_stats.py b/tests/evals/test_score_stats.py
new file mode 100644
index 0000000..7a0eb90
--- /dev/null
+++ b/tests/evals/test_score_stats.py
@@ -0,0 +1,92 @@
+"""Tests for `ScoreStats.from_values` — percentile correctness.
+
+Pre-M11, p5/p95 used `int(n * 0.05)` index lookup, which collapses to
+min/max for small samples (the typical eval case). Post-M11 uses
+`statistics.quantiles(method='inclusive')` for true linear-interpolated
+percentiles. These tests pin the new behavior.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from protest.evals.types import ScoreStats
+
+
+class TestEmptyAndSingleValue:
+    def test_empty_returns_zeroed_stats(self) -> None:
+        stats = ScoreStats.from_values("acc", [])
+        assert stats.count == 0
+        assert stats.mean == 0
+        assert stats.p5 == 0
+        assert stats.p95 == 0
+        assert stats.min == 0
+        assert stats.max == 0
+
+    def test_single_value_collapses_percentiles(self) -> None:
+        """One value → percentiles undefined; fall back to that value."""
+        stats = ScoreStats.from_values("acc", [0.42])
+        assert stats.count == 1
+        assert stats.mean == pytest.approx(0.42)
+        assert stats.median == pytest.approx(0.42)
+        assert stats.p5 == pytest.approx(0.42)
+        assert stats.p95 == pytest.approx(0.42)
+        assert stats.min == pytest.approx(0.42)
+        assert stats.max == pytest.approx(0.42)
+
+
+class TestPercentilesNotCollapsedForSmallSamples:
+    """Regression: with n=10 the old impl returned min/max for p5/p95."""
+
+    def test_n_equals_10_p5_is_above_min(self) -> None:
+        values = [float(i) for i in range(10)]  # 0..9
+        stats = ScoreStats.from_values("acc", values)
+        # Inclusive method interpolates: p5 of [0..9] is 0.45, p95 is 8.55
+        assert stats.min == 0
+        assert stats.p5 > stats.min
+        assert stats.p5 == pytest.approx(0.45, abs=0.01)
+
+    def test_n_equals_10_p95_is_below_max(self) -> None:
+        values = [float(i) for i in range(10)]
+        stats = ScoreStats.from_values("acc", values)
+        assert stats.max == 9
+        assert stats.p95 < stats.max
+        assert stats.p95 == pytest.approx(8.55, abs=0.01)
+
+    def test_n_equals_2_interpolates(self) -> None:
+        """Inclusive percentiles work even for n=2 (interpolation)."""
+        stats = ScoreStats.from_values("acc", [0.0, 1.0])
+        assert stats.p5 == pytest.approx(0.05, abs=0.01)
+        assert stats.p95 == pytest.approx(0.95, abs=0.01)
+
+
+class TestPercentilesAccurateForLargeSamples:
+    def test_n_equals_100_uniform_distribution(self) -> None:
+        """For uniform 0..99, p5 ≈ 5 and p95 ≈ 95 (inclusive method)."""
+        values = [float(i) for i in range(100)]
+        stats = ScoreStats.from_values("acc", values)
+        assert stats.p5 == pytest.approx(4.95, abs=0.1)
+        assert stats.p95 == pytest.approx(94.05, abs=0.1)
+
+    def test_unsorted_input_is_sorted_internally(self) -> None:
+        """from_values must not depend on input order."""
+        ordered = ScoreStats.from_values("a", [0.1, 0.2, 0.3, 0.4, 0.5])
+        shuffled = ScoreStats.from_values("a", [0.3, 0.5, 0.1, 0.4, 0.2])
+        assert ordered.p5 == pytest.approx(shuffled.p5)
+        assert ordered.p95 == pytest.approx(shuffled.p95)
+        assert ordered.median == pytest.approx(shuffled.median)
+
+
+class TestBasicStatsStillCorrect:
+    """Mean/median/min/max/count are unchanged."""
+
+    def test_mean_and_median(self) -> None:
+        stats = ScoreStats.from_values("acc", [1.0, 2.0, 3.0, 4.0, 5.0])
+        assert stats.mean == pytest.approx(3.0)
+        assert stats.median == pytest.approx(3.0)
+
+    def test_min_max_count(self) -> None:
+        stats = ScoreStats.from_values("acc", [0.2, 0.7, 0.1, 0.9, 0.5])
+        assert stats.min == pytest.approx(0.1)
+        assert stats.max == pytest.approx(0.9)
+        assert stats.count == 5

From a7f29ccc7f523d41b796fd1905230b18a8f61aba Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 10:50:12 +0200
Subject: [PATCH 39/60] chore: address review minors (m2, m3, m4, m6, m7, m10,
 m11)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- m2: replace `lambda` with `functools.partial` in CLI command dispatch
  (`protest/cli/main.py`).
- m3: route `EvalResultsWriter` "Results: ..." line through
  `console.print` instead of builtin `print`, so it bypasses test capture
  consistently.
- m4: `Evaluator.__call__` now always returns a fresh clone in the
  re-binding path; removes the surprising `f is f()` identity.
- m6: replace `"tests"` sentinel for `_default_suite_name` with `None`,
  fall back to the literal `"tests"` only when no test suite registered.
  A user-defined suite literally named `"tests"` no longer collides with
  the default-detection heuristic.
- m7: add a Contents section (TOC) to `docs/evals.md` for raw-file
  navigability (mkdocs already auto-generates a sidebar TOC).
- m10: clarify `FakeJudge.judge` comment — caller must use a dataclass
  with all-default fields.
- m11: type `EvalSuite.eval(judge=)` as `Judge | None` (was `Any`) and
  document the per-eval override behavior in the docstring.

Verified intentional / already-resolved: m1 (`console.print` shadow is
the API), m5 (deduplicated via M3), m8 (deferred — needs PEP 696),
m9 (`_canonical` resolution order is documented), m12 (`SuiteKind` is a
`StrEnum`, no mismatch between str/enum comparisons).
---
 docs/evals.md                   | 21 +++++++++++++++++++++
 protest/cli/main.py             |  5 +++--
 protest/evals/evaluator.py      |  8 +++++---
 protest/evals/results_writer.py |  3 ++-
 protest/evals/suite.py          | 13 +++++++++++--
 protest/history/plugin.py       |  8 +++++---
 tests/evals/test_judge.py       |  3 ++-
 7 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 562bcde..e778b22 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -2,6 +2,27 @@
 
 Evaluate LLM outputs with scored metrics and historical tracking.
 
+## Contents
+
+- [What is an Eval?](#what-is-an-eval)
+- [Quick Start](#quick-start)
+- [How It Works](#how-it-works)
+- [EvalSuite](#evalsuite)
+- [EvalCase](#evalcase)
+- [Evaluators](#evaluators)
+- [Fixtures](#fixtures)
+- [ModelInfo](#modelinfo)
+- [Judge](#judge)
+- [TaskResult (SUT Usage Tracking)](#taskresult-sut-usage-tracking)
+- [Usage Display](#usage-display)
+- [Evaluator Errors](#evaluator-errors)
+- [Name Collisions](#name-collisions)
+- [Multi-Model Sessions](#multi-model-sessions)
+- [CLI](#cli)
+- [Output](#output)
+- [History](#history)
+- [Progress Output](#progress-output)
+
 ## What is an Eval?
 
 A test produces **pass/fail**. An eval produces **scores** — numeric values (0.0–1.0) that measure output quality. Scores are aggregated across cases, tracked over time, and compared between runs.
diff --git a/protest/cli/main.py b/protest/cli/main.py
index 9c0b324..574825f 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import argparse
+import functools
 import sys
 from typing import TYPE_CHECKING, Any
 
@@ -106,8 +107,8 @@ def main() -> None:
 
     commands: dict[str, Any] = {
         "tags": _handle_tags_command,
-        "run": lambda: _handle_run_command(kind_filter="test"),
-        "eval": lambda: _handle_run_command(kind_filter="eval"),
+        "run": functools.partial(_handle_run_command, kind_filter="test"),
+        "eval": functools.partial(_handle_run_command, kind_filter="eval"),
         "history": _handle_history_command,
         "live": _handle_live_command,
     }
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 242bb64..80881f9 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -255,9 +255,11 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
         if args and isinstance(args[0], EvalContext):
             merged = {**self._kwargs, **kwargs}
             return self._fn(*args, **merged)
-        if kwargs:
-            return Evaluator(self._fn, {**self._kwargs, **kwargs})
-        return self
+        # Re-binding form (no EvalContext): always returns a fresh clone.
+        # Returning `self` for the no-kwargs case used to make `f is f()`
+        # accidentally true, which surprised users expecting `()` to behave
+        # like an evaluator constructor.
+        return Evaluator(self._fn, {**self._kwargs, **kwargs})
 
     def evaluator_identity(self) -> dict[str, Any]:
         identity: dict[str, Any] = {"fn": self._qualname}
diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
index db64f0e..b611d6b 100644
--- a/protest/evals/results_writer.py
+++ b/protest/evals/results_writer.py
@@ -11,6 +11,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
+from protest import console
 from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport
 from protest.plugin import PluginBase
 
@@ -62,7 +63,7 @@ def on_eval_suite_end(self, report: Any) -> None:
             return
         run_dir = self._run_dirs.get(report.suite_name)
         if run_dir:
-            print(f"  Results: {run_dir}")
+            console.print(f"  Results: {run_dir}")
 
 
 # ---------------------------------------------------------------------------
diff --git a/protest/evals/suite.py b/protest/evals/suite.py
index 905010c..c4af124 100644
--- a/protest/evals/suite.py
+++ b/protest/evals/suite.py
@@ -68,9 +68,18 @@ def eval(
         evaluators: list[Any] | None = None,
         tags: list[str] | None = None,
         timeout: float | None = None,
-        judge: Any = None,
+        judge: Judge | None = None,
     ) -> Callable[[FuncT], FuncT]:
-        """Register a scored eval test on this suite."""
+        """Register a scored eval test on this suite.
+
+        Args:
+            evaluators: Per-eval evaluators, appended to suite-level ones.
+            tags: Tags forwarded to the underlying `@suite.test`.
+            timeout: Per-eval timeout in seconds.
+            judge: Override the suite-level judge for this eval only.
+                Useful when one eval needs a stronger model than the rest
+                of the suite. Falls back to `self.judge` when omitted.
+        """
 
         def decorator(func: FuncT) -> FuncT:
             resolved_judge = judge or self._judge
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index ca738a3..8827db5 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -44,7 +44,9 @@ def __init__(self, history_dir: Path | None = None) -> None:
         # Test data
         self._test_suites: dict[str, dict[str, dict[str, Any]]] = {}
         self._suite_kinds: dict[str, SuiteKind] = {}
-        self._default_suite_name: str = "tests"
+        # Bucket name for tests without a suite_path; resolved during setup
+        # to the first non-eval suite name, or kept as the literal fallback.
+        self._default_suite_name: str | None = None
         # Eval data
         self._eval_reports: dict[str, EvalSuiteReport] = {}
         self._eval_suite_metadata: dict[str, dict[str, Any]] = {}
@@ -74,7 +76,7 @@ def setup(self, session: ProTestSession) -> None:
                         "name": suite.judge.name,
                         "provider": suite.judge.provider,
                     }
-            elif not self._default_suite_name or self._default_suite_name == "tests":
+            elif self._default_suite_name is None:
                 self._default_suite_name = suite.name
 
     # -- Test event handlers --------------------------------------------------
@@ -93,7 +95,7 @@ def _record_test(self, result: TestResult, *, passed: bool) -> None:
         suite_name = (
             result.suite_path.root_name
             if result.suite_path
-            else self._default_suite_name
+            else (self._default_suite_name or "tests")
         )
         if suite_name not in self._test_suites:
             self._test_suites[suite_name] = {}
diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py
index e711bdb..a27ea41 100644
--- a/tests/evals/test_judge.py
+++ b/tests/evals/test_judge.py
@@ -41,7 +41,8 @@ async def judge(self, prompt: str, output_type: type) -> JudgeResponse:
             )
         if output_type is str:
             return JudgeResponse(output=f"judged: {prompt[:20]}")
-        # For dataclass types, try to construct with defaults
+        # Dataclass fallback: caller must use a dataclass whose fields all
+        # have defaults — no real LLM call to derive values from.
         return JudgeResponse(output=output_type())
 
 

From 18078d495c879ed6f10dce7b16e910982e9bb38e Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:02:04 +0200
Subject: [PATCH 40/60] ci: ensure matrix Python version consistency and add
 verification step

- Set `UV_PYTHON` to enforce the selected Python version in the matrix.
- Add a verification step to confirm the expected Python version is used.
---
 .github/workflows/ci.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22a0944..dc7b06c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -73,6 +73,11 @@ jobs:
           - os: windows-latest
             python-version: "3.12"
     runs-on: ${{ matrix.os }}
+    env:
+      # Force uv to honor the matrix Python version. Without this, uv picks
+      # the newest interpreter satisfying `requires-python` (often the system
+      # 3.12), making the matrix cosmetic.
+      UV_PYTHON: ${{ matrix.python-version }}
 
     steps:
       - uses: actions/checkout@v6
@@ -90,6 +95,9 @@ jobs:
       - name: Install dependencies
         run: uv sync --dev
 
+      - name: Verify Python version
+        run: uv run python -c "import sys; v = '${{ matrix.python-version }}'; assert sys.version.startswith(v), f'expected {v}, got {sys.version}'"
+
       - name: Run tests
         if: matrix.os != 'ubuntu-latest' || matrix.python-version != '3.12'
         run: uv run pytest -vv

From 6b9cc8370aa0c67223e6fc9b19863f6d8987277a Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:19:10 +0200
Subject: [PATCH 41/60] refactor: replace `StrEnum` with `str, Enum` for Python
 3.10 compatibility

- Updated `SuiteKind` to inherit from `str` and `Enum` instead of `StrEnum`, ensuring compatibility with Python 3.10.
- Adjusted `SuiteKind.__str__` method for consistent behavior.
- Modified history plugin to handle `Enum.value` directly while maintaining default behavior.
- Moved `Self` import to `protest.compat` for streamlined typing support.
---
 protest/entities/core.py  | 12 +++++++++---
 protest/history/plugin.py |  3 ++-
 protest/plugin.py         |  3 ++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/protest/entities/core.py b/protest/entities/core.py
index 5a8c680..d8b157b 100644
--- a/protest/entities/core.py
+++ b/protest/entities/core.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from enum import Enum, StrEnum
+from enum import Enum
 from typing import TYPE_CHECKING, Any, TypeAlias
 
 if TYPE_CHECKING:
@@ -20,12 +20,18 @@
 FixtureCallable: TypeAlias = "Callable[..., Any]"
 
 
-class SuiteKind(StrEnum):
-    """Kind of suite — determines behavior (eval wiring, history, reporting)."""
+class SuiteKind(str, Enum):
+    """Kind of suite — determines behavior (eval wiring, history, reporting).
+
+    Inherits from `str` (not `StrEnum`) for Python 3.10 compatibility.
+    """
 
     TEST = "test"
     EVAL = "eval"
 
+    def __str__(self) -> str:
+        return self.value
+
 
 class FixtureScope(Enum):
     """Scope level for fixtures."""
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index 8827db5..00c5e8b 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -125,8 +125,9 @@ def on_session_end(self, result: Any) -> None:
         for suite_name, cases in self._test_suites.items():
             total = len(cases)
             passed = sum(1 for c in cases.values() if c["passed"])
+            kind = self._suite_kinds.get(suite_name)
             suites_data[suite_name] = {
-                "kind": str(self._suite_kinds.get(suite_name, "test")),
+                "kind": kind.value if kind is not None else "test",
                 "total_cases": total,
                 "passed": passed,
                 "failed": total - passed,
diff --git a/protest/plugin.py b/protest/plugin.py
index 9589fff..895d7a5 100644
--- a/protest/plugin.py
+++ b/protest/plugin.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Self
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from argparse import ArgumentParser
     from collections.abc import Awaitable
 
+    from protest.compat import Self
     from protest.core.session import ProTestSession
     from protest.entities import (
         FixtureInfo,

From ef5a65b287ba214b5df5a8042c5dc05cf681a564 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:25:34 +0200
Subject: [PATCH 42/60] chore: remove `pydantic-evals` dependency and related
 code

- Dropped `pydantic-evals` from dependencies and `pyproject.toml` `evals` extra.
- Removed references to `pydantic-evals` in code and version reporting.
- Cleaned up `uv.lock` and related metadata.
---
 protest/history/collector.py |   1 -
 pyproject.toml               |   3 -
 uv.lock                      | 327 +----------------------------------
 3 files changed, 1 insertion(+), 330 deletions(-)

diff --git a/protest/history/collector.py b/protest/history/collector.py
index ee8bb1a..7aa8659 100644
--- a/protest/history/collector.py
+++ b/protest/history/collector.py
@@ -31,7 +31,6 @@ def collect_env_info() -> dict[str, Any]:
     return {
         "python_version": platform.python_version(),
         "protest_version": _get_pkg_version("protest"),
-        "pydantic_evals_version": _get_pkg_version("pydantic-evals"),
         "hostname": platform.node(),
         "os": sys.platform,
         "ci": ci_provider is not None,
diff --git a/pyproject.toml b/pyproject.toml
index 0dbe858..0cb8974 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,9 +49,6 @@ rich = [
 web = [
     "websockets>=12.0",
 ]
-evals = [
-    "pydantic-evals>=0.1",
-]
 
 
 [tool.ruff]
diff --git a/uv.lock b/uv.lock
index e4d7032..7594a42 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2,29 +2,6 @@ version = 1
 revision = 3
 requires-python = ">=3.10"
 
-[[package]]
-name = "annotated-types"
-version = "0.7.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
-]
-
-[[package]]
-name = "anyio"
-version = "4.13.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
-    { name = "idna" },
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" },
-]
-
 [[package]]
 name = "attrs"
 version = "25.4.0"
@@ -328,19 +305,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" },
 ]
 
-[[package]]
-name = "genai-prices"
-version = "0.0.56"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "httpx" },
-    { name = "pydantic" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/44/6b/94b3018a672c7775edfb485f0fed8f6068fba75e49b067e8a1ac5eb96764/genai_prices-0.0.56.tar.gz", hash = "sha256:ac24b16a84d0ab97539bfa48dfa4649689de8e3ce71c12ebacef29efb1998045", size = 65872, upload-time = "2026-03-20T20:33:00.732Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a3/f6/8ef7e4c286deb2709d11ca96a5237caae3ef4876ab3c48095856cfd2df30/genai_prices-0.0.56-py3-none-any.whl", hash = "sha256:dbe86be8f3f556bed1b72209ed36851fec8b01793b3b220f42921a4e7da945f6", size = 68966, upload-time = "2026-03-20T20:33:02.555Z" },
-]
-
 [[package]]
 name = "ghp-import"
 version = "2.1.0"
@@ -353,52 +317,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" },
 ]
 
-[[package]]
-name = "griffelib"
-version = "2.0.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/9d/82/74f4a3310cdabfbb10da554c3a672847f1ed33c6f61dd472681ce7f1fe67/griffelib-2.0.2.tar.gz", hash = "sha256:3cf20b3bc470e83763ffbf236e0076b1211bac1bc67de13daf494640f2de707e", size = 166461, upload-time = "2026-03-27T11:34:51.091Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/11/8c/c9138d881c79aa0ea9ed83cbd58d5ca75624378b38cee225dcf5c42cc91f/griffelib-2.0.2-py3-none-any.whl", hash = "sha256:925c857658fb1ba40c0772c37acbc2ab650bd794d9c1b9726922e36ea4117ea1", size = 142357, upload-time = "2026-03-27T11:34:46.275Z" },
-]
-
-[[package]]
-name = "h11"
-version = "0.16.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
-]
-
-[[package]]
-name = "httpcore"
-version = "1.0.9"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "certifi" },
-    { name = "h11" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
-]
-
-[[package]]
-name = "httpx"
-version = "0.28.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "anyio" },
-    { name = "certifi" },
-    { name = "httpcore" },
-    { name = "idna" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
-]
-
 [[package]]
 name = "identify"
 version = "2.6.15"
@@ -417,18 +335,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
 ]
 
-[[package]]
-name = "importlib-metadata"
-version = "8.7.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "zipp" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
-]
-
 [[package]]
 name = "iniconfig"
 version = "2.1.0"
@@ -562,15 +468,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b2/c8/d148e041732d631fc76036f8b30fae4e77b027a1e95b7a84bb522481a940/librt-0.8.1-cp314-cp314t-win_arm64.whl", hash = "sha256:bf512a71a23504ed08103a13c941f763db13fb11177beb3d9244c98c29fb4a61", size = 48755, upload-time = "2026-02-17T16:12:47.943Z" },
 ]
 
-[[package]]
-name = "logfire-api"
-version = "4.31.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/08/a2/8d5a3c1c282d5f2bd9f5e9ddd5288d1414a53301ce389af9016b6d82bd50/logfire_api-4.31.0.tar.gz", hash = "sha256:fc4b01257ebd4ce297ad374ed201eb1a9213b999f6ae6df45cfca5bd0ef378f8", size = 77838, upload-time = "2026-03-27T19:00:47.545Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/26/27/9372b7492b3e146908d520f8599909311cd930175801ad219171fafc6f3e/logfire_api-4.31.0-py3-none-any.whl", hash = "sha256:3c1f502fd4eb8ef0996427a5cf275fd8f327f38600650a1f53071a8171c812db", size = 123402, upload-time = "2026-03-27T19:00:44.952Z" },
-]
-
 [[package]]
 name = "markdown"
 version = "3.10"
@@ -840,19 +737,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
 ]
 
-[[package]]
-name = "opentelemetry-api"
-version = "1.40.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "importlib-metadata" },
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" },
-]
-
 [[package]]
 name = "packaging"
 version = "24.2"
@@ -923,9 +807,6 @@ dependencies = [
 ]
 
 [package.optional-dependencies]
-evals = [
-    { name = "pydantic-evals" },
-]
 rich = [
     { name = "rich" },
 ]
@@ -953,12 +834,11 @@ docs = [
 
 [package.metadata]
 requires-dist = [
-    { name = "pydantic-evals", marker = "extra == 'evals'", specifier = ">=0.1" },
     { name = "rich", marker = "extra == 'rich'", specifier = ">=13.0" },
     { name = "typing-extensions", specifier = ">=4.15.0" },
     { name = "websockets", marker = "extra == 'web'", specifier = ">=12.0" },
 ]
-provides-extras = ["rich", "web", "evals"]
+provides-extras = ["rich", "web"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -978,190 +858,6 @@ docs = [
     { name = "mkdocs-material", specifier = ">=9.7.0" },
 ]
 
-[[package]]
-name = "pydantic"
-version = "2.12.5"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "annotated-types" },
-    { name = "pydantic-core" },
-    { name = "typing-extensions" },
-    { name = "typing-inspection" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
-]
-
-[[package]]
-name = "pydantic-ai-slim"
-version = "1.73.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
-    { name = "genai-prices" },
-    { name = "griffelib" },
-    { name = "httpx" },
-    { name = "opentelemetry-api" },
-    { name = "pydantic" },
-    { name = "pydantic-graph" },
-    { name = "typing-inspection" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/6a/1b/a5e18c7c721a3cfce5b17f86cb99e4142fcb70f38ea6d2b8963c2df445e1/pydantic_ai_slim-1.73.0.tar.gz", hash = "sha256:758d5bedb4b4f484c433672639bfc87af216a38453b1539ae10928a9ca62ff62", size = 497208, upload-time = "2026-03-27T03:49:49.459Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/04/3b/6aa1874cd0ccbc83c17c8eb308834bf004c8d4344c27cd8048851d4b284d/pydantic_ai_slim-1.73.0-py3-none-any.whl", hash = "sha256:f7176ce6c78539e1070d7e22549186862c2f6e6ea8b05b3aaad8a1942ba1ff4f", size = 638701, upload-time = "2026-03-27T03:49:42.804Z" },
-]
-
-[[package]]
-name = "pydantic-core"
-version = "2.41.5"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c6/90/32c9941e728d564b411d574d8ee0cf09b12ec978cb22b294995bae5549a5/pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146", size = 2107298, upload-time = "2025-11-04T13:39:04.116Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/a8/61c96a77fe28993d9a6fb0f4127e05430a267b235a124545d79fea46dd65/pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2", size = 1901475, upload-time = "2025-11-04T13:39:06.055Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/b6/338abf60225acc18cdc08b4faef592d0310923d19a87fba1faf05af5346e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97", size = 1918815, upload-time = "2025-11-04T13:39:10.41Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/1c/2ed0433e682983d8e8cba9c8d8ef274d4791ec6a6f24c58935b90e780e0a/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9", size = 2065567, upload-time = "2025-11-04T13:39:12.244Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/24/cf84974ee7d6eae06b9e63289b7b8f6549d416b5c199ca2d7ce13bbcf619/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52", size = 2230442, upload-time = "2025-11-04T13:39:13.962Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/21/4e287865504b3edc0136c89c9c09431be326168b1eb7841911cbc877a995/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941", size = 2350956, upload-time = "2025-11-04T13:39:15.889Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/76/7727ef2ffa4b62fcab916686a68a0426b9b790139720e1934e8ba797e238/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a", size = 2068253, upload-time = "2025-11-04T13:39:17.403Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/8c/a4abfc79604bcb4c748e18975c44f94f756f08fb04218d5cb87eb0d3a63e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c", size = 2177050, upload-time = "2025-11-04T13:39:19.351Z" },
-    { url = "https://files.pythonhosted.org/packages/67/b1/de2e9a9a79b480f9cb0b6e8b6ba4c50b18d4e89852426364c66aa82bb7b3/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2", size = 2147178, upload-time = "2025-11-04T13:39:21Z" },
-    { url = "https://files.pythonhosted.org/packages/16/c1/dfb33f837a47b20417500efaa0378adc6635b3c79e8369ff7a03c494b4ac/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556", size = 2341833, upload-time = "2025-11-04T13:39:22.606Z" },
-    { url = "https://files.pythonhosted.org/packages/47/36/00f398642a0f4b815a9a558c4f1dca1b4020a7d49562807d7bc9ff279a6c/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49", size = 2321156, upload-time = "2025-11-04T13:39:25.843Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/70/cad3acd89fde2010807354d978725ae111ddf6d0ea46d1ea1775b5c1bd0c/pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba", size = 1989378, upload-time = "2025-11-04T13:39:27.92Z" },
-    { url = "https://files.pythonhosted.org/packages/76/92/d338652464c6c367e5608e4488201702cd1cbb0f33f7b6a85a60fe5f3720/pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9", size = 2013622, upload-time = "2025-11-04T13:39:29.848Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" },
-    { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" },
-    { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" },
-    { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" },
-    { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" },
-    { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" },
-    { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" },
-    { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" },
-    { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" },
-    { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" },
-    { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" },
-    { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" },
-    { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" },
-    { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" },
-    { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" },
-    { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" },
-    { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" },
-    { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" },
-    { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" },
-    { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" },
-    { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" },
-    { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" },
-    { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" },
-    { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" },
-    { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" },
-    { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" },
-    { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" },
-    { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" },
-    { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
-    { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" },
-    { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" },
-    { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" },
-    { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/b0/1a2aa41e3b5a4ba11420aba2d091b2d17959c8d1519ece3627c371951e73/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8", size = 2103351, upload-time = "2025-11-04T13:43:02.058Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/ee/31b1f0020baaf6d091c87900ae05c6aeae101fa4e188e1613c80e4f1ea31/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a", size = 1925363, upload-time = "2025-11-04T13:43:05.159Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/89/ab8e86208467e467a80deaca4e434adac37b10a9d134cd2f99b28a01e483/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b", size = 2135615, upload-time = "2025-11-04T13:43:08.116Z" },
-    { url = "https://files.pythonhosted.org/packages/99/0a/99a53d06dd0348b2008f2f30884b34719c323f16c3be4e6cc1203b74a91d/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2", size = 2175369, upload-time = "2025-11-04T13:43:12.49Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/94/30ca3b73c6d485b9bb0bc66e611cff4a7138ff9736b7e66bcf0852151636/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093", size = 2144218, upload-time = "2025-11-04T13:43:15.431Z" },
-    { url = "https://files.pythonhosted.org/packages/87/57/31b4f8e12680b739a91f472b5671294236b82586889ef764b5fbc6669238/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a", size = 2329951, upload-time = "2025-11-04T13:43:18.062Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/73/3c2c8edef77b8f7310e6fb012dbc4b8551386ed575b9eb6fb2506e28a7eb/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963", size = 2318428, upload-time = "2025-11-04T13:43:20.679Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" },
-    { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" },
-    { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" },
-    { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" },
-    { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" },
-]
-
-[[package]]
-name = "pydantic-evals"
-version = "1.73.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "anyio" },
-    { name = "logfire-api" },
-    { name = "pydantic" },
-    { name = "pydantic-ai-slim" },
-    { name = "pyyaml" },
-    { name = "rich" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/02/45/ce1f9b97c4838f940c98693bc1d6298f0e1396355998942b095ce17157fe/pydantic_evals-1.73.0.tar.gz", hash = "sha256:c1f38ad9c4f566bee6958c92f205b8200957b4baf3dd5239e2a4a06edd28e3dc", size = 56137, upload-time = "2026-03-27T03:49:50.861Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/01/4e/aefc34a68adc165ddec22c0632cb3076579c46751ac11acdf8cec6462891/pydantic_evals-1.73.0-py3-none-any.whl", hash = "sha256:0609210d4825cc8339b5cb649be38321450b46d6e87d72c1ffde73598741fd5a", size = 67143, upload-time = "2026-03-27T03:49:44.298Z" },
-]
-
-[[package]]
-name = "pydantic-graph"
-version = "1.73.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "httpx" },
-    { name = "logfire-api" },
-    { name = "pydantic" },
-    { name = "typing-inspection" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1a/22/d479ea32e3c712c6711e41157fb975d81582e5171510e4c662f21a85e9fe/pydantic_graph-1.73.0.tar.gz", hash = "sha256:f0d3e4984af1d902cdda1ccd3fcd86949d45d3ed21559e781f7cf9eace2ed914", size = 58717, upload-time = "2026-03-27T03:49:51.967Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/08/b3/4cc0b1c543b8a0c1f9add7bdeb2e8cd583961a795664a1a74d1fc8200416/pydantic_graph-1.73.0-py3-none-any.whl", hash = "sha256:aaab8b1580885f5108401db0a7da58d6c7643e467eb626b8a1364b1030327de0", size = 72504, upload-time = "2026-03-27T03:49:45.668Z" },
-]
-
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -1574,18 +1270,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
 ]
 
-[[package]]
-name = "typing-inspection"
-version = "0.4.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
-]
-
 [[package]]
 name = "urllib3"
 version = "2.5.0"
@@ -1700,12 +1384,3 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" },
     { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
 ]
-
-[[package]]
-name = "zipp"
-version = "3.23.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" },
-]

From 72a8457546084569deb4d221b9df26503cc59dd8 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:33:49 +0200
Subject: [PATCH 43/60] tests(history): ensure `--runs` displays newest entries
 first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added `TestRunsOrderRecentFirst` to validate that `--runs` follows the git log convention, showing the most recent entries first.
- Updated CLI logic to reverse storage order (oldest → newest) for display consistency.
- Adjusted index formatting and numbering in both plain and rich output modes to reflect the newest-first display.
---
 protest/cli/history.py    | 11 +++++----
 tests/test_history_cli.py | 48 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/protest/cli/history.py b/protest/cli/history.py
index 01198d8..88c94b8 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -115,11 +115,13 @@ def stats(self, entries: list[dict[str, Any]]) -> None:
         print()
 
     def runs(self, entries: list[dict[str, Any]]) -> None:
-        for i, e in enumerate(entries):
+        # Display most-recent first (git log convention). `entries` arrives
+        # sorted oldest→newest from storage, so we reverse for display.
+        for i, e in enumerate(reversed(entries)):
             p, t, r = _entry_stats(e)
             git = (e.get("git") or {}).get("commit_short", "?")
             ts = e.get("timestamp", "?")[:16]
-            print(f"\n  #{len(entries) - i:<3} {ts}  {p}/{t} ({r * 100:.0f}%)  {git}")
+            print(f"\n  #{i + 1:<3} {ts}  {p}/{t} ({r * 100:.0f}%)  {git}")
             for sn, sd in e.get("suites", {}).items():
                 if not isinstance(sd, dict):
                     continue
@@ -212,13 +214,14 @@ def stats(self, entries: list[dict[str, Any]]) -> None:
 
     def runs(self, entries: list[dict[str, Any]]) -> None:
         self.console.print()
-        for i, e in enumerate(entries):
+        # Display most-recent first (git log convention).
+        for i, e in enumerate(reversed(entries)):
             p, t, r = _entry_stats(e)
             git = (e.get("git") or {}).get("commit_short", "?")
             ts = e.get("timestamp", "?")[:16]
             rate_color = "green" if r >= 0.8 else "yellow" if r >= 0.5 else "red"
             self.console.print(
-                f"  [dim]#{len(entries) - i:<3}[/] {ts}  "
+                f"  [dim]#{i + 1:<3}[/] {ts}  "
                 f"[{rate_color}]{p}/{t} ({r * 100:.0f}%)[/]  [dim]{git}[/]"
             )
             for sn, sd in e.get("suites", {}).items():
diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py
index 70e81b2..b19e5cb 100644
--- a/tests/test_history_cli.py
+++ b/tests/test_history_cli.py
@@ -16,6 +16,7 @@
 import pytest
 
 from protest.cli.history import handle_history_command
+from protest.history.storage import HISTORY_FILE, append_entry
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -110,3 +111,50 @@ def test_help_output_shows_action_and_kind_groups(
         stdout = capsys.readouterr().out
         assert "[--runs | --show [N] | --compare]" in stdout
         assert "[--evals | --tests]" in stdout
+
+
+class TestRunsOrderRecentFirst:
+    """`--runs` lists most-recent run first (git log convention).
+
+    Storage returns entries oldest→newest; the CLI must reverse for display
+    so #1 maps to the newest run, matching `git stash list` / `git log`.
+    """
+
+    def _seed(self, tmp_path: Path, commits: list[tuple[str, str]]) -> None:
+        path = tmp_path / HISTORY_FILE
+        for ts, commit in commits:
+            append_entry(
+                path,
+                {
+                    "schema_version": 1,
+                    "run_id": commit,
+                    "timestamp": ts,
+                    "git": {"commit_short": commit},
+                    "suites": {},
+                },
+            )
+
+    def test_runs_displays_newest_first(
+        self,
+        tmp_path: Path,
+        capsys: pytest.CaptureFixture[str],
+    ) -> None:
+        # Seed in chronological order — storage preserves write order.
+        self._seed(
+            tmp_path,
+            [
+                ("2026-04-25T10:00:00", "old1234"),
+                ("2026-04-25T11:00:00", "mid5678"),
+                ("2026-04-25T12:00:00", "newabcd"),
+            ],
+        )
+        handle_history_command(["--runs", "--path", str(tmp_path)])
+        stdout = capsys.readouterr().out
+        # #1 is newest, #3 is oldest.
+        assert stdout.index("#1") < stdout.index("#2") < stdout.index("#3")
+        assert (
+            stdout.index("newabcd") < stdout.index("mid5678") < stdout.index("old1234")
+        )
+        # And #1 lines up with the newest commit, not the oldest.
+        newest_line = next(line for line in stdout.splitlines() if "#1" in line)
+        assert "newabcd" in newest_line

From 8b64322d7973b8c2fa8c1b85f6938381e970344c Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:38:24 +0200
Subject: [PATCH 44/60] refactor(evals): replace `keyword_check` with
 `contains_keywords` and update evaluator logic

---
 docs/evals.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index e778b22..324e0b2 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -226,10 +226,10 @@ Skip expensive evaluators (LLM judges) when cheap ones already fail:
 from protest.evals import ShortCircuit
 
 evaluators=[
-    not_empty,                                     # always runs
+    not_empty,                                                  # always runs
     ShortCircuit([
-        contains_expected_facts(min_score=0.3),    # 0ms — if fail → stop
-        llm_judge(rubric="factual accuracy"),       # 3s — skipped if above fails
+        contains_keywords(keywords=["paris"], min_recall=0.5),  # 0ms — if fail → stop
+        llm_judge(rubric="factual accuracy"),                   # 3s — skipped if above fails
     ]),
 ]
 ```
@@ -243,7 +243,7 @@ evaluators=[
 evaluators=[not_empty]
 
 # With params → call to bind
-evaluators=[keyword_check(keywords=["python", "async"], min_recall=0.75)]
+evaluators=[contains_keywords(keywords=["python", "async"], min_recall=0.75)]
 
 # Per-case evaluators (added to suite-level)
 EvalCase(name="factual_accuracy_case", inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")])

From bf27f4ce702e6bb731442dc6d5808494954d640f Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:44:25 +0200
Subject: [PATCH 45/60] tests(evals): add stricter `contains_keywords`
 threshold tests and improve evaluator logic documentation

- Introduced tests for `min_recall` edge cases, including exact threshold passing, discontinuity fixes, and below-threshold failures.
- Updated `contains_keywords` evaluator to simplify `all_keywords_present` logic and ensure consistent behavior across thresholds.
- Adjusted default `min_recall` to `1.0` in docs and implementation for stricter compliance.
---
 docs/evals.md               |  2 +-
 protest/evals/evaluators.py | 12 +++++++++---
 tests/evals/test_e2e.py     | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 324e0b2..84b25ca 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -266,7 +266,7 @@ EvalCase(name="factual_accuracy_case", inputs="...", evaluators=[llm_judge(rubri
 
 | Evaluator | Params | Returns |
 |-----------|--------|---------|
-| `contains_keywords` | `keywords, min_recall=0.0` | `keyword_recall: float`, `all_keywords_present: bool` |
+| `contains_keywords` | `keywords, min_recall=1.0` | `keyword_recall: float`, `all_keywords_present: bool` |
 | `contains_expected` | `case_sensitive=False` | `bool` |
 | `does_not_contain` | `forbidden` | `no_forbidden_words: bool` |
 | `not_empty` | — | `bool` |
diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py
index ec7d9bd..dcac9b4 100644
--- a/protest/evals/evaluators.py
+++ b/protest/evals/evaluators.py
@@ -45,16 +45,22 @@ class WordOverlapResult:
 
 @evaluator
 def contains_keywords(
-    ctx: EvalContext[Any, str], keywords: list[str], min_recall: float = 0.0
+    ctx: EvalContext[Any, str], keywords: list[str], min_recall: float = 1.0
 ) -> ContainsKeywordsResult:
-    """Check that the output contains expected keywords (case-insensitive)."""
+    """Check that the output contains expected keywords (case-insensitive).
+
+    `min_recall` is the minimum fraction of keywords that must appear for
+    the verdict to pass. Default `1.0` requires all keywords to be present;
+    set to `0.5` for "at least half", `0.0` to ignore the verdict and only
+    track the metric.
+    """
     output_lower = ctx.output.lower()
     found = sum(1 for kw in keywords if kw.lower() in output_lower)
     total = len(keywords)
     recall = found / total if total else 1.0
     return ContainsKeywordsResult(
         keyword_recall=recall,
-        all_keywords_present=recall >= min_recall if min_recall > 0 else found == total,
+        all_keywords_present=recall >= min_recall,
     )
 
 
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index f1fc5d1..921cc39 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -721,6 +721,39 @@ def test_contains_keywords(self) -> None:
         assert result.keyword_recall == 1.0
         assert result.all_keywords_present is True
 
+    def test_contains_keywords_default_requires_all(self) -> None:
+        """Default `min_recall=1.0` means strict: missing one → verdict False."""
+        e = contains_keywords(keywords=["hello", "world"])
+        result = e(self._make_ctx("Only hello here"))
+        assert result.keyword_recall == 0.5
+        assert result.all_keywords_present is False
+
+    def test_contains_keywords_threshold_continuity_at_zero(self) -> None:
+        """Regression: `min_recall=0.0` must always pass (no discontinuity at 0).
+
+        Earlier behavior: `min_recall=0.0` flipped to strict mode (all required),
+        while `min_recall=0.0001` was permissive — surprising at the boundary.
+        Now `recall >= min_recall` applies uniformly.
+        """
+        e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.0)
+        result = e(self._make_ctx("nothing matches"))
+        assert result.keyword_recall == 0.0
+        assert result.all_keywords_present is True
+
+    def test_contains_keywords_threshold_at_exact_value(self) -> None:
+        """Verdict passes when recall equals the threshold exactly."""
+        e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.5)
+        result = e(self._make_ctx("only alpha here"))
+        assert result.keyword_recall == 0.5
+        assert result.all_keywords_present is True
+
+    def test_contains_keywords_threshold_just_below(self) -> None:
+        """Verdict fails when recall is below the threshold."""
+        e = contains_keywords(keywords=["alpha", "beta", "gamma"], min_recall=0.5)
+        result = e(self._make_ctx("only alpha"))
+        assert abs(result.keyword_recall - 1 / 3) < 1e-9
+        assert result.all_keywords_present is False
+
     def test_contains_expected(self) -> None:
         e = contains_expected
         assert e(self._make_ctx("Hello World", "world")) is True

From bfa9d14106728b5bd78cc2420e8ce2cd3f113c2d Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:51:41 +0200
Subject: [PATCH 46/60] tests(evals): add `not_empty` tests for Sized
 containers and clarify evaluator behavior

- Added tests to ensure `not_empty` correctly handles empty and non-empty lists, dicts, and sets.
- Updated `not_empty` docstring and logic to explicitly check `Sized` objects using `len()`.
---
 protest/evals/evaluators.py | 11 ++++++++++-
 tests/evals/test_e2e.py     | 38 +++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py
index dcac9b4..8866961 100644
--- a/protest/evals/evaluators.py
+++ b/protest/evals/evaluators.py
@@ -9,6 +9,7 @@
 
 import json as json_module
 import re
+from collections.abc import Sized
 from dataclasses import dataclass
 from typing import Annotated, Any
 
@@ -86,11 +87,19 @@ def does_not_contain(
 
 @evaluator
 def not_empty(ctx: EvalContext[Any, Any]) -> bool:
-    """Check that the output is not empty or whitespace-only."""
+    """Check that the output is not empty.
+
+    - `None` -> False.
+    - `str`: False if empty or whitespace-only.
+    - Sized (list, dict, set, tuple, ...): False if `len() == 0`.
+    - Other (int, float, dataclass, custom objects): True.
+    """
     if ctx.output is None:
         return False
     if isinstance(ctx.output, str):
         return len(ctx.output.strip()) > 0
+    if isinstance(ctx.output, Sized):
+        return len(ctx.output) > 0
     return True
 
 
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 921cc39..5e86c18 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -769,6 +769,44 @@ def test_not_empty(self) -> None:
         assert not_empty(self._make_ctx("")) is False
         assert not_empty(self._make_ctx("   ")) is False
 
+    def test_not_empty_handles_sized_containers(self) -> None:
+        """Sized containers: empty -> False, non-empty -> True.
+
+        Earlier behavior fell through to `return True` for any non-string,
+        so `not_empty([])` reported True — misleading for tasks that return
+        lists/dicts (e.g. tool calls, retrieved chunks).
+        """
+        # Helper accepts Any at runtime; type hint is just a default.
+        ctx_empty_list: Any = self._make_ctx("")
+        ctx_empty_list.output = []
+        assert not_empty(ctx_empty_list) is False
+
+        ctx_nonempty_list: Any = self._make_ctx("")
+        ctx_nonempty_list.output = [1, 2]
+        assert not_empty(ctx_nonempty_list) is True
+
+        ctx_empty_dict: Any = self._make_ctx("")
+        ctx_empty_dict.output = {}
+        assert not_empty(ctx_empty_dict) is False
+
+        ctx_nonempty_dict: Any = self._make_ctx("")
+        ctx_nonempty_dict.output = {"a": 1}
+        assert not_empty(ctx_nonempty_dict) is True
+
+        ctx_empty_set: Any = self._make_ctx("")
+        ctx_empty_set.output = set()
+        assert not_empty(ctx_empty_set) is False
+
+    def test_not_empty_unsized_objects_still_pass(self) -> None:
+        """Non-Sized values (int, float, dataclass): always True (kept as-is)."""
+        ctx_int: Any = self._make_ctx("")
+        ctx_int.output = 42
+        assert not_empty(ctx_int) is True
+
+        ctx_zero: Any = self._make_ctx("")
+        ctx_zero.output = 0  # 0 is not None, not Sized — still passes.
+        assert not_empty(ctx_zero) is True
+
     def test_max_length(self) -> None:
         e = max_length(max_chars=5)
         result = e(self._make_ctx("hi"))

From e54f17912ee10fc1f00b9be0bedf3134178913a9 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 15:05:51 +0200
Subject: [PATCH 47/60] tests(evals): add precision tests for sub-millisecond
 durations and adaptive formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added tests to ensure `_serialize_eval_case` preserves 10 µs precision, preventing sub-ms durations from collapsing to `0.0`.
- Introduced `_format_case_duration` tests for adaptive time unit rendering across microseconds, milliseconds, and seconds.
- Updated markdown renderer to use `_format_case_duration` for task durations.
- Increased duration serialization precision from 3 to 5 decimals in history plugin.
---
 protest/evals/results_writer.py        | 26 +++++++--
 protest/history/plugin.py              |  9 +++-
 tests/evals/test_duration_precision.py | 75 ++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 7 deletions(-)
 create mode 100644 tests/evals/test_duration_precision.py

diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
index b611d6b..983ac8b 100644
--- a/protest/evals/results_writer.py
+++ b/protest/evals/results_writer.py
@@ -90,11 +90,7 @@ def _write_case_file(case: EvalCaseResult, run_dir: Path) -> None:
 
 def _render_case(case: EvalCaseResult) -> str:
     status = "PASS ✓" if case.passed else "FAIL ✗"
-    duration = (
-        f"{case.duration * 1000:.0f}ms"
-        if case.duration < 1
-        else f"{case.duration:.2f}s"
-    )
+    duration = _format_case_duration(case.duration)
     lines: list[str] = [
         f"# {case.case_name} — {status} ({duration})",
         "",
@@ -113,6 +109,26 @@ def _render_case(case: EvalCaseResult) -> str:
     return "\n".join(lines)
 
 
+_ONE_MILLISECOND = 0.001
+_TEN_MILLISECONDS = 0.01
+_ONE_SECOND = 1.0
+
+
+def _format_case_duration(seconds: float) -> str:
+    """Format SUT duration with adaptive units.
+
+    Sub-ms tasks (deterministic stubs, fast classifiers) used to render as
+    `0ms` because the renderer rounded to the nearest millisecond.
+    """
+    if seconds < _ONE_MILLISECOND:
+        return f"{seconds * 1_000_000:.0f}µs"
+    if seconds < _TEN_MILLISECONDS:
+        return f"{seconds * 1000:.2f}ms"
+    if seconds < _ONE_SECOND:
+        return f"{seconds * 1000:.0f}ms"
+    return f"{seconds:.2f}s"
+
+
 def _format_score(score: EvalScore) -> str:
     icon = "·" if score.is_metric else ("✓" if score.passed else "✗")
     return f"- **{score.name}**: {score.value} {icon}"
diff --git a/protest/history/plugin.py b/protest/history/plugin.py
index 00c5e8b..e662e14 100644
--- a/protest/history/plugin.py
+++ b/protest/history/plugin.py
@@ -101,7 +101,7 @@ def _record_test(self, result: TestResult, *, passed: bool) -> None:
             self._test_suites[suite_name] = {}
         self._test_suites[suite_name][result.name] = {
             "passed": passed,
-            "duration": round(result.duration, 3),
+            "duration": round(result.duration, 5),
         }
 
     # -- Eval event handlers --------------------------------------------------
@@ -216,11 +216,16 @@ def _serialize_eval_case(case: EvalCaseResult) -> dict[str, Any]:
     Skipped scores are excluded: a ShortCircuit skip produces
     `EvalScore(value=False, skipped=True)` — serializing it as an assertion
     would look like a real failure in `history --compare` diffs.
+
+    `case.duration` here is `EvalPayload.task_duration` (SUT-only timing,
+    set by the eval wrapper), not the full TestResult duration shown by live
+    reporters. Persisted at 10 µs precision so sub-ms SUTs don't all hash
+    down to 0.0 across runs.
     """
     entry: dict[str, Any] = {
         "passed": case.passed,
         "is_error": case.is_error,
-        "duration": round(case.duration, 3),
+        "duration": round(case.duration, 5),
         "scores": {
             s.name: s.value for s in case.scores if s.is_metric and not s.skipped
         },
diff --git a/tests/evals/test_duration_precision.py b/tests/evals/test_duration_precision.py
new file mode 100644
index 0000000..fdd47bf
--- /dev/null
+++ b/tests/evals/test_duration_precision.py
@@ -0,0 +1,75 @@
+"""Tests for C3 — sub-millisecond duration handling.
+
+The eval pipeline persists `EvalPayload.task_duration` (SUT-only timing).
+For deterministic stubs / fast classifiers, that value is sub-millisecond
+and the previous serializer (`round(_, 3)`) collapsed everything to `0.0`,
+making run-over-run comparisons useless. The markdown renderer had the
+matching bug — it printed `0ms` for any sub-ms task.
+"""
+
+from __future__ import annotations
+
+from protest.evals.results_writer import _format_case_duration, _render_case
+from protest.evals.types import EvalCaseResult
+from protest.history.plugin import _serialize_eval_case
+
+
+def _make_case(duration: float) -> EvalCaseResult:
+    return EvalCaseResult(
+        case_name="case",
+        node_id="suite::case",
+        scores=(),
+        duration=duration,
+        passed=True,
+        inputs="in",
+        output="out",
+        expected_output=None,
+        case_hash="h",
+        eval_hash="e",
+        is_error=False,
+    )
+
+
+class TestSerializerPrecision:
+    """`_serialize_eval_case` keeps 5-decimal precision (10 µs)."""
+
+    def test_sub_millisecond_is_not_collapsed_to_zero(self) -> None:
+        case = _make_case(2.07e-05)  # 20.7 µs
+        entry = _serialize_eval_case(case)
+        # Previously: 0.0 (round to 3 decimals)
+        # Now: 2e-05 (round to 5 decimals — 10 µs precision)
+        assert entry["duration"] > 0
+        assert entry["duration"] == 2e-05
+
+    def test_distinct_sub_ms_values_remain_distinguishable(self) -> None:
+        e1 = _serialize_eval_case(_make_case(1.0e-05))  # 10 µs
+        e2 = _serialize_eval_case(_make_case(5.0e-05))  # 50 µs
+        assert e1["duration"] != e2["duration"]
+
+    def test_millisecond_values_unchanged(self) -> None:
+        # >1ms: 5-decimal rounding produces the same numbers as 3-decimal.
+        entry = _serialize_eval_case(_make_case(0.123))
+        assert entry["duration"] == 0.123
+
+
+class TestMarkdownDurationFormat:
+    """`_format_case_duration` adapts unit to magnitude."""
+
+    def test_microseconds_for_sub_millisecond(self) -> None:
+        assert _format_case_duration(2.07e-05) == "21µs"
+
+    def test_two_decimals_in_low_milliseconds(self) -> None:
+        # 2.5 ms — keep one fractional digit so 1ms vs 2ms is visible.
+        assert _format_case_duration(0.0025) == "2.50ms"
+
+    def test_integer_milliseconds_in_mid_range(self) -> None:
+        assert _format_case_duration(0.135) == "135ms"
+
+    def test_seconds_for_one_or_more(self) -> None:
+        assert _format_case_duration(2.5) == "2.50s"
+
+    def test_renders_microseconds_in_case_header(self) -> None:
+        case = _make_case(2.07e-05)
+        rendered = _render_case(case)
+        # Header contains the duration; previously read "(0ms)".
+        assert "21µs" in rendered.splitlines()[0]

From 1779d4a389e8071859aa828601fadb9edfc4984b Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 15:13:09 +0200
Subject: [PATCH 48/60] tests(console): add payload, prefix handling, and
 reporter tests

- Added tests for 3-tuple payload behavior in `console.print` with flags `raw` and `prefix`.
- Verified ASCII and Rich reporters correctly render messages with/without test prefixes and markup.
- Updated `console.print` to support a new `prefix` parameter for suite-level outputs (e.g., "Results: ...").
- Adjusted `on_user_print` implementations across reporters to handle the `prefix` flag correctly.
---
 protest/console.py                 |  10 ++-
 protest/evals/results_writer.py    |   2 +-
 protest/reporting/ascii.py         |   5 +-
 protest/reporting/rich_reporter.py |   6 +-
 tests/test_console_print.py        | 106 +++++++++++++++++++++++++++++
 5 files changed, 122 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_console_print.py

diff --git a/protest/console.py b/protest/console.py
index 2d31607..56d2df6 100644
--- a/protest/console.py
+++ b/protest/console.py
@@ -13,6 +13,9 @@ async def pipeline():
     # Raw mode — no markup processing
     console.print("debug: raw bytes here", raw=True)
 
+    # Section mode — no per-test prefix (use for suite/session-level lines)
+    console.print(f"  Results: {run_dir}", prefix=False)
+
 Messages go through the event bus → reporters display them inline.
 If no event bus is available (outside a protest session), falls back to stderr.
 """
@@ -26,7 +29,7 @@ async def pipeline():
 from protest.execution.capture import get_event_bus, real_stderr
 
 
-def print(msg: str, *, raw: bool = False) -> None:
+def print(msg: str, *, raw: bool = False, prefix: bool = True) -> None:
     """Print a message that bypasses test capture.
 
     Goes through the event bus so reporters display it at the right place.
@@ -35,6 +38,9 @@ def print(msg: str, *, raw: bool = False) -> None:
     Args:
         msg: The message to print. Supports Rich markup unless raw=True.
         raw: If True, no markup processing — message passed as-is.
+        prefix: If False, omit the per-test indent/bar prefix. Use for
+            suite-level or session-level lines (e.g. "Results: <dir>") that
+            visually belong outside any single case's output block.
     """
     bus = get_event_bus()
     if bus is None:
@@ -49,7 +55,7 @@ def print(msg: str, *, raw: bool = False) -> None:
     # only caller, and console.print is never invoked from a signal handler.
     for handler_entry in bus._handlers.get(Event.USER_PRINT, []):
         with contextlib.suppress(Exception):
-            handler_entry.func((msg, raw))
+            handler_entry.func((msg, raw, prefix))
 
 
 def _fallback_print(msg: str, raw: bool) -> None:
diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py
index 983ac8b..71c3725 100644
--- a/protest/evals/results_writer.py
+++ b/protest/evals/results_writer.py
@@ -63,7 +63,7 @@ def on_eval_suite_end(self, report: Any) -> None:
             return
         run_dir = self._run_dirs.get(report.suite_name)
         if run_dir:
-            console.print(f"  Results: {run_dir}")
+            console.print(f"  Results: {run_dir}", prefix=False)
 
 
 # ---------------------------------------------------------------------------
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index 018bedf..2233a1c 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -318,10 +318,11 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
                 print(f"  {line}")
 
     def on_user_print(self, data: Any) -> None:
-        msg, raw = data
+        msg, raw, prefix = data
         text = msg if raw else strip_markup(msg)
         stream = real_stdout()
-        stream.write(f"       | {text}\n")
+        line = f"       | {text}\n" if prefix and not raw else f"{text}\n"
+        stream.write(line)
         stream.flush()
 
     def on_eval_suite_end(self, report: Any) -> None:
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index bf93406..159f7bb 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -375,14 +375,16 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
                 self._print(f"[dim]{escaped_line}[/]")
 
     def on_user_print(self, data: Any) -> None:
-        msg, raw = data
+        msg, raw, prefix = data
         # Write to the real stdout, bypassing capture
         stream = real_stdout()
         c = Console(file=stream, highlight=False)
         if raw:
             c.print(msg, markup=False)
-        else:
+        elif prefix:
             c.print(f"[dim]       │[/] {msg}")
+        else:
+            c.print(msg)
 
     def on_eval_suite_end(self, report: Any) -> None:
         if not isinstance(report, EvalSuiteReport):
diff --git a/tests/test_console_print.py b/tests/test_console_print.py
new file mode 100644
index 0000000..956d2e2
--- /dev/null
+++ b/tests/test_console_print.py
@@ -0,0 +1,106 @@
+"""Tests for `protest.console.print` — payload shape and reporter formatting.
+
+`console.print(msg, raw=False, prefix=True)` builds a 3-tuple payload
+`(msg, raw, prefix)` dispatched on USER_PRINT. Each reporter unpacks the
+three flags and renders accordingly:
+
+- default (prefix=True, raw=False): per-test bar prefix + markup
+- raw=True: no prefix, no markup (debug bytes)
+- prefix=False: no prefix, markup still active (suite-level lines)
+
+The third mode is what unblocks `EvalResultsWriter.on_eval_suite_end` so
+`Results: <path>` doesn't visually attach to the previous case's output.
+"""
+
+from __future__ import annotations
+
+import io
+from unittest.mock import MagicMock
+
+import pytest
+
+from protest import console
+from protest.events.types import Event
+from protest.reporting.ascii import AsciiReporter
+from protest.reporting.rich_reporter import RichReporter
+
+
+@pytest.fixture
+def stdout_buffer(monkeypatch: pytest.MonkeyPatch) -> io.StringIO:
+    buf = io.StringIO()
+    # `real_stdout()` is what reporters write to; patch at both reporter modules.
+    monkeypatch.setattr("protest.reporting.ascii.real_stdout", lambda: buf)
+    monkeypatch.setattr("protest.reporting.rich_reporter.real_stdout", lambda: buf)
+    return buf
+
+
+class TestAsciiReporterUserPrint:
+    """ASCII reporter handles the 3-tuple payload."""
+
+    def test_default_adds_bar_prefix(self, stdout_buffer: io.StringIO) -> None:
+        reporter = AsciiReporter()
+        reporter.on_user_print(("hello", False, True))
+        assert stdout_buffer.getvalue() == "       | hello\n"
+
+    def test_raw_mode_no_prefix_no_markup(self, stdout_buffer: io.StringIO) -> None:
+        reporter = AsciiReporter()
+        reporter.on_user_print(("[bold]raw[/]", True, True))
+        # raw bypasses both markup-strip and prefix
+        assert stdout_buffer.getvalue() == "[bold]raw[/]\n"
+
+    def test_prefix_false_no_bar(self, stdout_buffer: io.StringIO) -> None:
+        reporter = AsciiReporter()
+        reporter.on_user_print(("Results: /tmp/foo", False, False))
+        # No bar — visually a section line, not attached to a case.
+        assert stdout_buffer.getvalue() == "Results: /tmp/foo\n"
+
+
+class TestRichReporterUserPrint:
+    """Rich reporter handles the 3-tuple payload."""
+
+    def _make_reporter(self) -> RichReporter:
+        # RichReporter pulls deps from the bus; we only exercise on_user_print.
+        return RichReporter.__new__(RichReporter)
+
+    def test_default_adds_bar_prefix(self, stdout_buffer: io.StringIO) -> None:
+        reporter = self._make_reporter()
+        reporter.on_user_print(("hello", False, True))
+        assert "│" in stdout_buffer.getvalue()
+        assert "hello" in stdout_buffer.getvalue()
+
+    def test_prefix_false_no_bar(self, stdout_buffer: io.StringIO) -> None:
+        reporter = self._make_reporter()
+        reporter.on_user_print(("Results: /tmp/foo", False, False))
+        out = stdout_buffer.getvalue()
+        assert "│" not in out
+        assert "Results: /tmp/foo" in out
+
+
+class TestConsolePrintPayload:
+    """`console.print` builds the payload and dispatches to handlers."""
+
+    def _captured_bus(self, monkeypatch: pytest.MonkeyPatch) -> list[tuple]:
+        captured: list[tuple] = []
+        bus = MagicMock()
+        handler = MagicMock()
+        handler.func = lambda payload: captured.append(payload)
+        bus._handlers = {Event.USER_PRINT: [handler]}
+        monkeypatch.setattr("protest.console.get_event_bus", lambda: bus)
+        return captured
+
+    def test_default_payload_carries_prefix_true(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        captured = self._captured_bus(monkeypatch)
+        console.print("hi")
+        assert captured == [("hi", False, True)]
+
+    def test_prefix_false_propagates(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        captured = self._captured_bus(monkeypatch)
+        console.print("section line", prefix=False)
+        assert captured == [("section line", False, False)]
+
+    def test_raw_propagates(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        captured = self._captured_bus(monkeypatch)
+        console.print("[raw]", raw=True)
+        assert captured == [("[raw]", True, True)]

From 0f25a1b74a1487347b39b5a0cb660458537d132b Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 15:19:39 +0200
Subject: [PATCH 49/60] tests(console): surface handler errors and add fallback
 handling tests

- Updated `console.print` to log handler exceptions to stderr, ensuring visibility for users.
- Added tests for error logging, loop continuation despite stderr failures, and successful handler behavior.
---
 protest/console.py          | 11 ++++++-
 tests/test_console_print.py | 65 +++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/protest/console.py b/protest/console.py
index 56d2df6..f200d9d 100644
--- a/protest/console.py
+++ b/protest/console.py
@@ -54,8 +54,17 @@ def print(msg: str, *, raw: bool = False, prefix: bool = True) -> None:
     # that API to users. Kept private here — the framework itself is the
     # only caller, and console.print is never invoked from a signal handler.
     for handler_entry in bus._handlers.get(Event.USER_PRINT, []):
-        with contextlib.suppress(Exception):
+        try:
             handler_entry.func((msg, raw, prefix))
+        except Exception as exc:
+            # Surface handler failures (typically: malformed Rich markup) on
+            # real stderr so users don't conclude `console.print` is silently
+            # broken. Wrapped in suppress() to guarantee the loop continues
+            # even if the fallback write itself raises.
+            with contextlib.suppress(Exception):
+                stream = real_stderr()
+                stream.write(f"console.print: handler raised {exc!r}\n")
+                stream.flush()
 
 
 def _fallback_print(msg: str, raw: bool) -> None:
diff --git a/tests/test_console_print.py b/tests/test_console_print.py
index 956d2e2..6bac47e 100644
--- a/tests/test_console_print.py
+++ b/tests/test_console_print.py
@@ -104,3 +104,68 @@ def test_raw_propagates(self, monkeypatch: pytest.MonkeyPatch) -> None:
         captured = self._captured_bus(monkeypatch)
         console.print("[raw]", raw=True)
         assert captured == [("[raw]", True, True)]
+
+
+class TestConsolePrintHandlerErrors:
+    """Handler failures must surface on stderr instead of disappearing.
+
+    Earlier behavior: `contextlib.suppress(Exception)` swallowed any handler
+    raise. A reporter bug (e.g. malformed Rich markup) made `console.print`
+    silently no-op — users assumed the call did nothing.
+    """
+
+    def _bus_with_failing_handler(
+        self, monkeypatch: pytest.MonkeyPatch, exc: Exception
+    ) -> None:
+        bus = MagicMock()
+        handler = MagicMock()
+
+        def boom(_payload: tuple) -> None:
+            raise exc
+
+        handler.func = boom
+        bus._handlers = {Event.USER_PRINT: [handler]}
+        monkeypatch.setattr("protest.console.get_event_bus", lambda: bus)
+
+    def test_handler_exception_is_surfaced_on_stderr(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        stderr = io.StringIO()
+        monkeypatch.setattr("protest.console.real_stderr", lambda: stderr)
+        self._bus_with_failing_handler(monkeypatch, RuntimeError("boom"))
+
+        console.print("anything")
+
+        out = stderr.getvalue()
+        assert "console.print: handler raised" in out
+        assert "RuntimeError" in out
+        assert "boom" in out
+
+    def test_loop_continues_when_real_stderr_itself_fails(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Defense in depth: if logging the error also fails, no cascade."""
+
+        def raising_stderr() -> object:
+            raise OSError("stderr broken")
+
+        monkeypatch.setattr("protest.console.real_stderr", raising_stderr)
+        self._bus_with_failing_handler(monkeypatch, RuntimeError("boom"))
+
+        # Must not raise — the outer suppress() is the last line of defense.
+        console.print("anything")
+
+    def test_successful_handler_does_not_touch_stderr(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        stderr = io.StringIO()
+        monkeypatch.setattr("protest.console.real_stderr", lambda: stderr)
+
+        bus = MagicMock()
+        handler = MagicMock()
+        handler.func = lambda _payload: None  # no-op, no raise
+        bus._handlers = {Event.USER_PRINT: [handler]}
+        monkeypatch.setattr("protest.console.get_event_bus", lambda: bus)
+
+        console.print("ok")
+        assert stderr.getvalue() == ""

From 7a78560134d2aca9882714aff35688712156a6ac Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 15:28:47 +0200
Subject: [PATCH 50/60] tests(history): ensure clean_dirty concurrency
 preserves all appends

- Added tests to validate `clean_dirty` concurrency handling, ensuring no appends are silently dropped due to interleaved truncate operations.
- Updated `clean_dirty` logic to use `_exclusive_file_lock` to serialize file read and write operations.
- Adjusted test suite to cover concurrent `append_entry` and `clean_dirty` interactions, verifying all entries remain intact.
---
 protest/history/storage.py                    | 43 ++++----
 protest/reporting/rich_reporter.py            | 15 +--
 .../history/test_append_entry_concurrency.py  | 98 ++++++++++++++++++-
 3 files changed, 132 insertions(+), 24 deletions(-)

diff --git a/protest/history/storage.py b/protest/history/storage.py
index 8f89fa4..3797335 100644
--- a/protest/history/storage.py
+++ b/protest/history/storage.py
@@ -181,6 +181,10 @@ def clean_dirty(history_dir: Path | None = None) -> int:
     """Remove entries where git.dirty=True AND git.commit matches current HEAD.
 
     Returns the number of entries removed.
+
+    The read+write happens under `_exclusive_file_lock` so a concurrent
+    `append_entry` cannot land between our read and our truncate (which
+    would silently drop the new entry).
     """
     path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE
     if not path.exists():
@@ -197,22 +201,27 @@ def clean_dirty(history_dir: Path | None = None) -> int:
     except (FileNotFoundError, subprocess.CalledProcessError):
         return 0
 
-    lines = path.read_text().strip().splitlines()
-    kept: list[str] = []
-    removed = 0
+    with open(path, "r+") as f, _exclusive_file_lock(f):
+        f.seek(0)
+        lines = f.read().strip().splitlines()
+        kept: list[str] = []
+        removed = 0
 
-    for line in lines:
-        try:
-            entry = json.loads(line)
-        except json.JSONDecodeError:
-            kept.append(line)
-            continue
-        git = entry.get("git") or {}
-        if git.get("dirty") and git.get("commit") == current_commit:
-            removed += 1
-        else:
-            kept.append(line)
-
-    if removed:
-        path.write_text("\n".join(kept) + "\n" if kept else "")
+        for line in lines:
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError:
+                kept.append(line)
+                continue
+            git = entry.get("git") or {}
+            if git.get("dirty") and git.get("commit") == current_commit:
+                removed += 1
+            else:
+                kept.append(line)
+
+        if removed:
+            f.seek(0)
+            f.truncate()
+            if kept:
+                f.write("\n".join(kept) + "\n")
     return removed
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 159f7bb..e30584c 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -34,6 +34,11 @@
 )
 from protest.reporting.verbosity import Verbosity
 
+# Per-run pass-rate thresholds for the eval suite color cue.
+# Strict default — green only if every case passes; yellow above half.
+_PERFECT_PASS_RATE = 1.0
+_PARTIAL_PASS_RATE = 0.5
+
 
 def _short_label(name: str, node_id: str) -> str:
     """name + [case_id] from node_id."""
@@ -416,18 +421,16 @@ def on_eval_suite_end(self, report: Any) -> None:
             self._print(
                 f"  [cyan]Eval: {report.suite_name} ({report.total_count} cases)[/]"
             )
-        full_pass = 100
-        half_pass = 50
-        rate_pct = report.pass_rate * full_pass
+        rate = report.pass_rate
         color = (
             "green"
-            if rate_pct >= full_pass
+            if rate >= _PERFECT_PASS_RATE
             else "yellow"
-            if rate_pct >= half_pass
+            if rate >= _PARTIAL_PASS_RATE
             else "red"
         )
         self._print(
-            f"  [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]"
+            f"  [{color}]Passed: {report.passed_count}/{report.total_count} ({rate * 100:.1f}%)[/]"
         )
         if report.total_task_tokens > 0 or report.total_task_cost > 0:
             self._print(
diff --git a/tests/history/test_append_entry_concurrency.py b/tests/history/test_append_entry_concurrency.py
index 5bd3d79..ab82739 100644
--- a/tests/history/test_append_entry_concurrency.py
+++ b/tests/history/test_append_entry_concurrency.py
@@ -4,15 +4,19 @@
 multiprocess-concurrency case: N workers append concurrently to the same
 file; every line must be parseable JSON. Without locking, interleaved
 writes larger than `PIPE_BUF` would corrupt lines and the test would fail.
+
+Also covers `clean_dirty` concurrency: a concurrent `append_entry` while
+`clean_dirty` is running must not be silently dropped by the truncate.
 """
 
 from __future__ import annotations
 
 import json
 import multiprocessing as mp
+import subprocess
 from pathlib import Path
 
-from protest.history.storage import append_entry
+from protest.history.storage import append_entry, clean_dirty
 
 
 def _worker_append(args: tuple[str, int, int]) -> None:
@@ -28,6 +32,30 @@ def _worker_append(args: tuple[str, int, int]) -> None:
         append_entry(path, {"worker": worker_id, "i": i, "pad": padding})
 
 
+def _worker_append_innocent(args: tuple[str, int, int]) -> None:
+    """Append entries on an unrelated commit — `clean_dirty` must not touch them."""
+    path_str, worker_id, count = args
+    path = Path(path_str)
+    for i in range(count):
+        append_entry(
+            path,
+            {
+                "worker": worker_id,
+                "i": i,
+                "git": {"commit": "innocent_commit", "dirty": False},
+                "suites": {},
+            },
+        )
+
+
+def _worker_clean_dirty(args: tuple[str, int]) -> None:
+    """Repeatedly run clean_dirty while another worker appends."""
+    path_str, count = args
+    history_dir = Path(path_str).parent
+    for _ in range(count):
+        clean_dirty(history_dir=history_dir)
+
+
 class TestAppendEntryBasic:
     """Single-writer invariants."""
 
@@ -88,3 +116,71 @@ def test_concurrent_writers_do_not_interleave(self, tmp_path: Path) -> None:
             )
 
         assert counts_per_worker == dict.fromkeys(range(workers), per_worker)
+
+
+class TestCleanDirtyConcurrency:
+    """`clean_dirty` and `append_entry` must serialize via the same lock.
+
+    The dangerous race: clean_dirty does (read → compute kept → truncate →
+    rewrite). Without a lock, an `append_entry` landing between the read
+    and the truncate is silently overwritten — the new entry disappears.
+    Here we run both in parallel and check the conserved quantity: every
+    appended "innocent" entry (different commit) must survive.
+    """
+
+    def test_concurrent_append_not_dropped_by_clean_dirty(self, tmp_path: Path) -> None:
+        # Skip outside a git repo — clean_dirty depends on `git rev-parse HEAD`.
+        try:
+            subprocess.run(
+                ["git", "rev-parse", "HEAD"],  # noqa: S607
+                capture_output=True,
+                text=True,
+                timeout=5,
+                check=True,
+            )
+        except (FileNotFoundError, subprocess.CalledProcessError):
+            return
+
+        path = tmp_path / "history.jsonl"
+        # Pre-populate with one no-op entry so the file exists for clean_dirty.
+        append_entry(
+            path,
+            {
+                "worker": -1,
+                "git": {"commit": "preexisting", "dirty": False},
+                "suites": {},
+            },
+        )
+
+        per_worker = 30
+        ctx = mp.get_context("spawn")
+        with ctx.Pool(2) as pool:
+            pool.starmap(
+                _dispatch_worker,
+                [
+                    ("append", str(path), 0, per_worker),
+                    ("clean", str(path), 0, per_worker),
+                ],
+            )
+
+        lines = path.read_text().splitlines()
+        # Every line still parses (no torn writes).
+        innocent_count = 0
+        for raw in lines:
+            entry = json.loads(raw)
+            if entry.get("git", {}).get("commit") == "innocent_commit":
+                innocent_count += 1
+        # All `per_worker` innocent appends survived — none silently
+        # discarded by an interleaved clean_dirty truncate.
+        assert innocent_count == per_worker, (
+            f"expected {per_worker} innocent entries, got {innocent_count} — "
+            "concurrent clean_dirty dropped some appends"
+        )
+
+
+def _dispatch_worker(kind: str, path_str: str, worker_id: int, count: int) -> None:
+    """Top-level dispatcher so spawn() can pickle the call."""
+    if kind == "append":
+        _worker_append_innocent((path_str, worker_id, count))
+    else:
+        _worker_clean_dirty((path_str, count))

From 2289485808c82dd986ea8008724cb5ffcd099699 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sat, 25 Apr 2026 15:34:11 +0200
Subject: [PATCH 51/60] docs(evals): add details on native LLM support and
 evaluator enhancements

- Expanded documentation to introduce native LLM evals, including pass/fail and numeric scoring with JSONL history.
- Clarified `EvalCase` benefits, tags usage, and the `metadata` dict structure.
- Updated evaluator execution order, including `ShortCircuit` behavior and gating logic.
- Improved `ModelInfo` explanation for history tracking and clarified its passive role in model configuration.
- Added CLI examples for tags, history comparison, and evaluation workflows.
---
 README.md     | 19 +++++++++++++++++++
 docs/evals.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 32af39d..41b04c2 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,24 @@ CODES = ForEach([200, 201])
 def test_status(code: Annotated[int, From(CODES)]): ...
 ```
 
+### Native LLM Evals
+
+Score model outputs alongside your tests — same fixtures, same parallelism, same `protest` CLI. Cases get pass/fail + numeric metrics, persisted to JSONL for run-over-run comparison.
+
+```python
+@chatbot_suite.eval(evaluators=[contains_keywords(keywords=["paris"])])
+async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str:
+    return await my_agent(case.inputs)
+```
+
+```bash
+protest eval evals.session:session
+protest history --runs        # recent runs
+protest history --compare     # current vs previous
+```
+
+See [Evals docs](https://renaudcepre.github.io/protest/evals/) for evaluators, judges, history tracking.
+
 ---
 
 ## Quick Start
@@ -120,6 +138,7 @@ protest run module:session --ctrf-output r.json  # CTRF report for CI/CD
 - **Plugin system** - Custom reporters, filters
 - **Last-failed mode** - Re-run only failed tests with `--lf`
 - **CTRF reports** - Standardized JSON for CI/CD integration
+- **Native LLM evals** - Scored cases, JSONL history, `protest eval` (see [evals docs](https://renaudcepre.github.io/protest/evals/))
 
 ## Why Not pytest?
 
diff --git a/docs/evals.md b/docs/evals.md
index 84b25ca..1b01534 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -106,7 +106,29 @@ cases = ForEach([
 | `expected` | `Any` | Expected output (passed to evaluators as `ctx.expected_output`) |
 | `name` | `str` | Case identifier (used in test IDs and history) |
 | `evaluators` | `list` | Per-case evaluators (added to suite-level ones) |
-| `metadata` | `dict` | Arbitrary metadata |
+| `metadata` | `dict` | Arbitrary metadata (special key: `"tags"` — see below) |
+
+### Why `EvalCase` and not a dict?
+
+The runtime reads case data via attribute access (`case.expected`, `case.metadata`, `case.evaluators`), not by string key. A plain dict would compile fine but blow up at runtime, and you'd lose the IDE refactor/Ctrl+Click affordances. Making `EvalCase` a typed dataclass surfaces typos at import time and keeps the contract one obvious place — same trade-off as `Annotated[T, Use(fn)]` over pytest's name-based fixture lookup.
+
+### Tags via `metadata={"tags": [...]}`
+
+Per-case tags piggyback on the `metadata` dict under the reserved key `"tags"`. They flow through the test collector and become first-class on the resulting `TestItem`, so `protest eval --tag slow` works out of the box.
+
+```python
+EvalCase(
+    inputs="Long doc to summarize…",
+    expected="…",
+    name="long_doc_case",
+    metadata={"tags": ["slow", "summarization"]},
+)
+```
+
+```bash
+protest eval evals.session:session --tag slow
+protest eval evals.session:session --no-tag slow
+```
 
 ## Evaluators
 
@@ -236,6 +258,25 @@ evaluators=[
 
 `ShortCircuit` is a group of ordered evaluators. The first `Verdict=False` stops the group. Evaluators outside the `ShortCircuit` always run.
 
+Execution order — `evaluators=[a, ShortCircuit([b, c]), d]`:
+
+```
+a            ← always runs
+├─ pass    → continue
+└─ fail    → continue (a is outside the group, doesn't gate b/c)
+
+[ShortCircuit group ──────────────────────────────────┐
+  b          ← always runs (first in group)           │
+  ├─ pass  → c                                        │
+  └─ fail  → c skipped (Verdict=False stops group)    │
+  c          ← runs only if b passed                  │
+└─────────────────────────────────────────────────────┘
+
+d            ← always runs (outside the group)
+```
+
+The list `evaluators=[…]` is sequential at the top level; a `ShortCircuit` is just a sub-group that may stop early. Use it to gate expensive evaluators (LLM judges) behind cheap ones (keyword/regex checks).
+
 ### Using Evaluators
 
 ```python
@@ -302,7 +343,11 @@ async def pipeline_eval(
 
 ## ModelInfo
 
-`ModelInfo` is a **label for history tracking** — it does not configure or route to any model. It records which model produced the results so you can compare runs.
+!!! warning "ModelInfo does NOT configure a model"
+
+    Despite the name, `ModelInfo` is a **passive label** for history tracking. It does not route requests, set a temperature, pick a provider, or otherwise touch any LLM. The actual model wiring happens inside *your* task function (or the agent / SDK it calls). `ModelInfo` exists solely so `protest history` can attribute results to a specific model and compare runs side-by-side.
+
+`ModelInfo` records which model produced the results so you can compare runs.
 
 ```python
 suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5"))

From 2f0bfcbc8c8d8274a81d96c325a2aa637dba9f4e Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sun, 26 Apr 2026 10:48:17 +0200
Subject: [PATCH 52/60] refactor(evals): migrate `tags` from `metadata` to
 first-class `EvalCase` field and update tests

---
 docs/evals.md                            |  7 ++++---
 examples/yorkshire/evals/dataset.py      | 24 +++++++++++------------
 protest/core/collector.py                |  6 ++----
 protest/evals/evaluator.py               |  1 +
 tests/evals/test_evalcase_tags_wiring.py | 25 ++++++++++++------------
 5 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 1b01534..fbff5bc 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -112,16 +112,17 @@ cases = ForEach([
 
 The runtime reads case data via attribute access (`case.expected`, `case.metadata`, `case.evaluators`), not by string key. A plain dict would compile fine but blow up at runtime, and you'd lose the IDE refactor/Ctrl+Click affordances. Making `EvalCase` a typed dataclass surfaces typos at import time and keeps the contract one obvious place — same trade-off as `Annotated[T, Use(fn)]` over pytest's name-based fixture lookup.
 
-### Tags via `metadata={"tags": [...]}`
+### Per-case `tags`
 
-Per-case tags piggyback on the `metadata` dict under the reserved key `"tags"`. They flow through the test collector and become first-class on the resulting `TestItem`, so `protest eval --tag slow` works out of the box.
+`EvalCase.tags` is a first-class field. Tags flow through the test collector and become first-class on the resulting `TestItem`, so `protest eval --tag slow` works out of the box. Use `metadata` for any other free-form annotation the framework should ignore.
 
 ```python
 EvalCase(
     inputs="Long doc to summarize…",
     expected="…",
     name="long_doc_case",
-    metadata={"tags": ["slow", "summarization"]},
+    tags=["slow", "summarization"],
+    metadata={"source_dataset": "v3"},  # opaque to the framework
 )
 ```
 
diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py
index 89b3362..423ad76 100644
--- a/examples/yorkshire/evals/dataset.py
+++ b/examples/yorkshire/evals/dataset.py
@@ -18,7 +18,7 @@
             name="weight_question",
             inputs="How much does a Yorkshire Terrier weigh?",
             expected="2-3 kg",
-            metadata={"tags": ["factual", "size"]},
+            tags=["factual", "size"],
             evaluators=[
                 contains_keywords(keywords=["2-3 kg", "teacup", "mini", "standard"])
             ],
@@ -27,21 +27,21 @@
             name="grooming_basics",
             inputs="How often should I brush my Yorkie?",
             expected="daily brushing for long coats",
-            metadata={"tags": ["factual", "grooming"]},
+            tags=["factual", "grooming"],
             evaluators=[contains_keywords(keywords=["daily", "brushing", "long"])],
         ),
         EvalCase(
             name="diet_advice",
             inputs="What should I feed my Yorkshire Terrier?",
             expected="small breed formula, 2-3 meals",
-            metadata={"tags": ["factual", "diet"]},
+            tags=["factual", "diet"],
             evaluators=[contains_keywords(keywords=["small breed", "meals", "avoid"])],
         ),
         EvalCase(
             name="exercise_needs",
             inputs="How much exercise does a Yorkie need?",
             expected="30 minutes daily",
-            metadata={"tags": ["factual", "exercise"]},
+            tags=["factual", "exercise"],
             evaluators=[contains_keywords(keywords=["30 minutes", "walk"])],
         ),
         # --- Temperament ---
@@ -49,7 +49,7 @@
             name="personality",
             inputs="What is the temperament of a Yorkshire Terrier?",
             expected="bold, confident, affectionate",
-            metadata={"tags": ["factual", "temperament"]},
+            tags=["factual", "temperament"],
             evaluators=[
                 contains_keywords(keywords=["bold", "confident", "affectionate"])
             ],
@@ -59,14 +59,14 @@
             name="puppy_care",
             inputs="How do I care for a Yorkshire puppy?",
             expected="extra care, socialization",
-            metadata={"tags": ["factual", "puppies"]},
+            tags=["factual", "puppies"],
             evaluators=[contains_keywords(keywords=["12 months", "socialization"])],
         ),
         EvalCase(
             name="senior_care",
             inputs="My Yorkie is getting old, what should I change?",
             expected="adjust exercise, more vet visits",
-            metadata={"tags": ["factual", "seniors"]},
+            tags=["factual", "seniors"],
             evaluators=[contains_keywords(keywords=["senior", "exercise", "vet"])],
         ),
         # --- Hallucination checks ---
@@ -74,7 +74,7 @@
             name="no_cat_advice",
             inputs="Tell me about Yorkshire Terrier health",
             expected="dental problems, patellar luxation",
-            metadata={"tags": ["safety"]},
+            tags=["safety"],
             evaluators=[
                 does_not_contain(forbidden=["cat", "feline", "persian"]),
                 contains_keywords(keywords=["dental", "health"]),
@@ -84,7 +84,7 @@
             name="no_made_up_breeds",
             inputs="What jobs can a Yorkie do?",
             expected="therapy dogs, companions",
-            metadata={"tags": ["safety"]},
+            tags=["safety"],
             evaluators=[
                 does_not_contain(forbidden=["labrador", "golden retriever", "poodle"]),
                 contains_keywords(keywords=["therapy", "companion"]),
@@ -95,14 +95,14 @@
             name="unknown_topic",
             inputs="What is the GDP of France?",
             expected="I'm not sure",
-            metadata={"tags": ["edge_case"]},
+            tags=["edge_case"],
             evaluators=[contains_keywords(keywords=["not sure", "specialize"])],
         ),
         EvalCase(
             name="empty_question",
             inputs="",
             expected="I'm not sure",
-            metadata={"tags": ["edge_case"]},
+            tags=["edge_case"],
             evaluators=[contains_keywords(keywords=["not sure"])],
         ),
         # --- Known weak spot (chatbot doesn't know about training treats) ---
@@ -110,7 +110,7 @@
             name="training_treats",
             inputs="What treats are best for training a Yorkie?",
             expected="small soft treats, positive reinforcement",
-            metadata={"tags": ["factual", "training"]},
+            tags=["factual", "training"],
             evaluators=[
                 contains_keywords(keywords=["treats", "small", "soft", "reward"])
             ],
diff --git a/protest/core/collector.py b/protest/core/collector.py
index 72743e1..565cb71 100644
--- a/protest/core/collector.py
+++ b/protest/core/collector.py
@@ -179,10 +179,8 @@ def _expand_registration(
 
             item_tags = tags.copy()
             for value in combination:
-                if isinstance(value, EvalCase):
-                    case_tags = value.metadata.get("tags")
-                    if case_tags:
-                        item_tags.update(case_tags)
+                if isinstance(value, EvalCase) and value.tags:
+                    item_tags.update(value.tags)
 
             items.append(
                 TestItem(
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index 80881f9..b493967 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -146,6 +146,7 @@ def my_eval(case: Annotated[EvalCase, From(cases)]) -> str:
     name: str
     expected: Any = None
     evaluators: list[Any] = field(default_factory=list)
+    tags: list[str] = field(default_factory=list)
     metadata: dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self) -> None:
diff --git a/tests/evals/test_evalcase_tags_wiring.py b/tests/evals/test_evalcase_tags_wiring.py
index dbf9649..05ff8ca 100644
--- a/tests/evals/test_evalcase_tags_wiring.py
+++ b/tests/evals/test_evalcase_tags_wiring.py
@@ -1,8 +1,8 @@
-"""Tests for `EvalCase.metadata['tags']` → `TestItem.tags` wiring.
+"""Tests for `EvalCase.tags` → `TestItem.tags` wiring.
 
-Verifies that tags declared on an `EvalCase` via `metadata={'tags': [...]}`
-are merged into the resulting `TestItem.tags` set, so that the
-`TagFilterPlugin` (which filters on `TestItem.tags`) can honor them.
+Verifies that tags declared on an `EvalCase` via `tags=[...]` are merged
+into the resulting `TestItem.tags` set, so that the `TagFilterPlugin`
+(which filters on `TestItem.tags`) can honor them.
 
 Eval functions are defined at module level to avoid `get_type_hints()`
 resolution issues that occur with nested function definitions.
@@ -19,21 +19,19 @@
 from protest.tags.plugin import TagFilterPlugin
 
 # Module-level case sources so `get_type_hints()` can resolve Annotated args.
-_single_tagged = [EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]})]
-_multi_tagged = [
-    EvalCase(inputs="x", name="c1", metadata={"tags": ["safety", "factual"]})
-]
+_single_tagged = [EvalCase(inputs="x", name="c1", tags=["safety"])]
+_multi_tagged = [EvalCase(inputs="x", name="c1", tags=["safety", "factual"])]
 _mixed_cases = [
-    EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]}),
-    EvalCase(inputs="y", name="c2", metadata={"tags": ["factual"]}),
+    EvalCase(inputs="x", name="c1", tags=["safety"]),
+    EvalCase(inputs="y", name="c2", tags=["factual"]),
     EvalCase(inputs="z", name="c3"),
 ]
 _no_tags_metadata = [
     EvalCase(inputs="x", name="c1", metadata={"other": "value"}),
 ]
 _filter_cases = [
-    EvalCase(inputs="a", name="c_safety", metadata={"tags": ["safety"]}),
-    EvalCase(inputs="b", name="c_factual", metadata={"tags": ["factual"]}),
+    EvalCase(inputs="a", name="c_safety", tags=["safety"]),
+    EvalCase(inputs="b", name="c_factual", tags=["factual"]),
 ]
 
 
@@ -73,7 +71,8 @@ def test_cases_get_distinct_tags(self) -> None:
         assert "safety" not in by_name["c2"].tags
         assert by_name["c3"].tags == set()
 
-    def test_case_without_tags_metadata_ok(self) -> None:
+    def test_case_with_metadata_only_has_no_tags(self) -> None:
+        """`metadata` is user-free: no key (including 'tags') is interpreted."""
         items = _collect(_no_tags_metadata)
         assert items[0].tags == set()
 

From fa5a7ee44fe50086c71d524d1d8187d9bc46107e Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:27:50 +0200
Subject: [PATCH 53/60] tests(evals): add validation for multiple `EvalCase`
 params and CLI flag exclusion

- Added decorator-time validation to ensure eval functions declare only one `EvalCase` parameter, raising clear errors on conflicts.
- Introduced tests for multiple `EvalCase` parameter rejection, covering both base and subclass scenarios.
- Updated CLI parser to exclude eval-only flags (e.g., `--show-output`) from `protest run`, with tests verifying proper error handling and help content omissions.
- Enhanced DI type hint resolution to handle `TYPE_CHECKING` imports and enclosing-local references.
---
 docs/core-concepts/dependency-injection.md   | 18 ++++
 protest/cli/main.py                          | 36 ++++---
 protest/di/hints.py                          | 85 ++++++++++++++---
 protest/evals/types.py                       | 10 +-
 protest/evals/wrapper.py                     | 39 +++++++-
 protest/exceptions.py                        | 20 ++++
 tests/cli/test_run_command.py                | 31 ++++++
 tests/evals/test_multiple_evalcase_params.py | 99 ++++++++++++++++++++
 8 files changed, 310 insertions(+), 28 deletions(-)
 create mode 100644 tests/evals/test_multiple_evalcase_params.py

diff --git a/docs/core-concepts/dependency-injection.md b/docs/core-concepts/dependency-injection.md
index 2aba9c2..3e3d3e3 100644
--- a/docs/core-concepts/dependency-injection.md
+++ b/docs/core-concepts/dependency-injection.md
@@ -24,6 +24,24 @@ async def test_query(db: Annotated[Database, Use(database)]):
 
 The `Use` marker takes a **function reference**, not a string. This makes dependencies explicit and enables IDE navigation.
 
+### `Type` is a hint, not a runtime check
+
+In `Annotated[Type, Use(fixture)]`, `Type` is a **type hint for your IDE and static checkers** — ProTest does not validate at runtime that `fixture()` actually returns a `Type`. This matches FastAPI's behavior with `Annotated[Type, Depends(fn)]`: the type is taken on faith, not enforced.
+
+```python
+@fixture()
+def returns_str() -> str:
+    return "hello"
+
+@session.test()
+def test_mismatch(value: Annotated[int, Use(returns_str)]):
+    # `value` is actually a `str` at runtime — ProTest will not warn.
+    # The mismatch surfaces only when `value` is used as an `int`.
+    ...
+```
+
+In practice this is rarely a problem: keep your fixture return types and your call-site annotations aligned, and rely on `mypy`/`pyright` for the static check on the fixture itself.
+
 ## Why Function References?
 
 Using function references instead of string names has benefits:
diff --git a/protest/cli/main.py b/protest/cli/main.py
index 574825f..16e76ee 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -175,11 +175,19 @@ def _create_base_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def _create_run_parser() -> argparse.ArgumentParser:
-    """Base parser with core run options. Plugin options added dynamically."""
+def _create_run_parser(
+    *,
+    include_eval_options: bool = False,
+) -> argparse.ArgumentParser:
+    """Base parser with core run options. Plugin options added dynamically.
+
+    `include_eval_options=True` adds eval-only flags (e.g. ``--show-output``).
+    Set when building the parser for ``protest eval``; left False for
+    ``protest run`` so the eval-only flags don't pollute the test help/parsing.
+    """
     parser = argparse.ArgumentParser(
-        prog="protest run",
-        description="Run tests",
+        prog="protest eval" if include_eval_options else "protest run",
+        description="Run evals" if include_eval_options else "Run tests",
     )
     parser.add_argument(
         "target",
@@ -231,12 +239,6 @@ def _create_run_parser() -> argparse.ArgumentParser:
         default=0,
         help="Increase verbosity (-v for lifecycle, -vv for fixtures)",
     )
-    parser.add_argument(
-        "--show-output",
-        dest="show_output",
-        action="store_true",
-        help="Show eval inputs/output/expected per case",
-    )
     parser.add_argument(
         "--show-logs",
         dest="show_logs",
@@ -246,6 +248,13 @@ def _create_run_parser() -> argparse.ArgumentParser:
         metavar="LEVEL",
         help="Show captured log records (default: INFO+)",
     )
+    if include_eval_options:
+        parser.add_argument(
+            "--show-output",
+            dest="show_output",
+            action="store_true",
+            help="Show eval inputs/output/expected per case",
+        )
     return parser
 
 
@@ -261,6 +270,7 @@ def _handle_history_command() -> None:
 def _handle_run_command(kind_filter: str | None = None) -> None:
     """Handle 'protest run' / 'protest eval' with two-phase parsing."""
     argv = sys.argv[2:]
+    include_eval_options = kind_filter == "eval"
 
     # Phase 1: Parse base args to get target
     base_parser = _create_base_parser()
@@ -268,14 +278,14 @@ def _handle_run_command(kind_filter: str | None = None) -> None:
 
     # If --help without target, show full help with all plugin options
     if ("--help" in remaining or "-h" in remaining) and not base_args.target:
-        full_parser = _create_run_parser()
+        full_parser = _create_run_parser(include_eval_options=include_eval_options)
         for plugin_class in ProTestSession.default_plugin_classes():
             plugin_class.add_cli_options(full_parser)
         full_parser.parse_args(["--help"])
         return
 
     if not base_args.target:
-        _create_run_parser().print_help()
+        _create_run_parser(include_eval_options=include_eval_options).print_help()
         sys.exit(1)
 
     # Phase 2: Load session and register default plugins
@@ -289,7 +299,7 @@ def _handle_run_command(kind_filter: str | None = None) -> None:
     session.register_default_plugins()
 
     # Phase 3: Build full parser with plugin options
-    full_parser = _create_run_parser()
+    full_parser = _create_run_parser(include_eval_options=include_eval_options)
     for plugin_class in session.plugin_classes:
         plugin_class.add_cli_options(full_parser)
 
diff --git a/protest/di/hints.py b/protest/di/hints.py
index bd6a89b..0af61bc 100644
--- a/protest/di/hints.py
+++ b/protest/di/hints.py
@@ -1,15 +1,60 @@
 """Type hints resolution with PEP 563 / TYPE_CHECKING compatibility.
 
-Shared by the core DI system and evals runner. Handles two failure modes:
+Shared by the core DI system and evals runner. ``get_type_hints()`` alone
+fails in two scenarios commonly encountered in ProTest user code; this
+module wraps it with a cascade of fallbacks.
 
-1. Local fixtures — ``from __future__ import annotations`` stringifies
-   annotations; names defined in local scopes aren't in ``func.__globals__``.
-   Fix: collect locals from the call stack.
+------------------------------------------------------------------------
+Failure mode 1 — names defined in a local scope (PEP 563 stringification)
+------------------------------------------------------------------------
 
-2. TYPE_CHECKING-only types — e.g. ``AsyncDriver`` imported only under
-   ``if TYPE_CHECKING:``. Fix: substitute ``Any`` for each unresolvable
-   name. The type itself is irrelevant for DI; only the ``Use(...)``
-   marker inside ``Annotated[...]`` matters.
+With ``from __future__ import annotations``, all annotations are stored
+as strings. ``get_type_hints()`` resolves them via ``eval()`` inside
+``func.__globals__`` only. Names defined in the scope of an enclosing
+function are NOT in ``__globals__``, so resolution raises ``NameError``.
+
+The most common form of this in ProTest is a parametrized eval defined
+inside a helper, where the case source is a local variable::
+
+    def _build_suite(cases):
+        source = ForEach(cases)            # local to _build_suite
+
+        @suite.eval()
+        def my_eval(case: Annotated[EvalCase, From(source)]) -> str:
+            #                              ^^^^^^ refers to `source`,
+            #                              which is local to _build_suite
+            return str(case.inputs)
+
+When ``get_type_hints(my_eval)`` evaluates ``"Annotated[EvalCase, From(source)]"``
+inside ``my_eval.__globals__``, ``source`` is undefined → ``NameError``.
+
+Fix: walk the call stack with ``inspect.stack()`` and merge every frame's
+``f_locals`` into a ``localns`` dict that we pass to ``get_type_hints()``
+on retry. This is registration-time only (decorator evaluation), never
+in a hot path, so the cost of ``inspect.stack()`` is acceptable.
+
+Trade-off: ``localns`` ends up containing every local from every frame
+on the stack. Name collisions silently resolve to the most recently
+seen binding. In practice no collision has been observed in this project,
+because annotations only reference DI markers (``Use``/``From``) plus
+small, distinctively-named locals.
+
+-------------------------------------------------
+Failure mode 2 — TYPE_CHECKING-only imported types
+-------------------------------------------------
+
+Types imported under ``if TYPE_CHECKING:`` are absent at runtime, so
+``get_type_hints()`` raises ``NameError`` regardless of ``localns``::
+
+    if TYPE_CHECKING:
+        from heavy_module import HeavyType
+
+    @factory()
+    def make() -> HeavyType: ...
+
+Fix: substitute ``Any`` for each unresolvable name and retry. The exact
+type is irrelevant for DI dispatch — only the ``Use(...)``/``From(...)``
+marker inside ``Annotated[...]`` is consulted at injection time.
 """
 
 from __future__ import annotations
@@ -21,11 +66,19 @@
 
 
 def get_type_hints_compat(func: Any) -> dict[str, Any]:
-    """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks."""
+    """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks.
+
+    See module docstring for the failure modes this function exists to
+    handle. Cascade: (1) plain call, (2) retry with stack-collected
+    ``localns``, (3) retry while substituting ``Any`` for unresolvable
+    names. All fallbacks run at registration time only.
+    """
     with contextlib.suppress(Exception):
         return get_type_hints(func, include_extras=True)
 
-    # Build a namespace from the entire call stack (covers local fixtures).
+    # Build a namespace from the entire call stack so that locals from
+    # an enclosing helper (e.g. `source = ForEach(...)`) become visible
+    # to `get_type_hints`'s eval. See module docstring, failure mode 1.
     localns: dict[str, Any] = {}
     with contextlib.suppress(Exception):
         for frame_info in inspect.stack():
@@ -34,7 +87,8 @@ def get_type_hints_compat(func: Any) -> dict[str, Any]:
     with contextlib.suppress(Exception):
         return get_type_hints(func, localns=localns, include_extras=True)
 
-    # TYPE_CHECKING fallback: substitute Any for unresolvable names.
+    # Last resort for TYPE_CHECKING-only types. See module docstring,
+    # failure mode 2.
     return _get_type_hints_substituting_any(func, localns)
 
 
@@ -42,7 +96,14 @@ def _get_type_hints_substituting_any(
     func: Any,
     localns: dict[str, Any],
 ) -> dict[str, Any]:
-    """Retry get_type_hints, replacing each NameError'd name with Any."""
+    """Retry ``get_type_hints``, replacing each NameError'd name with ``Any``.
+
+    Used as a last-resort fallback when a referenced type is unresolvable
+    at runtime (typically a TYPE_CHECKING-only import). The substituted
+    ``Any`` is only used as a placeholder so resolution can complete; the
+    DI system reads the ``Use(...)``/``From(...)`` marker out of the
+    ``Annotated[...]``, not the underlying type.
+    """
     localns = dict(localns)
     for _ in range(20):
         try:
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 141047b..0c628ab 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -309,7 +309,15 @@ def score_stats(self, name: str) -> ScoreStats:
         return ScoreStats.from_values(name, values)
 
     def all_score_stats(self) -> list[ScoreStats]:
-        return [self.score_stats(n) for n in sorted(self.score_names())]
+        # Single pass groups values by score name, avoiding O(n_cases x n_names)
+        # of calling score_stats(n) per name. score_stats(name) is preserved as
+        # a public single-name accessor.
+        by_name: dict[str, list[float]] = {}
+        for c in self.cases:
+            for s in c.scores:
+                if s.is_metric:
+                    by_name.setdefault(s.name, []).append(float(s.value))
+        return [ScoreStats.from_values(n, by_name[n]) for n in sorted(by_name)]
 
     @property
     def total_task_input_tokens(self) -> int:
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index 82b25a8..f6c074f 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -10,8 +10,9 @@
 import asyncio
 import functools
 import time
-from typing import Any
+from typing import Annotated, Any, get_args, get_origin
 
+from protest.di.hints import get_type_hints_compat
 from protest.entities.events import EvalPayload, EvalScoreEntry
 from protest.evals.evaluator import (
     EvalCase,
@@ -22,7 +23,7 @@
 )
 from protest.evals.hashing import compute_case_hash, compute_eval_hash
 from protest.evals.types import EvalScore, TaskResult
-from protest.exceptions import FixtureError
+from protest.exceptions import FixtureError, MultipleEvalCaseParamsError
 
 
 def make_eval_wrapper(
@@ -32,6 +33,8 @@ def make_eval_wrapper(
 ) -> Any:
     """Wrap a function to run evaluators on its return value."""
 
+    _validate_single_evalcase_param(func)
+
     @functools.wraps(func)
     async def eval_wrapper(**kwargs: Any) -> EvalPayload:
         expected = _extract_expected(kwargs)
@@ -102,6 +105,38 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
     return eval_wrapper
 
 
+# ---------------------------------------------------------------------------
+# Registration-time validation
+# ---------------------------------------------------------------------------
+
+
+def _validate_single_evalcase_param(func: Any) -> None:
+    """Raise MultipleEvalCaseParamsError if `func` has > 1 EvalCase parameter.
+
+    Runs at decorator time. The runtime contract (`_find_case`) silently
+    picks the first EvalCase in kwargs, which would drop the second one's
+    name/expected/inputs/metadata/per-case evaluators downstream. We catch
+    that here so the failure is loud and pinpoints the offending eval.
+
+    Subclasses of EvalCase count: the runtime uses isinstance(_, EvalCase),
+    so any subclass would trigger the same silent drop.
+    """
+    hints = get_type_hints_compat(func)
+    offending: list[str] = []
+    for param_name, annotation in hints.items():
+        if param_name == "return":
+            continue
+        underlying = (
+            get_args(annotation)[0]
+            if get_origin(annotation) is Annotated
+            else annotation
+        )
+        if isinstance(underlying, type) and issubclass(underlying, EvalCase):
+            offending.append(param_name)
+    if len(offending) > 1:
+        raise MultipleEvalCaseParamsError(func.__name__, offending)
+
+
 # ---------------------------------------------------------------------------
 # Extract helpers — pull EvalCase from kwargs
 # ---------------------------------------------------------------------------
diff --git a/protest/exceptions.py b/protest/exceptions.py
index 8176c6f..42b5716 100644
--- a/protest/exceptions.py
+++ b/protest/exceptions.py
@@ -93,3 +93,23 @@ def __init__(self, value: int):
             f"max_concurrency must be >= 1, got {value}. "
             f"Use None for unlimited concurrency."
         )
+
+
+class MultipleEvalCaseParamsError(ProTestError):
+    """Raised when an eval function declares more than one EvalCase parameter.
+
+    Only one EvalCase per eval is supported: it determines the case identity
+    (name, expected, inputs, metadata, per-case evaluators) used by the
+    runner, history, and reporters. Additional EvalCase parameters would be
+    silently ignored downstream.
+    """
+
+    def __init__(self, func_name: str, param_names: list[str]):
+        params = ", ".join(param_names)
+        super().__init__(
+            f"Eval '{func_name}' declares multiple EvalCase parameters: {params}. "
+            f"Only one EvalCase parameter is supported per eval — it is used "
+            f"for case identity (name), expected output, inputs, metadata, "
+            f"and per-case evaluators. Merge the cases into a single EvalCase, "
+            f"or split into separate evals."
+        )
diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py
index a56174d..878bd19 100644
--- a/tests/cli/test_run_command.py
+++ b/tests/cli/test_run_command.py
@@ -244,3 +244,34 @@ def test_suite_keyword_and_tag(self, run_protest: Callable[..., CLIResult]) -> N
         result.assert_success()
         expected_count = 1
         assert f"{expected_count}/{expected_count} passed" in result.stdout
+
+
+class TestRunRejectsEvalOnlyFlags:
+    """`--show-output` is eval-only and must not be accepted by `protest run`.
+
+    The CLI parser is split: `protest run` builds a parser without eval-only
+    flags, so passing `--show-output` should raise an argparse error rather
+    than silently no-op (the previous behavior was a UX papercut: the flag
+    appeared in `protest run --help` but did nothing for non-eval tests).
+    """
+
+    def test_run_rejects_show_output(
+        self, run_protest: Callable[..., CLIResult]
+    ) -> None:
+        result = run_protest("run", "simple_session:session", "--show-output")
+        assert result.exit_code != 0, (
+            f"Expected non-zero exit for `protest run --show-output`, "
+            f"got {result.exit_code}\nstdout: {result.stdout}\nstderr: {result.stderr}"
+        )
+        assert "show-output" in result.stderr, (
+            f"Expected argparse error mentioning 'show-output' in stderr, "
+            f"got: {result.stderr}"
+        )
+
+    def test_run_help_omits_show_output(
+        self, run_protest: Callable[..., CLIResult]
+    ) -> None:
+        result = run_protest("run", "--help")
+        assert "--show-output" not in result.stdout, (
+            f"Expected --show-output absent from `protest run --help`:\n{result.stdout}"
+        )
diff --git a/tests/evals/test_multiple_evalcase_params.py b/tests/evals/test_multiple_evalcase_params.py
new file mode 100644
index 0000000..3880811
--- /dev/null
+++ b/tests/evals/test_multiple_evalcase_params.py
@@ -0,0 +1,99 @@
+"""Tests for `_validate_single_evalcase_param` — D1 registration-time check.
+
+The runtime contract (`_find_case`) picks the first `EvalCase` in kwargs and
+silently drops any others. The wrapper detects > 1 EvalCase param at
+registration and raises a clear error pointing at the offending parameters.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated
+
+import pytest
+
+from protest import ForEach, From, ProTestSession
+from protest.evals import EvalCase
+from protest.evals.suite import EvalSuite
+from protest.exceptions import MultipleEvalCaseParamsError
+
+# Module-level case sources so `get_type_hints()` can resolve Annotated args.
+_cases_a = ForEach([EvalCase(inputs="a", name="a1")])
+_cases_b = ForEach([EvalCase(inputs="b", name="b1")])
+
+
+class _MyCase(EvalCase):
+    """Subclass to verify the check covers user-defined EvalCase types."""
+
+
+_subclass_cases = ForEach([_MyCase(inputs="x", name="x1")])
+
+
+class TestSingleEvalCaseParamAccepted:
+    def test_one_evalcase_param_via_annotated_from(self) -> None:
+        session = ProTestSession()
+        suite = EvalSuite("evals")
+
+        @suite.eval()
+        def good(case: Annotated[EvalCase, From(_cases_a)]) -> str:
+            return str(case.inputs)
+
+        _ = good
+        session.add_suite(suite)  # no raise
+
+    def test_zero_evalcase_param_accepted(self) -> None:
+        """Evals without parametrization (or without EvalCase) are valid."""
+        session = ProTestSession()
+        suite = EvalSuite("evals")
+
+        @suite.eval()
+        def no_case() -> str:
+            return "static"
+
+        _ = no_case
+        session.add_suite(suite)  # no raise
+
+    def test_subclass_param_accepted_when_alone(self) -> None:
+        session = ProTestSession()
+        suite = EvalSuite("evals")
+
+        @suite.eval()
+        def good(case: Annotated[_MyCase, From(_subclass_cases)]) -> str:
+            return str(case.inputs)
+
+        _ = good
+        session.add_suite(suite)
+
+
+class TestMultipleEvalCaseParamRejected:
+    def test_two_evalcase_params_raise(self) -> None:
+        suite = EvalSuite("evals")
+
+        with pytest.raises(MultipleEvalCaseParamsError) as excinfo:
+
+            @suite.eval()
+            def bad(
+                case_a: Annotated[EvalCase, From(_cases_a)],
+                case_b: Annotated[EvalCase, From(_cases_b)],
+            ) -> str:
+                return f"{case_a.inputs}+{case_b.inputs}"
+
+        msg = str(excinfo.value)
+        assert "bad" in msg
+        assert "case_a" in msg
+        assert "case_b" in msg
+
+    def test_subclass_counts_as_evalcase(self) -> None:
+        """A param typed `_MyCase` (subclass) collides with a `EvalCase` param."""
+        suite = EvalSuite("evals")
+
+        with pytest.raises(MultipleEvalCaseParamsError) as excinfo:
+
+            @suite.eval()
+            def bad(
+                case_a: Annotated[EvalCase, From(_cases_a)],
+                case_b: Annotated[_MyCase, From(_subclass_cases)],
+            ) -> str:
+                return str(case_a.inputs) + str(case_b.inputs)
+
+        assert "case_a" in str(excinfo.value)
+        assert "case_b" in str(excinfo.value)

From 53d4813ba8c46e1974a1cb1c78bf6f97c0856e70 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:59:04 +0200
Subject: [PATCH 54/60] fix(executor): raise builtin TimeoutError to match
 Python 3.10 semantics

asyncio.TimeoutError and builtins.TimeoutError were distinct classes
before Python 3.11. Reporters and tests check isinstance against the
builtin, so on 3.10 the previous `raise asyncio.TimeoutError(...)` made
those checks fail. On 3.11+ both names alias the builtin, so this is a
no-op. Fixes 6 timeout/retry tests on the 3.10 CI matrix.
---
 protest/core/execution/test_executor.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py
index 3c065f2..10921b9 100644
--- a/protest/core/execution/test_executor.py
+++ b/protest/core/execution/test_executor.py
@@ -181,7 +181,11 @@ async def _run_test(  # noqa: PLR0912 - complex test execution flow, refactoring
                             timeout=item.timeout,
                         )
                     except asyncio.TimeoutError:
-                        raise asyncio.TimeoutError(
+                        # Raise the builtin TimeoutError, not asyncio.TimeoutError.
+                        # On Python 3.11+ they are aliases, but on 3.10 they are
+                        # distinct classes and reporters/tests check isinstance
+                        # against the builtin.
+                        raise TimeoutError(
                             f"Test exceeded timeout of {item.timeout}s"
                         ) from None
                 else:

From 45643805bf40bb8f2017945460c64c0070db1246 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Mon, 27 Apr 2026 00:27:49 +0200
Subject: [PATCH 55/60] fix(evals): tier-1 polish from naive-agent feedback

Agent test (Claude Code in isolated dir, public docs only) surfaced
several rough edges. This batch addresses the ones blocking a clean
re-run signal:

- ScoreNameCollisionError: dataclass evaluators with overlapping field
  names previously overwrote each other silently in the per-case
  scores dict (and the history file). Now raises at runtime with the
  case name and duplicate names; doc rewritten to remove the false
  auto-prefix promise.
- ModelInfo -> ModelLabel: rename clarifies it is a passive history
  label, not a runtime model config (the doc warning becomes obsolete
  and is replaced by a plain description).
- rich made truly optional: lazy-imported inside RichReporter methods
  so `import protest` works without rich; AsciiReporter.activate()
  takes over when rich is missing. Verified in a venv with no extras.
- EvalSuite re-exported from protest.evals so users only need one
  import path for the eval API.
- Top-level `protest --help` epilog now includes eval/history/live
  examples (was 9 run + 1 tags, none for eval/history/live).
- cli.md gets full `protest eval` and `protest history` sections,
  including --compare's case-modified vs scoring-modified semantics.
---
 docs/cli.md                              | 163 +++++++++++++++++++++++
 docs/evals.md                            |  55 +++++---
 examples/yorkshire/evals/session.py      |   4 +-
 examples/yorkshire/session.py            |   4 +-
 protest/cli/main.py                      |   6 +
 protest/evals/__init__.py                |   6 +-
 protest/evals/suite.py                   |   6 +-
 protest/evals/types.py                   |   2 +-
 protest/evals/wrapper.py                 |  18 ++-
 protest/exceptions.py                    |  26 ++++
 protest/reporting/ascii.py               |   6 +-
 protest/reporting/rich_reporter.py       |  23 +++-
 tests/evals/test_e2e.py                  |   6 +-
 tests/evals/test_score_name_collision.py | 143 ++++++++++++++++++++
 14 files changed, 435 insertions(+), 33 deletions(-)
 create mode 100644 tests/evals/test_score_name_collision.py

diff --git a/docs/cli.md b/docs/cli.md
index 7495ae5..7d2b299 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -13,6 +13,8 @@ protest <command> [options] <target>
 | Command | Description |
 |---------|-------------|
 | `run` | Run tests |
+| `eval` | Run evaluations |
+| `history` | Browse run history (tests and evals) |
 | `live` | Start live reporter server |
 | `tags list` | List tags in a session |
 
@@ -276,6 +278,167 @@ protest run tests:session
 
 ---
 
+## protest eval
+
+Run evaluations from a session.
+
+`protest eval` is the eval-suite counterpart of `protest run`. It shares
+the same target format, filters, capture flags and reporting options as
+`run`; the differences are listed below.
+
+### Syntax
+
+```bash
+protest eval <target> [options]
+```
+
+### Options
+
+`protest eval` accepts every option from `protest run` (see above:
+`-n/--concurrency`, `--collect-only`, `-x/--exitfirst`, `-s/--no-capture`,
+`-q/--quiet`, `-v/--verbose`, `--show-logs`, `-t/--tag`, `--no-tag`,
+`-k/--keyword`, `--lf`, `--cache-clear`, `--no-color`, `--ctrf-output`,
+`--no-log-file`, `--app-dir`), plus one eval-only flag:
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--show-output` | Print `inputs` / `output` / `expected` for **every** case (failed cases always print these). | off |
+
+### Examples
+
+```bash
+# Run all evals in a session
+protest eval evals.session:session
+
+# One specific suite
+protest eval evals.session:session::helpdesk_struct
+
+# One ticket by name
+protest eval evals.session:session -k T001
+
+# All cases tagged "cat:hardware"
+protest eval evals.session:session --tag cat:hardware
+
+# Re-run only the cases that failed last time
+protest eval evals.session:session --lf
+
+# Show the input/output of every case (not just failures)
+protest eval evals.session:session --show-output
+```
+
+### Output
+
+Each case prints one line:
+
+```
+✓   classify_ticket_struct[T011] (2ms) category_is_allowed=✓ summary_keyword_recall=1.00 …
+```
+
+After every suite, an aggregate-stats table summarizes the `Metric`
+fields across cases (mean / p50 / p5 / p95). `Verdict` and `Reason`
+fields don't appear in this table — only numeric `Metric` fields do.
+
+Per-case markdown artifacts are written to
+`.protest/results/<suite>_<timestamp>/<case-id>.md`, with the full
+input, output, expected, and per-evaluator scores.
+
+---
+
+## protest history
+
+Browse persisted run history (tests and evals).
+
+Every run appends one entry to `.protest/history.jsonl`; `protest history`
+queries that file with various views.
+
+### Syntax
+
+```bash
+protest history [view] [filters]
+```
+
+Exactly one view is shown at a time. The view defaults to a per-suite
+trend table when no flag is given.
+
+### View flags (mutually exclusive)
+
+| Flag | Description |
+|------|-------------|
+| _(none)_ | Per-suite trend table: pass-rate trend + score arrows |
+| `--runs` | Run-by-run pass rates, most recent first |
+| `--show [N]` | Detailed panel for the Nth most recent run (`0` = latest, default) |
+| `--compare` | Compare the two most recent runs of the same model |
+
+### Filters (apply to all views)
+
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--tail N`, `-n N` | Limit to the N most recent entries | 10 |
+| `--evals` | Show eval runs only | _all kinds_ |
+| `--tests` | Show test runs only | _all kinds_ |
+| `--model NAME` | Filter by `ModelLabel.name` | _all_ |
+| `--suite NAME` | Filter by suite name | _all_ |
+| `--clean-dirty` | Remove entries from runs made on a dirty working tree | off |
+| `--path DIR` | Use a custom history directory | `.protest/` |
+
+### Reading `--compare`
+
+`--compare` reports four kinds of change between the two most recent
+runs of the same model:
+
+| Marker | Label | Meaning |
+|--------|-------|---------|
+| `+` | Fixed | Case was failing in the previous run, passes now |
+| `-` | Regressions | Case was passing in the previous run, fails now |
+| `⟳` | Modified | Case is recognizable (same name) but its content changed |
+| `*` | New | Case did not exist in the previous run |
+| `✗` | Deleted | Case existed in the previous run, gone now |
+
+The `Modified` line tells you **what** changed by suffixing the case
+name:
+
+- `T001 (case modified)` — `inputs` or `expected` changed (`case_hash`
+  diff)
+- `T001 (scoring modified)` — only the evaluator configuration changed
+  (`eval_hash` diff). Inputs and expected output are intact; you've
+  edited an evaluator or its parameters.
+
+### Examples
+
+```bash
+# Per-suite trend across last 10 runs (default view)
+protest history --evals
+
+# Run-by-run breakdown of the last 5 eval runs
+protest history --evals --runs --tail 5
+
+# Detailed panel for the most recent run
+protest history --evals --show
+
+# Detailed panel for the run before that (1 = next-most-recent)
+protest history --evals --show 1
+
+# Compare the two most recent runs
+protest history --evals --compare
+
+# Filter to one model across all views
+protest history --evals --model qwen-2.5
+
+# Drop runs made on a dirty working tree before any view
+protest history --evals --clean-dirty
+```
+
+### Notes
+
+- When the project is not a git repo, the per-run commit / dirty
+  columns display `?`. `--clean-dirty` is a no-op in that case.
+- `--evals` and `--tests` are mutually exclusive; omit both to see
+  every kind.
+- Per-case detail (input, output, expected, evaluator scores) lives in
+  `.protest/results/`, not in the history file.
+
+---
+
 ## protest live
 
 Start a persistent live reporter server for real-time test visualization.
diff --git a/docs/evals.md b/docs/evals.md
index fbff5bc..6bf899f 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -11,7 +11,7 @@ Evaluate LLM outputs with scored metrics and historical tracking.
 - [EvalCase](#evalcase)
 - [Evaluators](#evaluators)
 - [Fixtures](#fixtures)
-- [ModelInfo](#modelinfo)
+- [ModelLabel](#modelinfo)
 - [Judge](#judge)
 - [TaskResult (SUT Usage Tracking)](#taskresult-sut-usage-tracking)
 - [Usage Display](#usage-display)
@@ -36,7 +36,7 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t
 from typing import Annotated
 
 from protest import ForEach, From, ProTestSession
-from protest.evals import EvalCase, ModelInfo, evaluator
+from protest.evals import EvalCase, ModelLabel, evaluator
 from protest.evals.evaluators import contains_keywords
 from protest.evals.suite import EvalSuite
 
@@ -47,7 +47,7 @@ cases = ForEach([
 
 session = ProTestSession()
 
-chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini"))
+chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="gpt-4o-mini"))
 session.add_suite(chatbot_suite)
 
 @chatbot_suite.eval(evaluators=[contains_keywords(keywords=["Marie"])])
@@ -77,9 +77,9 @@ The rest of the pipeline — fixtures, DI, parallelism, reporters — works iden
 
 ```python
 from protest.evals.suite import EvalSuite
-from protest.evals import ModelInfo
+from protest.evals import ModelLabel
 
-chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini"))
+chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="gpt-4o-mini"))
 session.add_suite(chatbot_suite)
 
 @chatbot_suite.eval(evaluators=[my_scorer])
@@ -342,16 +342,12 @@ async def pipeline_eval(
     return await query(driver, case.inputs)
 ```
 
-## ModelInfo
+## ModelLabel
 
-!!! warning "ModelInfo does NOT configure a model"
-
-    Despite the name, `ModelInfo` is a **passive label** for history tracking. It does not route requests, set a temperature, pick a provider, or otherwise touch any LLM. The actual model wiring happens inside *your* task function (or the agent / SDK it calls). `ModelInfo` exists solely so `protest history` can attribute results to a specific model and compare runs side-by-side.
-
-`ModelInfo` records which model produced the results so you can compare runs.
+`ModelLabel` is a **passive label** that ProTest stores in the history alongside each run, so you can attribute results to a specific model and compare runs side-by-side. It does not route requests, set a temperature, pick a provider, or otherwise touch any LLM — the actual model wiring happens inside *your* task function (or the agent / SDK it calls).
 
 ```python
-suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5"))
+suite = EvalSuite("pipeline", model=ModelLabel(name="qwen-2.5"))
 ```
 
 ## Judge
@@ -406,7 +402,7 @@ return JudgeResponse(output=result.output)  # tokens/cost = None, that's fine
 ```python
 suite = EvalSuite(
     "pipeline",
-    model=ModelInfo(name="qwen-2.5"),
+    model=ModelLabel(name="qwen-2.5"),
     judge=PydanticAIJudge(model="gpt-4o-mini", temperature=0),
 )
 ```
@@ -497,7 +493,34 @@ If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked
 
 ## Name Collisions
 
-If two evaluators return dataclasses with the same field name (e.g. both have `accuracy`), the runner prefixes with the evaluator name when it detects a conflict: `llm_judge.accuracy`, `fact_check.accuracy`.
+Each `Verdict` / `Metric` / `Reason` field name from a dataclass evaluator
+becomes a key in the per-case score dict (and in the history file). **Names
+must be unique across all evaluators that run on the same case.**
+
+If two evaluators emit a score under the same name (e.g. both have a
+`detail` field), ProTest raises `ScoreNameCollisionError` at runtime so the
+collision is loud instead of silently overwriting the duplicate. Rename the
+colliding field — typically by prefixing with the evaluator's concept:
+
+```python
+@dataclass
+class SummaryShape:
+    summary_well_formed: Annotated[bool, Verdict]
+    summary_detail: Annotated[str, Reason] = ""        # not just "detail"
+
+@dataclass
+class CategoryMatch:
+    category_matches: Annotated[bool, Verdict]
+    category_match_detail: Annotated[str, Reason] = ""  # not just "detail"
+```
+
+Why no auto-prefix? An evaluator's score name is what users grep for in
+history, scripts, and the markdown artifacts. Auto-prefixing would mean the
+same evaluator's `accuracy` field changes name (`fact_check.accuracy` vs
+plain `accuracy`) depending on which other evaluators are wired in alongside
+it — silently breaking downstream consumers when a new evaluator is added.
+Failing loud and asking you to pick a stable, unique name keeps the score
+identifiers stable across configurations.
 
 ## Multi-Model Sessions
 
@@ -506,8 +529,8 @@ Track which model produced each eval suite's results. Each `EvalSuite` can have
 ```python
 session = ProTestSession()
 
-pipeline_suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5"))
-chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b"))
+pipeline_suite = EvalSuite("pipeline", model=ModelLabel(name="qwen-2.5"))
+chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="mistral-7b"))
 
 session.add_suite(pipeline_suite)
 session.add_suite(chatbot_suite)
diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py
index f03d733..06d9b3f 100644
--- a/examples/yorkshire/evals/session.py
+++ b/examples/yorkshire/evals/session.py
@@ -16,7 +16,7 @@
     yorkshire_cases,
 )
 from protest import From, ProTestSession
-from protest.evals import EvalCase, ModelInfo
+from protest.evals import EvalCase, ModelLabel
 from protest.evals.suite import EvalSuite
 
 session = ProTestSession(
@@ -25,7 +25,7 @@
 
 yorkshire_suite = EvalSuite(
     "yorkshire_eval",
-    model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"),
+    model=ModelLabel(name="yorkshire-chatbot-v1", provider="local"),
 )
 session.add_suite(yorkshire_suite)
 
diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py
index b723cb9..c4ffeb0 100644
--- a/examples/yorkshire/session.py
+++ b/examples/yorkshire/session.py
@@ -29,7 +29,7 @@
 from examples.yorkshire.tests.suites.seniors.suite import seniors_suite
 from examples.yorkshire.tests.suites.showcase.suite import showcase_suite
 from protest import From, ProTestSession
-from protest.evals import EvalCase, ModelInfo
+from protest.evals import EvalCase, ModelLabel
 from protest.evals.suite import EvalSuite
 
 session = ProTestSession(concurrency=4, history=True)
@@ -48,7 +48,7 @@
 
 yorkshire_suite = EvalSuite(
     "yorkshire_eval",
-    model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"),
+    model=ModelLabel(name="yorkshire-chatbot-v1", provider="local"),
 )
 session.add_suite(yorkshire_suite)
 
diff --git a/protest/cli/main.py b/protest/cli/main.py
index 16e76ee..8bb2fe8 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -24,6 +24,12 @@
   protest run demo:session --collect-only   List tests without running
   protest run demo:session --tag slow   Run tests with 'slow' tag
   protest run demo:session -s           Disable capture (show print output)
+  protest eval demo:session             Run all evaluations
+  protest eval demo:session --show-output  Show inputs/output/expected per case
+  protest history --evals               Show eval suite trends
+  protest history --evals --tail 5      Show last 5 entries
+  protest history --evals --compare     Compare 2 most recent runs
+  protest live                          Start live reporter server
   protest tags list demo:session        List all available tags
 """
 
diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py
index c985114..9882d7f 100644
--- a/protest/evals/__init__.py
+++ b/protest/evals/__init__.py
@@ -10,6 +10,7 @@
     Verdict,
     evaluator,
 )
+from protest.evals.suite import EvalSuite
 from protest.evals.types import (
     EvalCaseResult,
     EvalScore,
@@ -17,7 +18,7 @@
     Judge,
     JudgeInfo,
     JudgeResponse,
-    ModelInfo,
+    ModelLabel,
     ScoreStats,
     TaskResult,
 )
@@ -27,13 +28,14 @@
     "EvalCaseResult",
     "EvalContext",
     "EvalScore",
+    "EvalSuite",
     "EvalSuiteReport",
     "Evaluator",
     "Judge",
     "JudgeInfo",
     "JudgeResponse",
     "Metric",
-    "ModelInfo",
+    "ModelLabel",
     "Reason",
     "ScoreStats",
     "ShortCircuit",
diff --git a/protest/evals/suite.py b/protest/evals/suite.py
index c4af124..4971e17 100644
--- a/protest/evals/suite.py
+++ b/protest/evals/suite.py
@@ -11,7 +11,7 @@
 if TYPE_CHECKING:
     from collections.abc import Callable
 
-    from protest.evals.types import Judge, ModelInfo
+    from protest.evals.types import Judge, ModelLabel
 
 FuncT = TypeVar("FuncT", bound="Callable[..., object]")
 
@@ -33,7 +33,7 @@ def __init__(
         self,
         name: str,
         *,
-        model: ModelInfo | None = None,
+        model: ModelLabel | None = None,
         judge: Judge | None = None,
         tags: list[str] | None = None,
         max_concurrency: int | None = None,
@@ -60,7 +60,7 @@ def judge(self) -> Judge | None:
         return self._judge
 
     @property
-    def model(self) -> ModelInfo | None:
+    def model(self) -> ModelLabel | None:
         return self._model
 
     def eval(
diff --git a/protest/evals/types.py b/protest/evals/types.py
index 0c628ab..1d19474 100644
--- a/protest/evals/types.py
+++ b/protest/evals/types.py
@@ -95,7 +95,7 @@ async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: ..
 
 
 @dataclass(frozen=True, slots=True)
-class ModelInfo:
+class ModelLabel:
     """Metadata about the model being evaluated."""
 
     name: str
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index f6c074f..9601a7c 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -23,7 +23,11 @@
 )
 from protest.evals.hashing import compute_case_hash, compute_eval_hash
 from protest.evals.types import EvalScore, TaskResult
-from protest.exceptions import FixtureError, MultipleEvalCaseParamsError
+from protest.exceptions import (
+    FixtureError,
+    MultipleEvalCaseParamsError,
+    ScoreNameCollisionError,
+)
 
 
 def make_eval_wrapper(
@@ -76,6 +80,18 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload:
             judge=judge,
         )
 
+        # Detect score-name collisions across evaluators. EvalPayload.scores
+        # is a dict keyed by name; duplicates would silently overwrite each
+        # other downstream. Fail loud so the user can rename the field.
+        seen: set[str] = set()
+        duplicates: list[str] = []
+        for s in scores:
+            if s.name in seen and s.name not in duplicates:
+                duplicates.append(s.name)
+            seen.add(s.name)
+        if duplicates:
+            raise ScoreNameCollisionError(case_name, duplicates)
+
         return EvalPayload(
             case_name=case_name,
             passed=all(s.passed for s in scores),
diff --git a/protest/exceptions.py b/protest/exceptions.py
index 42b5716..3cff676 100644
--- a/protest/exceptions.py
+++ b/protest/exceptions.py
@@ -113,3 +113,29 @@ def __init__(self, func_name: str, param_names: list[str]):
             f"and per-case evaluators. Merge the cases into a single EvalCase, "
             f"or split into separate evals."
         )
+
+
+class ScoreNameCollisionError(ProTestError):
+    """Raised when two evaluators in the same eval emit scores with the same name.
+
+    Each `EvalScore.name` (from a dataclass `Verdict`/`Metric`/`Reason` field
+    or from the evaluator's name when it returns `bool`) becomes a key in
+    `EvalPayload.scores` (a dict). If two evaluators emit the same name,
+    one would silently overwrite the other in the per-case report and history,
+    which is a real source of misleading data.
+
+    Fix by renaming the colliding fields so each Verdict/Metric/Reason has a
+    unique name within the suite (e.g. prefix with the evaluator's concept:
+    `summary_detail` instead of just `detail`).
+    """
+
+    def __init__(self, case_name: str, duplicates: list[str]):
+        dup_str = ", ".join(repr(d) for d in sorted(duplicates))
+        super().__init__(
+            f"Score-name collision in eval '{case_name}': {dup_str}. "
+            f"Two or more evaluators emit a score under the same name. "
+            f"Rename the colliding dataclass Verdict/Metric/Reason field(s) "
+            f"so each name is unique within the suite — otherwise the "
+            f"duplicate scores would silently overwrite each other in the "
+            f"per-case report and the history file."
+        )
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index 2233a1c..446c083 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -98,7 +98,11 @@ def __init__(
 
     @classmethod
     def activate(cls, ctx: PluginContext) -> Self | None:
-        if ctx.get("no_color", False):
+        # Activate when --no-color was passed, OR when `rich` is not
+        # installed (RichReporter would otherwise leave the run silent).
+        import importlib.util  # noqa: PLC0415 — std lib, kept local for clarity
+
+        if ctx.get("no_color", False) or importlib.util.find_spec("rich") is None:
             return cls(
                 verbosity=ctx.get("verbosity", 0),
                 show_logs=ctx.get("show_logs"),
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index e30584c..22622c6 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -1,11 +1,10 @@
+import importlib.util
 import logging
 import traceback
 from argparse import ArgumentParser
 from pathlib import Path
 from typing import Any
 
-from rich.console import Console
-from rich.table import Table
 from typing_extensions import Self
 
 from protest.entities import (
@@ -34,6 +33,15 @@
 )
 from protest.reporting.verbosity import Verbosity
 
+
+# `rich` is an optional dependency. All `from rich...` imports below are
+# done lazily inside methods so that `import protest` works without it;
+# `RichReporter.activate()` returns None when rich is missing, and
+# `AsciiReporter` takes over via its own activate() check.
+def _rich_available() -> bool:
+    return importlib.util.find_spec("rich") is not None
+
+
 # Per-run pass-rate thresholds for the eval suite color cue.
 # Strict default — green only if every case passes; yellow above half.
 _PERFECT_PASS_RATE = 1.0
@@ -84,6 +92,8 @@ def __init__(
         show_logs: str | None = None,
         show_output: bool = False,
     ) -> None:
+        from rich.console import Console  # noqa: PLC0415 — optional dep, lazy
+
         self.console = Console(highlight=False)
         self._verbosity = verbosity
         self._show_logs = show_logs
@@ -112,6 +122,9 @@ def add_cli_options(cls, parser: ArgumentParser) -> None:
     def activate(cls, ctx: PluginContext) -> Self | None:
         if ctx.get("no_color", False):
             return None
+        if not _rich_available():
+            # `rich` is an optional dependency; AsciiReporter takes over.
+            return None
         return cls(
             verbosity=ctx.get("verbosity", 0),
             show_logs=ctx.get("show_logs"),
@@ -157,6 +170,8 @@ def _maybe_show_logs(self, result: TestResult) -> None:
 
     def _print_bypass(self, message: str) -> None:
         """Print bypassing capture (for lifecycle messages emitted during tests)."""
+        from rich.console import Console  # noqa: PLC0415 — optional dep, lazy
+
         stream = real_stdout()
         Console(file=stream, highlight=False).print(message)
 
@@ -380,6 +395,8 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None:
                 self._print(f"[dim]{escaped_line}[/]")
 
     def on_user_print(self, data: Any) -> None:
+        from rich.console import Console  # noqa: PLC0415 — optional dep, lazy
+
         msg, raw, prefix = data
         # Write to the real stdout, bypassing capture
         stream = real_stdout()
@@ -394,6 +411,8 @@ def on_user_print(self, data: Any) -> None:
     def on_eval_suite_end(self, report: Any) -> None:
         if not isinstance(report, EvalSuiteReport):
             return
+        from rich.table import Table  # noqa: PLC0415 — optional dep, lazy
+
         stats = report.all_score_stats()
         self._print("")
         if stats:
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 5e86c18..7daf058 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -28,7 +28,7 @@
     EvalCase,
     EvalContext,
     Metric,
-    ModelInfo,
+    ModelLabel,
     ShortCircuit,
     Verdict,
     evaluator,
@@ -121,7 +121,7 @@ def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str:
         assert any(s.kind == "eval" for s in session._suites)
 
     def test_model_set_via_suite(self) -> None:
-        suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model"))
+        suite = EvalSuite("eval_echo", model=ModelLabel(name="test-model"))
         assert suite._model is not None
         assert suite._model.name == "test-model"
 
@@ -523,7 +523,7 @@ class TestHistory:
     def _run_eval(self, tmp_path: Path) -> None:
         session = ProTestSession(history_dir=tmp_path)
 
-        eval_echo_suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model"))
+        eval_echo_suite = EvalSuite("eval_echo", model=ModelLabel(name="test-model"))
         session.add_suite(eval_echo_suite)
 
         @eval_echo_suite.eval(evaluators=[fake_accuracy])
diff --git a/tests/evals/test_score_name_collision.py b/tests/evals/test_score_name_collision.py
new file mode 100644
index 0000000..f5d4c48
--- /dev/null
+++ b/tests/evals/test_score_name_collision.py
@@ -0,0 +1,143 @@
+"""Tests for `ScoreNameCollisionError` — fail-loud on duplicate score names.
+
+Two evaluators emitting a score under the same name (e.g. both have a
+``detail`` field on their dataclass return) would silently overwrite each
+other in ``EvalPayload.scores`` (a dict). The wrapper detects the
+collision at runtime and raises a clear error pointing at the duplicate
+name(s) so the user can rename the colliding field.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+from typing import Annotated
+
+import pytest
+
+from protest import ForEach, From, ProTestSession
+from protest.evals import (
+    EvalCase,
+    EvalContext,
+    EvalSuite,
+    Reason,
+    Verdict,
+    evaluator,
+)
+from protest.evals.wrapper import make_eval_wrapper
+from protest.exceptions import ScoreNameCollisionError
+
+_cases = ForEach([EvalCase(inputs="x", name="c1")])
+
+
+@dataclass
+class _ShapeA:
+    matches: Annotated[bool, Verdict]
+    detail: Annotated[str, Reason] = ""
+
+
+@dataclass
+class _ShapeB:
+    other_check: Annotated[bool, Verdict]
+    detail: Annotated[str, Reason] = ""  # collides with _ShapeA.detail
+
+
+@evaluator
+def _shape_a(ctx: EvalContext) -> _ShapeA:
+    return _ShapeA(matches=True, detail="from A")
+
+
+@evaluator
+def _shape_b(ctx: EvalContext) -> _ShapeB:
+    return _ShapeB(other_check=True, detail="from B")
+
+
+@evaluator
+def _bool_one(ctx: EvalContext) -> bool:
+    return True
+
+
+@dataclass
+class _ShapeWithBoolOneField:
+    _bool_one: Annotated[bool, Verdict]  # collides with _bool_one evaluator's name
+
+
+@evaluator
+def _shape_collides_with_bool(ctx: EvalContext) -> _ShapeWithBoolOneField:
+    return _ShapeWithBoolOneField(_bool_one=True)
+
+
+@dataclass
+class _ShapeUniqueA:
+    matches_a: Annotated[bool, Verdict]
+    detail_a: Annotated[str, Reason] = ""
+
+
+@dataclass
+class _ShapeUniqueB:
+    matches_b: Annotated[bool, Verdict]
+    detail_b: Annotated[str, Reason] = ""
+
+
+@evaluator
+def _shape_unique_a(ctx: EvalContext) -> _ShapeUniqueA:
+    return _ShapeUniqueA(matches_a=True, detail_a="A")
+
+
+@evaluator
+def _shape_unique_b(ctx: EvalContext) -> _ShapeUniqueB:
+    return _ShapeUniqueB(matches_b=True, detail_b="B")
+
+
+def _invoke(evaluators: list, case: EvalCase) -> None:
+    """Invoke the eval wrapper directly so collision exceptions propagate."""
+
+    def task(case: EvalCase) -> str:
+        return str(case.inputs)
+
+    wrapped = make_eval_wrapper(task, evaluators)
+    asyncio.run(wrapped(case=case))
+
+
+class TestCollisionRaises:
+    def test_two_dataclasses_share_field_name(self) -> None:
+        with pytest.raises(ScoreNameCollisionError) as excinfo:
+            _invoke([_shape_a, _shape_b], EvalCase(inputs="x", name="c1"))
+        msg = str(excinfo.value)
+        assert "'detail'" in msg
+        assert "c1" in msg
+
+    def test_bool_evaluator_name_collides_with_dataclass_field(self) -> None:
+        with pytest.raises(ScoreNameCollisionError) as excinfo:
+            _invoke(
+                [_bool_one, _shape_collides_with_bool],
+                EvalCase(inputs="x", name="c2"),
+            )
+        msg = str(excinfo.value)
+        assert "_bool_one" in msg
+        assert "c2" in msg
+
+
+class TestNoCollisionPasses:
+    def test_unique_names_pass(self) -> None:
+        # Should not raise.
+        _invoke(
+            [_shape_unique_a, _shape_unique_b],
+            EvalCase(inputs="x", name="c1"),
+        )
+
+    def test_session_with_unique_names_runs_clean(self) -> None:
+        """Smoke check: running through the full session path also succeeds."""
+        from protest.api import run_session  # noqa: PLC0415 — heavy import
+
+        session = ProTestSession()
+        suite = EvalSuite("evals")
+
+        @suite.eval(evaluators=[_shape_unique_a, _shape_unique_b])
+        def ok(case: Annotated[EvalCase, From(_cases)]) -> str:
+            return str(case.inputs)
+
+        _ = ok
+        session.add_suite(suite)
+        result = run_session(session)
+        assert result.success

From 3d1fe488e3bd457e3eb704ad7233edb89c326545 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Mon, 27 Apr 2026 09:00:44 +0200
Subject: [PATCH 56/60] fix(history,cli,docs): tier-2 polish from naive-agent
 v2 feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agent v2 confirmed the tier-1 fixes landed cleanly and surfaced a new
bucket of frictions concentrated on `protest history`. This batch
addresses them.

CLI refactor:
- `protest history` is now sub-command based (`list`, `runs`, `show`,
  `compare`, `clean`) instead of mutually-exclusive flags. `list`
  remains the implicit default so `protest history --tail 5` still
  works without typing the sub-command. The previous flag-as-mode form
  (`--runs`, `--show`, `--compare`, `--clean-dirty`) is removed.
- `protest history clean` is dry-run by default. `--apply` actually
  modifies the file. Eliminates the "destructive without warning"
  footgun.
- `--model` and `--suite` filter at the suite level: a run with
  several suites under different models keeps the entry, with non-
  matching suites pruned out of the displayed view. The previous
  run-level filter would surprise users by dropping the whole run.
- `--tail N` now narrows the entries before aggregation, so the
  `list` (trend) view actually scopes to the requested window.
- Added `--short` for `protest eval`: hide passing scores per case
  to keep the output readable on suites with many evaluators.

Docs:
- `cli.md` rewritten for the new sub-command layout, with explicit
  examples for each sub-command and a note on suite-level filtering.
- `evals.md` gets a callout on writing custom evaluators when the eval
  task returns a non-string output (dict / dataclass / pydantic), and
  a tip clarifying that "first run successful" doesn't mean every case
  passes — evals are expected to surface failing cases.
- `evals.md` quick-start now imports `EvalSuite` from `protest.evals`
  (single canonical path).
- `installation.md` adds an IDE / type-checker setup section
  (Pyright/Pylance/mypy + uv).

Storage:
- `is_dirty_entry()` and `count_dirty_entries()` extracted as helpers
  so the dry-run path can compute counts without touching the file.

The remaining cross-suite/cross-model `compare` ask is tracked in #101.
---
 docs/cli.md                          |  62 +++++++-----
 docs/evals.md                        |  39 ++++++-
 docs/getting-started/installation.md |  26 +++++
 protest/cli/history.py               | 145 ++++++++++++++++++---------
 protest/cli/main.py                  |   6 ++
 protest/history/storage.py           | 112 +++++++++++++++++----
 protest/reporting/ascii.py           |  19 +++-
 protest/reporting/rich_reporter.py   |  19 +++-
 tests/test_history_cli.py            | 133 +++++++++++++-----------
 9 files changed, 401 insertions(+), 160 deletions(-)

diff --git a/docs/cli.md b/docs/cli.md
index 7d2b299..910701d 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -349,38 +349,43 @@ input, output, expected, and per-evaluator scores.
 Browse persisted run history (tests and evals).
 
 Every run appends one entry to `.protest/history.jsonl`; `protest history`
-queries that file with various views.
+queries that file via sub-commands.
 
 ### Syntax
 
 ```bash
-protest history [view] [filters]
+protest history <subcommand> [filters]
 ```
 
-Exactly one view is shown at a time. The view defaults to a per-suite
-trend table when no flag is given.
+If no sub-command is given, `list` runs by default — so
+`protest history --tail 5` is equivalent to
+`protest history list --tail 5`.
 
-### View flags (mutually exclusive)
+### Sub-commands
 
-| Flag | Description |
-|------|-------------|
-| _(none)_ | Per-suite trend table: pass-rate trend + score arrows |
-| `--runs` | Run-by-run pass rates, most recent first |
-| `--show [N]` | Detailed panel for the Nth most recent run (`0` = latest, default) |
-| `--compare` | Compare the two most recent runs of the same model |
+| Sub-command | Description |
+|-------------|-------------|
+| `list` | Per-suite trend table: pass-rate trend + score arrows. **Default** when no sub-command is given. |
+| `runs` | Run-by-run pass rates, most recent first. |
+| `show [N]` | Detailed panel for the Nth most recent run (`N=0` = latest, the default). |
+| `compare` | Compare the two most recent runs of the same model. |
+| `clean` | Remove entries from runs made on a dirty working tree. **Dry-run by default** — pass `--apply` to actually modify the file. |
 
-### Filters (apply to all views)
+### Filters (shared by every sub-command)
 
 | Flag | Description | Default |
 |------|-------------|---------|
 | `--tail N`, `-n N` | Limit to the N most recent entries | 10 |
 | `--evals` | Show eval runs only | _all kinds_ |
 | `--tests` | Show test runs only | _all kinds_ |
-| `--model NAME` | Filter by `ModelLabel.name` | _all_ |
-| `--suite NAME` | Filter by suite name | _all_ |
-| `--clean-dirty` | Remove entries from runs made on a dirty working tree | off |
+| `--model NAME` | Keep only suites whose `ModelLabel.name` matches | _all_ |
+| `--suite NAME` | Keep only the suite with this name | _all_ |
 | `--path DIR` | Use a custom history directory | `.protest/` |
 
+`--model` and `--suite` filter at the **suite level**: a run that
+contains *several* suites with different models keeps the entry alive,
+with non-matching suites pruned out of the displayed view.
+
 ### Reading `--compare`
 
 `--compare` reports four kinds of change between the two most recent
@@ -406,32 +411,35 @@ name:
 ### Examples
 
 ```bash
-# Per-suite trend across last 10 runs (default view)
+# Per-suite trend across last 10 eval runs (default sub-command: list)
 protest history --evals
 
 # Run-by-run breakdown of the last 5 eval runs
-protest history --evals --runs --tail 5
+protest history runs --evals --tail 5
 
-# Detailed panel for the most recent run
-protest history --evals --show
+# Detailed panel for the most recent eval run
+protest history show --evals
 
 # Detailed panel for the run before that (1 = next-most-recent)
-protest history --evals --show 1
+protest history show 1 --evals
+
+# Compare the two most recent runs of the same model
+protest history compare --evals
 
-# Compare the two most recent runs
-protest history --evals --compare
+# Filter to one model — only suites with this model are shown
+protest history list --evals --model qwen-2.5
 
-# Filter to one model across all views
-protest history --evals --model qwen-2.5
+# Preview which entries `clean` would remove (no file changes)
+protest history clean --evals
 
-# Drop runs made on a dirty working tree before any view
-protest history --evals --clean-dirty
+# Actually remove dirty entries
+protest history clean --apply
 ```
 
 ### Notes
 
 - When the project is not a git repo, the per-run commit / dirty
-  columns display `?`. `--clean-dirty` is a no-op in that case.
+  columns display `?`. `clean` is a no-op in that case.
 - `--evals` and `--tests` are mutually exclusive; omit both to see
   every kind.
 - Per-case detail (input, output, expected, evaluator scores) lives in
diff --git a/docs/evals.md b/docs/evals.md
index 6bf899f..f1bb40e 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -29,6 +29,19 @@ A test produces **pass/fail**. An eval produces **scores** — numeric values (0
 
 ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, tags. An eval is a test that returns a value, scored by evaluators.
 
+!!! tip "First-run expectations: don't expect 100% green"
+
+    Unlike tests, evals are **expected to have failing cases** — that's
+    the signal you're measuring. `protest eval` still exits 1 when any
+    case fails a `Verdict` (so CI surfaces regressions), but the
+    failures are not bugs, they're data points. The aggregate-stats
+    table and `protest history` are designed for this — you watch the
+    metrics drift over time, and use `--compare` to flag actual
+    regressions between runs. If you want a CI gate that only fails on
+    infrastructure errors (fixture / evaluator crashes) and not on
+    case-level scoring, run `protest eval || true` followed by
+    `protest history --compare` to assert no regression.
+
 ## Quick Start
 
 ```python
@@ -38,7 +51,7 @@ from typing import Annotated
 from protest import ForEach, From, ProTestSession
 from protest.evals import EvalCase, ModelLabel, evaluator
 from protest.evals.evaluators import contains_keywords
-from protest.evals.suite import EvalSuite
+from protest.evals import EvalSuite
 
 cases = ForEach([
     EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"),
@@ -76,7 +89,7 @@ The rest of the pipeline — fixtures, DI, parallelism, reporters — works iden
 `EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration. Model and judge are suite-level config: each suite declares which model produced its results and which judge scores them.
 
 ```python
-from protest.evals.suite import EvalSuite
+from protest.evals import EvalSuite
 from protest.evals import ModelLabel
 
 chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="gpt-4o-mini"))
@@ -135,6 +148,28 @@ protest eval evals.session:session --no-tag slow
 
 An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict.
 
+!!! info "If your eval task returns a non-string output"
+
+    The built-in evaluators (`contains_keywords`, `not_empty`, `max_length`,
+    `matches_regex`, `json_valid`, `word_overlap`) assume `ctx.output` is a
+    string and call methods like `.lower()` on it. They drop in cleanly for
+    summarization, chatbot replies, single-string completions, etc.
+
+    For a structured output (`dict`, `dataclass`, `pydantic.BaseModel`, list
+    of objects, …), the path is to write **custom evaluators** that
+    pick the field they care about. A typical pattern:
+
+    ```python
+    @evaluator
+    def category_matches_expected(ctx: EvalContext) -> CategoryMatch:
+        expected = (ctx.expected_output or {}).get("category")
+        actual = ctx.output.get("category")
+        return CategoryMatch(category_matches=(expected == actual), ...)
+    ```
+
+    See *Structured Evaluator* below and *EvalContext* for the data
+    you can read off `ctx`.
+
 ### Return Types
 
 Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). In dataclasses, annotate fields to tell the framework what each one is:
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index e885d05..05f5cd6 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -31,3 +31,29 @@ ProTest automatically uses [Rich](https://rich.readthedocs.io/) for better termi
 ```bash
 uv add rich
 ```
+
+## IDE / type checker setup
+
+ProTest ships a `py.typed` marker, so Pyright, mypy and Pylance pick up
+its type hints once it is installed in the project's virtual env.
+
+If your editor reports `Import "protest" could not be resolved`, point
+your type checker at the right interpreter:
+
+- **VS Code / Pylance**: open the command palette → *Python: Select
+  Interpreter* → choose `.venv/bin/python` (the one `uv` created).
+- **Pyright (CLI/standalone)**: add a `pyrightconfig.json` next to your
+  `pyproject.toml`:
+
+  ```json
+  {
+    "venvPath": ".",
+    "venv": ".venv"
+  }
+  ```
+
+- **mypy**: run via `uv run mypy ...` so it inherits the same
+  interpreter, or set `python_executable` in `mypy.ini`.
+
+Once configured, no extra stub package or plugin is needed — protest
+exposes its own types directly.
diff --git a/protest/cli/history.py b/protest/cli/history.py
index 88c94b8..50cf34d 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -7,59 +7,84 @@
 from pathlib import Path
 from typing import Any
 
-from protest.history.storage import clean_dirty, load_history
+from protest.history.storage import clean_dirty, count_dirty_entries, load_history
 
 
-def handle_history_command(argv: list[str]) -> None:
-    """Entry point for `protest history`."""
-    parser = argparse.ArgumentParser(
-        prog="protest history", description="Browse run history"
+def _make_common_parser() -> argparse.ArgumentParser:
+    """Filters shared by every `protest history` sub-command."""
+    common = argparse.ArgumentParser(add_help=False)
+    common.add_argument(
+        "--tail",
+        "-n",
+        type=int,
+        default=10,
+        help="Limit to the N most recent entries (default: 10)",
     )
-    parser.add_argument(
-        "--tail", "-n", type=int, default=10, help="Number of entries (default: 10)"
+    common.add_argument("--model", type=str, default=None, help="Filter by model name")
+    common.add_argument("--suite", type=str, default=None, help="Filter by suite name")
+    kind_group = common.add_mutually_exclusive_group()
+    kind_group.add_argument("--evals", action="store_true", help="Eval runs only")
+    kind_group.add_argument("--tests", action="store_true", help="Test runs only")
+    common.add_argument(
+        "--path",
+        type=str,
+        default=None,
+        help="History directory (default: .protest/)",
     )
-    parser.add_argument("--model", type=str, default=None, help="Filter by model name")
-    parser.add_argument("--suite", type=str, default=None, help="Filter by suite name")
+    return common
+
+
+def handle_history_command(argv: list[str]) -> None:
+    """Entry point for `protest history`.
+
+    Sub-commands:
 
-    action_group = parser.add_mutually_exclusive_group()
-    action_group.add_argument(
-        "--runs", action="store_true", help="Show run-by-run list"
+    - ``list`` (default): per-suite trend table.
+    - ``runs``: run-by-run pass rates, most recent first.
+    - ``show [N]``: detailed panel for the Nth most recent run (0=latest).
+    - ``compare``: compare the two most recent runs.
+    - ``clean``: remove entries from runs made on a dirty working tree
+      (dry-run by default; pass ``--apply`` to actually modify the file).
+    """
+    parser = argparse.ArgumentParser(
+        prog="protest history",
+        description="Browse run history",
     )
-    action_group.add_argument(
-        "--show",
-        nargs="?",
-        const=0,
+    sub = parser.add_subparsers(dest="action")
+    common = _make_common_parser()
+
+    sub.add_parser("list", parents=[common], help="Per-suite trend (default)")
+    sub.add_parser("runs", parents=[common], help="Run-by-run breakdown")
+    show_p = sub.add_parser("show", parents=[common], help="Detailed panel for one run")
+    show_p.add_argument(
+        "nth",
         type=int,
-        default=None,
-        metavar="N",
-        help="Detailed panel for Nth most recent run (0=latest)",
-    )
-    action_group.add_argument(
-        "--compare", action="store_true", help="Compare 2 most recent runs"
+        nargs="?",
+        default=0,
+        help="Nth most recent run (0=latest, default: 0)",
     )
-
-    kind_group = parser.add_mutually_exclusive_group()
-    kind_group.add_argument("--evals", action="store_true", help="Eval runs only")
-    kind_group.add_argument("--tests", action="store_true", help="Test runs only")
-    parser.add_argument(
-        "--clean-dirty",
+    sub.add_parser("compare", parents=[common], help="Compare 2 most recent runs")
+    clean_p = sub.add_parser("clean", parents=[common], help="Remove dirty entries")
+    clean_p.add_argument(
+        "--apply",
         action="store_true",
-        help="Remove runs with uncommitted changes on current commit.",
-    )
-    parser.add_argument(
-        "--path", type=str, default=None, help="History directory (default: .protest/)"
+        help="Actually modify the history file (default: dry-run, no changes).",
     )
 
+    # Default to `list` when no sub-command is given (so users can still
+    # write `protest history --tail 5` without typing `list`).
+    # `--help` / `-h` go to the parent so users see the sub-command list,
+    # not list-specific options.
+    if not argv:
+        argv = ["list"]
+    elif argv[0].startswith("-") and argv[0] not in ("--help", "-h"):
+        argv = ["list", *argv]
     args = parser.parse_args(argv)
+
     history_dir = Path(args.path) if args.path else None
 
-    if args.clean_dirty:
-        removed = clean_dirty(history_dir=history_dir)
-        print(
-            f"Removed {removed} dirty entries."
-            if removed
-            else "No dirty entries to clean."
-        )
+    if args.action == "clean":
+        _run_clean(history_dir=history_dir, apply=args.apply)
         sys.exit(0)
 
     entries = load_history(
@@ -73,21 +98,47 @@ def handle_history_command(argv: list[str]) -> None:
         print("No history found.")
         sys.exit(0)
 
+    # Apply --tail to entries before any aggregation so the trend view
+    # actually narrows to the requested window (otherwise the per-suite
+    # trend would still cover the full file even with --tail).
+    entries = entries[-args.tail :]
+    _dispatch_view(args.action, getattr(args, "nth", 0), entries)
+
+
+def _run_clean(history_dir: Path | None, *, apply: bool) -> None:
+    if apply:
+        removed = clean_dirty(history_dir=history_dir)
+        print(
+            f"Removed {removed} dirty entries."
+            if removed
+            else "No dirty entries to clean."
+        )
+        return
+    count = count_dirty_entries(history_dir=history_dir)
+    if count:
+        print(
+            f"Would remove {count} dirty entries. "
+            f"Re-run with --apply to actually modify the history file."
+        )
+    else:
+        print("No dirty entries to clean.")
+
+
+def _dispatch_view(action: str, nth: int, entries: list[dict[str, Any]]) -> None:
     out = _get_output()
-    if args.compare:
+    if action == "compare":
         if len(entries) < 2:
             print("Need at least 2 runs to compare.")
             sys.exit(1)
         out.compare(entries[-1], entries[-2])
-    elif args.show is not None:
-        idx = args.show
-        if idx >= len(entries):
+    elif action == "show":
+        if nth >= len(entries):
             print(f"Only {len(entries)} entries available.")
             sys.exit(1)
-        out.detail(entries[-(idx + 1)])
-    elif args.runs:
-        out.runs(entries[-args.tail :])
-    else:
+        out.detail(entries[-(nth + 1)])
+    elif action == "runs":
+        out.runs(entries)
+    else:  # "list" (default)
         out.stats(entries)
 
 
diff --git a/protest/cli/main.py b/protest/cli/main.py
index 8bb2fe8..4aaab5f 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -261,6 +261,12 @@ def _create_run_parser(
             action="store_true",
             help="Show eval inputs/output/expected per case",
         )
+        parser.add_argument(
+            "--short",
+            dest="short",
+            action="store_true",
+            help="Compact eval output: only print scores that failed per case",
+        )
     return parser
 
 
diff --git a/protest/history/storage.py b/protest/history/storage.py
index 3797335..7903649 100644
--- a/protest/history/storage.py
+++ b/protest/history/storage.py
@@ -107,17 +107,15 @@ def load_history(
             entry = json.loads(line)
         except json.JSONDecodeError:
             continue
-        if _is_future_schema(entry):
-            continue
-        if evals_only and not _has_suite_kind(entry, "eval"):
-            continue
-        if tests_only and not _has_suite_kind(entry, "test"):
-            continue
-        if model and (entry.get("evals") or {}).get("model") != model:
-            continue
-        if suite and suite not in entry.get("suites", {}):
-            continue
-        entries.append(entry)
+        filtered = _apply_entry_filters(
+            entry,
+            evals_only=evals_only,
+            tests_only=tests_only,
+            model=model,
+            suite=suite,
+        )
+        if filtered is not None:
+            entries.append(filtered)
 
     entries.sort(key=lambda e: e.get("timestamp", ""))
     if n is not None:
@@ -125,6 +123,43 @@ def load_history(
     return entries
 
 
+def _apply_entry_filters(
+    entry: dict[str, Any],
+    *,
+    evals_only: bool,
+    tests_only: bool,
+    model: str | None,
+    suite: str | None,
+) -> dict[str, Any] | None:
+    """Apply CLI filters to a single history entry.
+
+    Returns the (possibly suite-pruned) entry to keep, or None to drop it.
+    `--model` / `--suite` operate at the suite level: any suite in the run
+    that matches keeps the entry alive, with non-matching suites pruned out.
+    """
+    if _is_future_schema(entry):
+        return None
+    if evals_only and not _has_suite_kind(entry, "eval"):
+        return None
+    if tests_only and not _has_suite_kind(entry, "test"):
+        return None
+    if model is None and suite is None:
+        return entry
+
+    kept_suites: dict[str, Any] = {}
+    for sname, sdata in entry.get("suites", {}).items():
+        if not isinstance(sdata, dict):
+            continue
+        if model is not None and sdata.get("model") != model:
+            continue
+        if suite is not None and sname != suite:
+            continue
+        kept_suites[sname] = sdata
+    if not kept_suites:
+        return None
+    return {**entry, "suites": kept_suites}
+
+
 def _has_suite_kind(entry: dict[str, Any], kind: str) -> bool:
     """Check if entry has at least one suite with the given kind."""
     suites = entry.get("suites", {})
@@ -177,6 +212,47 @@ def load_previous_run(
     return None
 
 
+def _current_git_head() -> str | None:
+    """Return the current HEAD short SHA, or None when not in a git repo."""
+    try:
+        return subprocess.run(
+            ["git", "rev-parse", "HEAD"],  # noqa: S607
+            capture_output=True,
+            text=True,
+            timeout=5,
+            check=True,
+        ).stdout.strip()
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return None
+
+
+def is_dirty_entry(entry: dict[str, Any], current_commit: str | None) -> bool:
+    """Return True if `entry` was produced on a dirty working tree at HEAD."""
+    if not current_commit:
+        return False
+    git = entry.get("git") or {}
+    return bool(git.get("dirty")) and git.get("commit") == current_commit
+
+
+def count_dirty_entries(history_dir: Path | None = None) -> int:
+    """Count entries `clean_dirty()` would remove (without touching the file)."""
+    path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE
+    if not path.exists():
+        return 0
+    current_commit = _current_git_head()
+    if not current_commit:
+        return 0
+    count = 0
+    for line in path.read_text().strip().splitlines():
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if is_dirty_entry(entry, current_commit):
+            count += 1
+    return count
+
+
 def clean_dirty(history_dir: Path | None = None) -> int:
     """Remove entries where git.dirty=True AND git.commit matches current HEAD.
 
@@ -190,15 +266,8 @@ def clean_dirty(history_dir: Path | None = None) -> int:
     if not path.exists():
         return 0
 
-    try:
-        current_commit = subprocess.run(
-            ["git", "rev-parse", "HEAD"],  # noqa: S607
-            capture_output=True,
-            text=True,
-            timeout=5,
-            check=True,
-        ).stdout.strip()
-    except (FileNotFoundError, subprocess.CalledProcessError):
+    current_commit = _current_git_head()
+    if not current_commit:
         return 0
 
     with open(path, "r+") as f, _exclusive_file_lock(f):
@@ -213,8 +282,7 @@ def clean_dirty(history_dir: Path | None = None) -> int:
             except json.JSONDecodeError:
                 kept.append(line)
                 continue
-            git = entry.get("git") or {}
-            if git.get("dirty") and git.get("commit") == current_commit:
+            if is_dirty_entry(entry, current_commit):
                 removed += 1
             else:
                 kept.append(line)
diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py
index 446c083..a7dfdea 100644
--- a/protest/reporting/ascii.py
+++ b/protest/reporting/ascii.py
@@ -58,8 +58,12 @@ def _format_test_name(result: TestResult, include_suite: bool = False) -> str:
     return name
 
 
-def _format_eval_scores_inline(result: TestResult) -> str:
-    """Format eval scores for inline display — ASCII version (no glyphs)."""
+def _format_eval_scores_inline(result: TestResult, short: bool = False) -> str:
+    """Format eval scores for inline display — ASCII version (no glyphs).
+
+    When `short=True`, only failing/skipped scores are shown — passing scores
+    are hidden to keep the output readable on large suites.
+    """
     if not result.eval_payload:
         return ""
     parts: list[str] = []
@@ -67,6 +71,8 @@ def _format_eval_scores_inline(result: TestResult) -> str:
         if entry.skipped:
             parts.append(f"{name}=skip")
             continue
+        if short and entry.passed:
+            continue
         val = entry.value
         if isinstance(val, bool):
             parts.append(f"{name}={'pass' if val else 'fail'}")
@@ -88,10 +94,12 @@ def __init__(
         verbosity: int = 0,
         show_logs: str | None = None,
         show_output: bool = False,
+        short: bool = False,
     ) -> None:
         self._verbosity = verbosity
         self._show_logs = show_logs
         self._show_output = show_output
+        self._short = short
         self._is_parallel = False
         self._failed_results: list[TestResult] = []
         self._error_results: list[TestResult] = []
@@ -107,6 +115,7 @@ def activate(cls, ctx: PluginContext) -> Self | None:
                 verbosity=ctx.get("verbosity", 0),
                 show_logs=ctx.get("show_logs"),
                 show_output=ctx.get("show_output", False),
+                short=ctx.get("short", False),
             )
         return None
 
@@ -223,7 +232,11 @@ def on_test_pass(self, result: TestResult) -> None:
             retry_suffix = ""
             if result.max_attempts > 1:
                 retry_suffix = f" [attempt {result.attempt}/{result.max_attempts}]"
-            scores_str = _format_eval_scores_inline(result) if result.is_eval else ""
+            scores_str = (
+                _format_eval_scores_inline(result, short=self._short)
+                if result.is_eval
+                else ""
+            )
             print(f"  OK {name} ({duration}){scores_str}{retry_suffix}")
             if self._show_output and result.is_eval:
                 self._print_eval_detail(result)
diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py
index 22622c6..57ab433 100644
--- a/protest/reporting/rich_reporter.py
+++ b/protest/reporting/rich_reporter.py
@@ -61,8 +61,12 @@ def _format_test_name(result: TestResult) -> str:
     return label.replace("[", "\\[")
 
 
-def _format_eval_scores_inline(result: TestResult) -> str:
-    """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0')."""
+def _format_eval_scores_inline(result: TestResult, short: bool = False) -> str:
+    """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0').
+
+    When `short=True`, only failing/skipped scores are shown — passing scores
+    are hidden to keep the output readable on large suites.
+    """
     if not result.eval_payload:
         return ""
     parts = []
@@ -70,6 +74,8 @@ def _format_eval_scores_inline(result: TestResult) -> str:
         if entry.skipped:
             parts.append(f"{name}=⊘")
             continue
+        if short and entry.passed:
+            continue
         val = entry.value
         if isinstance(val, bool):
             parts.append(f"{name}={'✓' if val else '✗'}")
@@ -91,6 +97,7 @@ def __init__(
         verbosity: int = 0,
         show_logs: str | None = None,
         show_output: bool = False,
+        short: bool = False,
     ) -> None:
         from rich.console import Console  # noqa: PLC0415 — optional dep, lazy
 
@@ -98,6 +105,7 @@ def __init__(
         self._verbosity = verbosity
         self._show_logs = show_logs
         self._show_output = show_output
+        self._short = short
         self._failed_results: list[TestResult] = []
         self._error_results: list[TestResult] = []
 
@@ -129,6 +137,7 @@ def activate(cls, ctx: PluginContext) -> Self | None:
             verbosity=ctx.get("verbosity", 0),
             show_logs=ctx.get("show_logs"),
             show_output=ctx.get("show_output", False),
+            short=ctx.get("short", False),
         )
 
     def _print(self, message: str) -> None:
@@ -265,7 +274,11 @@ def on_test_pass(self, result: TestResult) -> None:
                 retry_suffix = (
                     f" [dim]\\[attempt {result.attempt}/{result.max_attempts}][/]"
                 )
-            scores_str = _format_eval_scores_inline(result) if result.is_eval else ""
+            scores_str = (
+                _format_eval_scores_inline(result, short=self._short)
+                if result.is_eval
+                else ""
+            )
             self._print(
                 f"   [green]✓[/]   {name} [dim]({duration})[/]{scores_str}{retry_suffix}"
             )
diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py
index b19e5cb..e5f6654 100644
--- a/tests/test_history_cli.py
+++ b/tests/test_history_cli.py
@@ -1,12 +1,12 @@
 """Tests for `protest history` CLI argument parsing.
 
-Covers mutually-exclusive flag groups:
-- Action: `--runs` / `--show` / `--compare`
-- Kind:   `--evals` / `--tests`
+The CLI uses sub-commands (`list`, `runs`, `show`, `compare`, `clean`).
+`list` is the implicit default when no sub-command is given. Each sub-command
+shares a common filter parser (`--tail`, `--model`, `--suite`, `--evals`/
+`--tests`, `--path`); `--evals` and `--tests` remain mutually exclusive.
 
-`handle_history_command(argv)` triggers `SystemExit(2)` from argparse when a
-mutex is violated. Tests assert both the exit code and the stderr message
-mentioning the conflicting flag.
+`handle_history_command(argv)` triggers `SystemExit(2)` from argparse on a
+parsing error, and `SystemExit(0)` on a clean (possibly empty-history) run.
 """
 
 from __future__ import annotations
@@ -22,24 +22,19 @@
     from pathlib import Path
 
 
-class TestActionMutex:
-    """`--runs`, `--show`, `--compare` cannot be combined."""
+class TestKindMutex:
+    """`--evals` and `--tests` cannot be combined within a sub-command."""
 
     @pytest.mark.parametrize(
-        ("argv", "expected_flag"),
+        "argv",
         [
-            (["--runs", "--compare"], "--compare"),
-            (["--compare", "--runs"], "--runs"),
-            (["--runs", "--show", "0"], "--show"),
-            (["--show", "0", "--runs"], "--runs"),
-            (["--show", "1", "--compare"], "--compare"),
-            (["--compare", "--show", "1"], "--show"),
+            ["list", "--evals", "--tests"],
+            ["runs", "--tests", "--evals"],
         ],
     )
     def test_mutex_violation_exits_with_error(
         self,
         argv: list[str],
-        expected_flag: str,
         capsys: pytest.CaptureFixture[str],
     ) -> None:
         with pytest.raises(SystemExit) as exc_info:
@@ -47,74 +42,76 @@ def test_mutex_violation_exits_with_error(
         assert exc_info.value.code == 2
         stderr = capsys.readouterr().err
         assert "not allowed with argument" in stderr
-        assert expected_flag in stderr
 
 
-class TestKindMutex:
-    """`--evals` and `--tests` cannot be combined."""
+class TestSubcommandsAccepted:
+    """Each sub-command parses cleanly with shared filters."""
 
     @pytest.mark.parametrize(
         "argv",
         [
-            ["--evals", "--tests"],
-            ["--tests", "--evals"],
+            ["list"],
+            ["runs"],
+            ["show"],
+            ["show", "0"],
+            ["compare"],
+            ["clean"],
+            ["list", "--evals"],
+            ["list", "--tests"],
+            ["runs", "--tail", "5"],
+            ["show", "1", "--model", "gpt-4"],
+            ["compare", "--suite", "my_suite"],
         ],
     )
-    def test_mutex_violation_exits_with_error(
+    def test_subcommand_parses_with_empty_history(
         self,
         argv: list[str],
+        tmp_path: Path,
         capsys: pytest.CaptureFixture[str],
     ) -> None:
+        full_argv = [*argv, "--path", str(tmp_path)]
         with pytest.raises(SystemExit) as exc_info:
-            handle_history_command(argv)
-        assert exc_info.value.code == 2
-        stderr = capsys.readouterr().err
-        assert "not allowed with argument" in stderr
+            handle_history_command(full_argv)
+        # Empty history exits 0 with "No history found." (or similar).
+        assert exc_info.value.code == 0
+        captured = capsys.readouterr()
+        assert "not allowed with argument" not in captured.err
 
 
-class TestMutexIndependence:
-    """Flags from different groups can be combined freely."""
+class TestImplicitListDefault:
+    """`protest history` with no sub-command falls back to `list`."""
 
-    @pytest.mark.parametrize(
-        "action_flags",
-        [
-            ["--runs"],
-            ["--compare"],
-            ["--show", "0"],
-        ],
-    )
-    @pytest.mark.parametrize("kind_flag", ["--evals", "--tests"])
-    def test_cross_group_combinations_parse_cleanly(
-        self,
-        action_flags: list[str],
-        kind_flag: str,
-        tmp_path: Path,
-        capsys: pytest.CaptureFixture[str],
+    def test_no_subcommand_runs_list(
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
     ) -> None:
-        argv = [*action_flags, kind_flag, "--path", str(tmp_path)]
         with pytest.raises(SystemExit) as exc_info:
-            handle_history_command(argv)
+            handle_history_command(["--path", str(tmp_path)])
         assert exc_info.value.code == 0
-        captured = capsys.readouterr()
-        assert "not allowed with argument" not in captured.err
 
+    def test_no_subcommand_with_only_filter_runs_list(
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+    ) -> None:
+        # `protest history --tail 5 --path X` should be parsed as the
+        # implicit `list --tail 5 --path X`, not as a parser error.
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(["--tail", "5", "--path", str(tmp_path)])
+        assert exc_info.value.code == 0
 
-class TestHelpShowsMutex:
-    """`--help` output surfaces both mutex groups in usage line."""
 
-    def test_help_output_shows_action_and_kind_groups(
-        self, capsys: pytest.CaptureFixture[str]
-    ) -> None:
+class TestHelpOutput:
+    """`--help` lists the sub-commands."""
+
+    def test_help_lists_subcommands(self, capsys: pytest.CaptureFixture[str]) -> None:
         with pytest.raises(SystemExit) as exc_info:
             handle_history_command(["--help"])
         assert exc_info.value.code == 0
         stdout = capsys.readouterr().out
-        assert "[--runs | --show [N] | --compare]" in stdout
-        assert "[--evals | --tests]" in stdout
+        for cmd in ("list", "runs", "show", "compare", "clean"):
+            assert cmd in stdout
 
 
 class TestRunsOrderRecentFirst:
-    """`--runs` lists most-recent run first (git log convention).
+    """`runs` lists most-recent run first (git log convention).
 
     Storage returns entries oldest→newest; the CLI must reverse for display
     so #1 maps to the newest run, matching `git stash list` / `git log`.
@@ -148,7 +145,7 @@ def test_runs_displays_newest_first(
                 ("2026-04-25T12:00:00", "newabcd"),
             ],
         )
-        handle_history_command(["--runs", "--path", str(tmp_path)])
+        handle_history_command(["runs", "--path", str(tmp_path)])
         stdout = capsys.readouterr().out
         # #1 is newest, #3 is oldest.
         assert stdout.index("#1") < stdout.index("#2") < stdout.index("#3")
@@ -158,3 +155,27 @@ def test_runs_displays_newest_first(
         # And #1 lines up with the newest commit, not the oldest.
         newest_line = next(line for line in stdout.splitlines() if "#1" in line)
         assert "newabcd" in newest_line
+
+
+class TestCleanDryRun:
+    """`clean` is dry-run by default; `--apply` to actually modify the file."""
+
+    def test_clean_default_is_dry_run(
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+    ) -> None:
+        # Empty history is the simplest case — both modes should report
+        # "No dirty entries to clean." without touching anything.
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(["clean", "--path", str(tmp_path)])
+        assert exc_info.value.code == 0
+        out = capsys.readouterr().out
+        assert "No dirty entries to clean." in out
+
+    def test_clean_apply_flag_accepted(
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+    ) -> None:
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(["clean", "--apply", "--path", str(tmp_path)])
+        assert exc_info.value.code == 0
+        out = capsys.readouterr().out
+        assert "No dirty entries to clean." in out

From db671a6ca0ecd8d9e1131185678f0d4f862137f4 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 28 Apr 2026 07:12:37 +0200
Subject: [PATCH 57/60] fix(history): refuse cross-model compare to avoid
 phantom regressions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`protest history compare` previously aplatted cases across all suites in
the two most recent runs. When the runs contained suites under different
ModelLabels (e.g. rules_v1 + rules_v2 in a multi-model session), a
case-id present under both models would surface as "regressed" or
"fixed" depending on which suite the diff happened to scan first.

Reported by the v3 naive-agent test: 5 strictly-identical runs produced
fake "Regressions: T010, T016" because T010 passed under v2 and failed
under v1 — the diff conflated the two contexts.

Fix: detect distinct ModelLabel.names across the two compared entries
and refuse to run when more than one is present, asking the user to
disambiguate via --model NAME or --suite NAME (which already
suite-prune entries at load time, leaving a single-model comparison).

Two new tests cover the rejection and the --model-disambiguated success
path. Top-level `protest --help` epilog and the test-bed MISSION.md
also get a small refresh to use the new sub-command syntax (`protest
history compare/runs/clean`) rather than the now-removed flag-as-mode
form.
---
 protest/cli/history.py    | 25 +++++++++++++++
 protest/cli/main.py       |  5 +--
 tests/test_history_cli.py | 66 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/protest/cli/history.py b/protest/cli/history.py
index 50cf34d..563dd9c 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -130,6 +130,19 @@ def _dispatch_view(action: str, nth: int, entries: list[dict[str, Any]]) -> None
         if len(entries) < 2:
             print("Need at least 2 runs to compare.")
             sys.exit(1)
+        # Refuse to compare across multiple models silently. When two runs
+        # contain suites with several distinct model labels (e.g. rules_v1
+        # and rules_v2 in the same multi-model session), the case-name diff
+        # would conflate the two contexts and emit phantom regressions.
+        # Force the user to disambiguate via --model NAME or --suite NAME.
+        models = _models_in_entries([entries[-1], entries[-2]])
+        if len(models) > 1:
+            print(
+                "Cannot compare runs that contain multiple models: "
+                f"{sorted(models)}. Pass --model NAME to compare runs of "
+                "the same model, or --suite NAME to focus on one suite."
+            )
+            sys.exit(1)
         out.compare(entries[-1], entries[-2])
     elif action == "show":
         if nth >= len(entries):
@@ -142,6 +155,18 @@ def _dispatch_view(action: str, nth: int, entries: list[dict[str, Any]]) -> None
         out.stats(entries)
 
 
+def _models_in_entries(entries: list[dict[str, Any]]) -> set[str]:
+    """Collect distinct, non-empty model labels across the given entries."""
+    models: set[str] = set()
+    for entry in entries:
+        for sdata in entry.get("suites", {}).values():
+            if isinstance(sdata, dict):
+                model = sdata.get("model")
+                if model:
+                    models.add(model)
+    return models
+
+
 # ---------------------------------------------------------------------------
 # Output abstraction — Rich if available, plain text fallback
 # ---------------------------------------------------------------------------
diff --git a/protest/cli/main.py b/protest/cli/main.py
index 4aaab5f..2fcc5b1 100644
--- a/protest/cli/main.py
+++ b/protest/cli/main.py
@@ -27,8 +27,9 @@
   protest eval demo:session             Run all evaluations
   protest eval demo:session --show-output  Show inputs/output/expected per case
   protest history --evals               Show eval suite trends
-  protest history --evals --tail 5      Show last 5 entries
-  protest history --evals --compare     Compare 2 most recent runs
+  protest history runs --evals          Run-by-run breakdown
+  protest history compare --evals       Compare 2 most recent runs
+  protest history clean                 Preview removable dirty entries
   protest live                          Start live reporter server
   protest tags list demo:session        List all available tags
 """
diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py
index e5f6654..d8b9f3c 100644
--- a/tests/test_history_cli.py
+++ b/tests/test_history_cli.py
@@ -157,6 +157,72 @@ def test_runs_displays_newest_first(
         assert "newabcd" in newest_line
 
 
+class TestCompareRefusesMixedModels:
+    """`compare` must not silently diff across models — would cause false regressions.
+
+    When the two most recent runs each contain suites with several distinct
+    `ModelLabel.name`s (e.g. `rules_v1` + `rules_v2` in a multi-model
+    session), aplatting the cases by name conflates contexts: a case-id that
+    passes under one model and fails under the other shows up as a phantom
+    regression. The CLI rejects this and asks the user to disambiguate via
+    `--model NAME` or `--suite NAME`.
+    """
+
+    def _seed_two_model_run(self, tmp_path: Path, run_id: str, ts: str) -> None:
+        path = tmp_path / HISTORY_FILE
+        append_entry(
+            path,
+            {
+                "schema_version": 1,
+                "run_id": run_id,
+                "timestamp": ts,
+                "git": {"commit_short": run_id},
+                "suites": {
+                    "helpdesk_v1": {
+                        "kind": "eval",
+                        "model": "rules_v1",
+                        "passed": 9,
+                        "total_cases": 18,
+                        "cases": {"T010": {"passed": False, "case_hash": "h1"}},
+                    },
+                    "helpdesk_v2": {
+                        "kind": "eval",
+                        "model": "rules_v2",
+                        "passed": 11,
+                        "total_cases": 18,
+                        "cases": {"T010": {"passed": True, "case_hash": "h1"}},
+                    },
+                },
+            },
+        )
+
+    def test_compare_rejects_mixed_models_without_filter(
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+    ) -> None:
+        self._seed_two_model_run(tmp_path, "aaa1111", "2026-04-27T10:00:00")
+        self._seed_two_model_run(tmp_path, "bbb2222", "2026-04-27T11:00:00")
+        with pytest.raises(SystemExit) as exc_info:
+            handle_history_command(["compare", "--evals", "--path", str(tmp_path)])
+        assert exc_info.value.code == 1
+        out = capsys.readouterr().out
+        assert "multiple models" in out
+        assert "rules_v1" in out and "rules_v2" in out
+        assert "--model" in out
+
+    def test_compare_with_model_filter_succeeds(
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+    ) -> None:
+        self._seed_two_model_run(tmp_path, "aaa1111", "2026-04-27T10:00:00")
+        self._seed_two_model_run(tmp_path, "bbb2222", "2026-04-27T11:00:00")
+        # `--model rules_v1` prunes helpdesk_v2 out of each entry, leaving
+        # a single-model comparison that should succeed (no false regression).
+        handle_history_command(
+            ["compare", "--evals", "--model", "rules_v1", "--path", str(tmp_path)]
+        )
+        out = capsys.readouterr().out
+        assert "multiple models" not in out
+
+
 class TestCleanDryRun:
     """`clean` is dry-run by default; `--apply` to actually modify the file."""
 

From 37d5c09882f39a41ccd7b87fff54582715454ae7 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:46:59 +0200
Subject: [PATCH 58/60] refactor(evals): split Evaluator __call__/run, require
 @evaluator at registration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The single Evaluator.__call__ that switched on isinstance(args[0], EvalContext)
forced an Any-typed signature and produced the surprising f is f() identity for
the no-kwargs case. Split into __call__(**kwargs) for rebinding and run(ctx) for
execution: each method is monomorphic and pyright can read it without overloads.

Plain callables are no longer accepted in evaluators=[...]. validate_evaluators
runs at registration boundaries (make_eval_wrapper, EvalCase, ShortCircuit) and
raises a clear TypeError pointing at @evaluator. The executor then operates on
a uniform Evaluator | ShortCircuit Union — the only remaining isinstance is the
narrowing on that real disjoint Union.
---
 docs/evals.md                            |  2 +-
 protest/evals/evaluator.py               | 90 +++++++++++++++++++-----
 protest/evals/suite.py                   | 10 ++-
 protest/evals/wrapper.py                 | 36 +++++-----
 tests/evals/test_e2e.py                  | 60 ++++++++--------
 tests/evals/test_evaluator_validation.py | 57 +++++++++++++++
 6 files changed, 185 insertions(+), 70 deletions(-)
 create mode 100644 tests/evals/test_evaluator_validation.py

diff --git a/docs/evals.md b/docs/evals.md
index f1bb40e..2831f8f 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -146,7 +146,7 @@ protest eval evals.session:session --no-tag slow
 
 ## Evaluators
 
-An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict.
+An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict. The decorator is mandatory: passing a plain function in `evaluators=[...]` raises `TypeError` at registration. The wrapping is what gives the evaluator its identity (used for hashing, history, reporting) and a typed `run(ctx)` method — there's no implicit conversion.
 
 !!! info "If your eval task returns a non-string output"
 
diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py
index b493967..8f3927d 100644
--- a/protest/evals/evaluator.py
+++ b/protest/evals/evaluator.py
@@ -1,22 +1,29 @@
-"""Evaluator primitives — functions, not classes.
+"""Evaluator primitives.
 
-An evaluator is a callable that receives an EvalContext and returns a score.
-The @evaluator decorator adds partial-application ergonomics:
+An evaluator is a function decorated with ``@evaluator`` that receives an
+``EvalContext`` and returns a verdict. The decorator wraps the function in an
+``Evaluator`` instance that carries identity (for hashing/history) and exposes
+two distinct entry points:
+
+- ``ev(keyword=value, ...)`` — bind params, return a new ``Evaluator``
+- ``ev.run(ctx)`` — execute against an ``EvalContext`` (called by the framework)
+
+Plain callables are not accepted in ``evaluators=[...]``; use ``@evaluator``::
 
     @evaluator
     def contains_keywords(ctx: EvalContext, keywords: list[str]) -> ContainsKeywordsResult:
         found = sum(1 for k in keywords if k.lower() in ctx.output.lower())
         return ContainsKeywordsResult(keyword_recall=found / len(keywords), ...)
 
-    # Bind params → returns a callable(ctx) via functools.partial
+    # Bind params → returns a fresh Evaluator with kwargs frozen in.
     evaluators=[contains_keywords(keywords=["paris", "france"])]
 
-    # No params → use directly
+    # No params → use the bare Evaluator directly.
     @evaluator
     def not_empty(ctx: EvalContext) -> bool:
         return bool(ctx.output.strip())
 
-Async evaluators are supported:
+Async evaluators are supported::
 
     @evaluator
     async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool:
@@ -155,6 +162,7 @@ def __post_init__(self) -> None:
                 "EvalCase.name must be a non-empty string "
                 "(used for history tracking and case identity)."
             )
+        validate_evaluators(self.evaluators)
 
     def __repr__(self) -> str:
         return self.name
@@ -177,7 +185,8 @@ class ShortCircuit:
         ]
     """
 
-    def __init__(self, evaluators: list[Any]) -> None:
+    def __init__(self, evaluators: list[Evaluator]) -> None:
+        validate_evaluators(evaluators, _inside_short_circuit=True)
         self.evaluators = evaluators
 
     def evaluator_identity(self) -> dict[str, Any]:
@@ -185,6 +194,40 @@ def evaluator_identity(self) -> dict[str, Any]:
         return {"short_circuit": [_canonical(e) for e in self.evaluators]}
 
 
+def validate_evaluators(
+    items: list[Any], *, _inside_short_circuit: bool = False
+) -> None:
+    """Reject anything that isn't a registered Evaluator (or ShortCircuit).
+
+    ``@evaluator`` is the only sanctioned path to producing an evaluator. Plain
+    callables used to be accepted, which forced a runtime ``isinstance`` dispatch
+    in the executor and made the evaluators list type effectively ``list[Any]``.
+    Failing loud at registration moves the error to the boundary and lets
+    downstream code work on a uniform ``Evaluator | ShortCircuit`` Union.
+    """
+    for item in items:
+        if isinstance(item, Evaluator):
+            continue
+        if isinstance(item, ShortCircuit) and not _inside_short_circuit:
+            continue
+        if _inside_short_circuit and isinstance(item, ShortCircuit):
+            raise TypeError(
+                "ShortCircuit cannot nest another ShortCircuit; "
+                "flatten the inner evaluators into the outer group."
+            )
+        if callable(item):
+            raise TypeError(
+                f"{item!r} is a plain callable, not an Evaluator. "
+                "Wrap it with @evaluator (from protest.evals) so it carries "
+                "identity, hashing, and a typed run() method."
+            )
+        raise TypeError(
+            f"Expected Evaluator or ShortCircuit, got {type(item).__name__}. "
+            "Only objects produced by @evaluator (or ShortCircuit groups) "
+            "are accepted in evaluators=[...]."
+        )
+
+
 class Metric:
     """Annotate a float/int field as a metric for stats aggregation."""
 
@@ -232,10 +275,14 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]:
 class Evaluator:
     """A configured evaluator — callable with identity for hashing.
 
-    Created by the ``@evaluator`` decorator. Supports two calling modes:
+    Created by the ``@evaluator`` decorator. Two distinct entry points:
+
+    - ``ev(keyword=value, ...)`` — bind params, return a new Evaluator
+    - ``ev.run(ctx)`` — execute against an EvalContext
 
-    1. ``ev(ctx)`` — evaluate directly (first arg is EvalContext)
-    2. ``ev(keyword=value, ...)`` — bind params, return a new Evaluator
+    Splitting these avoids the "callable that does two things based on the
+    type of arg[0]" anti-pattern: each method has a single, monomorphic
+    signature that type checkers can read without overload gymnastics.
     """
 
     __slots__ = ("_fn", "_kwargs", "_name", "_qualname")
@@ -252,16 +299,15 @@ def __init__(
     def name(self) -> str:
         return self._name
 
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        if args and isinstance(args[0], EvalContext):
-            merged = {**self._kwargs, **kwargs}
-            return self._fn(*args, **merged)
-        # Re-binding form (no EvalContext): always returns a fresh clone.
-        # Returning `self` for the no-kwargs case used to make `f is f()`
-        # accidentally true, which surprised users expecting `()` to behave
-        # like an evaluator constructor.
+    def __call__(self, **kwargs: Any) -> Evaluator:
+        # Re-binding form: always returns a fresh clone. Returning `self`
+        # for the no-kwargs case used to make `f is f()` accidentally true,
+        # which surprised users expecting `()` to behave like a constructor.
         return Evaluator(self._fn, {**self._kwargs, **kwargs})
 
+    def run(self, ctx: EvalContext[Any, Any], /) -> Any:
+        return self._fn(ctx, **self._kwargs)
+
     def evaluator_identity(self) -> dict[str, Any]:
         identity: dict[str, Any] = {"fn": self._qualname}
         if self._kwargs:
@@ -276,5 +322,11 @@ def __repr__(self) -> str:
 
 
 def evaluator(fn: Callable[..., Any]) -> Evaluator:
-    """Turn a function into a ProTest evaluator."""
+    """Turn a function into a ProTest evaluator.
+
+    The decorator is the only sanctioned way to produce an object that
+    ``evaluators=[...]`` will accept. Plain callables are rejected at
+    registration so the executor can rely on a uniform Union type instead
+    of dispatching at runtime.
+    """
     return Evaluator(fn)
diff --git a/protest/evals/suite.py b/protest/evals/suite.py
index 4971e17..67e277c 100644
--- a/protest/evals/suite.py
+++ b/protest/evals/suite.py
@@ -9,8 +9,9 @@
 from protest.evals.wrapper import make_eval_wrapper
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
+    from protest.evals.evaluator import Evaluator, ShortCircuit
     from protest.evals.types import Judge, ModelLabel
 
 FuncT = TypeVar("FuncT", bound="Callable[..., object]")
@@ -65,7 +66,7 @@ def model(self) -> ModelLabel | None:
 
     def eval(
         self,
-        evaluators: list[Any] | None = None,
+        evaluators: Sequence[Evaluator | ShortCircuit] | None = None,
         tags: list[str] | None = None,
         timeout: float | None = None,
         judge: Judge | None = None,
@@ -83,9 +84,12 @@ def eval(
 
         def decorator(func: FuncT) -> FuncT:
             resolved_judge = judge or self._judge
+            evals_list: list[Evaluator | ShortCircuit] = (
+                list(evaluators) if evaluators else []
+            )
             wrapper = make_eval_wrapper(
                 func,
-                evaluators or [],
+                evals_list,
                 judge=resolved_judge,
             )
             self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper)
diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py
index 9601a7c..3f07cc3 100644
--- a/protest/evals/wrapper.py
+++ b/protest/evals/wrapper.py
@@ -20,6 +20,7 @@
     Evaluator,
     ShortCircuit,
     extract_scores_from_result,
+    validate_evaluators,
 )
 from protest.evals.hashing import compute_case_hash, compute_eval_hash
 from protest.evals.types import EvalScore, TaskResult
@@ -32,12 +33,13 @@
 
 def make_eval_wrapper(
     func: Any,
-    evaluators: list[Any],
+    evaluators: list[Evaluator | ShortCircuit],
     judge: Any = None,
 ) -> Any:
     """Wrap a function to run evaluators on its return value."""
 
     _validate_single_evalcase_param(func)
+    validate_evaluators(evaluators)
 
     @functools.wraps(func)
     async def eval_wrapper(**kwargs: Any) -> EvalPayload:
@@ -207,7 +209,7 @@ def _extract_per_case_evaluators(kwargs: dict[str, Any]) -> list[Any]:
 
 
 async def run_evaluators(
-    evaluators: list[Any],
+    evaluators: list[Evaluator | ShortCircuit],
     case_name: str,
     inputs: Any,
     output: Any,
@@ -216,7 +218,12 @@ async def run_evaluators(
     duration: float,
     judge: Any = None,
 ) -> tuple[list[EvalScore], EvalContext[Any, Any]]:
-    """Run evaluators and return (scores, ctx with judge stats)."""
+    """Run evaluators and return (scores, ctx with judge stats).
+
+    Callers must have validated the list (Evaluator | ShortCircuit only) at the
+    boundary; the loop below trusts the Union and uses isinstance solely to
+    narrow it — the only legitimate isinstance kept in this module.
+    """
     ctx = EvalContext(
         name=case_name,
         inputs=inputs,
@@ -233,40 +240,35 @@ async def run_evaluators(
             scores.extend(await _run_short_circuit(ev.evaluators, ctx))
             continue
 
-        evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__
         try:
-            raw = ev(ctx)
+            raw = ev.run(ctx)
             result = await raw if asyncio.iscoroutine(raw) else raw
-            scores.extend(extract_scores_from_result(result, evaluator_name))
+            scores.extend(extract_scores_from_result(result, ev.name))
         except Exception as exc:
-            raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
+            raise FixtureError(f"evaluator '{ev.name}'", exc) from exc
 
     return scores, ctx
 
 
 async def _run_short_circuit(
-    evaluators: list[Any],
+    evaluators: list[Evaluator],
     ctx: EvalContext[Any, Any],
 ) -> list[EvalScore]:
     """Run evaluators in order, stop at first Verdict=False."""
     scores: list[EvalScore] = []
     for i, ev in enumerate(evaluators):
-        evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__
         try:
-            raw = ev(ctx)
+            raw = ev.run(ctx)
             result = await raw if asyncio.iscoroutine(raw) else raw
         except Exception as exc:
-            raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc
-        extracted = extract_scores_from_result(result, evaluator_name)
+            raise FixtureError(f"evaluator '{ev.name}'", exc) from exc
+        extracted = extract_scores_from_result(result, ev.name)
         scores.extend(extracted)
         if any(s.is_verdict and not s.passed for s in extracted):
             # Mark remaining evaluators as skipped
             for skipped_ev in evaluators[i + 1 :]:
-                skipped_name = (
-                    skipped_ev.name
-                    if isinstance(skipped_ev, Evaluator)
-                    else type(skipped_ev).__name__
+                scores.append(
+                    EvalScore(name=skipped_ev.name, value=False, skipped=True)
                 )
-                scores.append(EvalScore(name=skipped_name, value=False, skipped=True))
             break
     return scores
diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py
index 7daf058..75def3c 100644
--- a/tests/evals/test_e2e.py
+++ b/tests/evals/test_e2e.py
@@ -717,14 +717,14 @@ def _make_ctx(self, output: str, expected: str | None = None) -> EvalContext:
 
     def test_contains_keywords(self) -> None:
         e = contains_keywords(keywords=["hello", "world"])
-        result = e(self._make_ctx("Hello World"))
+        result = e.run(self._make_ctx("Hello World"))
         assert result.keyword_recall == 1.0
         assert result.all_keywords_present is True
 
     def test_contains_keywords_default_requires_all(self) -> None:
         """Default `min_recall=1.0` means strict: missing one → verdict False."""
         e = contains_keywords(keywords=["hello", "world"])
-        result = e(self._make_ctx("Only hello here"))
+        result = e.run(self._make_ctx("Only hello here"))
         assert result.keyword_recall == 0.5
         assert result.all_keywords_present is False
 
@@ -736,38 +736,38 @@ def test_contains_keywords_threshold_continuity_at_zero(self) -> None:
         Now `recall >= min_recall` applies uniformly.
         """
         e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.0)
-        result = e(self._make_ctx("nothing matches"))
+        result = e.run(self._make_ctx("nothing matches"))
         assert result.keyword_recall == 0.0
         assert result.all_keywords_present is True
 
     def test_contains_keywords_threshold_at_exact_value(self) -> None:
         """Verdict passes when recall equals the threshold exactly."""
         e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.5)
-        result = e(self._make_ctx("only alpha here"))
+        result = e.run(self._make_ctx("only alpha here"))
         assert result.keyword_recall == 0.5
         assert result.all_keywords_present is True
 
     def test_contains_keywords_threshold_just_below(self) -> None:
         """Verdict fails when recall is below the threshold."""
         e = contains_keywords(keywords=["alpha", "beta", "gamma"], min_recall=0.5)
-        result = e(self._make_ctx("only alpha"))
+        result = e.run(self._make_ctx("only alpha"))
         assert abs(result.keyword_recall - 1 / 3) < 1e-9
         assert result.all_keywords_present is False
 
     def test_contains_expected(self) -> None:
         e = contains_expected
-        assert e(self._make_ctx("Hello World", "world")) is True
-        assert e(self._make_ctx("Hello", "world")) is False
+        assert e.run(self._make_ctx("Hello World", "world")) is True
+        assert e.run(self._make_ctx("Hello", "world")) is False
 
     def test_does_not_contain(self) -> None:
         e = does_not_contain(forbidden=["cat", "dog"])
-        assert e(self._make_ctx("Yorkshire")).no_forbidden_words is True
-        assert e(self._make_ctx("I like cats")).no_forbidden_words is False
+        assert e.run(self._make_ctx("Yorkshire")).no_forbidden_words is True
+        assert e.run(self._make_ctx("I like cats")).no_forbidden_words is False
 
     def test_not_empty(self) -> None:
-        assert not_empty(self._make_ctx("hello")) is True
-        assert not_empty(self._make_ctx("")) is False
-        assert not_empty(self._make_ctx("   ")) is False
+        assert not_empty.run(self._make_ctx("hello")) is True
+        assert not_empty.run(self._make_ctx("")) is False
+        assert not_empty.run(self._make_ctx("   ")) is False
 
     def test_not_empty_handles_sized_containers(self) -> None:
         """Sized containers: empty -> False, non-empty -> True.
@@ -779,63 +779,63 @@ def test_not_empty_handles_sized_containers(self) -> None:
         # Helper accepts Any at runtime; type hint is just a default.
         ctx_empty_list: Any = self._make_ctx("")
         ctx_empty_list.output = []
-        assert not_empty(ctx_empty_list) is False
+        assert not_empty.run(ctx_empty_list) is False
 
         ctx_nonempty_list: Any = self._make_ctx("")
         ctx_nonempty_list.output = [1, 2]
-        assert not_empty(ctx_nonempty_list) is True
+        assert not_empty.run(ctx_nonempty_list) is True
 
         ctx_empty_dict: Any = self._make_ctx("")
         ctx_empty_dict.output = {}
-        assert not_empty(ctx_empty_dict) is False
+        assert not_empty.run(ctx_empty_dict) is False
 
         ctx_nonempty_dict: Any = self._make_ctx("")
         ctx_nonempty_dict.output = {"a": 1}
-        assert not_empty(ctx_nonempty_dict) is True
+        assert not_empty.run(ctx_nonempty_dict) is True
 
         ctx_empty_set: Any = self._make_ctx("")
         ctx_empty_set.output = set()
-        assert not_empty(ctx_empty_set) is False
+        assert not_empty.run(ctx_empty_set) is False
 
     def test_not_empty_unsized_objects_still_pass(self) -> None:
         """Non-Sized values (int, float, dataclass): always True (kept as-is)."""
         ctx_int: Any = self._make_ctx("")
         ctx_int.output = 42
-        assert not_empty(ctx_int) is True
+        assert not_empty.run(ctx_int) is True
 
         ctx_zero: Any = self._make_ctx("")
         ctx_zero.output = 0  # 0 is not None, not Sized — still passes.
-        assert not_empty(ctx_zero) is True
+        assert not_empty.run(ctx_zero) is True
 
     def test_max_length(self) -> None:
         e = max_length(max_chars=5)
-        result = e(self._make_ctx("hi"))
+        result = e.run(self._make_ctx("hi"))
         assert result.within_limit is True
-        result = e(self._make_ctx("this is too long"))
+        result = e.run(self._make_ctx("this is too long"))
         assert result.within_limit is False
 
     def test_min_length(self) -> None:
-        assert min_length(min_chars=3)(self._make_ctx("hello")) is True
-        assert min_length(min_chars=10)(self._make_ctx("hi")) is False
+        assert min_length(min_chars=3).run(self._make_ctx("hello")) is True
+        assert min_length(min_chars=10).run(self._make_ctx("hi")) is False
 
     def test_matches_regex(self) -> None:
         e = matches_regex(pattern=r"\d{3}-\d{4}")
-        assert e(self._make_ctx("Call 555-1234")) is True
-        assert e(self._make_ctx("no numbers")) is False
+        assert e.run(self._make_ctx("Call 555-1234")) is True
+        assert e.run(self._make_ctx("no numbers")) is False
 
     def test_json_valid(self) -> None:
         e = json_valid(required_keys=["name"])
-        result = e(self._make_ctx('{"name": "Rex"}'))
+        result = e.run(self._make_ctx('{"name": "Rex"}'))
         assert result.valid_json is True
         assert result.has_required_keys is True
-        result = e(self._make_ctx("not json"))
+        result = e.run(self._make_ctx("not json"))
         assert result.valid_json is False
 
     def test_word_overlap(self) -> None:
         e = word_overlap
-        assert e(self._make_ctx("hello world", "hello world")).overlap == 1.0
-        assert e(self._make_ctx("hello there", "hello world")).overlap == 0.5
-        assert e(self._make_ctx("foo", "hello world")).overlap == 0.0
+        assert e.run(self._make_ctx("hello world", "hello world")).overlap == 1.0
+        assert e.run(self._make_ctx("hello there", "hello world")).overlap == 0.5
+        assert e.run(self._make_ctx("foo", "hello world")).overlap == 0.0
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/evals/test_evaluator_validation.py b/tests/evals/test_evaluator_validation.py
new file mode 100644
index 0000000..584a988
--- /dev/null
+++ b/tests/evals/test_evaluator_validation.py
@@ -0,0 +1,57 @@
+"""Validation that evaluators=[...] only accepts @evaluator-wrapped objects.
+
+Plain callables and arbitrary values used to be silently accepted, forcing a
+runtime ``isinstance`` dispatch in the executor. Validating at the boundary
+turns the failure into a clear TypeError at registration time and lets the
+downstream code work on a uniform ``Evaluator | ShortCircuit`` Union.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from protest.evals.evaluator import (
+    EvalCase,
+    EvalContext,
+    ShortCircuit,
+    evaluator,
+    validate_evaluators,
+)
+
+
+@evaluator
+def _ok(ctx: EvalContext) -> bool:
+    return True
+
+
+def _plain_callable(ctx: EvalContext) -> bool:
+    return True
+
+
+class TestValidateEvaluators:
+    def test_accepts_evaluator(self) -> None:
+        validate_evaluators([_ok])
+
+    def test_accepts_short_circuit(self) -> None:
+        validate_evaluators([ShortCircuit([_ok])])
+
+    def test_rejects_plain_callable(self) -> None:
+        with pytest.raises(TypeError, match="@evaluator"):
+            validate_evaluators([_plain_callable])
+
+    def test_rejects_non_callable(self) -> None:
+        with pytest.raises(TypeError, match="Expected Evaluator or ShortCircuit"):
+            validate_evaluators(["not_an_evaluator"])  # type: ignore[list-item]
+
+    def test_rejects_nested_short_circuit(self) -> None:
+        with pytest.raises(TypeError, match="cannot nest"):
+            ShortCircuit([ShortCircuit([_ok])])  # type: ignore[list-item]
+
+
+class TestEvalCaseValidates:
+    def test_evalcase_rejects_plain_callable(self) -> None:
+        with pytest.raises(TypeError, match="@evaluator"):
+            EvalCase(inputs="x", name="c", evaluators=[_plain_callable])
+
+    def test_evalcase_accepts_evaluator(self) -> None:
+        EvalCase(inputs="x", name="c", evaluators=[_ok])

From 8e388ca37ac8eccb466f38cc3898a961bdd39749 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 28 Apr 2026 22:47:20 +0200
Subject: [PATCH 59/60] fix(evals,history): polish from naive-agent v4 feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- evals.md: EvalCase field table listed `tags` as a special metadata key
  while the example below used `tags=[...]` as a kwarg and the dataclass
  declares it first-class. Split into separate `tags` / `metadata` rows.
- evals.md: history compare example now shows `--model NAME` with the
  rationale, so users hit the constraint at read time instead of via the
  runtime "multiple models" rejection.
- history.py: Run Detail panel title now carries a "(+ pass · - fail)"
  legend; the +/- markers were unlabeled and required inference.
---
 docs/evals.md          | 9 +++++++--
 protest/cli/history.py | 6 +++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index 2831f8f..4e22920 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -119,7 +119,8 @@ cases = ForEach([
 | `expected` | `Any` | Expected output (passed to evaluators as `ctx.expected_output`) |
 | `name` | `str` | Case identifier (used in test IDs and history) |
 | `evaluators` | `list` | Per-case evaluators (added to suite-level ones) |
-| `metadata` | `dict` | Arbitrary metadata (special key: `"tags"` — see below) |
+| `tags` | `list[str]` | First-class tags — flow to `protest eval --tag …` (see below) |
+| `metadata` | `dict` | Arbitrary metadata, opaque to the framework |
 
 ### Why `EvalCase` and not a dict?
 
@@ -663,7 +664,11 @@ protest history --evals --runs
 protest history --evals --show
 
 # Compare last two runs (fixed/regressed/new)
-protest history --evals --compare
+# Requires --model NAME if your history mixes multiple model labels
+# (e.g. one suite per rules version) — comparing across labels is rejected
+# to avoid phantom regressions where a case "fails" only because the two
+# runs being diffed used different models.
+protest history --evals --compare --model rules_v1
 ```
 
 ### Integrity Hashes
diff --git a/protest/cli/history.py b/protest/cli/history.py
index 563dd9c..19b6a97 100644
--- a/protest/cli/history.py
+++ b/protest/cli/history.py
@@ -364,7 +364,11 @@ def detail(self, entry: dict[str, Any]) -> None:
 
         self.console.print()
         self.console.print(
-            Panel(lines, title="[bold]Run Detail[/]", border_style="cyan")
+            Panel(
+                lines,
+                title="[bold]Run Detail[/]  [dim]([green]+[/] pass · [red]-[/] fail)[/]",
+                border_style="cyan",
+            )
         )
 
     def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None:

From 99d512f55da6b362c00c2e96f2b0340d6ce2ad18 Mon Sep 17 00:00:00 2001
From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com>
Date: Tue, 28 Apr 2026 23:08:02 +0200
Subject: [PATCH 60/60] refactor(examples): rename yorkshire dataset.py to
 cases.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vestige from the pydantic-evals era — there is no Dataset concept
in the native eval API. The file holds EvalCase instances, so
cases.py matches the vocabulary used by EvalSuite, EvalCase, and
the --last-failed CLI flag.
---
 examples/yorkshire/evals/{dataset.py => cases.py} | 2 +-
 examples/yorkshire/evals/session.py               | 2 +-
 examples/yorkshire/session.py                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename examples/yorkshire/evals/{dataset.py => cases.py} (98%)

diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/cases.py
similarity index 98%
rename from examples/yorkshire/evals/dataset.py
rename to examples/yorkshire/evals/cases.py
index 423ad76..f50eae9 100644
--- a/examples/yorkshire/evals/dataset.py
+++ b/examples/yorkshire/evals/cases.py
@@ -1,4 +1,4 @@
-"""Dataset for the Yorkshire chatbot evals."""
+"""Eval cases for the Yorkshire chatbot."""
 
 from __future__ import annotations
 
diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py
index 06d9b3f..e23f1d7 100644
--- a/examples/yorkshire/evals/session.py
+++ b/examples/yorkshire/evals/session.py
@@ -11,7 +11,7 @@
 from typing import Annotated
 
 from examples.yorkshire.app.chatbot import yorkshire_chatbot
-from examples.yorkshire.evals.dataset import (
+from examples.yorkshire.evals.cases import (
     suite_evaluators,
     yorkshire_cases,
 )
diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py
index c4ffeb0..f1347b7 100644
--- a/examples/yorkshire/session.py
+++ b/examples/yorkshire/session.py
@@ -14,7 +14,7 @@
 from typing import Annotated
 
 from examples.yorkshire.app.chatbot import yorkshire_chatbot
-from examples.yorkshire.evals.dataset import suite_evaluators, yorkshire_cases
+from examples.yorkshire.evals.cases import suite_evaluators, yorkshire_cases
 from examples.yorkshire.tests.fixtures import (
     configure_kennel_logging,
     kennel,