From e54e194d11cfddbffff47608820866a25f1689a3 Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Mon, 30 Mar 2026 18:43:28 +0200 Subject: [PATCH 1/2] add per eval time metrics to CLI --- src/agentevals/output.py | 25 +++++++-- src/agentevals/runner.py | 6 +++ tests/test_output.py | 109 +++++++++++++++++++++++++++++++++++++++ tests/test_runner.py | 4 ++ 4 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 tests/test_output.py diff --git a/src/agentevals/output.py b/src/agentevals/output.py index bae7e3a..99898d0 100644 --- a/src/agentevals/output.py +++ b/src/agentevals/output.py @@ -8,6 +8,20 @@ from .runner import MetricResult, RunResult +def _format_duration(ms: float | None) -> str: + if ms is None: + return "" + ms = round(ms) + if ms < 1000: + return f"{ms}ms" + seconds = ms / 1000 + if seconds < 60: + return f"{seconds:.1f}s" + minutes = int(seconds // 60) + remaining = seconds - minutes * 60 + return f"{minutes}m {remaining:.0f}s" + + def format_results(run_result: RunResult, fmt: str = "table") -> str: if fmt == "json": return _format_json(run_result) @@ -56,6 +70,7 @@ def _format_table(run_result: RunResult) -> str: score_str, mr.eval_status, per_inv, + _format_duration(mr.duration_ms), error_str, ] ) @@ -63,7 +78,7 @@ def _format_table(run_result: RunResult) -> str: if rows: table = tabulate( rows, - headers=["", "Metric", "Score", "Status", "Per-Invocation", "Error"], + headers=["", "Metric", "Score", "Status", "Per-Invocation", "Time", "Error"], tablefmt="simple", ) lines.append(table) @@ -131,6 +146,7 @@ def _format_json(run_result: RunResult) -> str: "score": mr.score, "eval_status": mr.eval_status, "per_invocation_scores": mr.per_invocation_scores, + "duration_ms": mr.duration_ms, "error": mr.error, } if mr.details: @@ -159,12 +175,13 @@ def _format_summary(run_result: RunResult) -> str: lines.append(f"Trace {tr.trace_id} ({tr.num_invocations} invocations):") for mr in tr.metric_results: icon = _status_icon(mr.eval_status) + duration_suffix = f" [{_format_duration(mr.duration_ms)}]" if mr.duration_ms is not None else "" if mr.error: - lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}") + lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}{duration_suffix}") elif mr.score is not None: - lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status})") + lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status}){duration_suffix}") else: - lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status})") + lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status}){duration_suffix}") lines.append("") return "\n".join(lines) diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py index 2ac7d8d..392082c 100644 --- a/src/agentevals/runner.py +++ b/src/agentevals/runner.py @@ -5,6 +5,7 @@ import asyncio import json import logging +import time from collections.abc import Awaitable, Callable from typing import Any @@ -39,6 +40,7 @@ class MetricResult(BaseModel): per_invocation_scores: list[float | None] = Field(default_factory=list) error: str | None = None details: dict[str, Any] | None = None + duration_ms: float | None = None class TraceResult(BaseModel): @@ -234,6 +236,7 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult: async with eval_semaphore: if progress_callback: await progress_callback(f"Running {metric_name}...") + t0 = time.monotonic() result = await evaluate_builtin_metric( metric_name=metric_name, actual_invocations=actual_invocations, @@ -241,6 +244,7 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult: judge_model=judge_model, threshold=threshold, ) + result.duration_ms = (time.monotonic() - t0) * 1000 return await _append_result(result) async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> MetricResult: @@ -249,11 +253,13 @@ async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> Metr await progress_callback(f"Running {evaluator_def.name}...") from .custom_evaluators import evaluate_custom_evaluator + t0 = time.monotonic() result = await evaluate_custom_evaluator( evaluator_def=evaluator_def, actual_invocations=actual_invocations, expected_invocations=expected_invocations, ) + result.duration_ms = (time.monotonic() - t0) * 1000 return await _append_result(result) tasks = [_eval_builtin_with_semaphore(m) for m in metrics] diff --git a/tests/test_output.py b/tests/test_output.py new file mode 100644 index 0000000..b50a080 --- /dev/null +++ b/tests/test_output.py @@ -0,0 +1,109 @@ +import json + +from agentevals.output import _format_duration, format_results +from agentevals.runner import MetricResult, RunResult, TraceResult + + +class TestFormatDuration: + def test_none(self): + assert _format_duration(None) == "" + + def test_milliseconds(self): + assert _format_duration(42.3) == "42ms" + + def test_zero(self): + assert _format_duration(0.0) == "0ms" + + def test_seconds(self): + assert _format_duration(1500.0) == "1.5s" + + def test_exact_one_second(self): + assert _format_duration(1000.0) == "1.0s" + + def test_minutes(self): + assert _format_duration(125000.0) == "2m 5s" + + def test_just_under_one_second(self): + assert _format_duration(999.4) == "999ms" + + def test_rounding_boundary(self): + assert _format_duration(999.5) == "1.0s" + + +class TestTableFormatTiming: + def _make_result(self, duration_ms: float | None = None) -> RunResult: + mr = MetricResult( + metric_name="test_metric", + score=0.95, + eval_status="PASSED", + per_invocation_scores=[0.95], + duration_ms=duration_ms, + ) + tr = TraceResult( + trace_id="abc123", + num_invocations=1, + metric_results=[mr], + ) + return RunResult(trace_results=[tr]) + + def test_time_column_in_table(self): + output = format_results(self._make_result(duration_ms=1234.5), fmt="table") + assert "Time" in output + assert "1.2s" in output + + def test_time_column_milliseconds(self): + output = format_results(self._make_result(duration_ms=42.0), fmt="table") + assert "42ms" in output + + def test_time_column_none(self): + output = format_results(self._make_result(duration_ms=None), fmt="table") + assert "Time" in output + + +class TestJsonFormatTiming: + def test_duration_ms_in_json(self): + mr = MetricResult( + metric_name="test_metric", + score=0.95, + eval_status="PASSED", + duration_ms=1234.5, + ) + tr = TraceResult( + trace_id="abc123", + num_invocations=1, + metric_results=[mr], + ) + result = RunResult(trace_results=[tr]) + output = format_results(result, fmt="json") + data = json.loads(output) + assert data["traces"][0]["metrics"][0]["duration_ms"] == 1234.5 + + def test_duration_ms_null_in_json(self): + mr = MetricResult(metric_name="test_metric", score=0.5, eval_status="PASSED") + tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr]) + result = RunResult(trace_results=[tr]) + output = format_results(result, fmt="json") + data = json.loads(output) + assert data["traces"][0]["metrics"][0]["duration_ms"] is None + + +class TestSummaryFormatTiming: + def test_duration_in_summary(self): + mr = MetricResult( + metric_name="test_metric", + score=0.95, + eval_status="PASSED", + duration_ms=820.0, + ) + tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr]) + result = RunResult(trace_results=[tr]) + output = format_results(result, fmt="summary") + assert "[820ms]" in output + + def test_no_duration_no_brackets(self): + mr = MetricResult(metric_name="test_metric", score=0.5, eval_status="PASSED") + tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr]) + result = RunResult(trace_results=[tr]) + output = format_results(result, fmt="summary") + metric_line = [line for line in output.splitlines() if "test_metric" in line][0] + assert metric_line.rstrip().endswith("(PASSED)") diff --git a/tests/test_runner.py b/tests/test_runner.py index 2bf621e..ce57950 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -39,6 +39,8 @@ def test_trajectory_eval_pass(self): assert mr.score == 1.0 assert mr.eval_status == "PASSED" assert mr.error is None + assert mr.duration_ms is not None + assert mr.duration_ms >= 0 def test_missing_eval_set_error(self): """Trajectory metric without eval set should report a clear error.""" @@ -51,6 +53,8 @@ def test_missing_eval_set_error(self): mr = result.trace_results[0].metric_results[0] assert mr.error is not None assert "requires expected invocations" in mr.error + assert mr.duration_ms is not None + assert mr.duration_ms >= 0 def test_bad_trace_file(self): config = EvalRunConfig( From e4852d3763fee0247c37df89f59501a8442965cd Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Mon, 30 Mar 2026 18:55:57 +0200 Subject: [PATCH 2/2] fix review comments --- src/agentevals/output.py | 6 +++--- tests/test_output.py | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/agentevals/output.py b/src/agentevals/output.py index 99898d0..0e24583 100644 --- a/src/agentevals/output.py +++ b/src/agentevals/output.py @@ -17,9 +17,9 @@ def _format_duration(ms: float | None) -> str: seconds = ms / 1000 if seconds < 60: return f"{seconds:.1f}s" - minutes = int(seconds // 60) - remaining = seconds - minutes * 60 - return f"{minutes}m {remaining:.0f}s" + total_secs = round(seconds) + minutes, secs = divmod(total_secs, 60) + return f"{minutes}m {secs}s" def format_results(run_result: RunResult, fmt: str = "table") -> str: diff --git a/tests/test_output.py b/tests/test_output.py index b50a080..b620416 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -29,6 +29,9 @@ def test_just_under_one_second(self): def test_rounding_boundary(self): assert _format_duration(999.5) == "1.0s" + def test_minutes_no_60s(self): + assert _format_duration(119500) == "2m 0s" + class TestTableFormatTiming: def _make_result(self, duration_ms: float | None = None) -> RunResult: