Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions src/agentevals/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,20 @@
from .runner import MetricResult, RunResult


def _format_duration(ms: float | None) -> str:
if ms is None:
return ""
ms = round(ms)
if ms < 1000:
return f"{ms}ms"
seconds = ms / 1000
if seconds < 60:
return f"{seconds:.1f}s"
total_secs = round(seconds)
minutes, secs = divmod(total_secs, 60)
return f"{minutes}m {secs}s"


def format_results(run_result: RunResult, fmt: str = "table") -> str:
if fmt == "json":
return _format_json(run_result)
Expand Down Expand Up @@ -56,14 +70,15 @@ def _format_table(run_result: RunResult) -> str:
score_str,
mr.eval_status,
per_inv,
_format_duration(mr.duration_ms),
error_str,
]
)

if rows:
table = tabulate(
rows,
headers=["", "Metric", "Score", "Status", "Per-Invocation", "Error"],
headers=["", "Metric", "Score", "Status", "Per-Invocation", "Time", "Error"],
tablefmt="simple",
)
lines.append(table)
Expand Down Expand Up @@ -131,6 +146,7 @@ def _format_json(run_result: RunResult) -> str:
"score": mr.score,
"eval_status": mr.eval_status,
"per_invocation_scores": mr.per_invocation_scores,
"duration_ms": mr.duration_ms,
"error": mr.error,
}
if mr.details:
Expand Down Expand Up @@ -159,12 +175,13 @@ def _format_summary(run_result: RunResult) -> str:
lines.append(f"Trace {tr.trace_id} ({tr.num_invocations} invocations):")
for mr in tr.metric_results:
icon = _status_icon(mr.eval_status)
duration_suffix = f" [{_format_duration(mr.duration_ms)}]" if mr.duration_ms is not None else ""
if mr.error:
lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}")
lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}{duration_suffix}")
elif mr.score is not None:
lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status})")
lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status}){duration_suffix}")
else:
lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status})")
lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status}){duration_suffix}")
lines.append("")

return "\n".join(lines)
Expand Down
6 changes: 6 additions & 0 deletions src/agentevals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import asyncio
import json
import logging
import time
from collections.abc import Awaitable, Callable
from typing import Any

Expand Down Expand Up @@ -39,6 +40,7 @@ class MetricResult(BaseModel):
per_invocation_scores: list[float | None] = Field(default_factory=list)
error: str | None = None
details: dict[str, Any] | None = None
duration_ms: float | None = None


class TraceResult(BaseModel):
Expand Down Expand Up @@ -234,13 +236,15 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult:
async with eval_semaphore:
if progress_callback:
await progress_callback(f"Running {metric_name}...")
t0 = time.monotonic()
result = await evaluate_builtin_metric(
metric_name=metric_name,
actual_invocations=actual_invocations,
expected_invocations=expected_invocations,
judge_model=judge_model,
threshold=threshold,
)
result.duration_ms = (time.monotonic() - t0) * 1000
return await _append_result(result)

async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> MetricResult:
Expand All @@ -249,11 +253,13 @@ async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> Metr
await progress_callback(f"Running {evaluator_def.name}...")
from .custom_evaluators import evaluate_custom_evaluator

t0 = time.monotonic()
result = await evaluate_custom_evaluator(
evaluator_def=evaluator_def,
actual_invocations=actual_invocations,
expected_invocations=expected_invocations,
)
result.duration_ms = (time.monotonic() - t0) * 1000
return await _append_result(result)

tasks = [_eval_builtin_with_semaphore(m) for m in metrics]
Expand Down
112 changes: 112 additions & 0 deletions tests/test_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import json

from agentevals.output import _format_duration, format_results
from agentevals.runner import MetricResult, RunResult, TraceResult


class TestFormatDuration:
def test_none(self):
assert _format_duration(None) == ""

def test_milliseconds(self):
assert _format_duration(42.3) == "42ms"

def test_zero(self):
assert _format_duration(0.0) == "0ms"

def test_seconds(self):
assert _format_duration(1500.0) == "1.5s"

def test_exact_one_second(self):
assert _format_duration(1000.0) == "1.0s"

def test_minutes(self):
assert _format_duration(125000.0) == "2m 5s"

def test_just_under_one_second(self):
assert _format_duration(999.4) == "999ms"

def test_rounding_boundary(self):
assert _format_duration(999.5) == "1.0s"

def test_minutes_no_60s(self):
assert _format_duration(119500) == "2m 0s"


class TestTableFormatTiming:
def _make_result(self, duration_ms: float | None = None) -> RunResult:
mr = MetricResult(
metric_name="test_metric",
score=0.95,
eval_status="PASSED",
per_invocation_scores=[0.95],
duration_ms=duration_ms,
)
tr = TraceResult(
trace_id="abc123",
num_invocations=1,
metric_results=[mr],
)
return RunResult(trace_results=[tr])

def test_time_column_in_table(self):
output = format_results(self._make_result(duration_ms=1234.5), fmt="table")
assert "Time" in output
assert "1.2s" in output

def test_time_column_milliseconds(self):
output = format_results(self._make_result(duration_ms=42.0), fmt="table")
assert "42ms" in output

def test_time_column_none(self):
output = format_results(self._make_result(duration_ms=None), fmt="table")
assert "Time" in output


class TestJsonFormatTiming:
def test_duration_ms_in_json(self):
mr = MetricResult(
metric_name="test_metric",
score=0.95,
eval_status="PASSED",
duration_ms=1234.5,
)
tr = TraceResult(
trace_id="abc123",
num_invocations=1,
metric_results=[mr],
)
result = RunResult(trace_results=[tr])
output = format_results(result, fmt="json")
data = json.loads(output)
assert data["traces"][0]["metrics"][0]["duration_ms"] == 1234.5

def test_duration_ms_null_in_json(self):
mr = MetricResult(metric_name="test_metric", score=0.5, eval_status="PASSED")
tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr])
result = RunResult(trace_results=[tr])
output = format_results(result, fmt="json")
data = json.loads(output)
assert data["traces"][0]["metrics"][0]["duration_ms"] is None


class TestSummaryFormatTiming:
def test_duration_in_summary(self):
mr = MetricResult(
metric_name="test_metric",
score=0.95,
eval_status="PASSED",
duration_ms=820.0,
)
tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr])
result = RunResult(trace_results=[tr])
output = format_results(result, fmt="summary")
assert "[820ms]" in output

def test_no_duration_no_brackets(self):
mr = MetricResult(metric_name="test_metric", score=0.5, eval_status="PASSED")
tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr])
result = RunResult(trace_results=[tr])
output = format_results(result, fmt="summary")
metric_line = [line for line in output.splitlines() if "test_metric" in line][0]
assert metric_line.rstrip().endswith("(PASSED)")
4 changes: 4 additions & 0 deletions tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def test_trajectory_eval_pass(self):
assert mr.score == 1.0
assert mr.eval_status == "PASSED"
assert mr.error is None
assert mr.duration_ms is not None
assert mr.duration_ms >= 0

def test_missing_eval_set_error(self):
"""Trajectory metric without eval set should report a clear error."""
Expand All @@ -51,6 +53,8 @@ def test_missing_eval_set_error(self):
mr = result.trace_results[0].metric_results[0]
assert mr.error is not None
assert "requires expected invocations" in mr.error
assert mr.duration_ms is not None
assert mr.duration_ms >= 0

def test_bad_trace_file(self):
config = EvalRunConfig(
Expand Down