Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions evaluators/time_efficiency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# time_efficiency

Scores how quickly an agent resolved relative to a time budget. Catches agents that produce correct answers but take too long for production use.

## How it works

Reads latency percentiles from the trace's `performance_metrics`. Scores against a time budget in seconds:

```
score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)
```

You choose which percentile (`p50`, `p95`, `p99`) and which latency category (`overall`, `llm_calls`, `tool_executions`) to score against. For example, scoring against `p95` of `llm_calls` catches slow LLM responses specifically.

This is a **trace-level** metric. Returns `NOT_EVALUATED` when no latency data is available or when an invalid percentile/source is configured.

## Config

| Option | Type | Default | Description |
|---|---|---|---|
| `max_duration_s` | float | 120 | Time budget in seconds |
| `latency_percentile` | str | `"p50"` | Percentile to score: `"p50"`, `"p95"`, `"p99"` |
| `latency_source` | str | `"overall"` | Latency category: `"overall"`, `"llm_calls"`, `"tool_executions"` |

## Example

```yaml
evaluators:
- name: time_efficiency
type: remote
source: github
ref: evaluators/time_efficiency/time_efficiency.py
threshold: 0.5
config:
max_duration_s: 60
latency_percentile: p95
latency_source: overall
```

## Output details

```json
{
"duration_s": 4.164,
"max_duration_s": 60,
"utilization": "6.9%",
"source": "latency.overall.p95"
}
```

Requires `agentevals-evaluator-sdk >= 0.1.1`.
6 changes: 6 additions & 0 deletions evaluators/time_efficiency/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: time_efficiency
description: Scores how quickly the agent resolved relative to a time budget
language: python
entrypoint: time_efficiency.py
tags: [performance, time, latency, efficiency, budget]
author: agentevals-dev
131 changes: 131 additions & 0 deletions evaluators/time_efficiency/time_efficiency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Community evaluator: time_efficiency

Scores resolution time relative to a budget. Reads latency from
the trace's performance_metrics.

Score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)

Returns NOT_EVALUATED when no latency data is available.

Config options:
max_duration_s (float): Time budget in seconds (default: 120)
latency_percentile (str): Which percentile to score against:
"p50" (default), "p95", "p99"
latency_source (str): Latency category:
"overall" (default), "llm_calls", "tool_executions"
"""

from __future__ import annotations

from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator

_VALID_PERCENTILES = ("p50", "p95", "p99")
_VALID_SOURCES = ("overall", "llm_calls", "tool_executions")


def _extract_duration_s(perf: dict, percentile: str, source: str) -> tuple[float | None, str]:
"""Extract duration in seconds from a performance_metrics dict.

Returns (duration_seconds, description_of_source).

Supports:
nested (agentevals): latency.<source>.<percentile> in milliseconds
flat: duration_s (seconds) or duration_ms (milliseconds)
"""
latency_block = perf.get("latency")
if isinstance(latency_block, dict):
source_block = latency_block.get(source)
if isinstance(source_block, dict):
ms_value = source_block.get(percentile)
if ms_value is not None:
return float(ms_value) / 1000.0, f"latency.{source}.{percentile}"

duration_s = perf.get("duration_s")
if duration_s is not None:
return float(duration_s), "duration_s"

duration_ms = perf.get("duration_ms")
if duration_ms is not None:
return float(duration_ms) / 1000.0, "duration_ms"

return None, "no latency data found"


def _get_perf(input: EvalInput) -> dict | None:
"""Return the first non-None performance_metrics from any invocation."""
for inv in input.invocations:
if isinstance(inv.performance_metrics, dict):
return inv.performance_metrics
return None


@evaluator
def time_efficiency(input: EvalInput) -> EvalResult:
max_duration = input.config.get("max_duration_s", 120.0)
percentile = input.config.get("latency_percentile", "p50")
source = input.config.get("latency_source", "overall")
n = len(input.invocations)

if percentile not in _VALID_PERCENTILES:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": f"invalid latency_percentile '{percentile}', must be one of {_VALID_PERCENTILES}"},
)
if source not in _VALID_SOURCES:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": f"invalid latency_source '{source}', must be one of {_VALID_SOURCES}"},
)

perf = _get_perf(input)
if perf is None:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": "no performance_metrics available"},
)

duration_s, source_desc = _extract_duration_s(perf, percentile, source)
if duration_s is None:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": source_desc},
)

score = max(0.0, min(1.0, 1.0 - duration_s / max_duration)) if max_duration > 0 else 1.0

breakdown = {}
latency_block = perf.get("latency")
if isinstance(latency_block, dict):
for src in _VALID_SOURCES:
src_block = latency_block.get(src)
if isinstance(src_block, dict):
val = src_block.get(percentile)
if val is not None:
breakdown[src] = round(float(val) / 1000.0, 3)

details: dict = {
"duration_s": round(duration_s, 3),
"max_duration_s": max_duration,
"utilization": f"{duration_s / max_duration * 100:.1f}%" if max_duration > 0 else "n/a",
"source": source_desc,
}
if breakdown:
details["latency_breakdown_s"] = breakdown

return EvalResult(
score=score,
per_invocation_scores=[None] * n,
details=details,
)


if __name__ == "__main__":
time_efficiency.run()
57 changes: 57 additions & 0 deletions evaluators/token_efficiency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# token_efficiency

Scores how efficiently an agent used tokens relative to a budget. Useful for catching runaway token consumption — real benchmarks show 8x variation across agent solutions for the same task.

## How it works

Reads token counts from the trace's `performance_metrics`. Scores input and output tokens separately against their budgets, returns the worst of the two:

```
input_score = clamp(1.0 - input_tokens / max_input_tokens, 0, 1)
output_score = clamp(1.0 - output_tokens / max_output_tokens, 0, 1)
score = min(input_score, output_score)
```

A score of 1.0 means zero tokens used; 0.0 means at or over budget. With a threshold of 0.3, the agent must use less than 70% of the budget to pass.

This is a **trace-level** metric — per-invocation scores are not applicable (token counts come from the full trace).

Returns `NOT_EVALUATED` when no token data is available in the trace.

## Config

| Option | Type | Default | Description |
|---|---|---|---|
| `max_input_tokens` | int | 150000 | Input (prompt) token budget |
| `max_output_tokens` | int | 50000 | Output (completion) token budget |

## Example

```yaml
evaluators:
- name: token_efficiency
type: remote
source: github
ref: evaluators/token_efficiency/token_efficiency.py
threshold: 0.3
config:
max_input_tokens: 100000
max_output_tokens: 30000
```

## Output details

```json
{
"input_tokens": 75000,
"output_tokens": 10000,
"max_input_tokens": 100000,
"max_output_tokens": 30000,
"input_utilization": "75.0%",
"output_utilization": "33.3%",
"input_score": 0.25,
"output_score": 0.6667
}
```

Requires `agentevals-evaluator-sdk >= 0.1.1`.
6 changes: 6 additions & 0 deletions evaluators/token_efficiency/evaluator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: token_efficiency
description: Scores how efficiently the agent used tokens relative to a budget
language: python
entrypoint: token_efficiency.py
tags: [performance, tokens, efficiency, budget]
author: agentevals-dev
108 changes: 108 additions & 0 deletions evaluators/token_efficiency/token_efficiency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Community evaluator: token_efficiency

Scores token usage relative to a budget. Reads token counts from
the trace's performance_metrics.

Score formula (per dimension):
input_score = clamp(1.0 - total_prompt / max_input_tokens, 0, 1)
output_score = clamp(1.0 - total_output / max_output_tokens, 0, 1)
score = min(input_score, output_score)

Returns NOT_EVALUATED when no token data is available.

Config options:
max_input_tokens (int): Input token budget (default: 150000)
max_output_tokens (int): Output token budget (default: 50000)
"""

from __future__ import annotations

from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator


def _extract_tokens(perf: dict) -> dict[str, int] | None:
"""Extract token counts from a performance_metrics dict.

Supports two layouts:
nested (agentevals default): {"tokens": {"total_prompt": N, "total_output": N}}
flat (custom harness): {"input_tokens": N, "output_tokens": N}
"""
tokens_block = perf.get("tokens")
if isinstance(tokens_block, dict):
total_prompt = tokens_block.get("total_prompt")
total_output = tokens_block.get("total_output")
if total_prompt is not None or total_output is not None:
return {
"input_tokens": int(total_prompt) if total_prompt is not None else 0,
"output_tokens": int(total_output) if total_output is not None else 0,
}

input_t = perf.get("input_tokens") if perf.get("input_tokens") is not None else perf.get("prompt_tokens")
output_t = perf.get("output_tokens") if perf.get("output_tokens") is not None else perf.get("completion_tokens")

if input_t is not None or output_t is not None:
return {
"input_tokens": int(input_t) if input_t is not None else 0,
"output_tokens": int(output_t) if output_t is not None else 0,
}

return None


def _get_perf(input: EvalInput) -> dict | None:
"""Return the first non-None performance_metrics from any invocation."""
for inv in input.invocations:
if isinstance(inv.performance_metrics, dict):
return inv.performance_metrics
return None


@evaluator
def token_efficiency(input: EvalInput) -> EvalResult:
max_input = input.config.get("max_input_tokens", 150_000)
max_output = input.config.get("max_output_tokens", 50_000)
n = len(input.invocations)

perf = _get_perf(input)
if perf is None:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": "no performance_metrics available"},
)

tokens = _extract_tokens(perf)
if tokens is None:
return EvalResult(
score=0.0,
status=EvalStatus.NOT_EVALUATED,
per_invocation_scores=[None] * n,
details={"reason": "no token data in performance_metrics"},
)

input_tokens = tokens["input_tokens"]
output_tokens = tokens["output_tokens"]

input_score = max(0.0, min(1.0, 1.0 - input_tokens / max_input)) if max_input > 0 else 1.0
output_score = max(0.0, min(1.0, 1.0 - output_tokens / max_output)) if max_output > 0 else 1.0
score = min(input_score, output_score)

return EvalResult(
score=score,
per_invocation_scores=[None] * n,
details={
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"max_input_tokens": max_input,
"max_output_tokens": max_output,
"input_utilization": f"{input_tokens / max_input * 100:.1f}%" if max_input > 0 else "n/a",
"output_utilization": f"{output_tokens / max_output * 100:.1f}%" if max_output > 0 else "n/a",
"input_score": round(input_score, 4),
"output_score": round(output_score, 4),
},
)


if __name__ == "__main__":
token_efficiency.run()
Loading
Loading