diff --git a/evaluators/time_efficiency/README.md b/evaluators/time_efficiency/README.md new file mode 100644 index 0000000..d18f2ec --- /dev/null +++ b/evaluators/time_efficiency/README.md @@ -0,0 +1,51 @@ +# time_efficiency + +Scores how quickly an agent resolved relative to a time budget. Catches agents that produce correct answers but take too long for production use. + +## How it works + +Reads latency percentiles from the trace's `performance_metrics`. Scores against a time budget in seconds: + +``` +score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1) +``` + +You choose which percentile (`p50`, `p95`, `p99`) and which latency category (`overall`, `llm_calls`, `tool_executions`) to score against. For example, scoring against `p95` of `llm_calls` catches slow LLM responses specifically. + +This is a **trace-level** metric. Returns `NOT_EVALUATED` when no latency data is available or when an invalid percentile/source is configured. + +## Config + +| Option | Type | Default | Description | +|---|---|---|---| +| `max_duration_s` | float | 120 | Time budget in seconds | +| `latency_percentile` | str | `"p50"` | Percentile to score: `"p50"`, `"p95"`, `"p99"` | +| `latency_source` | str | `"overall"` | Latency category: `"overall"`, `"llm_calls"`, `"tool_executions"` | + +## Example + +```yaml +evaluators: + - name: time_efficiency + type: remote + source: github + ref: evaluators/time_efficiency/time_efficiency.py + threshold: 0.5 + config: + max_duration_s: 60 + latency_percentile: p95 + latency_source: overall +``` + +## Output details + +```json +{ + "duration_s": 4.164, + "max_duration_s": 60, + "utilization": "6.9%", + "source": "latency.overall.p95" +} +``` + +Requires `agentevals-evaluator-sdk >= 0.1.1`. diff --git a/evaluators/time_efficiency/evaluator.yaml b/evaluators/time_efficiency/evaluator.yaml new file mode 100644 index 0000000..58c9863 --- /dev/null +++ b/evaluators/time_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: time_efficiency +description: Scores how quickly the agent resolved relative to a time budget +language: python +entrypoint: time_efficiency.py +tags: [performance, time, latency, efficiency, budget] +author: agentevals-dev diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py new file mode 100644 index 0000000..7700300 --- /dev/null +++ b/evaluators/time_efficiency/time_efficiency.py @@ -0,0 +1,131 @@ +"""Community evaluator: time_efficiency + +Scores resolution time relative to a budget. Reads latency from +the trace's performance_metrics. + +Score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1) + +Returns NOT_EVALUATED when no latency data is available. + +Config options: + max_duration_s (float): Time budget in seconds (default: 120) + latency_percentile (str): Which percentile to score against: + "p50" (default), "p95", "p99" + latency_source (str): Latency category: + "overall" (default), "llm_calls", "tool_executions" +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + +_VALID_PERCENTILES = ("p50", "p95", "p99") +_VALID_SOURCES = ("overall", "llm_calls", "tool_executions") + + +def _extract_duration_s(perf: dict, percentile: str, source: str) -> tuple[float | None, str]: + """Extract duration in seconds from a performance_metrics dict. + + Returns (duration_seconds, description_of_source). + + Supports: + nested (agentevals): latency.. in milliseconds + flat: duration_s (seconds) or duration_ms (milliseconds) + """ + latency_block = perf.get("latency") + if isinstance(latency_block, dict): + source_block = latency_block.get(source) + if isinstance(source_block, dict): + ms_value = source_block.get(percentile) + if ms_value is not None: + return float(ms_value) / 1000.0, f"latency.{source}.{percentile}" + + duration_s = perf.get("duration_s") + if duration_s is not None: + return float(duration_s), "duration_s" + + duration_ms = perf.get("duration_ms") + if duration_ms is not None: + return float(duration_ms) / 1000.0, "duration_ms" + + return None, "no latency data found" + + +def _get_perf(input: EvalInput) -> dict | None: + """Return the first non-None performance_metrics from any invocation.""" + for inv in input.invocations: + if isinstance(inv.performance_metrics, dict): + return inv.performance_metrics + return None + + +@evaluator +def time_efficiency(input: EvalInput) -> EvalResult: + max_duration = input.config.get("max_duration_s", 120.0) + percentile = input.config.get("latency_percentile", "p50") + source = input.config.get("latency_source", "overall") + n = len(input.invocations) + + if percentile not in _VALID_PERCENTILES: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": f"invalid latency_percentile '{percentile}', must be one of {_VALID_PERCENTILES}"}, + ) + if source not in _VALID_SOURCES: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": f"invalid latency_source '{source}', must be one of {_VALID_SOURCES}"}, + ) + + perf = _get_perf(input) + if perf is None: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "no performance_metrics available"}, + ) + + duration_s, source_desc = _extract_duration_s(perf, percentile, source) + if duration_s is None: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": source_desc}, + ) + + score = max(0.0, min(1.0, 1.0 - duration_s / max_duration)) if max_duration > 0 else 1.0 + + breakdown = {} + latency_block = perf.get("latency") + if isinstance(latency_block, dict): + for src in _VALID_SOURCES: + src_block = latency_block.get(src) + if isinstance(src_block, dict): + val = src_block.get(percentile) + if val is not None: + breakdown[src] = round(float(val) / 1000.0, 3) + + details: dict = { + "duration_s": round(duration_s, 3), + "max_duration_s": max_duration, + "utilization": f"{duration_s / max_duration * 100:.1f}%" if max_duration > 0 else "n/a", + "source": source_desc, + } + if breakdown: + details["latency_breakdown_s"] = breakdown + + return EvalResult( + score=score, + per_invocation_scores=[None] * n, + details=details, + ) + + +if __name__ == "__main__": + time_efficiency.run() diff --git a/evaluators/token_efficiency/README.md b/evaluators/token_efficiency/README.md new file mode 100644 index 0000000..89a2c43 --- /dev/null +++ b/evaluators/token_efficiency/README.md @@ -0,0 +1,57 @@ +# token_efficiency + +Scores how efficiently an agent used tokens relative to a budget. Useful for catching runaway token consumption — real benchmarks show 8x variation across agent solutions for the same task. + +## How it works + +Reads token counts from the trace's `performance_metrics`. Scores input and output tokens separately against their budgets, returns the worst of the two: + +``` +input_score = clamp(1.0 - input_tokens / max_input_tokens, 0, 1) +output_score = clamp(1.0 - output_tokens / max_output_tokens, 0, 1) +score = min(input_score, output_score) +``` + +A score of 1.0 means zero tokens used; 0.0 means at or over budget. With a threshold of 0.3, the agent must use less than 70% of the budget to pass. + +This is a **trace-level** metric — per-invocation scores are not applicable (token counts come from the full trace). + +Returns `NOT_EVALUATED` when no token data is available in the trace. + +## Config + +| Option | Type | Default | Description | +|---|---|---|---| +| `max_input_tokens` | int | 150000 | Input (prompt) token budget | +| `max_output_tokens` | int | 50000 | Output (completion) token budget | + +## Example + +```yaml +evaluators: + - name: token_efficiency + type: remote + source: github + ref: evaluators/token_efficiency/token_efficiency.py + threshold: 0.3 + config: + max_input_tokens: 100000 + max_output_tokens: 30000 +``` + +## Output details + +```json +{ + "input_tokens": 75000, + "output_tokens": 10000, + "max_input_tokens": 100000, + "max_output_tokens": 30000, + "input_utilization": "75.0%", + "output_utilization": "33.3%", + "input_score": 0.25, + "output_score": 0.6667 +} +``` + +Requires `agentevals-evaluator-sdk >= 0.1.1`. diff --git a/evaluators/token_efficiency/evaluator.yaml b/evaluators/token_efficiency/evaluator.yaml new file mode 100644 index 0000000..864cc38 --- /dev/null +++ b/evaluators/token_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: token_efficiency +description: Scores how efficiently the agent used tokens relative to a budget +language: python +entrypoint: token_efficiency.py +tags: [performance, tokens, efficiency, budget] +author: agentevals-dev diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py new file mode 100644 index 0000000..5bff15f --- /dev/null +++ b/evaluators/token_efficiency/token_efficiency.py @@ -0,0 +1,108 @@ +"""Community evaluator: token_efficiency + +Scores token usage relative to a budget. Reads token counts from +the trace's performance_metrics. + +Score formula (per dimension): + input_score = clamp(1.0 - total_prompt / max_input_tokens, 0, 1) + output_score = clamp(1.0 - total_output / max_output_tokens, 0, 1) + score = min(input_score, output_score) + +Returns NOT_EVALUATED when no token data is available. + +Config options: + max_input_tokens (int): Input token budget (default: 150000) + max_output_tokens (int): Output token budget (default: 50000) +""" + +from __future__ import annotations + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +def _extract_tokens(perf: dict) -> dict[str, int] | None: + """Extract token counts from a performance_metrics dict. + + Supports two layouts: + nested (agentevals default): {"tokens": {"total_prompt": N, "total_output": N}} + flat (custom harness): {"input_tokens": N, "output_tokens": N} + """ + tokens_block = perf.get("tokens") + if isinstance(tokens_block, dict): + total_prompt = tokens_block.get("total_prompt") + total_output = tokens_block.get("total_output") + if total_prompt is not None or total_output is not None: + return { + "input_tokens": int(total_prompt) if total_prompt is not None else 0, + "output_tokens": int(total_output) if total_output is not None else 0, + } + + input_t = perf.get("input_tokens") if perf.get("input_tokens") is not None else perf.get("prompt_tokens") + output_t = perf.get("output_tokens") if perf.get("output_tokens") is not None else perf.get("completion_tokens") + + if input_t is not None or output_t is not None: + return { + "input_tokens": int(input_t) if input_t is not None else 0, + "output_tokens": int(output_t) if output_t is not None else 0, + } + + return None + + +def _get_perf(input: EvalInput) -> dict | None: + """Return the first non-None performance_metrics from any invocation.""" + for inv in input.invocations: + if isinstance(inv.performance_metrics, dict): + return inv.performance_metrics + return None + + +@evaluator +def token_efficiency(input: EvalInput) -> EvalResult: + max_input = input.config.get("max_input_tokens", 150_000) + max_output = input.config.get("max_output_tokens", 50_000) + n = len(input.invocations) + + perf = _get_perf(input) + if perf is None: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "no performance_metrics available"}, + ) + + tokens = _extract_tokens(perf) + if tokens is None: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "no token data in performance_metrics"}, + ) + + input_tokens = tokens["input_tokens"] + output_tokens = tokens["output_tokens"] + + input_score = max(0.0, min(1.0, 1.0 - input_tokens / max_input)) if max_input > 0 else 1.0 + output_score = max(0.0, min(1.0, 1.0 - output_tokens / max_output)) if max_output > 0 else 1.0 + score = min(input_score, output_score) + + return EvalResult( + score=score, + per_invocation_scores=[None] * n, + details={ + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "max_input_tokens": max_input, + "max_output_tokens": max_output, + "input_utilization": f"{input_tokens / max_input * 100:.1f}%" if max_input > 0 else "n/a", + "output_utilization": f"{output_tokens / max_output * 100:.1f}%" if max_output > 0 else "n/a", + "input_score": round(input_score, 4), + "output_score": round(output_score, 4), + }, + ) + + +if __name__ == "__main__": + token_efficiency.run() diff --git a/evaluators/tool_efficiency/README.md b/evaluators/tool_efficiency/README.md new file mode 100644 index 0000000..192f32e --- /dev/null +++ b/evaluators/tool_efficiency/README.md @@ -0,0 +1,72 @@ +# tool_efficiency + +Scores whether an agent used tools effectively. Catches waste: duplicate calls (same tool + same args), error responses, and exceeding a tool call budget. Also enforces a minimum number of tool calls when tools are required. + +## How it works + +Scores each invocation independently based on three factors multiplied together: + +``` +useful_calls = total - duplicates - errors +efficiency = useful_calls / total +budget_factor = clamp(1.0 - max(0, total - max_tool_calls) / max_tool_calls, 0, 1) +min_factor = total / min_tool_calls (when total < min_tool_calls) +score = clamp(efficiency * budget_factor * min_factor, 0, 1) +``` + +- **efficiency**: What fraction of calls were useful (not duplicates, not errors). +- **budget_factor**: Penalizes exceeding `max_tool_calls`. At 2x budget, score goes to 0. +- **min_factor**: Penalizes falling below `min_tool_calls`. 1 call when 2 are required = 0.5x. + +If an invocation has zero tool calls: score is 0.0 when `min_tool_calls > 0`, otherwise 1.0. + +The overall score is the mean across invocations. + +**Duplicate detection**: Two calls are duplicates if they have the same tool name and identical arguments (JSON-serialized, sorted keys). + +**Error detection**: Checks `ToolResponseData.status` for `"error"`, `"failed"`, or `"failure"`. + +## Config + +| Option | Type | Default | Description | +|---|---|---|---| +| `max_tool_calls` | int | 15 | Budget; calls beyond this are penalized | +| `min_tool_calls` | int | 0 | Minimum required; 0 means tools are optional | +| `penalize_duplicates` | bool | true | Count duplicate calls as waste | +| `penalize_errors` | bool | true | Count error responses as waste | + +## Example + +```yaml +evaluators: + - name: tool_efficiency + type: remote + source: github + ref: evaluators/tool_efficiency/tool_efficiency.py + threshold: 0.5 + config: + max_tool_calls: 10 + min_tool_calls: 1 + penalize_duplicates: true + penalize_errors: true +``` + +## Output details + +```json +{ + "per_invocation": [ + { + "invocation_id": "inv-001", + "score": 0.3429, + "total_calls": 7, + "useful_calls": 4, + "duplicate_calls": 2, + "error_responses": 1, + "budget_factor": 0.6 + } + ] +} +``` + +Requires `agentevals-evaluator-sdk >= 0.1.1`. diff --git a/evaluators/tool_efficiency/evaluator.yaml b/evaluators/tool_efficiency/evaluator.yaml new file mode 100644 index 0000000..872f702 --- /dev/null +++ b/evaluators/tool_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: tool_efficiency +description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors +language: python +entrypoint: tool_efficiency.py +tags: [performance, tools, efficiency, budget] +author: agentevals-dev diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py new file mode 100644 index 0000000..d3bcd66 --- /dev/null +++ b/evaluators/tool_efficiency/tool_efficiency.py @@ -0,0 +1,119 @@ +"""Community evaluator: tool_efficiency + +Scores tool usage effectiveness per invocation. Penalizes duplicate calls +(same tool name + args), error responses, and budget overruns. + +Score per invocation: + useful_calls = total - duplicates - errors + efficiency = useful_calls / total + budget_factor = clamp(1.0 - max(0, total - max_tool_calls) / max_tool_calls, 0, 1) + min_factor = total / min_tool_calls (when total < min_tool_calls, else 1.0) + score = clamp(efficiency * budget_factor * min_factor, 0, 1) + +If total == 0: score = 0.0 when min_tool_calls > 0, else 1.0. + +Config options: + max_tool_calls (int): Budget; calls beyond this are penalized (default: 15) + min_tool_calls (int): Minimum required; 0 means optional (default: 0) + penalize_duplicates (bool): Count duplicate calls as waste (default: true) + penalize_errors (bool): Count error responses as waste (default: true) +""" + +from __future__ import annotations + +import json + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +def _call_signature(call) -> str: + """Deterministic signature for deduplication: 'tool_name::sorted_args_json'.""" + try: + args_str = json.dumps(call.args, sort_keys=True, default=str) + except (TypeError, ValueError): + args_str = str(call.args) + return f"{call.name}::{args_str}" + + +def _is_error_response(response) -> bool: + """Check if a tool response indicates an error via its status field.""" + status = (response.status or "").lower() + return status in ("error", "failed", "failure") + + +@evaluator +def tool_efficiency(input: EvalInput) -> EvalResult: + max_tool_calls = input.config.get("max_tool_calls", 15) + min_tool_calls = input.config.get("min_tool_calls", 0) + penalize_duplicates = input.config.get("penalize_duplicates", True) + penalize_errors = input.config.get("penalize_errors", True) + + scores: list[float] = [] + inv_details: list[dict] = [] + + for inv in input.invocations: + tool_calls = inv.intermediate_steps.tool_calls + tool_responses = inv.intermediate_steps.tool_responses + total = len(tool_calls) + + if total == 0: + score = 0.0 if min_tool_calls > 0 else 1.0 + reason = f"no tool calls (min required: {min_tool_calls})" if min_tool_calls > 0 else "no tool calls (tools optional)" + scores.append(score) + inv_details.append({"invocation_id": inv.invocation_id, "score": score, "reason": reason}) + continue + + dupes = 0 + if penalize_duplicates: + seen: dict[str, int] = {} + for call in tool_calls: + sig = _call_signature(call) + seen[sig] = seen.get(sig, 0) + 1 + dupes = sum(count - 1 for count in seen.values() if count > 1) + + errors = 0 + if penalize_errors: + errors = sum(1 for r in tool_responses if _is_error_response(r)) + + useful = max(0, total - dupes - errors) + efficiency = useful / total + + budget_factor = 1.0 + if max_tool_calls > 0 and total > max_tool_calls: + budget_factor = max(0.0, 1.0 - (total - max_tool_calls) / max_tool_calls) + + min_factor = 1.0 + if min_tool_calls > 0 and total < min_tool_calls: + min_factor = total / min_tool_calls + + score = max(0.0, min(1.0, efficiency * budget_factor * min_factor)) + scores.append(score) + + detail: dict = { + "invocation_id": inv.invocation_id, + "score": round(score, 4), + "total_calls": total, + "useful_calls": useful, + } + if dupes: + detail["duplicate_calls"] = dupes + if errors: + detail["error_responses"] = errors + if budget_factor < 1.0: + detail["budget_factor"] = round(budget_factor, 4) + if min_factor < 1.0: + detail["min_factor"] = round(min_factor, 4) + detail["min_tool_calls"] = min_tool_calls + inv_details.append(detail) + + overall = sum(scores) / len(scores) if scores else 0.0 + + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"per_invocation": inv_details}, + ) + + +if __name__ == "__main__": + tool_efficiency.run()