From 08b0905516c5b04fb03bdd554728844f63796b06 Mon Sep 17 00:00:00 2001 From: Henrik Rexed Date: Mon, 30 Mar 2026 11:13:24 +0000 Subject: [PATCH 1/5] feat: add token_efficiency, tool_efficiency, time_efficiency evaluators (issue #6) Three built-in performance evaluators that score agents from trace data: - token_efficiency: scores token usage vs budget (weighted input/output) - tool_efficiency: penalizes duplicate calls, errors, and budget overruns - time_efficiency: scores resolution time vs budget All evaluators: - Follow the @evaluator SDK pattern (stdin/stdout JSON protocol) - Handle missing performance_metrics gracefully (neutral scores) - Support per-invocation scoring - Pass validate_evaluator.py smoke tests Closes agentevals-dev/evaluators#6 --- .gitignore | 2 +- .../__pycache__/bertscore.cpython-314.pyc | Bin 5775 -> 0 bytes evaluators/time_efficiency/evaluator.yaml | 6 + evaluators/time_efficiency/time_efficiency.py | 68 +++++++++++ evaluators/token_efficiency/evaluator.yaml | 6 + .../token_efficiency/token_efficiency.py | 104 ++++++++++++++++ evaluators/tool_efficiency/evaluator.yaml | 6 + evaluators/tool_efficiency/tool_efficiency.py | 114 ++++++++++++++++++ 8 files changed, 305 insertions(+), 1 deletion(-) delete mode 100644 evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc create mode 100644 evaluators/time_efficiency/evaluator.yaml create mode 100644 evaluators/time_efficiency/time_efficiency.py create mode 100644 evaluators/token_efficiency/evaluator.yaml create mode 100644 evaluators/token_efficiency/token_efficiency.py create mode 100644 evaluators/tool_efficiency/evaluator.yaml create mode 100644 evaluators/tool_efficiency/tool_efficiency.py diff --git a/.gitignore b/.gitignore index 0cafc1c..8067d78 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -.venv/ \ No newline at end of file +.venv/evaluators/__pycache__ diff --git a/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc b/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc deleted file mode 100644 index 551cc69f9ea7d27c5e9dc862e507ab4d519eda31..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5775 zcmb^#TWs6b^-`25krE~O6~~DkQ?}zov0~MM+a^xZ)=BKxc~xaxxoaE~j6~UNBuXWz zJdA#*K(X2FIy>l!de}ZZU=Q!lz<#p74cLHwT2VY{gn=1rfv$fxk}g0$ww+6fvg{`9 z3hW5FJiI*jo^#JVuX}sEEQ0{D*ZwTG_z?OGzgR$)5#rHr0LdX9>1Y!1#3(Uh({1e- zw~dlI2~Tp=KH|_FBa}{={q|Akh)Z{k&^kTB=*);)cef+n(TsSi872L?hj#+*GI930 zo2TK8!EgN)-n#d?k9Qw`1EF{WLape!HxjVK+J0KpFt@GK{0BYaunnOzL;Tp)SXz}h zO-hN1F2^`cPRU78mG!%vbW2QTMLn(dGR)a@DwEYEjoTh_IHO82S(DRBhyzSaCPUnC zUy!>Y>9-|G;UqCO&B@BGbWGHN=#U0fS|+V%5+f>cPK4Klq)JLm;&kbb&Sf=Onc`w; z1;1s*BqybANbxvyY8)OAO9KH&S{2X~OG(I60}RJWcQR5;m*U)EO;>{h9520{l~pO; z3)l`edKzX*WkguCI3>!8ruTBA2`-(Ibr_D9qGwe_QRS4V-sLoNf3V1DMN;$uuA{2=ju6*T);wS^c?XOHrM#|*Qxet* zhqYPJ4}PRluAK2BW;Bffmd7RhT2I8Fbb3Vg#5m z{$s!{2+l@43IUQsQ8Z~g{vMb<^tGbbh-rc+cpFdhcHY5LypwnFG|z+`oha!36098= zc5GS(3FTey0`phP-V-Qsq~IXik4P?hhX&- z>}*QZH7sX`aV0#RPD$a!P5q{-hhw0G@J&_LW^`Fp64Fgcm%`QMX|*WUwz=EkD%P91 zYj^~qqFqT4)F$Au5B}N!G&%GHF~q?w+>{u0lZ5AnuXWP_&rJ$>nl_ynb0M~6lg1c> zsQN88V2@xlis6-8-~IMjm< zRl25te=Cw9>bAiM{CndWqfU}-NG5{R%nO!nmZK1kH06!vlA@+_({elx!At|glxn@# zux0eIAZd8Z#DZ>`gJw80q8YOcMptn(z@hJoS`YGp>D%v2US)CP#SLT}uu2*G3~)$ib&P0O8{GgLr#4~&pD-oq!gkbDJD}IJm1}_H z(3Tf@n-}!ubN9PG?SIHditN=qdv$f}%>sKZH@r@J=iAx~ddq_Nb&J^OA zPHWrgw!Aw`f#Z5q?DPg5gME?Xum>uh&!DThqpT%tY-Lqpb20U$`Y~1q&5S=T$MkDv zkWD1hqJF)+AA|}y6P3Nz)S&5HW`pfXy95pPu>Hg<#Fp8XeD#|oe%%1Xff4XBf7_~ z+r8~+-V&k9bJn&|94v_!E1nD7Di}gf^Wpz_R%*wrwX&L(RILOE1<-gjg6(}5OcIl} z{)#--GvkapBiJK$!K4jVQPJQe3Sq;qKm=RCF0dj|1lzzau)bwn&)@`!=Ksuz9-qTW z&4NjE{HKuE+Ho|Sv_)N^8mo!YwQoDeg|H`9Aj4d?@wN=ZlTjCMkJ7v&%7jVqDd)_^ zN;y|k%~gwv?P3mx!r?x%Y!SJt10Joq9+weiwHNXq8JAOESZoctIu%ZtxGpqoiM}uY z0}y1)n54D=v0+mbgGoS)#352l8nlA5XcDRfgS?SSL55bLLdm2xcu$FU40}ov6~iv; zQYz?HyI>0SAZ~VrNf(wBM?u30l9~t!HKQ8Dv`28$1C1H7aPWlaiJ=@p>jhkAL7gj9 zGq8FbsLgQjZ$NVBu^;){7w@jTUhtjGjjVgy7Uh+pg7-}B{O7*L`8#uWioTA#ucPP- z<$a;$H|`%>^@R$)moRC={Pf&((HqEn14VC7-rKW${(k+cx2NDel{^2KLT>+j@crPz zNbx{8e;~Xs(NS-}XU_-g#4X1z2*|IqJ(AidE zy$dZLUSEu^v7JkI*AAUnV+ZDJFwLxO!_&OjvNW=sSc%_1v+90%)&6ohfMFN20u>HX zW^m_BvlovoM^TBzl$~k!FD%9sVY}j)b;I0>t2+h;m{7`^&IVI${m5NZ>jCur}#gl0)IlJdK+7fh>n> zhF4IYiLyLXI}`7Q9`}wO8hYs3p0ItJ-^JSq(dYn*(w2~M*ah$(9C%HL(iU}%PgVPB zlNjv0E#uwxr>GYanF}nGijuq^Cp5n9#ftwzglRBb)*<(biWi4dkf^1VnY}4lvo09$ zk+^yohO*pYp1KSQSKn~@Y|2QguzhhMm|vmpj!U{ICpClFz6qGA`^FL3pi<`H*i19q zS;w%)vT@NMW0|ah47O%^Ic|7&atxB{OCY=Y3T_6WfsUQ&E4as?%4d6ngreUdlagY% zwUm5@YS~oKrw#zFp28i3qMpGoXK{m#yDUOA zfx`~q%MQ)(R8%df>D!t?!n(|Sk2ll2D{!W7Nh%Z;8jeeq3|`LTRwk#n6Zo4zGX#I_ z_Yl)^XamypJ9F<8eeHQ)`w~;|_2foMPTvD(^SZ~k;3)b7d4Hhb>6j(feJ!Q>ruBxl zVna{9p{LY&XsLU-@7G64O>L{~CsrE=9@7p#vxyuuv*|_k&7ZX1Yc19V@^yiw(CTw% z3w7sm7fNjX{KVWuk!{PfZHv>(_5yn}H(YYEt4+rqI**m=nr5lb{S6BvAF;X7&oS}) z6Tcez?S)@nD0ZF9cbzPDoqo`D`e)sLe7?Yr!XeGy2I>BR|LF2t1^Cb!mSE2>asoYS>PXFk_yBFTOw6U)hIJO6Jmtg5dANSD5&61_Y=3--a zzOlR57+P%%EqBb)>vf$=qxXZK4j1aK&Qc|pcb=VN7ZM*U4_yaKuEwHkf8Mo!!{5Ht zPz)T)2ac8e`xm227uVXJEBW^>b}#K++jpeoZ(K+$j(wcmq#dmHU!;>}wrJ!JJlTgr z$9^1J=>Fj5FU~GeKfm}_@1YefcYfX7RPr{J4s@03S{7eeI<>NI_4MUW-&yrfJa*bW z)F!e!C^M@YEij{3epuNBa1Ur}%i^VL4{hdQg$2=kd+r-%BB!TRn*_nrAOUcgu=<>UlfB5PaiR}AKrvC+r2;0a2 diff --git a/evaluators/time_efficiency/evaluator.yaml b/evaluators/time_efficiency/evaluator.yaml new file mode 100644 index 0000000..cdd2dc8 --- /dev/null +++ b/evaluators/time_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: time_efficiency +description: Scores how quickly the agent resolved relative to a time budget +language: python +entrypoint: time_efficiency.py +tags: [performance, time, latency, efficiency, budget] +author: henrikrexed diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py new file mode 100644 index 0000000..2bdbc40 --- /dev/null +++ b/evaluators/time_efficiency/time_efficiency.py @@ -0,0 +1,68 @@ +"""Community evaluator: time_efficiency + +Scores how quickly the agent resolved relative to a time budget. +Uses performance_metrics.duration_s from trace data when available. + +Config options: + max_duration_s (float): Time budget in seconds (default: 120) +""" + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +def _extract_duration(inv) -> float | None: + """Extract duration_s from an invocation's performance_metrics.""" + perf = getattr(inv, "performance_metrics", None) + if perf is None and hasattr(inv, "__getitem__"): + try: + perf = inv["performance_metrics"] + except (KeyError, TypeError): + perf = None + + if isinstance(perf, dict): + duration = perf.get("duration_s") or perf.get("duration") + if duration is not None: + return float(duration) + + return None + + +@evaluator +def time_efficiency(input: EvalInput) -> EvalResult: + max_duration = input.config.get("max_duration_s", 120.0) + + scores: list[float] = [] + details_items: list[str] = [] + has_data = False + + for inv in input.invocations: + duration = _extract_duration(inv) + + if duration is None: + # No timing data — assign neutral score + scores.append(0.5) + details_items.append(f"{inv.invocation_id}: no duration data available") + continue + + has_data = True + score = max(0.0, min(1.0, 1.0 - (duration / max_duration))) + scores.append(score) + details_items.append( + f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s budget (score: {score:.2f})" + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={ + "time_details": details_items, + "has_trace_data": has_data, + "max_duration_s": max_duration, + }, + ) + + +if __name__ == "__main__": + time_efficiency.run() diff --git a/evaluators/token_efficiency/evaluator.yaml b/evaluators/token_efficiency/evaluator.yaml new file mode 100644 index 0000000..e6d23c6 --- /dev/null +++ b/evaluators/token_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: token_efficiency +description: Scores how efficiently the agent used tokens relative to a budget +language: python +entrypoint: token_efficiency.py +tags: [performance, tokens, efficiency, budget] +author: henrikrexed diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py new file mode 100644 index 0000000..b15fb24 --- /dev/null +++ b/evaluators/token_efficiency/token_efficiency.py @@ -0,0 +1,104 @@ +"""Community evaluator: token_efficiency + +Scores how efficiently the agent used tokens relative to a budget. +Uses performance_metrics from trace data when available, falls back to +counting tool calls as a rough proxy. + +Config options: + max_tokens (int): Token budget (default: 200000) + weight_input (float): Weight for input tokens in scoring (default: 0.7) + weight_output (float): Weight for output tokens in scoring (default: 0.3) +""" + +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +def _extract_tokens(inv) -> dict | None: + """Extract token counts from an invocation's performance_metrics or metadata.""" + # Check performance_metrics (primary source from OTel trace data) + perf = getattr(inv, "performance_metrics", None) + if perf is None and hasattr(inv, "__getitem__"): + try: + perf = inv["performance_metrics"] + except (KeyError, TypeError): + perf = None + + if isinstance(perf, dict): + input_t = perf.get("input_tokens") or perf.get("prompt_tokens") + output_t = perf.get("output_tokens") or perf.get("completion_tokens") + if input_t is not None or output_t is not None: + return { + "input_tokens": int(input_t or 0), + "output_tokens": int(output_t or 0), + } + + # Check performance_budget on invocation (eval_set integration) + budget = getattr(inv, "performance_budget", None) + if budget is None and hasattr(inv, "__getitem__"): + try: + budget = inv["performance_budget"] + except (KeyError, TypeError): + pass + + # No token data available + return None + + +@evaluator +def token_efficiency(input: EvalInput) -> EvalResult: + max_tokens = input.config.get("max_tokens", 200000) + weight_input = input.config.get("weight_input", 0.7) + weight_output = input.config.get("weight_output", 0.3) + + scores: list[float] = [] + details_items: list[str] = [] + has_data = False + + for inv in input.invocations: + tokens = _extract_tokens(inv) + + if tokens is None: + # No token data — score based on tool call count as rough proxy + # More tool calls ≈ more tokens used + tool_count = len(inv.intermediate_steps.tool_calls) if inv.intermediate_steps else 0 + if tool_count == 0: + scores.append(0.5) # No data, neutral score + details_items.append(f"{inv.invocation_id}: no token data available") + else: + # Rough heuristic: assume ~5000 tokens per tool call + estimated = tool_count * 5000 + score = max(0.0, min(1.0, 1.0 - (estimated / max_tokens))) + scores.append(score) + details_items.append( + f"{inv.invocation_id}: estimated ~{estimated} tokens from {tool_count} tool calls" + ) + continue + + has_data = True + input_t = tokens["input_tokens"] + output_t = tokens["output_tokens"] + weighted_total = (input_t * weight_input) + (output_t * weight_output) + weighted_budget = max_tokens * 1.0 # Budget applies to weighted total + + score = max(0.0, min(1.0, 1.0 - (weighted_total / weighted_budget))) + scores.append(score) + details_items.append( + f"{inv.invocation_id}: {input_t} input + {output_t} output = " + f"{input_t + output_t} total (weighted: {weighted_total:.0f}/{weighted_budget:.0f})" + ) + + overall = sum(scores) / len(scores) if scores else 0.0 + + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={ + "token_details": details_items, + "has_trace_data": has_data, + "max_tokens": max_tokens, + }, + ) + + +if __name__ == "__main__": + token_efficiency.run() diff --git a/evaluators/tool_efficiency/evaluator.yaml b/evaluators/tool_efficiency/evaluator.yaml new file mode 100644 index 0000000..0092f17 --- /dev/null +++ b/evaluators/tool_efficiency/evaluator.yaml @@ -0,0 +1,6 @@ +name: tool_efficiency +description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors +language: python +entrypoint: tool_efficiency.py +tags: [performance, tools, efficiency, budget] +author: henrikrexed diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py new file mode 100644 index 0000000..c56f84d --- /dev/null +++ b/evaluators/tool_efficiency/tool_efficiency.py @@ -0,0 +1,114 @@ +"""Community evaluator: tool_efficiency + +Scores whether the agent used tools effectively. Penalizes duplicate calls +(same tool + same args), error calls, and budget overruns. + +Config options: + max_tool_calls (int): Tool call budget (default: 15) + penalize_duplicates (bool): Penalize repeated identical calls (default: true) + penalize_errors (bool): Penalize failed tool calls (default: true) +""" + +import json +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator + + +def _call_signature(call) -> str: + """Create a hashable signature for a tool call (name + sorted args).""" + name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "") + args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {}) + try: + args_str = json.dumps(args, sort_keys=True, default=str) + except (TypeError, ValueError): + args_str = str(args) + return f"{name}::{args_str}" + + +def _is_error_response(response) -> bool: + """Check if a tool response indicates an error.""" + output = response.get("output", "") if isinstance(response, dict) else getattr(response, "output", "") + output_str = str(output).lower() + # Check common error indicators + if any(marker in output_str for marker in ["error", "failed", "exception", "traceback"]): + return True + status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "") + if str(status).lower() in ("error", "failed"): + return True + return False + + +@evaluator +def tool_efficiency(input: EvalInput) -> EvalResult: + max_tool_calls = input.config.get("max_tool_calls", 15) + penalize_duplicates = input.config.get("penalize_duplicates", True) + penalize_errors = input.config.get("penalize_errors", True) + + scores: list[float] = [] + details_items: list[str] = [] + + for inv in input.invocations: + tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else [] + tool_responses = ( + inv.intermediate_steps.tool_responses if inv.intermediate_steps else [] + ) + total = len(tool_calls) + + if total == 0: + scores.append(1.0) # No tools needed = perfectly efficient + details_items.append(f"{inv.invocation_id}: no tool calls (score: 1.0)") + continue + + # Count duplicates + seen_signatures: dict[str, int] = {} + duplicate_count = 0 + for call in tool_calls: + sig = _call_signature(call) + seen_signatures[sig] = seen_signatures.get(sig, 0) + 1 + + if penalize_duplicates: + duplicate_count = sum(count - 1 for count in seen_signatures.values() if count > 1) + + # Count errors + error_count = 0 + if penalize_errors and tool_responses: + for resp in tool_responses: + if _is_error_response(resp): + error_count += 1 + + # Calculate useful calls + wasted = duplicate_count + error_count + useful = max(0, total - wasted) + + # Efficiency ratio: useful / total + efficiency_ratio = useful / total if total > 0 else 1.0 + + # Budget penalty: how much over budget + budget_overrun = max(0, total - max_tool_calls) / max_tool_calls + budget_factor = max(0.0, 1.0 - budget_overrun) + + score = max(0.0, min(1.0, efficiency_ratio * budget_factor)) + scores.append(score) + + parts = [f"total={total}", f"useful={useful}"] + if duplicate_count > 0: + parts.append(f"duplicates={duplicate_count}") + if error_count > 0: + parts.append(f"errors={error_count}") + if total > max_tool_calls: + parts.append(f"over_budget={total - max_tool_calls}") + details_items.append(f"{inv.invocation_id}: {', '.join(parts)} (score: {score:.2f})") + + overall = sum(scores) / len(scores) if scores else 0.0 + + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={ + "tool_details": details_items, + "max_tool_calls": max_tool_calls, + }, + ) + + +if __name__ == "__main__": + tool_efficiency.run() From 8371bc0041da14ffae8c4972f544c37d23801e93 Mon Sep 17 00:00:00 2001 From: Henrik Rexed Date: Mon, 30 Mar 2026 17:02:18 +0200 Subject: [PATCH 2/5] =?UTF-8?q?fix:=20address=20review=20=E2=80=94=20use?= =?UTF-8?q?=20EvalStatus.NOT=5FEVALUATED,=20remove=20dead=20code,=20trim?= =?UTF-8?q?=20comments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix .gitignore: separate .venv/ and __pycache__/ on own lines - token_efficiency: remove unused performance_budget block, return NOT_EVALUATED when no token data available - time_efficiency: return NOT_EVALUATED when no duration data - All evaluators: trim verbose comments, cleaner code - All pass validate_evaluator.py --- .gitignore | 3 +- evaluators/time_efficiency/time_efficiency.py | 43 ++++------ .../token_efficiency/token_efficiency.py | 78 +++++------------ evaluators/tool_efficiency/tool_efficiency.py | 83 +++++-------------- 4 files changed, 62 insertions(+), 145 deletions(-) diff --git a/.gitignore b/.gitignore index 8067d78..a230a78 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.venv/evaluators/__pycache__ +.venv/ +__pycache__/ diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py index 2bdbc40..f425d36 100644 --- a/evaluators/time_efficiency/time_efficiency.py +++ b/evaluators/time_efficiency/time_efficiency.py @@ -1,17 +1,15 @@ """Community evaluator: time_efficiency -Scores how quickly the agent resolved relative to a time budget. -Uses performance_metrics.duration_s from trace data when available. +Scores resolution time relative to a budget. Extracts duration_s from +performance_metrics when available, otherwise returns NOT_EVALUATED. -Config options: - max_duration_s (float): Time budget in seconds (default: 120) +Config: max_duration_s (float, default 120) """ -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator def _extract_duration(inv) -> float | None: - """Extract duration_s from an invocation's performance_metrics.""" perf = getattr(inv, "performance_metrics", None) if perf is None and hasattr(inv, "__getitem__"): try: @@ -20,10 +18,9 @@ def _extract_duration(inv) -> float | None: perf = None if isinstance(perf, dict): - duration = perf.get("duration_s") or perf.get("duration") - if duration is not None: - return float(duration) - + d = perf.get("duration_s") or perf.get("duration") + if d is not None: + return float(d) return None @@ -37,31 +34,25 @@ def time_efficiency(input: EvalInput) -> EvalResult: for inv in input.invocations: duration = _extract_duration(inv) - if duration is None: - # No timing data — assign neutral score - scores.append(0.5) - details_items.append(f"{inv.invocation_id}: no duration data available") + scores.append(0.0) + details_items.append(f"{inv.invocation_id}: no duration data") continue has_data = True score = max(0.0, min(1.0, 1.0 - (duration / max_duration))) scores.append(score) - details_items.append( - f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s budget (score: {score:.2f})" + details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s") + + if not has_data: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + details={"reason": "no duration data in any invocation"}, ) overall = sum(scores) / len(scores) if scores else 0.0 - - return EvalResult( - score=overall, - per_invocation_scores=scores, - details={ - "time_details": details_items, - "has_trace_data": has_data, - "max_duration_s": max_duration, - }, - ) + return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items}) if __name__ == "__main__": diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py index b15fb24..ccc6bc6 100644 --- a/evaluators/token_efficiency/token_efficiency.py +++ b/evaluators/token_efficiency/token_efficiency.py @@ -1,21 +1,16 @@ """Community evaluator: token_efficiency -Scores how efficiently the agent used tokens relative to a budget. -Uses performance_metrics from trace data when available, falls back to -counting tool calls as a rough proxy. - -Config options: - max_tokens (int): Token budget (default: 200000) - weight_input (float): Weight for input tokens in scoring (default: 0.7) - weight_output (float): Weight for output tokens in scoring (default: 0.3) +Scores token usage relative to a budget. Extracts input/output tokens from +performance_metrics when available, otherwise returns NOT_EVALUATED. + +Config: max_tokens (int, default 200000), weight_input (float, default 0.7), + weight_output (float, default 0.3) """ -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator def _extract_tokens(inv) -> dict | None: - """Extract token counts from an invocation's performance_metrics or metadata.""" - # Check performance_metrics (primary source from OTel trace data) perf = getattr(inv, "performance_metrics", None) if perf is None and hasattr(inv, "__getitem__"): try: @@ -27,20 +22,8 @@ def _extract_tokens(inv) -> dict | None: input_t = perf.get("input_tokens") or perf.get("prompt_tokens") output_t = perf.get("output_tokens") or perf.get("completion_tokens") if input_t is not None or output_t is not None: - return { - "input_tokens": int(input_t or 0), - "output_tokens": int(output_t or 0), - } - - # Check performance_budget on invocation (eval_set integration) - budget = getattr(inv, "performance_budget", None) - if budget is None and hasattr(inv, "__getitem__"): - try: - budget = inv["performance_budget"] - except (KeyError, TypeError): - pass + return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)} - # No token data available return None @@ -56,48 +39,29 @@ def token_efficiency(input: EvalInput) -> EvalResult: for inv in input.invocations: tokens = _extract_tokens(inv) - if tokens is None: - # No token data — score based on tool call count as rough proxy - # More tool calls ≈ more tokens used - tool_count = len(inv.intermediate_steps.tool_calls) if inv.intermediate_steps else 0 - if tool_count == 0: - scores.append(0.5) # No data, neutral score - details_items.append(f"{inv.invocation_id}: no token data available") - else: - # Rough heuristic: assume ~5000 tokens per tool call - estimated = tool_count * 5000 - score = max(0.0, min(1.0, 1.0 - (estimated / max_tokens))) - scores.append(score) - details_items.append( - f"{inv.invocation_id}: estimated ~{estimated} tokens from {tool_count} tool calls" - ) + scores.append(0.0) + details_items.append(f"{inv.invocation_id}: no token data") continue has_data = True - input_t = tokens["input_tokens"] - output_t = tokens["output_tokens"] - weighted_total = (input_t * weight_input) + (output_t * weight_output) - weighted_budget = max_tokens * 1.0 # Budget applies to weighted total - - score = max(0.0, min(1.0, 1.0 - (weighted_total / weighted_budget))) + weighted = (tokens["input_tokens"] * weight_input) + (tokens["output_tokens"] * weight_output) + score = max(0.0, min(1.0, 1.0 - (weighted / max_tokens))) scores.append(score) details_items.append( - f"{inv.invocation_id}: {input_t} input + {output_t} output = " - f"{input_t + output_t} total (weighted: {weighted_total:.0f}/{weighted_budget:.0f})" + f"{inv.invocation_id}: {tokens['input_tokens']}in + {tokens['output_tokens']}out " + f"(weighted {weighted:.0f}/{max_tokens})" ) - overall = sum(scores) / len(scores) if scores else 0.0 + if not has_data: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + details={"reason": "no token data in any invocation"}, + ) - return EvalResult( - score=overall, - per_invocation_scores=scores, - details={ - "token_details": details_items, - "has_trace_data": has_data, - "max_tokens": max_tokens, - }, - ) + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items}) if __name__ == "__main__": diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py index c56f84d..e54c091 100644 --- a/evaluators/tool_efficiency/tool_efficiency.py +++ b/evaluators/tool_efficiency/tool_efficiency.py @@ -1,20 +1,17 @@ """Community evaluator: tool_efficiency -Scores whether the agent used tools effectively. Penalizes duplicate calls -(same tool + same args), error calls, and budget overruns. +Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args), +error responses, and budget overruns. -Config options: - max_tool_calls (int): Tool call budget (default: 15) - penalize_duplicates (bool): Penalize repeated identical calls (default: true) - penalize_errors (bool): Penalize failed tool calls (default: true) +Config: max_tool_calls (int, default 15), penalize_duplicates (bool, default true), + penalize_errors (bool, default true) """ import json -from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator def _call_signature(call) -> str: - """Create a hashable signature for a tool call (name + sorted args).""" name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "") args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {}) try: @@ -25,16 +22,11 @@ def _call_signature(call) -> str: def _is_error_response(response) -> bool: - """Check if a tool response indicates an error.""" output = response.get("output", "") if isinstance(response, dict) else getattr(response, "output", "") - output_str = str(output).lower() - # Check common error indicators - if any(marker in output_str for marker in ["error", "failed", "exception", "traceback"]): + if any(m in str(output).lower() for m in ["error", "failed", "exception", "traceback"]): return True status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "") - if str(status).lower() in ("error", "failed"): - return True - return False + return str(status).lower() in ("error", "failed") @evaluator @@ -48,66 +40,35 @@ def tool_efficiency(input: EvalInput) -> EvalResult: for inv in input.invocations: tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else [] - tool_responses = ( - inv.intermediate_steps.tool_responses if inv.intermediate_steps else [] - ) + tool_responses = inv.intermediate_steps.tool_responses if inv.intermediate_steps else [] total = len(tool_calls) if total == 0: - scores.append(1.0) # No tools needed = perfectly efficient - details_items.append(f"{inv.invocation_id}: no tool calls (score: 1.0)") + scores.append(1.0) + details_items.append(f"{inv.invocation_id}: no tool calls") continue - # Count duplicates - seen_signatures: dict[str, int] = {} - duplicate_count = 0 + seen: dict[str, int] = {} for call in tool_calls: sig = _call_signature(call) - seen_signatures[sig] = seen_signatures.get(sig, 0) + 1 - - if penalize_duplicates: - duplicate_count = sum(count - 1 for count in seen_signatures.values() if count > 1) - - # Count errors - error_count = 0 - if penalize_errors and tool_responses: - for resp in tool_responses: - if _is_error_response(resp): - error_count += 1 + seen[sig] = seen.get(sig, 0) + 1 - # Calculate useful calls - wasted = duplicate_count + error_count - useful = max(0, total - wasted) + dupes = sum(c - 1 for c in seen.values() if c > 1) if penalize_duplicates else 0 + errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0 + useful = max(0, total - dupes - errors) - # Efficiency ratio: useful / total - efficiency_ratio = useful / total if total > 0 else 1.0 - - # Budget penalty: how much over budget - budget_overrun = max(0, total - max_tool_calls) / max_tool_calls - budget_factor = max(0.0, 1.0 - budget_overrun) - - score = max(0.0, min(1.0, efficiency_ratio * budget_factor)) + efficiency = useful / total + budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls) + score = max(0.0, min(1.0, efficiency * budget_factor)) scores.append(score) parts = [f"total={total}", f"useful={useful}"] - if duplicate_count > 0: - parts.append(f"duplicates={duplicate_count}") - if error_count > 0: - parts.append(f"errors={error_count}") - if total > max_tool_calls: - parts.append(f"over_budget={total - max_tool_calls}") - details_items.append(f"{inv.invocation_id}: {', '.join(parts)} (score: {score:.2f})") + if dupes: parts.append(f"dupes={dupes}") + if errors: parts.append(f"errors={errors}") + details_items.append(f"{inv.invocation_id}: {', '.join(parts)}") overall = sum(scores) / len(scores) if scores else 0.0 - - return EvalResult( - score=overall, - per_invocation_scores=scores, - details={ - "tool_details": details_items, - "max_tool_calls": max_tool_calls, - }, - ) + return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items}) if __name__ == "__main__": From 4e9899dc6b5f1a3a9f912dde8b72b920150f1f99 Mon Sep 17 00:00:00 2001 From: Henrik Rexed Date: Mon, 30 Mar 2026 18:18:50 +0200 Subject: [PATCH 3/5] fix: remove naive text heuristic from error detection, use status field only _is_error_response now only checks the structured status field (error/failed/failure) instead of scanning output text for substrings like "error" which would false-positive on legitimate outputs. --- evaluators/tool_efficiency/tool_efficiency.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py index e54c091..b785f3d 100644 --- a/evaluators/tool_efficiency/tool_efficiency.py +++ b/evaluators/tool_efficiency/tool_efficiency.py @@ -8,7 +8,7 @@ """ import json -from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator +from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator def _call_signature(call) -> str: @@ -22,11 +22,9 @@ def _call_signature(call) -> str: def _is_error_response(response) -> bool: - output = response.get("output", "") if isinstance(response, dict) else getattr(response, "output", "") - if any(m in str(output).lower() for m in ["error", "failed", "exception", "traceback"]): - return True + """Check if a tool response indicates an error via its status field.""" status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "") - return str(status).lower() in ("error", "failed") + return str(status).lower() in ("error", "failed", "failure") @evaluator From beadc5c3f0e02408f6d20714a5d932094ea1608b Mon Sep 17 00:00:00 2001 From: Henrik Rexed Date: Mon, 30 Mar 2026 18:24:35 +0200 Subject: [PATCH 4/5] =?UTF-8?q?fix:=20address=20review=20=E2=80=94=20add?= =?UTF-8?q?=20min=5Ftool=5Fcalls=20config,=20move=20signature=20counting?= =?UTF-8?q?=20into=20conditional?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add min_tool_calls config (default 0): when >0, zero tool calls scores 0.0 instead of 1.0 (zero calls often means hallucinated answer) - Move signature computation inside penalize_duplicates conditional to avoid unnecessary work when duplicate detection is disabled - Complements tool_coverage for strict tool-usage requirements --- evaluators/tool_efficiency/tool_efficiency.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py index b785f3d..d03e400 100644 --- a/evaluators/tool_efficiency/tool_efficiency.py +++ b/evaluators/tool_efficiency/tool_efficiency.py @@ -3,8 +3,8 @@ Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args), error responses, and budget overruns. -Config: max_tool_calls (int, default 15), penalize_duplicates (bool, default true), - penalize_errors (bool, default true) +Config: max_tool_calls (int, default 15), min_tool_calls (int, default 0), + penalize_duplicates (bool, default true), penalize_errors (bool, default true) """ import json @@ -30,6 +30,7 @@ def _is_error_response(response) -> bool: @evaluator def tool_efficiency(input: EvalInput) -> EvalResult: max_tool_calls = input.config.get("max_tool_calls", 15) + min_tool_calls = input.config.get("min_tool_calls", 0) penalize_duplicates = input.config.get("penalize_duplicates", True) penalize_errors = input.config.get("penalize_errors", True) @@ -42,16 +43,22 @@ def tool_efficiency(input: EvalInput) -> EvalResult: total = len(tool_calls) if total == 0: - scores.append(1.0) - details_items.append(f"{inv.invocation_id}: no tool calls") + if min_tool_calls > 0: + scores.append(0.0) + details_items.append(f"{inv.invocation_id}: no tool calls (min required: {min_tool_calls})") + else: + scores.append(1.0) + details_items.append(f"{inv.invocation_id}: no tool calls (tools optional)") continue - seen: dict[str, int] = {} - for call in tool_calls: - sig = _call_signature(call) - seen[sig] = seen.get(sig, 0) + 1 + dupes = 0 + if penalize_duplicates: + seen: dict[str, int] = {} + for call in tool_calls: + sig = _call_signature(call) + seen[sig] = seen.get(sig, 0) + 1 + dupes = sum(c - 1 for c in seen.values() if c > 1) - dupes = sum(c - 1 for c in seen.values() if c > 1) if penalize_duplicates else 0 errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0 useful = max(0, total - dupes - errors) From c5d16842054cbf9ac298072038f815483bcb367a Mon Sep 17 00:00:00 2001 From: henrikrexed Date: Tue, 7 Apr 2026 18:27:56 +0200 Subject: [PATCH 5/5] fix: use SDK performance_metrics field directly, split token budgets - Remove getattr/dict-access fallbacks in token_efficiency and time_efficiency now that SDK 0.1.1 has performance_metrics as a proper field on InvocationData - Replace weighted max_tokens with separate max_input_tokens and max_output_tokens config (score = min of both), per review feedback - All three evaluators tested e2e with SDK 0.1.1 Co-Authored-By: Paperclip --- evaluators/time_efficiency/time_efficiency.py | 18 ++++----- .../token_efficiency/token_efficiency.py | 37 ++++++++----------- 2 files changed, 23 insertions(+), 32 deletions(-) diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py index f425d36..f5069e1 100644 --- a/evaluators/time_efficiency/time_efficiency.py +++ b/evaluators/time_efficiency/time_efficiency.py @@ -10,17 +10,13 @@ def _extract_duration(inv) -> float | None: - perf = getattr(inv, "performance_metrics", None) - if perf is None and hasattr(inv, "__getitem__"): - try: - perf = inv["performance_metrics"] - except (KeyError, TypeError): - perf = None - - if isinstance(perf, dict): - d = perf.get("duration_s") or perf.get("duration") - if d is not None: - return float(d) + perf = inv.performance_metrics + if not isinstance(perf, dict): + return None + + d = perf.get("duration_s") or perf.get("duration") + if d is not None: + return float(d) return None diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py index ccc6bc6..cd12975 100644 --- a/evaluators/token_efficiency/token_efficiency.py +++ b/evaluators/token_efficiency/token_efficiency.py @@ -3,35 +3,29 @@ Scores token usage relative to a budget. Extracts input/output tokens from performance_metrics when available, otherwise returns NOT_EVALUATED. -Config: max_tokens (int, default 200000), weight_input (float, default 0.7), - weight_output (float, default 0.3) +Config: max_input_tokens (int, default 150000), max_output_tokens (int, default 50000) """ from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator def _extract_tokens(inv) -> dict | None: - perf = getattr(inv, "performance_metrics", None) - if perf is None and hasattr(inv, "__getitem__"): - try: - perf = inv["performance_metrics"] - except (KeyError, TypeError): - perf = None - - if isinstance(perf, dict): - input_t = perf.get("input_tokens") or perf.get("prompt_tokens") - output_t = perf.get("output_tokens") or perf.get("completion_tokens") - if input_t is not None or output_t is not None: - return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)} + perf = inv.performance_metrics + if not isinstance(perf, dict): + return None + + input_t = perf.get("input_tokens") or perf.get("prompt_tokens") + output_t = perf.get("output_tokens") or perf.get("completion_tokens") + if input_t is not None or output_t is not None: + return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)} return None @evaluator def token_efficiency(input: EvalInput) -> EvalResult: - max_tokens = input.config.get("max_tokens", 200000) - weight_input = input.config.get("weight_input", 0.7) - weight_output = input.config.get("weight_output", 0.3) + max_input = input.config.get("max_input_tokens", 150000) + max_output = input.config.get("max_output_tokens", 50000) scores: list[float] = [] details_items: list[str] = [] @@ -45,12 +39,13 @@ def token_efficiency(input: EvalInput) -> EvalResult: continue has_data = True - weighted = (tokens["input_tokens"] * weight_input) + (tokens["output_tokens"] * weight_output) - score = max(0.0, min(1.0, 1.0 - (weighted / max_tokens))) + input_score = max(0.0, min(1.0, 1.0 - (tokens["input_tokens"] / max_input))) if max_input > 0 else 1.0 + output_score = max(0.0, min(1.0, 1.0 - (tokens["output_tokens"] / max_output))) if max_output > 0 else 1.0 + score = min(input_score, output_score) scores.append(score) details_items.append( - f"{inv.invocation_id}: {tokens['input_tokens']}in + {tokens['output_tokens']}out " - f"(weighted {weighted:.0f}/{max_tokens})" + f"{inv.invocation_id}: {tokens['input_tokens']}in/{max_input} + " + f"{tokens['output_tokens']}out/{max_output} -> {score:.2f}" ) if not has_data: