From 08b0905516c5b04fb03bdd554728844f63796b06 Mon Sep 17 00:00:00 2001
From: Henrik Rexed <henrik.rexed@gmail.com>
Date: Mon, 30 Mar 2026 11:13:24 +0000
Subject: [PATCH 1/5] feat: add token_efficiency, tool_efficiency,
 time_efficiency evaluators (issue #6)

Three built-in performance evaluators that score agents from trace data:

- token_efficiency: scores token usage vs budget (weighted input/output)
- tool_efficiency: penalizes duplicate calls, errors, and budget overruns
- time_efficiency: scores resolution time vs budget

All evaluators:
- Follow the @evaluator SDK pattern (stdin/stdout JSON protocol)
- Handle missing performance_metrics gracefully (neutral scores)
- Support per-invocation scoring
- Pass validate_evaluator.py smoke tests

Closes agentevals-dev/evaluators#6
---
 .gitignore                                    |   2 +-
 .../__pycache__/bertscore.cpython-314.pyc     | Bin 5775 -> 0 bytes
 evaluators/time_efficiency/evaluator.yaml     |   6 +
 evaluators/time_efficiency/time_efficiency.py |  68 +++++++++++
 evaluators/token_efficiency/evaluator.yaml    |   6 +
 .../token_efficiency/token_efficiency.py      | 104 ++++++++++++++++
 evaluators/tool_efficiency/evaluator.yaml     |   6 +
 evaluators/tool_efficiency/tool_efficiency.py | 114 ++++++++++++++++++
 8 files changed, 305 insertions(+), 1 deletion(-)
 delete mode 100644 evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc
 create mode 100644 evaluators/time_efficiency/evaluator.yaml
 create mode 100644 evaluators/time_efficiency/time_efficiency.py
 create mode 100644 evaluators/token_efficiency/evaluator.yaml
 create mode 100644 evaluators/token_efficiency/token_efficiency.py
 create mode 100644 evaluators/tool_efficiency/evaluator.yaml
 create mode 100644 evaluators/tool_efficiency/tool_efficiency.py
diff --git a/.gitignore b/.gitignore
index 0cafc1c..8067d78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-.venv/
\ No newline at end of file
+.venv/evaluators/__pycache__
diff --git a/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc b/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc
deleted file mode 100644
index 551cc69f9ea7d27c5e9dc862e507ab4d519eda31..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5775
zcmb^#TWs6b^-`25krE~O6~~DkQ?}zov0~MM+a^xZ)=BKxc~xaxxoaE~j6~UNBuXWz
zJdA#*K(X2FIy>l!de}ZZU=Q!lz<#p74cLHwT2VY{gn=1rfv$fxk}g0$ww+6fvg{`9
z3hW5FJiI*jo^#JVuX}sEEQ0{D*ZwTG_z?OGzgR$)5#rHr0LdX9>1Y!1#3(Uh({1e-
zw~dlI2~Tp=KH|_FBa}{={q|Akh)Z{k&^kTB=*);)cef+n(TsSi872L?hj#+*GI930
zo2TK8!EgN)-n#d?k9Qw`1EF{WLape!HxjVK+J0KpFt@GK{0BYaunnOzL;Tp)SXz}h
zO-hN1F2^`cPRU78mG!%vbW2QTMLn(dGR)a@DwEYEjoTh_IHO82S(DRBhyzSaCPUnC
zUy!>Y>9-|G;UqCO&B@BGbWGHN=#U0fS|+V%5+f>cPK4Klq)JLm;&kbb&Sf=Onc`w;
z1;1s*BqybANbxvyY8)OAO9KH&S{2X~OG(I60}RJWcQR5;m*U)EO;>{h9520{l~pO;
z3)l`edKzX*WkguCI3>!8ruTBA2`-(Ibr_D9qGwe_<F33qCJaRfFTFlEHgv8RIHb~X
zDJdvoN-}wbxVV%Mvq_!nxFM<f(Ho*B9nC5+c*HwEfRXIf6v#d-!urgiTq3R3tjU~G
z(pAto7P)4+L?0d>QRS4V-sLoNf3V1DMN;$uuA{2=ju6*T);wS^c?XOHrM#|*Qxet*
zhqYPJ4}P<B1api{!}1mFwxn{h#`PcT?J((UR|!<#T$n{=3Jw&&kqHupTU3;^Ztk;o
zC1^9~!K|LXj1^|E7#&OBloWYJQiFDb9s;`>RluAK2BW;Bffmd7RhT2I8Fbb3Vg#5m
z{$s!{2+l@43IUQsQ8Z~g{vMb<^tGbbh-rc+cpFdhcHY5LypwnFG|z+`oha!36098=
zc5GS(3F<az-4aQ)<4+EAk5uf-1{IfX$ua3`57>Tey0`phP-V-Qsq~IXik4P?hhX&-
z>}*QZH7sX`aV0#RPD$a!P5q{-hhw0G@J&_LW^`Fp64Fgcm%`QMX|*WUwz=EkD%P91
zYj^~qqFqT4)F$Au5B}N!G&%GHF~q?w+>{u0lZ5AnuXWP_&rJ$>nl_ynb0M~6lg1c>
zsQN88V2@xlis6-8-~<F#?|Af2KtQ=f02FL06jb_Rg)(Aog*6JEQL?|%o3T$|HJcEr
zox9Q(KvpC=NeD#NmL-L1Pdkd*0ucMcC~7}L*ips-+X%mYydwbc-4UV6cx~H>IMjm<
zRl25te=Cw9>bAiM{CndWqfU}-NG5{R%nO!nmZK1kH06!vlA@+_({elx!At|glxn@#
zux0eIAZd8Z#DZ>`gJw80q8YOcMptn(z@hJoS`YGp>D%<Q;V_NLa4KnGN)_Wq9cCkx
z*$5_^Af>v2US)CP#SLT}uu2*G3~)$ib&P0O8{GgLr#4~&pD-oq!gkbDJD}IJm1}_H
z(3Tf@n-}!ubN9PG?SIHditN=qdv$f}%>sKZH@r@J=iA<In{WTH{crWn_ri<a_rkNz
zCnRE8Hr(|^cURuswQBEr;((5ywAXfhtzmKhKo9P5{ArJSu#ev1v>x~ddq_Nb&J^OA
zPHWrgw!Aw`f#Z5q?DPg5gME?Xum>uh&!DThqpT%tY-Lqpb20U$`Y~1q&5S=T$MkDv
zkWD1hqJF)+AA|}y6P3Nz)S&5HW`pfXy95pPu>Hg<#Fp8XeD#|oe%%1Xff4XBf7_<h
zL|xe7XcJ|y$GJ_^gQz`QEJoEmFk#FAjRjSon+Chhp^2(@joZfi`|+XxGLD-5`kH>~
z+r8~+-V&k9bJn&|94v_!E1nD7Di}gf^Wpz_R%*wrwX&L(RILOE1<-gjg6(}5OcIl}
z{)#--GvkapBiJK$!K4jVQPJQe3Sq;qKm=RCF0dj|1lzzau)bwn&)@`!=Ksuz9-qTW
z&4NjE{HKuE+Ho|Sv_)N^8mo!YwQoDeg|H`9Aj4d?@wN=ZlTjCMkJ7v&%7jVqDd)_^
zN;y|k%~gwv?P3mx!r?x%Y!SJt10Joq9+weiwHNXq8JAOESZoctIu%ZtxGpqoiM}uY
z0}y1)n54D=v0+mbgGoS)#352l8nlA5XcDRfgS?SSL55bLLdm2xcu$FU40}ov6~iv;
zQYz?HyI>0SAZ~VrNf(wBM?u30l9~t!HKQ8Dv`28$1C1H7aPWlaiJ=@p>jhkAL7gj9
zGq8FbsLgQjZ$NVBu^;){7w@jTUhtjGjjVgy7Uh+pg7-}B{O7*L`8#uWioTA#ucPP-
z<$a;$H|`%>^@R$)moRC={Pf&((HqEn14VC7-rKW${(k+cx2NDel{^2KLT>+j@crPz
zNbx{8e;~X<tn?KQJYQg5$eo2KQ`h*3^PY2YaKZVJck%5tf1v2^&ilIy{=-H8(Y*g?
z!5_|zZm|CO@wxHEy4mp$CKm_S*aJnjGtYJw*n>s(NS-}XU_-g#4X1z2*|IqJ(AidE
zy$dZLUSEu^v7JkI*AAUnV+ZDJFwLxO!_&OjvNW=sSc%_1v+90%)&6ohfMFN20u>HX
zW^m_B<vcUE+rx112s+Ba++ij%6`QmfpH-a5zB42b1CfTE8i(W%L2cn?;Rl=eu{2=5
zZOaE_CRCHpKR`-cF-EK}HO~mn8M}Z-)uM)o5{c8;5hton2=hFp6seA|O;p1yZ=0xw
zy9kzM7w|+c5w+DiG_GI61n;m)uFmfxHB`OTFiC!o)k39Q9cPc)`T7&E!{xe~r=hH?
zhTAAnU8Q3JXIK*=xW3s10aWuH*C)Fm(!8q!Uai23{S8A!0?%cFXRCaviM{2okf3UU
zywwL~>vlovoM^TBzl$~k!FD%9sVY}j)b;I0>t2+h;m<tXHvDKf3#r^4bw@gOb3y1N
zVXb%yY_S!}R^FTOOyKZl!uVgt?|>{7`^&IVI${m5NZ>jCur}#gl0)IlJdK+7fh>n>
zhF4IYiLyLXI}`7Q9`}wO8hYs3p0ItJ-^JSq(dYn*(w2~M*ah$(9C%HL(iU}%PgVPB
zlNjv0E#uwxr>GYanF}nGijuq^Cp5n9#ftwzglRBb)*<(biWi4dkf^1VnY}4lvo09$
zk+^yohO*pYp1KSQSKn~@Y|2QguzhhMm|vmpj!U{ICpClFz6qGA`^FL3pi<`H*i19q
zS;w%)vT@NMW0|ah47O%^Ic|7&atxB{OCY=Y3T_6WfsUQ&E4as?%4d6ngreUdlagY%
zw<p!gG1UoU4R*&(hFL$G3AkJ(pTi^`Srf!tusHEXQYxQ^eF-(45+IdB7KUq=xOp;G
zu{RlZe0(*$2^q4sVBP7MdD5W7Oh!`T>Um5@YS~oKrw#zFp28i3qMpGoXK{m#yDUOA
zfx`~q%MQ)(R8%df>D!t?!n(|Sk2ll2D{!W7Nh%Z;8jeeq3|`LTRwk#n6Zo4zGX#I_
z_Yl)^XamypJ9F<8eeHQ)`w~;|_2foMPTvD(^SZ~k;3)b7d4Hhb>6j(feJ!Q>ruBxl
zVna{9p{LY&XsLU-@7G64O>L{~CsrE=9@7p#vxyuuv*|_k&7ZX1Yc19V@^yiw(CTw%
z3w7sm7fNjX{KVWuk!{PfZHv>(_5yn}H(YYEt4+rqI**m=nr5lb{S6BvAF;X7&oS})
z6Tcez?S)@nD0ZF9cbzPDoqo`D`e)sLe7?Yr!XeGy2I>BR|LF2t1^<cM=(?w==sEbn
zb8zXcf~T+O>Cb!mSE2>asoYS>PXFk_yBFTOw6U)hIJO6Jmtg5dANSD5&61_Y=3--a
zzOlR57+P%%EqBb)>vf$=qxXZK4j1aK&Qc|pcb=VN7ZM*U4_yaKuEwHkf8Mo!!{5Ht
zPz)T)2ac8e`xm227uVXJEBW^>b}#K++jpeoZ(K+$j(wcmq#dmHU!;>}wrJ!JJlTgr
z$9^1J=>Fj5FU~GeKfm}_@1YefcYfX7RPr{J4s@03S{7eeI<>NI_4MUW-&yrfJa*bW
z)F!e!C^M@YE<s3vAqqmU&TtAsJRK7RgAs&87Ov~?#RxtnlnYCbSvcZ58A;X3c58=^
z#$LD<gUdxQUQMXpzzACWX1NqGs}SrlstY$bOq)inVgthb@zozNCE;9eRx74p<-*Z*
z3SvSwDZQlL0AgGnX;-1yBng7pviS)6R~?Ax{0w>ij{3epuNBa1Ur}%i^VL4{h<FOw
q+>dQg$2=kd+r-%BB!TRn*_nrAOUcgu=<>UlfB5PaiR}AKrvC+r2;0a2

diff --git a/evaluators/time_efficiency/evaluator.yaml b/evaluators/time_efficiency/evaluator.yaml
new file mode 100644
index 0000000..cdd2dc8
--- /dev/null
+++ b/evaluators/time_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: time_efficiency
+description: Scores how quickly the agent resolved relative to a time budget
+language: python
+entrypoint: time_efficiency.py
+tags: [performance, time, latency, efficiency, budget]
+author: henrikrexed
diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py
new file mode 100644
index 0000000..2bdbc40
--- /dev/null
+++ b/evaluators/time_efficiency/time_efficiency.py
@@ -0,0 +1,68 @@
+"""Community evaluator: time_efficiency
+
+Scores how quickly the agent resolved relative to a time budget.
+Uses performance_metrics.duration_s from trace data when available.
+
+Config options:
+  max_duration_s (float): Time budget in seconds (default: 120)
+"""
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+def _extract_duration(inv) -> float | None:
+    """Extract duration_s from an invocation's performance_metrics."""
+    perf = getattr(inv, "performance_metrics", None)
+    if perf is None and hasattr(inv, "__getitem__"):
+        try:
+            perf = inv["performance_metrics"]
+        except (KeyError, TypeError):
+            perf = None
+
+    if isinstance(perf, dict):
+        duration = perf.get("duration_s") or perf.get("duration")
+        if duration is not None:
+            return float(duration)
+
+    return None
+
+
+@evaluator
+def time_efficiency(input: EvalInput) -> EvalResult:
+    max_duration = input.config.get("max_duration_s", 120.0)
+
+    scores: list[float] = []
+    details_items: list[str] = []
+    has_data = False
+
+    for inv in input.invocations:
+        duration = _extract_duration(inv)
+
+        if duration is None:
+            # No timing data — assign neutral score
+            scores.append(0.5)
+            details_items.append(f"{inv.invocation_id}: no duration data available")
+            continue
+
+        has_data = True
+        score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
+        scores.append(score)
+        details_items.append(
+            f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s budget (score: {score:.2f})"
+        )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={
+            "time_details": details_items,
+            "has_trace_data": has_data,
+            "max_duration_s": max_duration,
+        },
+    )
+
+
+if __name__ == "__main__":
+    time_efficiency.run()
diff --git a/evaluators/token_efficiency/evaluator.yaml b/evaluators/token_efficiency/evaluator.yaml
new file mode 100644
index 0000000..e6d23c6
--- /dev/null
+++ b/evaluators/token_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: token_efficiency
+description: Scores how efficiently the agent used tokens relative to a budget
+language: python
+entrypoint: token_efficiency.py
+tags: [performance, tokens, efficiency, budget]
+author: henrikrexed
diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py
new file mode 100644
index 0000000..b15fb24
--- /dev/null
+++ b/evaluators/token_efficiency/token_efficiency.py
@@ -0,0 +1,104 @@
+"""Community evaluator: token_efficiency
+
+Scores how efficiently the agent used tokens relative to a budget.
+Uses performance_metrics from trace data when available, falls back to
+counting tool calls as a rough proxy.
+
+Config options:
+  max_tokens (int): Token budget (default: 200000)
+  weight_input (float): Weight for input tokens in scoring (default: 0.7)
+  weight_output (float): Weight for output tokens in scoring (default: 0.3)
+"""
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+def _extract_tokens(inv) -> dict | None:
+    """Extract token counts from an invocation's performance_metrics or metadata."""
+    # Check performance_metrics (primary source from OTel trace data)
+    perf = getattr(inv, "performance_metrics", None)
+    if perf is None and hasattr(inv, "__getitem__"):
+        try:
+            perf = inv["performance_metrics"]
+        except (KeyError, TypeError):
+            perf = None
+
+    if isinstance(perf, dict):
+        input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
+        output_t = perf.get("output_tokens") or perf.get("completion_tokens")
+        if input_t is not None or output_t is not None:
+            return {
+                "input_tokens": int(input_t or 0),
+                "output_tokens": int(output_t or 0),
+            }
+
+    # Check performance_budget on invocation (eval_set integration)
+    budget = getattr(inv, "performance_budget", None)
+    if budget is None and hasattr(inv, "__getitem__"):
+        try:
+            budget = inv["performance_budget"]
+        except (KeyError, TypeError):
+            pass
+
+    # No token data available
+    return None
+
+
+@evaluator
+def token_efficiency(input: EvalInput) -> EvalResult:
+    max_tokens = input.config.get("max_tokens", 200000)
+    weight_input = input.config.get("weight_input", 0.7)
+    weight_output = input.config.get("weight_output", 0.3)
+
+    scores: list[float] = []
+    details_items: list[str] = []
+    has_data = False
+
+    for inv in input.invocations:
+        tokens = _extract_tokens(inv)
+
+        if tokens is None:
+            # No token data — score based on tool call count as rough proxy
+            # More tool calls ≈ more tokens used
+            tool_count = len(inv.intermediate_steps.tool_calls) if inv.intermediate_steps else 0
+            if tool_count == 0:
+                scores.append(0.5)  # No data, neutral score
+                details_items.append(f"{inv.invocation_id}: no token data available")
+            else:
+                # Rough heuristic: assume ~5000 tokens per tool call
+                estimated = tool_count * 5000
+                score = max(0.0, min(1.0, 1.0 - (estimated / max_tokens)))
+                scores.append(score)
+                details_items.append(
+                    f"{inv.invocation_id}: estimated ~{estimated} tokens from {tool_count} tool calls"
+                )
+            continue
+
+        has_data = True
+        input_t = tokens["input_tokens"]
+        output_t = tokens["output_tokens"]
+        weighted_total = (input_t * weight_input) + (output_t * weight_output)
+        weighted_budget = max_tokens * 1.0  # Budget applies to weighted total
+
+        score = max(0.0, min(1.0, 1.0 - (weighted_total / weighted_budget)))
+        scores.append(score)
+        details_items.append(
+            f"{inv.invocation_id}: {input_t} input + {output_t} output = "
+            f"{input_t + output_t} total (weighted: {weighted_total:.0f}/{weighted_budget:.0f})"
+        )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={
+            "token_details": details_items,
+            "has_trace_data": has_data,
+            "max_tokens": max_tokens,
+        },
+    )
+
+
+if __name__ == "__main__":
+    token_efficiency.run()
diff --git a/evaluators/tool_efficiency/evaluator.yaml b/evaluators/tool_efficiency/evaluator.yaml
new file mode 100644
index 0000000..0092f17
--- /dev/null
+++ b/evaluators/tool_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: tool_efficiency
+description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors
+language: python
+entrypoint: tool_efficiency.py
+tags: [performance, tools, efficiency, budget]
+author: henrikrexed
diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py
new file mode 100644
index 0000000..c56f84d
--- /dev/null
+++ b/evaluators/tool_efficiency/tool_efficiency.py
@@ -0,0 +1,114 @@
+"""Community evaluator: tool_efficiency
+
+Scores whether the agent used tools effectively. Penalizes duplicate calls
+(same tool + same args), error calls, and budget overruns.
+
+Config options:
+  max_tool_calls (int): Tool call budget (default: 15)
+  penalize_duplicates (bool): Penalize repeated identical calls (default: true)
+  penalize_errors (bool): Penalize failed tool calls (default: true)
+"""
+
+import json
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+def _call_signature(call) -> str:
+    """Create a hashable signature for a tool call (name + sorted args)."""
+    name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "")
+    args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {})
+    try:
+        args_str = json.dumps(args, sort_keys=True, default=str)
+    except (TypeError, ValueError):
+        args_str = str(args)
+    return f"{name}::{args_str}"
+
+
+def _is_error_response(response) -> bool:
+    """Check if a tool response indicates an error."""
+    output = response.get("output", "") if isinstance(response, dict) else getattr(response, "output", "")
+    output_str = str(output).lower()
+    # Check common error indicators
+    if any(marker in output_str for marker in ["error", "failed", "exception", "traceback"]):
+        return True
+    status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "")
+    if str(status).lower() in ("error", "failed"):
+        return True
+    return False
+
+
+@evaluator
+def tool_efficiency(input: EvalInput) -> EvalResult:
+    max_tool_calls = input.config.get("max_tool_calls", 15)
+    penalize_duplicates = input.config.get("penalize_duplicates", True)
+    penalize_errors = input.config.get("penalize_errors", True)
+
+    scores: list[float] = []
+    details_items: list[str] = []
+
+    for inv in input.invocations:
+        tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else []
+        tool_responses = (
+            inv.intermediate_steps.tool_responses if inv.intermediate_steps else []
+        )
+        total = len(tool_calls)
+
+        if total == 0:
+            scores.append(1.0)  # No tools needed = perfectly efficient
+            details_items.append(f"{inv.invocation_id}: no tool calls (score: 1.0)")
+            continue
+
+        # Count duplicates
+        seen_signatures: dict[str, int] = {}
+        duplicate_count = 0
+        for call in tool_calls:
+            sig = _call_signature(call)
+            seen_signatures[sig] = seen_signatures.get(sig, 0) + 1
+
+        if penalize_duplicates:
+            duplicate_count = sum(count - 1 for count in seen_signatures.values() if count > 1)
+
+        # Count errors
+        error_count = 0
+        if penalize_errors and tool_responses:
+            for resp in tool_responses:
+                if _is_error_response(resp):
+                    error_count += 1
+
+        # Calculate useful calls
+        wasted = duplicate_count + error_count
+        useful = max(0, total - wasted)
+
+        # Efficiency ratio: useful / total
+        efficiency_ratio = useful / total if total > 0 else 1.0
+
+        # Budget penalty: how much over budget
+        budget_overrun = max(0, total - max_tool_calls) / max_tool_calls
+        budget_factor = max(0.0, 1.0 - budget_overrun)
+
+        score = max(0.0, min(1.0, efficiency_ratio * budget_factor))
+        scores.append(score)
+
+        parts = [f"total={total}", f"useful={useful}"]
+        if duplicate_count > 0:
+            parts.append(f"duplicates={duplicate_count}")
+        if error_count > 0:
+            parts.append(f"errors={error_count}")
+        if total > max_tool_calls:
+            parts.append(f"over_budget={total - max_tool_calls}")
+        details_items.append(f"{inv.invocation_id}: {', '.join(parts)} (score: {score:.2f})")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={
+            "tool_details": details_items,
+            "max_tool_calls": max_tool_calls,
+        },
+    )
+
+
+if __name__ == "__main__":
+    tool_efficiency.run()

From 8371bc0041da14ffae8c4972f544c37d23801e93 Mon Sep 17 00:00:00 2001
From: Henrik Rexed <henrik.rexed@gmail.com>
Date: Mon, 30 Mar 2026 17:02:18 +0200
Subject: [PATCH 2/5] =?UTF-8?q?fix:=20address=20review=20=E2=80=94=20use?=
 =?UTF-8?q?=20EvalStatus.NOT=5FEVALUATED,=20remove=20dead=20code,=20trim?=
 =?UTF-8?q?=20comments?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix .gitignore: separate .venv/ and __pycache__/ on own lines
- token_efficiency: remove unused performance_budget block, return
  NOT_EVALUATED when no token data available
- time_efficiency: return NOT_EVALUATED when no duration data
- All evaluators: trim verbose comments, cleaner code
- All pass validate_evaluator.py
---
 .gitignore                                    |  3 +-
 evaluators/time_efficiency/time_efficiency.py | 43 ++++------
 .../token_efficiency/token_efficiency.py      | 78 +++++------------
 evaluators/tool_efficiency/tool_efficiency.py | 83 +++++--------------
 4 files changed, 62 insertions(+), 145 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8067d78..a230a78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-.venv/evaluators/__pycache__
+.venv/
+__pycache__/
diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py
index 2bdbc40..f425d36 100644
--- a/evaluators/time_efficiency/time_efficiency.py
+++ b/evaluators/time_efficiency/time_efficiency.py
@@ -1,17 +1,15 @@
 """Community evaluator: time_efficiency
 
-Scores how quickly the agent resolved relative to a time budget.
-Uses performance_metrics.duration_s from trace data when available.
+Scores resolution time relative to a budget. Extracts duration_s from
+performance_metrics when available, otherwise returns NOT_EVALUATED.
 
-Config options:
-  max_duration_s (float): Time budget in seconds (default: 120)
+Config: max_duration_s (float, default 120)
 """
 
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 def _extract_duration(inv) -> float | None:
-    """Extract duration_s from an invocation's performance_metrics."""
     perf = getattr(inv, "performance_metrics", None)
     if perf is None and hasattr(inv, "__getitem__"):
         try:
@@ -20,10 +18,9 @@ def _extract_duration(inv) -> float | None:
             perf = None
 
     if isinstance(perf, dict):
-        duration = perf.get("duration_s") or perf.get("duration")
-        if duration is not None:
-            return float(duration)
-
+        d = perf.get("duration_s") or perf.get("duration")
+        if d is not None:
+            return float(d)
     return None
 
 
@@ -37,31 +34,25 @@ def time_efficiency(input: EvalInput) -> EvalResult:
 
     for inv in input.invocations:
         duration = _extract_duration(inv)
-
         if duration is None:
-            # No timing data — assign neutral score
-            scores.append(0.5)
-            details_items.append(f"{inv.invocation_id}: no duration data available")
+            scores.append(0.0)
+            details_items.append(f"{inv.invocation_id}: no duration data")
             continue
 
         has_data = True
         score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
         scores.append(score)
-        details_items.append(
-            f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s budget (score: {score:.2f})"
+        details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s")
+
+    if not has_data:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            details={"reason": "no duration data in any invocation"},
         )
 
     overall = sum(scores) / len(scores) if scores else 0.0
-
-    return EvalResult(
-        score=overall,
-        per_invocation_scores=scores,
-        details={
-            "time_details": details_items,
-            "has_trace_data": has_data,
-            "max_duration_s": max_duration,
-        },
-    )
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items})
 
 
 if __name__ == "__main__":
diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py
index b15fb24..ccc6bc6 100644
--- a/evaluators/token_efficiency/token_efficiency.py
+++ b/evaluators/token_efficiency/token_efficiency.py
@@ -1,21 +1,16 @@
 """Community evaluator: token_efficiency
 
-Scores how efficiently the agent used tokens relative to a budget.
-Uses performance_metrics from trace data when available, falls back to
-counting tool calls as a rough proxy.
-
-Config options:
-  max_tokens (int): Token budget (default: 200000)
-  weight_input (float): Weight for input tokens in scoring (default: 0.7)
-  weight_output (float): Weight for output tokens in scoring (default: 0.3)
+Scores token usage relative to a budget. Extracts input/output tokens from
+performance_metrics when available, otherwise returns NOT_EVALUATED.
+
+Config: max_tokens (int, default 200000), weight_input (float, default 0.7),
+        weight_output (float, default 0.3)
 """
 
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 def _extract_tokens(inv) -> dict | None:
-    """Extract token counts from an invocation's performance_metrics or metadata."""
-    # Check performance_metrics (primary source from OTel trace data)
     perf = getattr(inv, "performance_metrics", None)
     if perf is None and hasattr(inv, "__getitem__"):
         try:
@@ -27,20 +22,8 @@ def _extract_tokens(inv) -> dict | None:
         input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
         output_t = perf.get("output_tokens") or perf.get("completion_tokens")
         if input_t is not None or output_t is not None:
-            return {
-                "input_tokens": int(input_t or 0),
-                "output_tokens": int(output_t or 0),
-            }
-
-    # Check performance_budget on invocation (eval_set integration)
-    budget = getattr(inv, "performance_budget", None)
-    if budget is None and hasattr(inv, "__getitem__"):
-        try:
-            budget = inv["performance_budget"]
-        except (KeyError, TypeError):
-            pass
+            return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)}
 
-    # No token data available
     return None
 
 
@@ -56,48 +39,29 @@ def token_efficiency(input: EvalInput) -> EvalResult:
 
     for inv in input.invocations:
         tokens = _extract_tokens(inv)
-
         if tokens is None:
-            # No token data — score based on tool call count as rough proxy
-            # More tool calls ≈ more tokens used
-            tool_count = len(inv.intermediate_steps.tool_calls) if inv.intermediate_steps else 0
-            if tool_count == 0:
-                scores.append(0.5)  # No data, neutral score
-                details_items.append(f"{inv.invocation_id}: no token data available")
-            else:
-                # Rough heuristic: assume ~5000 tokens per tool call
-                estimated = tool_count * 5000
-                score = max(0.0, min(1.0, 1.0 - (estimated / max_tokens)))
-                scores.append(score)
-                details_items.append(
-                    f"{inv.invocation_id}: estimated ~{estimated} tokens from {tool_count} tool calls"
-                )
+            scores.append(0.0)
+            details_items.append(f"{inv.invocation_id}: no token data")
             continue
 
         has_data = True
-        input_t = tokens["input_tokens"]
-        output_t = tokens["output_tokens"]
-        weighted_total = (input_t * weight_input) + (output_t * weight_output)
-        weighted_budget = max_tokens * 1.0  # Budget applies to weighted total
-
-        score = max(0.0, min(1.0, 1.0 - (weighted_total / weighted_budget)))
+        weighted = (tokens["input_tokens"] * weight_input) + (tokens["output_tokens"] * weight_output)
+        score = max(0.0, min(1.0, 1.0 - (weighted / max_tokens)))
         scores.append(score)
         details_items.append(
-            f"{inv.invocation_id}: {input_t} input + {output_t} output = "
-            f"{input_t + output_t} total (weighted: {weighted_total:.0f}/{weighted_budget:.0f})"
+            f"{inv.invocation_id}: {tokens['input_tokens']}in + {tokens['output_tokens']}out "
+            f"(weighted {weighted:.0f}/{max_tokens})"
         )
 
-    overall = sum(scores) / len(scores) if scores else 0.0
+    if not has_data:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            details={"reason": "no token data in any invocation"},
+        )
 
-    return EvalResult(
-        score=overall,
-        per_invocation_scores=scores,
-        details={
-            "token_details": details_items,
-            "has_trace_data": has_data,
-            "max_tokens": max_tokens,
-        },
-    )
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items})
 
 
 if __name__ == "__main__":
diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py
index c56f84d..e54c091 100644
--- a/evaluators/tool_efficiency/tool_efficiency.py
+++ b/evaluators/tool_efficiency/tool_efficiency.py
@@ -1,20 +1,17 @@
 """Community evaluator: tool_efficiency
 
-Scores whether the agent used tools effectively. Penalizes duplicate calls
-(same tool + same args), error calls, and budget overruns.
+Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args),
+error responses, and budget overruns.
 
-Config options:
-  max_tool_calls (int): Tool call budget (default: 15)
-  penalize_duplicates (bool): Penalize repeated identical calls (default: true)
-  penalize_errors (bool): Penalize failed tool calls (default: true)
+Config: max_tool_calls (int, default 15), penalize_duplicates (bool, default true),
+        penalize_errors (bool, default true)
 """
 
 import json
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 def _call_signature(call) -> str:
-    """Create a hashable signature for a tool call (name + sorted args)."""
     name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "")
     args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {})
     try:
@@ -25,16 +22,11 @@ def _call_signature(call) -> str:
 
 
 def _is_error_response(response) -> bool:
-    """Check if a tool response indicates an error."""
     output = response.get("output", "") if isinstance(response, dict) else getattr(response, "output", "")
-    output_str = str(output).lower()
-    # Check common error indicators
-    if any(marker in output_str for marker in ["error", "failed", "exception", "traceback"]):
+    if any(m in str(output).lower() for m in ["error", "failed", "exception", "traceback"]):
         return True
     status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "")
-    if str(status).lower() in ("error", "failed"):
-        return True
-    return False
+    return str(status).lower() in ("error", "failed")
 
 
 @evaluator
@@ -48,66 +40,35 @@ def tool_efficiency(input: EvalInput) -> EvalResult:
 
     for inv in input.invocations:
         tool_calls = inv.intermediate_steps.tool_calls if inv.intermediate_steps else []
-        tool_responses = (
-            inv.intermediate_steps.tool_responses if inv.intermediate_steps else []
-        )
+        tool_responses = inv.intermediate_steps.tool_responses if inv.intermediate_steps else []
         total = len(tool_calls)
 
         if total == 0:
-            scores.append(1.0)  # No tools needed = perfectly efficient
-            details_items.append(f"{inv.invocation_id}: no tool calls (score: 1.0)")
+            scores.append(1.0)
+            details_items.append(f"{inv.invocation_id}: no tool calls")
             continue
 
-        # Count duplicates
-        seen_signatures: dict[str, int] = {}
-        duplicate_count = 0
+        seen: dict[str, int] = {}
         for call in tool_calls:
             sig = _call_signature(call)
-            seen_signatures[sig] = seen_signatures.get(sig, 0) + 1
-
-        if penalize_duplicates:
-            duplicate_count = sum(count - 1 for count in seen_signatures.values() if count > 1)
-
-        # Count errors
-        error_count = 0
-        if penalize_errors and tool_responses:
-            for resp in tool_responses:
-                if _is_error_response(resp):
-                    error_count += 1
+            seen[sig] = seen.get(sig, 0) + 1
 
-        # Calculate useful calls
-        wasted = duplicate_count + error_count
-        useful = max(0, total - wasted)
+        dupes = sum(c - 1 for c in seen.values() if c > 1) if penalize_duplicates else 0
+        errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0
+        useful = max(0, total - dupes - errors)
 
-        # Efficiency ratio: useful / total
-        efficiency_ratio = useful / total if total > 0 else 1.0
-
-        # Budget penalty: how much over budget
-        budget_overrun = max(0, total - max_tool_calls) / max_tool_calls
-        budget_factor = max(0.0, 1.0 - budget_overrun)
-
-        score = max(0.0, min(1.0, efficiency_ratio * budget_factor))
+        efficiency = useful / total
+        budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls)
+        score = max(0.0, min(1.0, efficiency * budget_factor))
         scores.append(score)
 
         parts = [f"total={total}", f"useful={useful}"]
-        if duplicate_count > 0:
-            parts.append(f"duplicates={duplicate_count}")
-        if error_count > 0:
-            parts.append(f"errors={error_count}")
-        if total > max_tool_calls:
-            parts.append(f"over_budget={total - max_tool_calls}")
-        details_items.append(f"{inv.invocation_id}: {', '.join(parts)} (score: {score:.2f})")
+        if dupes: parts.append(f"dupes={dupes}")
+        if errors: parts.append(f"errors={errors}")
+        details_items.append(f"{inv.invocation_id}: {', '.join(parts)}")
 
     overall = sum(scores) / len(scores) if scores else 0.0
-
-    return EvalResult(
-        score=overall,
-        per_invocation_scores=scores,
-        details={
-            "tool_details": details_items,
-            "max_tool_calls": max_tool_calls,
-        },
-    )
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items})
 
 
 if __name__ == "__main__":

From 4e9899dc6b5f1a3a9f912dde8b72b920150f1f99 Mon Sep 17 00:00:00 2001
From: Henrik Rexed <henrik.rexed@gmail.com>
Date: Mon, 30 Mar 2026 18:18:50 +0200
Subject: [PATCH 3/5] fix: remove naive text heuristic from error detection,
 use status field only

_is_error_response now only checks the structured status field (error/failed/failure)
instead of scanning output text for substrings like "error" which would
false-positive on legitimate outputs.
---
 evaluators/tool_efficiency/tool_efficiency.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py
index e54c091..b785f3d 100644
--- a/evaluators/tool_efficiency/tool_efficiency.py
+++ b/evaluators/tool_efficiency/tool_efficiency.py
@@ -8,7 +8,7 @@
 """
 
 import json
-from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
 
 
 def _call_signature(call) -> str:
@@ -22,11 +22,9 @@ def _call_signature(call) -> str:
 
 
 def _is_error_response(response) -> bool:
-    output = response.get("output", "") if isinstance(response, dict) else getattr(response, "output", "")
-    if any(m in str(output).lower() for m in ["error", "failed", "exception", "traceback"]):
-        return True
+    """Check if a tool response indicates an error via its status field."""
     status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "")
-    return str(status).lower() in ("error", "failed")
+    return str(status).lower() in ("error", "failed", "failure")
 
 
 @evaluator

From beadc5c3f0e02408f6d20714a5d932094ea1608b Mon Sep 17 00:00:00 2001
From: Henrik Rexed <henrik.rexed@gmail.com>
Date: Mon, 30 Mar 2026 18:24:35 +0200
Subject: [PATCH 4/5] =?UTF-8?q?fix:=20address=20review=20=E2=80=94=20add?=
 =?UTF-8?q?=20min=5Ftool=5Fcalls=20config,=20move=20signature=20counting?=
 =?UTF-8?q?=20into=20conditional?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add min_tool_calls config (default 0): when >0, zero tool calls scores 0.0
  instead of 1.0 (zero calls often means hallucinated answer)
- Move signature computation inside penalize_duplicates conditional to
  avoid unnecessary work when duplicate detection is disabled
- Complements tool_coverage for strict tool-usage requirements
---
 evaluators/tool_efficiency/tool_efficiency.py | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py
index b785f3d..d03e400 100644
--- a/evaluators/tool_efficiency/tool_efficiency.py
+++ b/evaluators/tool_efficiency/tool_efficiency.py
@@ -3,8 +3,8 @@
 Scores tool usage effectiveness. Penalizes duplicate calls (same tool + args),
 error responses, and budget overruns.
 
-Config: max_tool_calls (int, default 15), penalize_duplicates (bool, default true),
-        penalize_errors (bool, default true)
+Config: max_tool_calls (int, default 15), min_tool_calls (int, default 0),
+        penalize_duplicates (bool, default true), penalize_errors (bool, default true)
 """
 
 import json
@@ -30,6 +30,7 @@ def _is_error_response(response) -> bool:
 @evaluator
 def tool_efficiency(input: EvalInput) -> EvalResult:
     max_tool_calls = input.config.get("max_tool_calls", 15)
+    min_tool_calls = input.config.get("min_tool_calls", 0)
     penalize_duplicates = input.config.get("penalize_duplicates", True)
     penalize_errors = input.config.get("penalize_errors", True)
 
@@ -42,16 +43,22 @@ def tool_efficiency(input: EvalInput) -> EvalResult:
         total = len(tool_calls)
 
         if total == 0:
-            scores.append(1.0)
-            details_items.append(f"{inv.invocation_id}: no tool calls")
+            if min_tool_calls > 0:
+                scores.append(0.0)
+                details_items.append(f"{inv.invocation_id}: no tool calls (min required: {min_tool_calls})")
+            else:
+                scores.append(1.0)
+                details_items.append(f"{inv.invocation_id}: no tool calls (tools optional)")
             continue
 
-        seen: dict[str, int] = {}
-        for call in tool_calls:
-            sig = _call_signature(call)
-            seen[sig] = seen.get(sig, 0) + 1
+        dupes = 0
+        if penalize_duplicates:
+            seen: dict[str, int] = {}
+            for call in tool_calls:
+                sig = _call_signature(call)
+                seen[sig] = seen.get(sig, 0) + 1
+            dupes = sum(c - 1 for c in seen.values() if c > 1)
 
-        dupes = sum(c - 1 for c in seen.values() if c > 1) if penalize_duplicates else 0
         errors = sum(1 for r in tool_responses if _is_error_response(r)) if penalize_errors else 0
         useful = max(0, total - dupes - errors)
 

From c5d16842054cbf9ac298072038f815483bcb367a Mon Sep 17 00:00:00 2001
From: henrikrexed <henrikrexed@users.noreply.github.com>
Date: Tue, 7 Apr 2026 18:27:56 +0200
Subject: [PATCH 5/5] fix: use SDK performance_metrics field directly, split
 token budgets

- Remove getattr/dict-access fallbacks in token_efficiency and
  time_efficiency now that SDK 0.1.1 has performance_metrics as a
  proper field on InvocationData
- Replace weighted max_tokens with separate max_input_tokens and
  max_output_tokens config (score = min of both), per review feedback
- All three evaluators tested e2e with SDK 0.1.1

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 evaluators/time_efficiency/time_efficiency.py | 18 ++++-----
 .../token_efficiency/token_efficiency.py      | 37 ++++++++-----------
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py
index f425d36..f5069e1 100644
--- a/evaluators/time_efficiency/time_efficiency.py
+++ b/evaluators/time_efficiency/time_efficiency.py
@@ -10,17 +10,13 @@
 
 
 def _extract_duration(inv) -> float | None:
-    perf = getattr(inv, "performance_metrics", None)
-    if perf is None and hasattr(inv, "__getitem__"):
-        try:
-            perf = inv["performance_metrics"]
-        except (KeyError, TypeError):
-            perf = None
-
-    if isinstance(perf, dict):
-        d = perf.get("duration_s") or perf.get("duration")
-        if d is not None:
-            return float(d)
+    perf = inv.performance_metrics
+    if not isinstance(perf, dict):
+        return None
+
+    d = perf.get("duration_s") or perf.get("duration")
+    if d is not None:
+        return float(d)
     return None
 
 
diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py
index ccc6bc6..cd12975 100644
--- a/evaluators/token_efficiency/token_efficiency.py
+++ b/evaluators/token_efficiency/token_efficiency.py
@@ -3,35 +3,29 @@
 Scores token usage relative to a budget. Extracts input/output tokens from
 performance_metrics when available, otherwise returns NOT_EVALUATED.
 
-Config: max_tokens (int, default 200000), weight_input (float, default 0.7),
-        weight_output (float, default 0.3)
+Config: max_input_tokens (int, default 150000), max_output_tokens (int, default 50000)
 """
 
 from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 def _extract_tokens(inv) -> dict | None:
-    perf = getattr(inv, "performance_metrics", None)
-    if perf is None and hasattr(inv, "__getitem__"):
-        try:
-            perf = inv["performance_metrics"]
-        except (KeyError, TypeError):
-            perf = None
-
-    if isinstance(perf, dict):
-        input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
-        output_t = perf.get("output_tokens") or perf.get("completion_tokens")
-        if input_t is not None or output_t is not None:
-            return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)}
+    perf = inv.performance_metrics
+    if not isinstance(perf, dict):
+        return None
+
+    input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
+    output_t = perf.get("output_tokens") or perf.get("completion_tokens")
+    if input_t is not None or output_t is not None:
+        return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)}
 
     return None
 
 
 @evaluator
 def token_efficiency(input: EvalInput) -> EvalResult:
-    max_tokens = input.config.get("max_tokens", 200000)
-    weight_input = input.config.get("weight_input", 0.7)
-    weight_output = input.config.get("weight_output", 0.3)
+    max_input = input.config.get("max_input_tokens", 150000)
+    max_output = input.config.get("max_output_tokens", 50000)
 
     scores: list[float] = []
     details_items: list[str] = []
@@ -45,12 +39,13 @@ def token_efficiency(input: EvalInput) -> EvalResult:
             continue
 
         has_data = True
-        weighted = (tokens["input_tokens"] * weight_input) + (tokens["output_tokens"] * weight_output)
-        score = max(0.0, min(1.0, 1.0 - (weighted / max_tokens)))
+        input_score = max(0.0, min(1.0, 1.0 - (tokens["input_tokens"] / max_input))) if max_input > 0 else 1.0
+        output_score = max(0.0, min(1.0, 1.0 - (tokens["output_tokens"] / max_output))) if max_output > 0 else 1.0
+        score = min(input_score, output_score)
         scores.append(score)
         details_items.append(
-            f"{inv.invocation_id}: {tokens['input_tokens']}in + {tokens['output_tokens']}out "
-            f"(weighted {weighted:.0f}/{max_tokens})"
+            f"{inv.invocation_id}: {tokens['input_tokens']}in/{max_input} + "
+            f"{tokens['output_tokens']}out/{max_output} -> {score:.2f}"
         )
 
     if not has_data: