From 51c0bffd94c65ae049ec52f1f8f6481ad36e2bd4 Mon Sep 17 00:00:00 2001 From: Francisco Rivas Date: Mon, 20 Apr 2026 17:41:01 +0200 Subject: [PATCH 1/2] add skills_trajectory evaluator Deterministic trajectory-based evaluator that scores whether a configured set of skills (tool names) was observed in each invocation. Supports three match modes: - ANY_ORDER: fractional credit via Counter, order ignored - IN_ORDER: fractional credit via subsequence scan - EXACT: binary, called list must match required exactly Returns NOT_EVALUATED for missing, empty, or invalid config. Partial credit distinguishes this from the binary tool_sequence_match. --- evaluators/skills_trajectory/evaluator.yaml | 6 + .../skills_trajectory/skills_trajectory.py | 144 ++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 evaluators/skills_trajectory/evaluator.yaml create mode 100644 evaluators/skills_trajectory/skills_trajectory.py diff --git a/evaluators/skills_trajectory/evaluator.yaml b/evaluators/skills_trajectory/evaluator.yaml new file mode 100644 index 0000000..5a16bab --- /dev/null +++ b/evaluators/skills_trajectory/evaluator.yaml @@ -0,0 +1,6 @@ +name: skills_trajectory +description: Scores whether configured skills/tool names were observed in each invocation, optionally requiring a specific order or exact match +language: python +entrypoint: skills_trajectory.py +tags: [skills, trajectory, tools] +author: frivas-at-navteca diff --git a/evaluators/skills_trajectory/skills_trajectory.py b/evaluators/skills_trajectory/skills_trajectory.py new file mode 100644 index 0000000..d01bfd9 --- /dev/null +++ b/evaluators/skills_trajectory/skills_trajectory.py @@ -0,0 +1,144 @@ +"""Skills trajectory evaluator. + +Scores whether a configured set of skills (tool names) was observed in each +invocation, optionally requiring them to appear in the given order. + +Partial credit is awarded when only a subset of required skills were called. + +Config: + skills (list[str]): Required. Names of skills/tools that must be observed. + match_type (str, default "ANY_ORDER"): + "ANY_ORDER" - all required skills must appear; order and extras ignored. + Duplicate requirements are handled via Counter. + "IN_ORDER" - required skills must appear as a subsequence in the call + list (extras between them are allowed, order matters). + "EXACT" - called tool names must match required skills exactly + (same names, same order, no extras). + +Returns NOT_EVALUATED when ``skills`` is missing, non-list, or empty. + +Usage in eval_config.yaml: + + evaluators: + - name: skills_trajectory + type: remote + source: github + ref: evaluators/skills_trajectory/skills_trajectory.py + threshold: 0.7 + config: + skills: ["search", "summarize"] + match_type: ANY_ORDER +""" + +from __future__ import annotations + +from collections import Counter + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + + +def _skills_score(required: list[str], called: list[str], match_type: str) -> float: + """Return the fraction of *required* skills satisfied in *called*. + + Args: + required: Ordered list of required skill/tool names. + called: Ordered list of tool names actually called. + match_type: One of ``"ANY_ORDER"``, ``"IN_ORDER"``, or ``"EXACT"``. + + Returns: + A float in ``[0.0, 1.0]``. + """ + if not required: + return 1.0 + + if match_type == "EXACT": + return 1.0 if called == required else 0.0 + + if match_type == "IN_ORDER": + # Subsequence check: each required skill must appear after the previous hit. + pos = 0 + hits = 0 + for skill in required: + while pos < len(called): + if called[pos] == skill: + hits += 1 + pos += 1 + break + pos += 1 + return hits / len(required) + + # ANY_ORDER: duplicate-aware fractional match using Counter. + called_counts = Counter(called) + required_counts = Counter(required) + hits = sum(min(required_counts[s], called_counts[s]) for s in required_counts) + return hits / len(required) + + +@evaluator +def skills_trajectory(input: EvalInput) -> EvalResult: + skills = input.config.get("skills") + n = len(input.invocations) + + if skills is None or not isinstance(skills, list): + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing or invalid config: skills (need a non-empty list of names)"}, + ) + if not skills: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing or empty config: skills"}, + ) + + required = [str(s) for s in skills] + raw_match_type = str(input.config.get("match_type", "ANY_ORDER")).upper() + valid_match_types = {"ANY_ORDER", "IN_ORDER", "EXACT"} + if raw_match_type not in valid_match_types: + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={ + "reason": ( + f"invalid config: match_type={raw_match_type!r}; " + f"must be one of {sorted(valid_match_types)}" + ) + }, + ) + + per_invocation_scores: list[float] = [] + comparisons: list[dict] = [] + + for inv in input.invocations: + called: list[str] = [] + for call in inv.intermediate_steps.tool_calls or []: + if isinstance(call, dict): + name = call.get("name") + if name is not None: + called.append(str(name)) + + score = _skills_score(required, called, raw_match_type) + per_invocation_scores.append(score) + comparisons.append( + { + "invocation_id": inv.invocation_id, + "required_skills": required, + "called_tools": called, + "score": score, + } + ) + + overall = sum(per_invocation_scores) / len(per_invocation_scores) if per_invocation_scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=per_invocation_scores, + details={"comparisons": comparisons}, + ) + + +if __name__ == "__main__": + skills_trajectory.run() From 24c0e8d0c42799785f38d100f43c7ba136862f60 Mon Sep 17 00:00:00 2001 From: Francisco Rivas Date: Mon, 20 Apr 2026 17:52:54 +0200 Subject: [PATCH 2/2] fix and improve skills_trajectory evaluator - Fix critical bug: tool_calls are ToolCallData objects not dicts; isinstance(call, dict) guard silently dropped all tool calls, causing score=0.0 regardless of agent behavior. Use call.name directly. - Merge redundant skills None+isinstance guards into single check - Add NOT_EVALUATED for empty invocations list - Replace list[dict] annotation with _Comparison TypedDict - Simplify tool name extraction to single list comprehension --- .../skills_trajectory/skills_trajectory.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/evaluators/skills_trajectory/skills_trajectory.py b/evaluators/skills_trajectory/skills_trajectory.py index d01bfd9..cabbe33 100644 --- a/evaluators/skills_trajectory/skills_trajectory.py +++ b/evaluators/skills_trajectory/skills_trajectory.py @@ -33,10 +33,18 @@ from __future__ import annotations from collections import Counter +from typing import TypedDict from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator +class _Comparison(TypedDict): + invocation_id: str + required_skills: list[str] + called_tools: list[str] + score: float + + def _skills_score(required: list[str], called: list[str], match_type: str) -> float: """Return the fraction of *required* skills satisfied in *called*. @@ -79,19 +87,20 @@ def skills_trajectory(input: EvalInput) -> EvalResult: skills = input.config.get("skills") n = len(input.invocations) - if skills is None or not isinstance(skills, list): + if not n: return EvalResult( score=0.0, status=EvalStatus.NOT_EVALUATED, - per_invocation_scores=[None] * n, - details={"reason": "missing or invalid config: skills (need a non-empty list of names)"}, + per_invocation_scores=[], + details={"reason": "no invocations to evaluate"}, ) - if not skills: + + if not isinstance(skills, list) or not skills: return EvalResult( score=0.0, status=EvalStatus.NOT_EVALUATED, per_invocation_scores=[None] * n, - details={"reason": "missing or empty config: skills"}, + details={"reason": "missing or empty config: skills (need a non-empty list of names)"}, ) required = [str(s) for s in skills] @@ -111,15 +120,10 @@ def skills_trajectory(input: EvalInput) -> EvalResult: ) per_invocation_scores: list[float] = [] - comparisons: list[dict] = [] + comparisons: list[_Comparison] = [] for inv in input.invocations: - called: list[str] = [] - for call in inv.intermediate_steps.tool_calls or []: - if isinstance(call, dict): - name = call.get("name") - if name is not None: - called.append(str(name)) + called = [call.name for call in inv.intermediate_steps.tool_calls] score = _skills_score(required, called, raw_match_type) per_invocation_scores.append(score)