From 51c0bffd94c65ae049ec52f1f8f6481ad36e2bd4 Mon Sep 17 00:00:00 2001
From: Francisco Rivas <frivas@voiceatlas.com>
Date: Mon, 20 Apr 2026 17:41:01 +0200
Subject: [PATCH 1/2] add skills_trajectory evaluator

Deterministic trajectory-based evaluator that scores whether a
configured set of skills (tool names) was observed in each invocation.

Supports three match modes:
- ANY_ORDER: fractional credit via Counter, order ignored
- IN_ORDER:  fractional credit via subsequence scan
- EXACT:     binary, called list must match required exactly

Returns NOT_EVALUATED for missing, empty, or invalid config.
Partial credit distinguishes this from the binary tool_sequence_match.
---
 evaluators/skills_trajectory/evaluator.yaml   |   6 +
 .../skills_trajectory/skills_trajectory.py    | 144 ++++++++++++++++++
 2 files changed, 150 insertions(+)
 create mode 100644 evaluators/skills_trajectory/evaluator.yaml
 create mode 100644 evaluators/skills_trajectory/skills_trajectory.py

diff --git a/evaluators/skills_trajectory/evaluator.yaml b/evaluators/skills_trajectory/evaluator.yaml
new file mode 100644
index 0000000..5a16bab
--- /dev/null
+++ b/evaluators/skills_trajectory/evaluator.yaml
@@ -0,0 +1,6 @@
+name: skills_trajectory
+description: Scores whether configured skills/tool names were observed in each invocation, optionally requiring a specific order or exact match
+language: python
+entrypoint: skills_trajectory.py
+tags: [skills, trajectory, tools]
+author: frivas-at-navteca
diff --git a/evaluators/skills_trajectory/skills_trajectory.py b/evaluators/skills_trajectory/skills_trajectory.py
new file mode 100644
index 0000000..d01bfd9
--- /dev/null
+++ b/evaluators/skills_trajectory/skills_trajectory.py
@@ -0,0 +1,144 @@
+"""Skills trajectory evaluator.
+
+Scores whether a configured set of skills (tool names) was observed in each
+invocation, optionally requiring them to appear in the given order.
+
+Partial credit is awarded when only a subset of required skills were called.
+
+Config:
+  skills (list[str]): Required. Names of skills/tools that must be observed.
+  match_type (str, default "ANY_ORDER"):
+    "ANY_ORDER" - all required skills must appear; order and extras ignored.
+                  Duplicate requirements are handled via Counter.
+    "IN_ORDER"  - required skills must appear as a subsequence in the call
+                  list (extras between them are allowed, order matters).
+    "EXACT"     - called tool names must match required skills exactly
+                  (same names, same order, no extras).
+
+Returns NOT_EVALUATED when ``skills`` is missing, non-list, or empty.
+
+Usage in eval_config.yaml:
+
+    evaluators:
+      - name: skills_trajectory
+        type: remote
+        source: github
+        ref: evaluators/skills_trajectory/skills_trajectory.py
+        threshold: 0.7
+        config:
+          skills: ["search", "summarize"]
+          match_type: ANY_ORDER
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+def _skills_score(required: list[str], called: list[str], match_type: str) -> float:
+    """Return the fraction of *required* skills satisfied in *called*.
+
+    Args:
+        required: Ordered list of required skill/tool names.
+        called:   Ordered list of tool names actually called.
+        match_type: One of ``"ANY_ORDER"``, ``"IN_ORDER"``, or ``"EXACT"``.
+
+    Returns:
+        A float in ``[0.0, 1.0]``.
+    """
+    if not required:
+        return 1.0
+
+    if match_type == "EXACT":
+        return 1.0 if called == required else 0.0
+
+    if match_type == "IN_ORDER":
+        # Subsequence check: each required skill must appear after the previous hit.
+        pos = 0
+        hits = 0
+        for skill in required:
+            while pos < len(called):
+                if called[pos] == skill:
+                    hits += 1
+                    pos += 1
+                    break
+                pos += 1
+        return hits / len(required)
+
+    # ANY_ORDER: duplicate-aware fractional match using Counter.
+    called_counts = Counter(called)
+    required_counts = Counter(required)
+    hits = sum(min(required_counts[s], called_counts[s]) for s in required_counts)
+    return hits / len(required)
+
+
+@evaluator
+def skills_trajectory(input: EvalInput) -> EvalResult:
+    skills = input.config.get("skills")
+    n = len(input.invocations)
+
+    if skills is None or not isinstance(skills, list):
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing or invalid config: skills (need a non-empty list of names)"},
+        )
+    if not skills:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing or empty config: skills"},
+        )
+
+    required = [str(s) for s in skills]
+    raw_match_type = str(input.config.get("match_type", "ANY_ORDER")).upper()
+    valid_match_types = {"ANY_ORDER", "IN_ORDER", "EXACT"}
+    if raw_match_type not in valid_match_types:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={
+                "reason": (
+                    f"invalid config: match_type={raw_match_type!r}; "
+                    f"must be one of {sorted(valid_match_types)}"
+                )
+            },
+        )
+
+    per_invocation_scores: list[float] = []
+    comparisons: list[dict] = []
+
+    for inv in input.invocations:
+        called: list[str] = []
+        for call in inv.intermediate_steps.tool_calls or []:
+            if isinstance(call, dict):
+                name = call.get("name")
+                if name is not None:
+                    called.append(str(name))
+
+        score = _skills_score(required, called, raw_match_type)
+        per_invocation_scores.append(score)
+        comparisons.append(
+            {
+                "invocation_id": inv.invocation_id,
+                "required_skills": required,
+                "called_tools": called,
+                "score": score,
+            }
+        )
+
+    overall = sum(per_invocation_scores) / len(per_invocation_scores) if per_invocation_scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=per_invocation_scores,
+        details={"comparisons": comparisons},
+    )
+
+
+if __name__ == "__main__":
+    skills_trajectory.run()

From 24c0e8d0c42799785f38d100f43c7ba136862f60 Mon Sep 17 00:00:00 2001
From: Francisco Rivas <frivas@voiceatlas.com>
Date: Mon, 20 Apr 2026 17:52:54 +0200
Subject: [PATCH 2/2] fix and improve skills_trajectory evaluator

- Fix critical bug: tool_calls are ToolCallData objects not dicts;
  isinstance(call, dict) guard silently dropped all tool calls, causing
  score=0.0 regardless of agent behavior. Use call.name directly.
- Merge redundant skills None+isinstance guards into single check
- Add NOT_EVALUATED for empty invocations list
- Replace list[dict] annotation with _Comparison TypedDict
- Simplify tool name extraction to single list comprehension
---
 .../skills_trajectory/skills_trajectory.py    | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/evaluators/skills_trajectory/skills_trajectory.py b/evaluators/skills_trajectory/skills_trajectory.py
index d01bfd9..cabbe33 100644
--- a/evaluators/skills_trajectory/skills_trajectory.py
+++ b/evaluators/skills_trajectory/skills_trajectory.py
@@ -33,10 +33,18 @@
 from __future__ import annotations
 
 from collections import Counter
+from typing import TypedDict
 
 from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
+class _Comparison(TypedDict):
+    invocation_id: str
+    required_skills: list[str]
+    called_tools: list[str]
+    score: float
+
+
 def _skills_score(required: list[str], called: list[str], match_type: str) -> float:
     """Return the fraction of *required* skills satisfied in *called*.
 
@@ -79,19 +87,20 @@ def skills_trajectory(input: EvalInput) -> EvalResult:
     skills = input.config.get("skills")
     n = len(input.invocations)
 
-    if skills is None or not isinstance(skills, list):
+    if not n:
         return EvalResult(
             score=0.0,
             status=EvalStatus.NOT_EVALUATED,
-            per_invocation_scores=[None] * n,
-            details={"reason": "missing or invalid config: skills (need a non-empty list of names)"},
+            per_invocation_scores=[],
+            details={"reason": "no invocations to evaluate"},
         )
-    if not skills:
+
+    if not isinstance(skills, list) or not skills:
         return EvalResult(
             score=0.0,
             status=EvalStatus.NOT_EVALUATED,
             per_invocation_scores=[None] * n,
-            details={"reason": "missing or empty config: skills"},
+            details={"reason": "missing or empty config: skills (need a non-empty list of names)"},
         )
 
     required = [str(s) for s in skills]
@@ -111,15 +120,10 @@ def skills_trajectory(input: EvalInput) -> EvalResult:
         )
 
     per_invocation_scores: list[float] = []
-    comparisons: list[dict] = []
+    comparisons: list[_Comparison] = []
 
     for inv in input.invocations:
-        called: list[str] = []
-        for call in inv.intermediate_steps.tool_calls or []:
-            if isinstance(call, dict):
-                name = call.get("name")
-                if name is not None:
-                    called.append(str(name))
+        called = [call.name for call in inv.intermediate_steps.tool_calls]
 
         score = _skills_score(required, called, raw_match_type)
         per_invocation_scores.append(score)