agentevals-dev · krisztianfekete · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py
@@ -507,6 +507,7 @@ async def evaluate_traces(
             trace_format=trace_format,
             judge_model=config_dict.get("judgeModel"),
             threshold=threshold,
+            trajectory_match_type=config_dict.get("trajectoryMatchType"),
         )
 
         logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}")
@@ -615,6 +616,7 @@ async def event_generator():
                 trace_format=trace_format,
                 judge_model=config_dict.get("judgeModel"),
                 threshold=threshold,
+                trajectory_match_type=config_dict.get("trajectoryMatchType"),
             )
 
             loader = get_loader(eval_config.trace_format)

diff --git a/src/agentevals/api/streaming_routes.py b/src/agentevals/api/streaming_routes.py
@@ -5,7 +5,7 @@
 import asyncio
 import json
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import FileResponse
@@ -46,6 +46,7 @@ class EvaluateSessionsRequest(BaseModel):
     eval_set_id: str
     metrics: list[str] = ["tool_trajectory_avg_score"]
     judge_model: str = "gemini-2.5-flash"
+    trajectory_match_type: Literal["EXACT", "IN_ORDER", "ANY_ORDER"] | None = None
 
 
 class PrepareEvaluationRequest(BaseModel):
@@ -210,6 +211,7 @@ async def eval_one_session(session_id: str, session) -> SessionEvalResult:
                         eval_set_file=eval_set_file.name,
                         metrics=request.metrics,
                         judge_model=request.judge_model,
+                        trajectory_match_type=request.trajectory_match_type,
                     )
 
                     eval_result = await run_evaluation(config)

diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py
@@ -60,14 +60,18 @@ def build_eval_metric(
     judge_model: str | None,
     threshold: float | None,
     rubrics: list[str] | None = None,
+    match_type: str | None = None,
 ) -> EvalMetric:
     """Construct an ADK ``EvalMetric`` with the appropriate criterion."""
     effective_threshold = threshold if threshold is not None else 0.5
 
     criterion: BaseCriterion | None = None
 
     if metric_name == "tool_trajectory_avg_score":
-        criterion = ToolTrajectoryCriterion(threshold=effective_threshold)
+        _match = (
+            ToolTrajectoryCriterion.MatchType[match_type] if match_type else ToolTrajectoryCriterion.MatchType.EXACT
+        )
+        criterion = ToolTrajectoryCriterion(threshold=effective_threshold, match_type=_match)
     elif metric_name == "final_response_match_v2":
         judge_opts = JudgeModelOptions()
         if judge_model:
@@ -105,7 +109,11 @@ def build_eval_metric(
             threshold=effective_threshold,
             judge_model_options=judge_opts,
         )
-    elif metric_name in ("response_match_score", "response_evaluation_score", "safety_v1"):
+    elif metric_name in (
+        "response_match_score",
+        "response_evaluation_score",
+        "safety_v1",
+    ):
         criterion = BaseCriterion(threshold=effective_threshold)
 
     return EvalMetric(
@@ -179,6 +187,7 @@ async def evaluate_builtin_metric(
     expected_invocations: list[Invocation] | None,
     judge_model: str | None,
     threshold: float | None,
+    match_type: str | None = None,
 ) -> dict[str, Any]:
     """Evaluate a single built-in ADK metric.
 
@@ -197,7 +206,7 @@ async def evaluate_builtin_metric(
         )
 
     try:
-        eval_metric = build_eval_metric(metric_name, judge_model, threshold)
+        eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
         evaluator: Evaluator = get_evaluator(eval_metric)
 
         if inspect.iscoroutinefunction(evaluator.evaluate_invocations):

diff --git a/src/agentevals/cli.py b/src/agentevals/cli.py
@@ -108,6 +108,12 @@ def main(verbose: int) -> None:
     default=None,
     help="Score threshold for pass/fail.",
 )
+@click.option(
+    "--trajectory-match-type",
+    type=click.Choice(["EXACT", "IN_ORDER", "ANY_ORDER"], case_sensitive=False),
+    default=None,
+    help="Match type for tool_trajectory_avg_score: EXACT (default), IN_ORDER, or ANY_ORDER.",
+)
 @click.option(
     "--output",
     "-o",
@@ -130,6 +136,7 @@ def run(
     trace_format: str,
     judge_model: str | None,
     threshold: float | None,
+    trajectory_match_type: str | None,
     output: str,
     config_file: str | None,
 ) -> None:
@@ -152,6 +159,7 @@ def run(
             trace_format=trace_format,
             judge_model=judge_model,
             threshold=threshold,
+            trajectory_match_type=trajectory_match_type,
             output_format=output,
         )
         config = merge_configs(file_config, cli_config)
@@ -164,6 +172,7 @@ def run(
             trace_format=trace_format,
             judge_model=judge_model,
             threshold=threshold,
+            trajectory_match_type=trajectory_match_type,
             output_format=output,
         )
 

diff --git a/src/agentevals/config.py b/src/agentevals/config.py
@@ -132,6 +132,19 @@ class EvalRunConfig(BaseModel):
         description="Score threshold for pass/fail.",
     )
 
+    trajectory_match_type: str | None = Field(
+        default=None,
+        description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.",
+    )
+
+    @field_validator("trajectory_match_type")
+    @classmethod
+    def _validate_trajectory_match_type(cls, v: str | None) -> str | None:
+        valid = {"EXACT", "IN_ORDER", "ANY_ORDER"}
+        if v is not None and v.upper() not in valid:
+            raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
+        return v.upper() if v is not None else v
+
     output_format: str = Field(
         default="table",
         description="Output format: 'table', 'json', or 'summary'.",

diff --git a/src/agentevals/eval_config_loader.py b/src/agentevals/eval_config_loader.py
@@ -111,6 +111,8 @@ def load_eval_config(path: str | Path) -> EvalRunConfig:
         config.judge_model = data["judge_model"]
     if "threshold" in data:
         config.threshold = float(data["threshold"])
+    if "trajectory_match_type" in data:
+        config.trajectory_match_type = data["trajectory_match_type"]
     if "trace_format" in data:
         config.trace_format = data["trace_format"]
 
@@ -136,6 +138,8 @@ def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> Eval
         merged.judge_model = cli_config.judge_model
     if cli_config.threshold is not None:
         merged.threshold = cli_config.threshold
+    if cli_config.trajectory_match_type is not None:
+        merged.trajectory_match_type = cli_config.trajectory_match_type
     if cli_config.trace_format != "jaeger-json":
         merged.trace_format = cli_config.trace_format
     if cli_config.output_format != "table":

diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py
@@ -140,6 +140,7 @@ async def _evaluate_trace_bounded(idx: int, conv_result: ConversionResult) -> Tr
                 eval_set=eval_set,
                 judge_model=config.judge_model,
                 threshold=config.threshold,
+                trajectory_match_type=config.trajectory_match_type,
                 eval_semaphore=eval_semaphore,
                 progress_callback=progress_callback,
                 trace_progress_callback=trace_progress_callback,
@@ -201,6 +202,7 @@ async def _evaluate_trace(
     trace_progress_callback: TraceProgressCallback | None = None,
     trace=None,
     performance_metrics: dict[str, Any] | None = None,
+    trajectory_match_type: str | None = None,
 ) -> TraceResult:
     trace_result = TraceResult(
         trace_id=conv_result.trace_id,
@@ -243,6 +245,7 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult:
                 expected_invocations=expected_invocations,
                 judge_model=judge_model,
                 threshold=threshold,
+                match_type=trajectory_match_type,
             )
             result.duration_ms = (time.monotonic() - t0) * 1000
         return await _append_result(result)

diff --git a/tests/test_runner.py b/tests/test_runner.py
@@ -1,12 +1,101 @@
 import asyncio
+import json
 import os
 
 import pytest
 
 from agentevals.config import EvalRunConfig
-from agentevals.runner import load_eval_set, run_evaluation
+from agentevals.converter import convert_traces
+from agentevals.loader.base import Span, Trace
+from agentevals.runner import _evaluate_trace, load_eval_set, run_evaluation
 from agentevals.trace_metrics import extract_trace_metadata
 
+
+def _make_tool_trace(tools: list[str]) -> Trace:
+    """Build a minimal ADK trace calling the given tools in order."""
+    invoke = Span(
+        trace_id="t1",
+        span_id="invoke1",
+        parent_span_id=None,
+        operation_name="invoke_agent test_agent",
+        start_time=1000,
+        duration=10000,
+        tags={"otel.scope.name": "gcp.vertex.agent"},
+    )
+    call_llm_1 = Span(
+        trace_id="t1",
+        span_id="llm1",
+        parent_span_id="invoke1",
+        operation_name="call_llm",
+        start_time=2000,
+        duration=1000,
+        tags={
+            "otel.scope.name": "gcp.vertex.agent",
+            "gcp.vertex.agent.llm_request": json.dumps(
+                {"contents": [{"role": "user", "parts": [{"text": "do something"}]}]}
+            ),
+        },
+    )
+    tool_spans = [
+        Span(
+            trace_id="t1",
+            span_id=f"tool{i}",
+            parent_span_id="invoke1",
+            operation_name=f"execute_tool {name}",
+            start_time=3000 + i * 100,
+            duration=100,
+            tags={"otel.scope.name": "gcp.vertex.agent"},
+        )
+        for i, name in enumerate(tools)
+    ]
+    call_llm_2 = Span(
+        trace_id="t1",
+        span_id="llm2",
+        parent_span_id="invoke1",
+        operation_name="call_llm",
+        start_time=5000,
+        duration=1000,
+        tags={
+            "otel.scope.name": "gcp.vertex.agent",
+            "gcp.vertex.agent.llm_response": json.dumps({"content": {"role": "model", "parts": [{"text": "done"}]}}),
+        },
+    )
+    invoke.children = [call_llm_1, *tool_spans, call_llm_2]
+    return Trace(
+        trace_id="t1",
+        root_spans=[invoke],
+        all_spans=[invoke, call_llm_1, *tool_spans, call_llm_2],
+    )
+
+
+def _make_eval_set_json(tools: list[str]) -> dict:
+    return {
+        "eval_set_id": "test",
+        "eval_cases": [
+            {
+                "eval_id": "inv_1",
+                "conversation": [
+                    {
+                        "invocation_id": "inv_1",
+                        "user_content": {
+                            "role": "user",
+                            "parts": [{"text": "do something"}],
+                        },
+                        "final_response": {
+                            "role": "model",
+                            "parts": [{"text": "done"}],
+                        },
+                        "intermediate_data": {
+                            "tool_uses": [{"name": t, "args": {}, "id": f"e{i}"} for i, t in enumerate(tools)],
+                            "tool_responses": [],
+                        },
+                    }
+                ],
+            }
+        ],
+    }
+
+
 SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "..", "samples")
 HELM_TRACE = os.path.join(SAMPLES_DIR, "helm.json")
 HELM_3_TRACE = os.path.join(SAMPLES_DIR, "helm_3.json")
@@ -168,3 +257,46 @@ def testextract_trace_metadata_adk(self):
         assert "helm" in metadata["user_input_preview"].lower()
         assert metadata["final_output_preview"] is not None
         assert len(metadata["final_output_preview"]) > 0
+
+
+class TestTrajectoryMatchType:
+    """Verify trajectory_match_type produces different scores on the same trace.
+
+    Actual calls [get, list], expected calls [list, get].
+    EXACT and IN_ORDER fail; ANY_ORDER passes.
+    """
+
+    def _run(self, match_type, tmp_path):
+        conv_result = convert_traces([_make_tool_trace(["helm_get_release", "helm_list_releases"])])[0]
+
+        eval_set_path = tmp_path / "eval_set.json"
+        eval_set_path.write_text(json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"])))
+        eval_set = load_eval_set(str(eval_set_path))
+
+        return asyncio.run(
+            _evaluate_trace(
+                conv_result=conv_result,
+                metrics=["tool_trajectory_avg_score"],
+                custom_evaluators=[],
+                eval_set=eval_set,
+                judge_model=None,
+                threshold=0.5,
+                trajectory_match_type=match_type,
+                eval_semaphore=asyncio.Semaphore(1),
+            )
+        )
+
+    def test_exact_fails(self, tmp_path):
+        mr = self._run(None, tmp_path).metric_results[0]
+        assert mr.score == 0.0
+        assert mr.eval_status == "FAILED"
+
+    def test_any_order_passes(self, tmp_path):
+        mr = self._run("ANY_ORDER", tmp_path).metric_results[0]
+        assert mr.score == 1.0
+        assert mr.eval_status == "PASSED"
+
+    def test_in_order_fails(self, tmp_path):
+        mr = self._run("IN_ORDER", tmp_path).metric_results[0]
+        assert mr.score == 0.0
+        assert mr.eval_status == "FAILED"
diff --git a/ui/src/components/upload/UploadView.tsx b/ui/src/components/upload/UploadView.tsx
@@ -525,6 +525,26 @@ export const UploadView: React.FC = () => {
               Minimum score to pass
             </span>
           </div>
+
+          {state.selectedMetrics.includes('tool_trajectory_avg_score') && (
+            <div className="setting-item" style={{ marginTop: 10 }}>
+              <label className="setting-label">Trajectory Match Type</label>
+              <Select
+                value={state.trajectoryMatchType}
+                onChange={actions.setTrajectoryMatchType}
+                options={[
+                  { value: 'EXACT', label: 'EXACT — tools must match in exact order' },
+                  { value: 'IN_ORDER', label: 'IN_ORDER — expected tools appear in order (extras allowed)' },
+                  { value: 'ANY_ORDER', label: 'ANY_ORDER — expected tools appear in any order' },
+                ]}
+                style={{ width: '100%' }}
+                size="small"
+              />
+              <span className="setting-hint">
+                How to compare tool call sequences for tool_trajectory_avg_score
+              </span>
+            </div>
+          )}
         </div>
       </div>
 

diff --git a/ui/src/context/TraceContext.tsx b/ui/src/context/TraceContext.tsx
@@ -15,6 +15,7 @@ export interface TraceState {
   selectedMetrics: string[];
   judgeModel: string;
   threshold: number;
+  trajectoryMatchType: string;
   traceMetadata: Map<string, TraceMetadata>;
   isLoadingMetadata: boolean;
   apiKeyStatus: ApiKeyStatus | null;
@@ -54,6 +55,7 @@ export interface TraceContextType {
     toggleMetric: (metric: string) => void;
     setJudgeModel: (model: string) => void;
     setThreshold: (threshold: number) => void;
+    setTrajectoryMatchType: (matchType: string) => void;
     runEvaluation: () => Promise<void>;
     setCurrentView: (view: ViewType) => void;
     setEvaluationOrigin: (view: ViewType | null) => void;