From 2514426f16d6671f745c96e653d9567d463f132d Mon Sep 17 00:00:00 2001
From: ossama-ferjani-work <ossama.ferjani@distribusion.com>
Date: Wed, 1 Apr 2026 14:17:08 +0200
Subject: [PATCH 1/5] Expose all tool_trajectory_avg_score match types

---
 samples/eval_set_multi_tool.json        | 34 ++++++++++
 samples/helm_reversed_order.json        | 84 +++++++++++++++++++++++++
 src/agentevals/api/routes.py            |  2 +
 src/agentevals/api/streaming_routes.py  |  2 +
 src/agentevals/builtin_metrics.py       | 11 +++-
 src/agentevals/cli.py                   |  9 +++
 src/agentevals/config.py                |  5 ++
 src/agentevals/eval_config_loader.py    |  4 ++
 src/agentevals/runner.py                |  5 +-
 ui/src/components/upload/UploadView.tsx | 20 ++++++
 ui/src/context/TraceContext.tsx         |  2 +
 ui/src/context/TraceProvider.tsx        |  7 ++-
 ui/src/lib/types.ts                     |  1 +
 13 files changed, 182 insertions(+), 4 deletions(-)
 create mode 100644 samples/eval_set_multi_tool.json
 create mode 100644 samples/helm_reversed_order.json

diff --git a/samples/eval_set_multi_tool.json b/samples/eval_set_multi_tool.json
new file mode 100644
index 0000000..cafd22f
--- /dev/null
+++ b/samples/eval_set_multi_tool.json
@@ -0,0 +1,34 @@
+{
+  "eval_set_id": "helm_multi_tool_eval_set",
+  "name": "Helm Multi-Tool Eval Set",
+  "description": "Golden eval case with two tools in a specific order for match-type smoke testing.",
+  "eval_cases": [
+    {
+      "eval_id": "helm_multi_tool_inv_1",
+      "conversation": [
+        {
+          "invocation_id": "helm_multi_tool_inv_1",
+          "user_content": {
+            "role": "user",
+            "parts": [
+              {"text": "list all Helm releases and get details for kagent"}
+            ]
+          },
+          "final_response": {
+            "role": "model",
+            "parts": [
+              {"text": "I found the releases and their details."}
+            ]
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {"name": "helm_list_releases", "args": {}, "id": "call_e1"},
+              {"name": "helm_get_release", "args": {}, "id": "call_e2"}
+            ],
+            "tool_responses": []
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/samples/helm_reversed_order.json b/samples/helm_reversed_order.json
new file mode 100644
index 0000000..2e20db1
--- /dev/null
+++ b/samples/helm_reversed_order.json
@@ -0,0 +1,84 @@
+{
+  "data": [
+    {
+      "traceID": "aabbccdd11223344aabbccdd11223344",
+      "spans": [
+        {
+          "traceID": "aabbccdd11223344aabbccdd11223344",
+          "spanID": "invoke00000001",
+          "operationName": "invoke_agent helm_agent",
+          "references": [],
+          "startTime": 1771900000000000,
+          "duration": 5000000,
+          "tags": [
+            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"},
+            {"key": "otel.scope.version", "type": "string", "value": "1.0.0"},
+            {"key": "gcp.vertex.agent.invocation_id", "type": "string", "value": "helm_multi_tool_inv_1"},
+            {"key": "gen_ai.agent.name", "type": "string", "value": "helm_agent"}
+          ],
+          "processID": "p1"
+        },
+        {
+          "traceID": "aabbccdd11223344aabbccdd11223344",
+          "spanID": "calllm000000001",
+          "operationName": "call_llm",
+          "references": [
+            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
+          ],
+          "startTime": 1771900000100000,
+          "duration": 1000000,
+          "tags": [
+            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"},
+            {"key": "gcp.vertex.agent.llm_request", "type": "string", "value": "{\"contents\": [{\"role\": \"user\", \"parts\": [{\"text\": \"list all Helm releases and get details for kagent\"}]}]}"},
+            {"key": "gen_ai.request.model", "type": "string", "value": "gemini-2.0-flash"}
+          ],
+          "processID": "p1"
+        },
+        {
+          "traceID": "aabbccdd11223344aabbccdd11223344",
+          "spanID": "toolget00000001",
+          "operationName": "execute_tool helm_get_release",
+          "references": [
+            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
+          ],
+          "startTime": 1771900001200000,
+          "duration": 200000,
+          "tags": [
+            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"}
+          ],
+          "processID": "p1"
+        },
+        {
+          "traceID": "aabbccdd11223344aabbccdd11223344",
+          "spanID": "toollist0000001",
+          "operationName": "execute_tool helm_list_releases",
+          "references": [
+            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
+          ],
+          "startTime": 1771900001500000,
+          "duration": 200000,
+          "tags": [
+            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"}
+          ],
+          "processID": "p1"
+        },
+        {
+          "traceID": "aabbccdd11223344aabbccdd11223344",
+          "spanID": "calllm000000002",
+          "operationName": "call_llm",
+          "references": [
+            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
+          ],
+          "startTime": 1771900002000000,
+          "duration": 1000000,
+          "tags": [
+            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"},
+            {"key": "gcp.vertex.agent.llm_response", "type": "string", "value": "{\"content\": {\"role\": \"model\", \"parts\": [{\"text\": \"I found the releases and their details.\"}]}}"},
+            {"key": "gen_ai.request.model", "type": "string", "value": "gemini-2.0-flash"}
+          ],
+          "processID": "p1"
+        }
+      ]
+    }
+  ]
+}
diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py
index 46803e9..0689130 100644
--- a/src/agentevals/api/routes.py
+++ b/src/agentevals/api/routes.py
@@ -507,6 +507,7 @@ async def evaluate_traces(
             trace_format=trace_format,
             judge_model=config_dict.get("judgeModel"),
             threshold=threshold,
+            trajectory_match_type=config_dict.get("trajectoryMatchType"),
         )
 
         logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}")
@@ -615,6 +616,7 @@ async def event_generator():
                 trace_format=trace_format,
                 judge_model=config_dict.get("judgeModel"),
                 threshold=threshold,
+                trajectory_match_type=config_dict.get("trajectoryMatchType"),
             )
 
             loader = get_loader(eval_config.trace_format)
diff --git a/src/agentevals/api/streaming_routes.py b/src/agentevals/api/streaming_routes.py
index 6fff215..811a513 100644
--- a/src/agentevals/api/streaming_routes.py
+++ b/src/agentevals/api/streaming_routes.py
@@ -46,6 +46,7 @@ class EvaluateSessionsRequest(BaseModel):
     eval_set_id: str
     metrics: list[str] = ["tool_trajectory_avg_score"]
     judge_model: str = "gemini-2.5-flash"
+    trajectory_match_type: str | None = None
 
 
 class PrepareEvaluationRequest(BaseModel):
@@ -210,6 +211,7 @@ async def eval_one_session(session_id: str, session) -> SessionEvalResult:
                         eval_set_file=eval_set_file.name,
                         metrics=request.metrics,
                         judge_model=request.judge_model,
+                        trajectory_match_type=request.trajectory_match_type,
                     )
 
                     eval_result = await run_evaluation(config)
diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py
index 0e03562..f70ddc2 100644
--- a/src/agentevals/builtin_metrics.py
+++ b/src/agentevals/builtin_metrics.py
@@ -60,6 +60,7 @@ def build_eval_metric(
     judge_model: str | None,
     threshold: float | None,
     rubrics: list[str] | None = None,
+    match_type: str | None = None, 
 ) -> EvalMetric:
     """Construct an ADK ``EvalMetric`` with the appropriate criterion."""
     effective_threshold = threshold if threshold is not None else 0.5
@@ -67,7 +68,12 @@ def build_eval_metric(
     criterion: BaseCriterion | None = None
 
     if metric_name == "tool_trajectory_avg_score":
-        criterion = ToolTrajectoryCriterion(threshold=effective_threshold)
+        _match = (
+            ToolTrajectoryCriterion.MatchType[match_type.upper()]
+            if match_type
+            else ToolTrajectoryCriterion.MatchType.EXACT
+        )
+        criterion = ToolTrajectoryCriterion(threshold=effective_threshold, matchType=_match)
     elif metric_name == "final_response_match_v2":
         judge_opts = JudgeModelOptions()
         if judge_model:
@@ -179,6 +185,7 @@ async def evaluate_builtin_metric(
     expected_invocations: list[Invocation] | None,
     judge_model: str | None,
     threshold: float | None,
+    match_type: str | None = None,
 ) -> dict[str, Any]:
     """Evaluate a single built-in ADK metric.
 
@@ -197,7 +204,7 @@ async def evaluate_builtin_metric(
         )
 
     try:
-        eval_metric = build_eval_metric(metric_name, judge_model, threshold)
+        eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
         evaluator: Evaluator = get_evaluator(eval_metric)
 
         if inspect.iscoroutinefunction(evaluator.evaluate_invocations):
diff --git a/src/agentevals/cli.py b/src/agentevals/cli.py
index bb36352..5c4195f 100644
--- a/src/agentevals/cli.py
+++ b/src/agentevals/cli.py
@@ -108,6 +108,12 @@ def main(verbose: int) -> None:
     default=None,
     help="Score threshold for pass/fail.",
 )
+@click.option(
+    "--trajectory-match-type",
+    type=click.Choice(["EXACT", "IN_ORDER", "ANY_ORDER"], case_sensitive=False),
+    default=None,
+    help="Match type for tool_trajectory_avg_score: EXACT (default), IN_ORDER, or ANY_ORDER.",
+)
 @click.option(
     "--output",
     "-o",
@@ -130,6 +136,7 @@ def run(
     trace_format: str,
     judge_model: str | None,
     threshold: float | None,
+    trajectory_match_type: str | None,
     output: str,
     config_file: str | None,
 ) -> None:
@@ -152,6 +159,7 @@ def run(
             trace_format=trace_format,
             judge_model=judge_model,
             threshold=threshold,
+            trajectory_match_type=trajectory_match_type,
             output_format=output,
         )
         config = merge_configs(file_config, cli_config)
@@ -164,6 +172,7 @@ def run(
             trace_format=trace_format,
             judge_model=judge_model,
             threshold=threshold,
+            trajectory_match_type=trajectory_match_type,
             output_format=output,
         )
 
diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index 3278c19..136bfdd 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -132,6 +132,11 @@ class EvalRunConfig(BaseModel):
         description="Score threshold for pass/fail.",
     )
 
+    trajectory_match_type: str | None = Field(
+        default=None,
+        description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.",
+    )
+
     output_format: str = Field(
         default="table",
         description="Output format: 'table', 'json', or 'summary'.",
diff --git a/src/agentevals/eval_config_loader.py b/src/agentevals/eval_config_loader.py
index 5ddc7cc..e3b35b1 100644
--- a/src/agentevals/eval_config_loader.py
+++ b/src/agentevals/eval_config_loader.py
@@ -111,6 +111,8 @@ def load_eval_config(path: str | Path) -> EvalRunConfig:
         config.judge_model = data["judge_model"]
     if "threshold" in data:
         config.threshold = float(data["threshold"])
+    if "trajectory_match_type" in data:
+        config.trajectory_match_type = data["trajectory_match_type"]
     if "trace_format" in data:
         config.trace_format = data["trace_format"]
 
@@ -136,6 +138,8 @@ def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> Eval
         merged.judge_model = cli_config.judge_model
     if cli_config.threshold is not None:
         merged.threshold = cli_config.threshold
+    if cli_config.trajectory_match_type is not None:
+        merged.trajectory_match_type = cli_config.trajectory_match_type
     if cli_config.trace_format != "jaeger-json":
         merged.trace_format = cli_config.trace_format
     if cli_config.output_format != "table":
diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py
index 392082c..fc16990 100644
--- a/src/agentevals/runner.py
+++ b/src/agentevals/runner.py
@@ -140,6 +140,7 @@ async def _evaluate_trace_bounded(idx: int, conv_result: ConversionResult) -> Tr
                 eval_set=eval_set,
                 judge_model=config.judge_model,
                 threshold=config.threshold,
+                trajectory_match_type=config.trajectory_match_type,
                 eval_semaphore=eval_semaphore,
                 progress_callback=progress_callback,
                 trace_progress_callback=trace_progress_callback,
@@ -196,7 +197,8 @@ async def _evaluate_trace(
     eval_set: EvalSet | None,
     judge_model: str | None,
     threshold: float | None,
-    eval_semaphore: asyncio.Semaphore,
+    trajectory_match_type: str | None = None,
+    eval_semaphore: asyncio.Semaphore = None,
     progress_callback: ProgressCallback | None = None,
     trace_progress_callback: TraceProgressCallback | None = None,
     trace=None,
@@ -243,6 +245,7 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult:
                 expected_invocations=expected_invocations,
                 judge_model=judge_model,
                 threshold=threshold,
+                match_type=trajectory_match_type,
             )
             result.duration_ms = (time.monotonic() - t0) * 1000
         return await _append_result(result)
diff --git a/ui/src/components/upload/UploadView.tsx b/ui/src/components/upload/UploadView.tsx
index 7ed6d9e..6e047fe 100644
--- a/ui/src/components/upload/UploadView.tsx
+++ b/ui/src/components/upload/UploadView.tsx
@@ -525,6 +525,26 @@ export const UploadView: React.FC = () => {
               Minimum score to pass
             </span>
           </div>
+
+          {state.selectedMetrics.includes('tool_trajectory_avg_score') && (
+            <div className="setting-item" style={{ marginTop: 10 }}>
+              <label className="setting-label">Trajectory Match Type</label>
+              <Select
+                value={state.trajectoryMatchType}
+                onChange={actions.setTrajectoryMatchType}
+                options={[
+                  { value: 'EXACT', label: 'EXACT — tools must match in exact order' },
+                  { value: 'IN_ORDER', label: 'IN_ORDER — expected tools appear in order (extras allowed)' },
+                  { value: 'ANY_ORDER', label: 'ANY_ORDER — expected tools appear in any order' },
+                ]}
+                style={{ width: '100%' }}
+                size="small"
+              />
+              <span className="setting-hint">
+                How to compare tool call sequences for tool_trajectory_avg_score
+              </span>
+            </div>
+          )}
         </div>
       </div>
 
diff --git a/ui/src/context/TraceContext.tsx b/ui/src/context/TraceContext.tsx
index fb38879..b0919b8 100644
--- a/ui/src/context/TraceContext.tsx
+++ b/ui/src/context/TraceContext.tsx
@@ -15,6 +15,7 @@ export interface TraceState {
   selectedMetrics: string[];
   judgeModel: string;
   threshold: number;
+  trajectoryMatchType: string;
   traceMetadata: Map<string, TraceMetadata>;
   isLoadingMetadata: boolean;
   apiKeyStatus: ApiKeyStatus | null;
@@ -54,6 +55,7 @@ export interface TraceContextType {
     toggleMetric: (metric: string) => void;
     setJudgeModel: (model: string) => void;
     setThreshold: (threshold: number) => void;
+    setTrajectoryMatchType: (matchType: string) => void;
     runEvaluation: () => Promise<void>;
     setCurrentView: (view: ViewType) => void;
     setEvaluationOrigin: (view: ViewType | null) => void;
diff --git a/ui/src/context/TraceProvider.tsx b/ui/src/context/TraceProvider.tsx
index c724685..1c0eb40 100644
--- a/ui/src/context/TraceProvider.tsx
+++ b/ui/src/context/TraceProvider.tsx
@@ -16,6 +16,7 @@ export const TraceProvider: React.FC<TraceProviderProps> = ({ children }) => {
     selectedMetrics: ['tool_trajectory_avg_score'],
     judgeModel: 'gemini-2.5-flash',
     threshold: 0.8,
+    trajectoryMatchType: 'EXACT',
     traceMetadata: new Map(),
     isLoadingMetadata: false,
     apiKeyStatus: null,
@@ -94,6 +95,9 @@ export const TraceProvider: React.FC<TraceProviderProps> = ({ children }) => {
       setThreshold: (threshold: number) =>
         setState((prev) => ({ ...prev, threshold })),
 
+      setTrajectoryMatchType: (matchType: string) =>
+        setState((prev) => ({ ...prev, trajectoryMatchType: matchType })),
+
       runEvaluation: async () => {
         const initialRows = new Map();
         const metadataArray = Array.from(state.traceMetadata.values());
@@ -133,6 +137,7 @@ export const TraceProvider: React.FC<TraceProviderProps> = ({ children }) => {
               metrics: state.selectedMetrics,
               judgeModel: state.judgeModel,
               threshold: state.threshold,
+              trajectoryMatchType: state.trajectoryMatchType !== 'EXACT' ? state.trajectoryMatchType : undefined,
             },
             (message) => {
               setState((prev) => ({ ...prev, progressMessage: message }));
@@ -378,7 +383,7 @@ export const TraceProvider: React.FC<TraceProviderProps> = ({ children }) => {
           };
         }),
     }),
-    [state.traceFiles, state.traceMetadata, state.evalSetFile, state.selectedMetrics, state.judgeModel, state.threshold]
+    [state.traceFiles, state.traceMetadata, state.evalSetFile, state.selectedMetrics, state.judgeModel, state.threshold, state.trajectoryMatchType]
   );
 
   return (
diff --git a/ui/src/lib/types.ts b/ui/src/lib/types.ts
index f2f1b75..eced01f 100644
--- a/ui/src/lib/types.ts
+++ b/ui/src/lib/types.ts
@@ -199,6 +199,7 @@ export interface EvalConfig {
   metrics: string[];
   judgeModel: string;
   threshold: number;
+  trajectoryMatchType?: string;
 }
 
 // EvalSet types

From bac67ef75aa255c4371cb28be8df587fb916b688 Mon Sep 17 00:00:00 2001
From: ossama-ferjani-work <ossama.ferjani@distribusion.com>
Date: Wed, 1 Apr 2026 16:02:48 +0200
Subject: [PATCH 2/5] fix PR comments

---
 samples/eval_set_multi_tool.json  |  34 ----------
 samples/helm_reversed_order.json  |  84 -----------------------
 src/agentevals/builtin_metrics.py |   4 +-
 src/agentevals/config.py          |   8 +++
 src/agentevals/runner.py          |   2 +-
 tests/test_runner.py              | 109 +++++++++++++++++++++++++++++-
 ui/src/context/TraceProvider.tsx  |   2 +-
 7 files changed, 120 insertions(+), 123 deletions(-)
 delete mode 100644 samples/eval_set_multi_tool.json
 delete mode 100644 samples/helm_reversed_order.json

diff --git a/samples/eval_set_multi_tool.json b/samples/eval_set_multi_tool.json
deleted file mode 100644
index cafd22f..0000000
--- a/samples/eval_set_multi_tool.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "eval_set_id": "helm_multi_tool_eval_set",
-  "name": "Helm Multi-Tool Eval Set",
-  "description": "Golden eval case with two tools in a specific order for match-type smoke testing.",
-  "eval_cases": [
-    {
-      "eval_id": "helm_multi_tool_inv_1",
-      "conversation": [
-        {
-          "invocation_id": "helm_multi_tool_inv_1",
-          "user_content": {
-            "role": "user",
-            "parts": [
-              {"text": "list all Helm releases and get details for kagent"}
-            ]
-          },
-          "final_response": {
-            "role": "model",
-            "parts": [
-              {"text": "I found the releases and their details."}
-            ]
-          },
-          "intermediate_data": {
-            "tool_uses": [
-              {"name": "helm_list_releases", "args": {}, "id": "call_e1"},
-              {"name": "helm_get_release", "args": {}, "id": "call_e2"}
-            ],
-            "tool_responses": []
-          }
-        }
-      ]
-    }
-  ]
-}
diff --git a/samples/helm_reversed_order.json b/samples/helm_reversed_order.json
deleted file mode 100644
index 2e20db1..0000000
--- a/samples/helm_reversed_order.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "data": [
-    {
-      "traceID": "aabbccdd11223344aabbccdd11223344",
-      "spans": [
-        {
-          "traceID": "aabbccdd11223344aabbccdd11223344",
-          "spanID": "invoke00000001",
-          "operationName": "invoke_agent helm_agent",
-          "references": [],
-          "startTime": 1771900000000000,
-          "duration": 5000000,
-          "tags": [
-            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"},
-            {"key": "otel.scope.version", "type": "string", "value": "1.0.0"},
-            {"key": "gcp.vertex.agent.invocation_id", "type": "string", "value": "helm_multi_tool_inv_1"},
-            {"key": "gen_ai.agent.name", "type": "string", "value": "helm_agent"}
-          ],
-          "processID": "p1"
-        },
-        {
-          "traceID": "aabbccdd11223344aabbccdd11223344",
-          "spanID": "calllm000000001",
-          "operationName": "call_llm",
-          "references": [
-            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
-          ],
-          "startTime": 1771900000100000,
-          "duration": 1000000,
-          "tags": [
-            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"},
-            {"key": "gcp.vertex.agent.llm_request", "type": "string", "value": "{\"contents\": [{\"role\": \"user\", \"parts\": [{\"text\": \"list all Helm releases and get details for kagent\"}]}]}"},
-            {"key": "gen_ai.request.model", "type": "string", "value": "gemini-2.0-flash"}
-          ],
-          "processID": "p1"
-        },
-        {
-          "traceID": "aabbccdd11223344aabbccdd11223344",
-          "spanID": "toolget00000001",
-          "operationName": "execute_tool helm_get_release",
-          "references": [
-            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
-          ],
-          "startTime": 1771900001200000,
-          "duration": 200000,
-          "tags": [
-            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"}
-          ],
-          "processID": "p1"
-        },
-        {
-          "traceID": "aabbccdd11223344aabbccdd11223344",
-          "spanID": "toollist0000001",
-          "operationName": "execute_tool helm_list_releases",
-          "references": [
-            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
-          ],
-          "startTime": 1771900001500000,
-          "duration": 200000,
-          "tags": [
-            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"}
-          ],
-          "processID": "p1"
-        },
-        {
-          "traceID": "aabbccdd11223344aabbccdd11223344",
-          "spanID": "calllm000000002",
-          "operationName": "call_llm",
-          "references": [
-            {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"}
-          ],
-          "startTime": 1771900002000000,
-          "duration": 1000000,
-          "tags": [
-            {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"},
-            {"key": "gcp.vertex.agent.llm_response", "type": "string", "value": "{\"content\": {\"role\": \"model\", \"parts\": [{\"text\": \"I found the releases and their details.\"}]}}"},
-            {"key": "gen_ai.request.model", "type": "string", "value": "gemini-2.0-flash"}
-          ],
-          "processID": "p1"
-        }
-      ]
-    }
-  ]
-}
diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py
index f70ddc2..808de75 100644
--- a/src/agentevals/builtin_metrics.py
+++ b/src/agentevals/builtin_metrics.py
@@ -69,11 +69,11 @@ def build_eval_metric(
 
     if metric_name == "tool_trajectory_avg_score":
         _match = (
-            ToolTrajectoryCriterion.MatchType[match_type.upper()]
+            ToolTrajectoryCriterion.MatchType[match_type]
             if match_type
             else ToolTrajectoryCriterion.MatchType.EXACT
         )
-        criterion = ToolTrajectoryCriterion(threshold=effective_threshold, matchType=_match)
+        criterion = ToolTrajectoryCriterion(threshold=effective_threshold, match_type=_match)
     elif metric_name == "final_response_match_v2":
         judge_opts = JudgeModelOptions()
         if judge_model:
diff --git a/src/agentevals/config.py b/src/agentevals/config.py
index 136bfdd..f7a3149 100644
--- a/src/agentevals/config.py
+++ b/src/agentevals/config.py
@@ -137,6 +137,14 @@ class EvalRunConfig(BaseModel):
         description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.",
     )
 
+    @field_validator("trajectory_match_type")
+    @classmethod
+    def _validate_trajectory_match_type(cls, v: str | None) -> str | None:
+        valid = {"EXACT", "IN_ORDER", "ANY_ORDER"}
+        if v is not None and v.upper() not in valid:
+            raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
+        return v.upper() if v is not None else v
+
     output_format: str = Field(
         default="table",
         description="Output format: 'table', 'json', or 'summary'.",
diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py
index fc16990..c838040 100644
--- a/src/agentevals/runner.py
+++ b/src/agentevals/runner.py
@@ -197,12 +197,12 @@ async def _evaluate_trace(
     eval_set: EvalSet | None,
     judge_model: str | None,
     threshold: float | None,
-    trajectory_match_type: str | None = None,
     eval_semaphore: asyncio.Semaphore = None,
     progress_callback: ProgressCallback | None = None,
     trace_progress_callback: TraceProgressCallback | None = None,
     trace=None,
     performance_metrics: dict[str, Any] | None = None,
+    trajectory_match_type: str | None = None,
 ) -> TraceResult:
     trace_result = TraceResult(
         trace_id=conv_result.trace_id,
diff --git a/tests/test_runner.py b/tests/test_runner.py
index ce57950..8778422 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -1,12 +1,76 @@
 import asyncio
+import json
 import os
 
 import pytest
 
 from agentevals.config import EvalRunConfig
-from agentevals.runner import load_eval_set, run_evaluation
+from agentevals.converter import convert_traces
+from agentevals.loader.base import Span, Trace
+from agentevals.runner import _evaluate_trace, load_eval_set, run_evaluation
 from agentevals.trace_metrics import extract_trace_metadata
 
+
+def _make_tool_trace(tools: list[str]) -> Trace:
+    """Build a minimal ADK trace calling the given tools in order."""
+    invoke = Span(
+        trace_id="t1", span_id="invoke1", parent_span_id=None,
+        operation_name="invoke_agent test_agent", start_time=1000, duration=10000,
+        tags={"otel.scope.name": "gcp.vertex.agent"},
+    )
+    call_llm_1 = Span(
+        trace_id="t1", span_id="llm1", parent_span_id="invoke1",
+        operation_name="call_llm", start_time=2000, duration=1000,
+        tags={
+            "otel.scope.name": "gcp.vertex.agent",
+            "gcp.vertex.agent.llm_request": json.dumps(
+                {"contents": [{"role": "user", "parts": [{"text": "do something"}]}]}
+            ),
+        },
+    )
+    tool_spans = [
+        Span(
+            trace_id="t1", span_id=f"tool{i}", parent_span_id="invoke1",
+            operation_name=f"execute_tool {name}", start_time=3000 + i * 100, duration=100,
+            tags={"otel.scope.name": "gcp.vertex.agent"},
+        )
+        for i, name in enumerate(tools)
+    ]
+    call_llm_2 = Span(
+        trace_id="t1", span_id="llm2", parent_span_id="invoke1",
+        operation_name="call_llm", start_time=5000, duration=1000,
+        tags={
+            "otel.scope.name": "gcp.vertex.agent",
+            "gcp.vertex.agent.llm_response": json.dumps(
+                {"content": {"role": "model", "parts": [{"text": "done"}]}}
+            ),
+        },
+    )
+    invoke.children = [call_llm_1, *tool_spans, call_llm_2]
+    return Trace(
+        trace_id="t1",
+        root_spans=[invoke],
+        all_spans=[invoke, call_llm_1, *tool_spans, call_llm_2],
+    )
+
+
+def _make_eval_set_json(tools: list[str]) -> dict:
+    return {
+        "eval_set_id": "test",
+        "eval_cases": [{
+            "eval_id": "inv_1",
+            "conversation": [{
+                "invocation_id": "inv_1",
+                "user_content": {"role": "user", "parts": [{"text": "do something"}]},
+                "final_response": {"role": "model", "parts": [{"text": "done"}]},
+                "intermediate_data": {
+                    "tool_uses": [{"name": t, "args": {}, "id": f"e{i}"} for i, t in enumerate(tools)],
+                    "tool_responses": [],
+                },
+            }],
+        }],
+    }
+
 SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "..", "samples")
 HELM_TRACE = os.path.join(SAMPLES_DIR, "helm.json")
 HELM_3_TRACE = os.path.join(SAMPLES_DIR, "helm_3.json")
@@ -168,3 +232,46 @@ def testextract_trace_metadata_adk(self):
         assert "helm" in metadata["user_input_preview"].lower()
         assert metadata["final_output_preview"] is not None
         assert len(metadata["final_output_preview"]) > 0
+
+
+class TestTrajectoryMatchType:
+    """Verify trajectory_match_type produces different scores on the same trace.
+
+    Actual calls [get, list], expected calls [list, get].
+    EXACT and IN_ORDER fail; ANY_ORDER passes.
+    """
+
+    def _run(self, match_type, tmp_path):
+        conv_result = convert_traces([_make_tool_trace(["helm_get_release", "helm_list_releases"])])[0]
+
+        eval_set_path = tmp_path / "eval_set.json"
+        eval_set_path.write_text(json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"])))
+        eval_set = load_eval_set(str(eval_set_path))
+
+        return asyncio.run(
+            _evaluate_trace(
+                conv_result=conv_result,
+                metrics=["tool_trajectory_avg_score"],
+                custom_evaluators=[],
+                eval_set=eval_set,
+                judge_model=None,
+                threshold=0.5,
+                trajectory_match_type=match_type,
+                eval_semaphore=asyncio.Semaphore(1),
+            )
+        )
+
+    def test_exact_fails(self, tmp_path):
+        mr = self._run(None, tmp_path).metric_results[0]
+        assert mr.score == 0.0
+        assert mr.eval_status == "FAILED"
+
+    def test_any_order_passes(self, tmp_path):
+        mr = self._run("ANY_ORDER", tmp_path).metric_results[0]
+        assert mr.score == 1.0
+        assert mr.eval_status == "PASSED"
+
+    def test_in_order_fails(self, tmp_path):
+        mr = self._run("IN_ORDER", tmp_path).metric_results[0]
+        assert mr.score == 0.0
+        assert mr.eval_status == "FAILED"
diff --git a/ui/src/context/TraceProvider.tsx b/ui/src/context/TraceProvider.tsx
index 1c0eb40..2ffdd22 100644
--- a/ui/src/context/TraceProvider.tsx
+++ b/ui/src/context/TraceProvider.tsx
@@ -137,7 +137,7 @@ export const TraceProvider: React.FC<TraceProviderProps> = ({ children }) => {
               metrics: state.selectedMetrics,
               judgeModel: state.judgeModel,
               threshold: state.threshold,
-              trajectoryMatchType: state.trajectoryMatchType !== 'EXACT' ? state.trajectoryMatchType : undefined,
+              trajectoryMatchType: state.trajectoryMatchType,
             },
             (message) => {
               setState((prev) => ({ ...prev, progressMessage: message }));

From bb3c0a28fa876011279112ea0568c9fb80045149 Mon Sep 17 00:00:00 2001
From: ossama-ferjani-work <ossama.ferjani@distribusion.com>
Date: Wed, 1 Apr 2026 16:46:10 +0200
Subject: [PATCH 3/5] fix comments

---
 src/agentevals/api/streaming_routes.py | 4 ++--
 src/agentevals/runner.py               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/agentevals/api/streaming_routes.py b/src/agentevals/api/streaming_routes.py
index 811a513..4ad76d6 100644
--- a/src/agentevals/api/streaming_routes.py
+++ b/src/agentevals/api/streaming_routes.py
@@ -5,7 +5,7 @@
 import asyncio
 import json
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import FileResponse
@@ -46,7 +46,7 @@ class EvaluateSessionsRequest(BaseModel):
     eval_set_id: str
     metrics: list[str] = ["tool_trajectory_avg_score"]
     judge_model: str = "gemini-2.5-flash"
-    trajectory_match_type: str | None = None
+    trajectory_match_type: Literal["EXACT", "IN_ORDER", "ANY_ORDER"] | None = None
 
 
 class PrepareEvaluationRequest(BaseModel):
diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py
index c838040..0c6134e 100644
--- a/src/agentevals/runner.py
+++ b/src/agentevals/runner.py
@@ -197,7 +197,7 @@ async def _evaluate_trace(
     eval_set: EvalSet | None,
     judge_model: str | None,
     threshold: float | None,
-    eval_semaphore: asyncio.Semaphore = None,
+    eval_semaphore: asyncio.Semaphore,
     progress_callback: ProgressCallback | None = None,
     trace_progress_callback: TraceProgressCallback | None = None,
     trace=None,

From c731501f9013660ae36f0a65f746d661f313843e Mon Sep 17 00:00:00 2001
From: ossama-ferjani-work <ossama.ferjani@distribusion.com>
Date: Wed, 1 Apr 2026 17:30:03 +0200
Subject: [PATCH 4/5] fix linting

---
 src/agentevals/builtin_metrics.py | 16 +++++--
 tests/test_runner.py              | 78 ++++++++++++++++++++++---------
 2 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py
index 808de75..3a5de7e 100644
--- a/src/agentevals/builtin_metrics.py
+++ b/src/agentevals/builtin_metrics.py
@@ -60,7 +60,7 @@ def build_eval_metric(
     judge_model: str | None,
     threshold: float | None,
     rubrics: list[str] | None = None,
-    match_type: str | None = None, 
+    match_type: str | None = None,
 ) -> EvalMetric:
     """Construct an ADK ``EvalMetric`` with the appropriate criterion."""
     effective_threshold = threshold if threshold is not None else 0.5
@@ -73,7 +73,9 @@ def build_eval_metric(
             if match_type
             else ToolTrajectoryCriterion.MatchType.EXACT
         )
-        criterion = ToolTrajectoryCriterion(threshold=effective_threshold, match_type=_match)
+        criterion = ToolTrajectoryCriterion(
+            threshold=effective_threshold, match_type=_match
+        )
     elif metric_name == "final_response_match_v2":
         judge_opts = JudgeModelOptions()
         if judge_model:
@@ -111,7 +113,11 @@ def build_eval_metric(
             threshold=effective_threshold,
             judge_model_options=judge_opts,
         )
-    elif metric_name in ("response_match_score", "response_evaluation_score", "safety_v1"):
+    elif metric_name in (
+        "response_match_score",
+        "response_evaluation_score",
+        "safety_v1",
+    ):
         criterion = BaseCriterion(threshold=effective_threshold)
 
     return EvalMetric(
@@ -204,7 +210,9 @@ async def evaluate_builtin_metric(
         )
 
     try:
-        eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
+        eval_metric = build_eval_metric(
+            metric_name, judge_model, threshold, match_type=match_type
+        )
         evaluator: Evaluator = get_evaluator(eval_metric)
 
         if inspect.iscoroutinefunction(evaluator.evaluate_invocations):
diff --git a/tests/test_runner.py b/tests/test_runner.py
index 8778422..67b34f6 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -14,13 +14,21 @@
 def _make_tool_trace(tools: list[str]) -> Trace:
     """Build a minimal ADK trace calling the given tools in order."""
     invoke = Span(
-        trace_id="t1", span_id="invoke1", parent_span_id=None,
-        operation_name="invoke_agent test_agent", start_time=1000, duration=10000,
+        trace_id="t1",
+        span_id="invoke1",
+        parent_span_id=None,
+        operation_name="invoke_agent test_agent",
+        start_time=1000,
+        duration=10000,
         tags={"otel.scope.name": "gcp.vertex.agent"},
     )
     call_llm_1 = Span(
-        trace_id="t1", span_id="llm1", parent_span_id="invoke1",
-        operation_name="call_llm", start_time=2000, duration=1000,
+        trace_id="t1",
+        span_id="llm1",
+        parent_span_id="invoke1",
+        operation_name="call_llm",
+        start_time=2000,
+        duration=1000,
         tags={
             "otel.scope.name": "gcp.vertex.agent",
             "gcp.vertex.agent.llm_request": json.dumps(
@@ -30,15 +38,23 @@ def _make_tool_trace(tools: list[str]) -> Trace:
     )
     tool_spans = [
         Span(
-            trace_id="t1", span_id=f"tool{i}", parent_span_id="invoke1",
-            operation_name=f"execute_tool {name}", start_time=3000 + i * 100, duration=100,
+            trace_id="t1",
+            span_id=f"tool{i}",
+            parent_span_id="invoke1",
+            operation_name=f"execute_tool {name}",
+            start_time=3000 + i * 100,
+            duration=100,
             tags={"otel.scope.name": "gcp.vertex.agent"},
         )
         for i, name in enumerate(tools)
     ]
     call_llm_2 = Span(
-        trace_id="t1", span_id="llm2", parent_span_id="invoke1",
-        operation_name="call_llm", start_time=5000, duration=1000,
+        trace_id="t1",
+        span_id="llm2",
+        parent_span_id="invoke1",
+        operation_name="call_llm",
+        start_time=5000,
+        duration=1000,
         tags={
             "otel.scope.name": "gcp.vertex.agent",
             "gcp.vertex.agent.llm_response": json.dumps(
@@ -57,20 +73,34 @@ def _make_tool_trace(tools: list[str]) -> Trace:
 def _make_eval_set_json(tools: list[str]) -> dict:
     return {
         "eval_set_id": "test",
-        "eval_cases": [{
-            "eval_id": "inv_1",
-            "conversation": [{
-                "invocation_id": "inv_1",
-                "user_content": {"role": "user", "parts": [{"text": "do something"}]},
-                "final_response": {"role": "model", "parts": [{"text": "done"}]},
-                "intermediate_data": {
-                    "tool_uses": [{"name": t, "args": {}, "id": f"e{i}"} for i, t in enumerate(tools)],
-                    "tool_responses": [],
-                },
-            }],
-        }],
+        "eval_cases": [
+            {
+                "eval_id": "inv_1",
+                "conversation": [
+                    {
+                        "invocation_id": "inv_1",
+                        "user_content": {
+                            "role": "user",
+                            "parts": [{"text": "do something"}],
+                        },
+                        "final_response": {
+                            "role": "model",
+                            "parts": [{"text": "done"}],
+                        },
+                        "intermediate_data": {
+                            "tool_uses": [
+                                {"name": t, "args": {}, "id": f"e{i}"}
+                                for i, t in enumerate(tools)
+                            ],
+                            "tool_responses": [],
+                        },
+                    }
+                ],
+            }
+        ],
     }
 
+
 SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "..", "samples")
 HELM_TRACE = os.path.join(SAMPLES_DIR, "helm.json")
 HELM_3_TRACE = os.path.join(SAMPLES_DIR, "helm_3.json")
@@ -242,10 +272,14 @@ class TestTrajectoryMatchType:
     """
 
     def _run(self, match_type, tmp_path):
-        conv_result = convert_traces([_make_tool_trace(["helm_get_release", "helm_list_releases"])])[0]
+        conv_result = convert_traces(
+            [_make_tool_trace(["helm_get_release", "helm_list_releases"])]
+        )[0]
 
         eval_set_path = tmp_path / "eval_set.json"
-        eval_set_path.write_text(json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"])))
+        eval_set_path.write_text(
+            json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"]))
+        )
         eval_set = load_eval_set(str(eval_set_path))
 
         return asyncio.run(

From 235aa0f8012e44791e129fd3444b47c32d243b17 Mon Sep 17 00:00:00 2001
From: ossama-ferjani-work <ossama.ferjani@distribusion.com>
Date: Wed, 1 Apr 2026 17:42:51 +0200
Subject: [PATCH 5/5] ruff-linting fixed

---
 src/agentevals/builtin_metrics.py | 12 +++---------
 tests/test_runner.py              | 17 ++++-------------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py
index 3a5de7e..59a2892 100644
--- a/src/agentevals/builtin_metrics.py
+++ b/src/agentevals/builtin_metrics.py
@@ -69,13 +69,9 @@ def build_eval_metric(
 
     if metric_name == "tool_trajectory_avg_score":
         _match = (
-            ToolTrajectoryCriterion.MatchType[match_type]
-            if match_type
-            else ToolTrajectoryCriterion.MatchType.EXACT
-        )
-        criterion = ToolTrajectoryCriterion(
-            threshold=effective_threshold, match_type=_match
+            ToolTrajectoryCriterion.MatchType[match_type] if match_type else ToolTrajectoryCriterion.MatchType.EXACT
         )
+        criterion = ToolTrajectoryCriterion(threshold=effective_threshold, match_type=_match)
     elif metric_name == "final_response_match_v2":
         judge_opts = JudgeModelOptions()
         if judge_model:
@@ -210,9 +206,7 @@ async def evaluate_builtin_metric(
         )
 
     try:
-        eval_metric = build_eval_metric(
-            metric_name, judge_model, threshold, match_type=match_type
-        )
+        eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
         evaluator: Evaluator = get_evaluator(eval_metric)
 
         if inspect.iscoroutinefunction(evaluator.evaluate_invocations):
diff --git a/tests/test_runner.py b/tests/test_runner.py
index 67b34f6..06d5e4b 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -57,9 +57,7 @@ def _make_tool_trace(tools: list[str]) -> Trace:
         duration=1000,
         tags={
             "otel.scope.name": "gcp.vertex.agent",
-            "gcp.vertex.agent.llm_response": json.dumps(
-                {"content": {"role": "model", "parts": [{"text": "done"}]}}
-            ),
+            "gcp.vertex.agent.llm_response": json.dumps({"content": {"role": "model", "parts": [{"text": "done"}]}}),
         },
     )
     invoke.children = [call_llm_1, *tool_spans, call_llm_2]
@@ -88,10 +86,7 @@ def _make_eval_set_json(tools: list[str]) -> dict:
                             "parts": [{"text": "done"}],
                         },
                         "intermediate_data": {
-                            "tool_uses": [
-                                {"name": t, "args": {}, "id": f"e{i}"}
-                                for i, t in enumerate(tools)
-                            ],
+                            "tool_uses": [{"name": t, "args": {}, "id": f"e{i}"} for i, t in enumerate(tools)],
                             "tool_responses": [],
                         },
                     }
@@ -272,14 +267,10 @@ class TestTrajectoryMatchType:
     """
 
     def _run(self, match_type, tmp_path):
-        conv_result = convert_traces(
-            [_make_tool_trace(["helm_get_release", "helm_list_releases"])]
-        )[0]
+        conv_result = convert_traces([_make_tool_trace(["helm_get_release", "helm_list_releases"])])[0]
 
         eval_set_path = tmp_path / "eval_set.json"
-        eval_set_path.write_text(
-            json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"]))
-        )
+        eval_set_path.write_text(json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"])))
         eval_set = load_eval_set(str(eval_set_path))
 
         return asyncio.run(