From 2514426f16d6671f745c96e653d9567d463f132d Mon Sep 17 00:00:00 2001 From: ossama-ferjani-work Date: Wed, 1 Apr 2026 14:17:08 +0200 Subject: [PATCH 1/5] Expose all tool_trajectory_avg_score match types --- samples/eval_set_multi_tool.json | 34 ++++++++++ samples/helm_reversed_order.json | 84 +++++++++++++++++++++++++ src/agentevals/api/routes.py | 2 + src/agentevals/api/streaming_routes.py | 2 + src/agentevals/builtin_metrics.py | 11 +++- src/agentevals/cli.py | 9 +++ src/agentevals/config.py | 5 ++ src/agentevals/eval_config_loader.py | 4 ++ src/agentevals/runner.py | 5 +- ui/src/components/upload/UploadView.tsx | 20 ++++++ ui/src/context/TraceContext.tsx | 2 + ui/src/context/TraceProvider.tsx | 7 ++- ui/src/lib/types.ts | 1 + 13 files changed, 182 insertions(+), 4 deletions(-) create mode 100644 samples/eval_set_multi_tool.json create mode 100644 samples/helm_reversed_order.json diff --git a/samples/eval_set_multi_tool.json b/samples/eval_set_multi_tool.json new file mode 100644 index 0000000..cafd22f --- /dev/null +++ b/samples/eval_set_multi_tool.json @@ -0,0 +1,34 @@ +{ + "eval_set_id": "helm_multi_tool_eval_set", + "name": "Helm Multi-Tool Eval Set", + "description": "Golden eval case with two tools in a specific order for match-type smoke testing.", + "eval_cases": [ + { + "eval_id": "helm_multi_tool_inv_1", + "conversation": [ + { + "invocation_id": "helm_multi_tool_inv_1", + "user_content": { + "role": "user", + "parts": [ + {"text": "list all Helm releases and get details for kagent"} + ] + }, + "final_response": { + "role": "model", + "parts": [ + {"text": "I found the releases and their details."} + ] + }, + "intermediate_data": { + "tool_uses": [ + {"name": "helm_list_releases", "args": {}, "id": "call_e1"}, + {"name": "helm_get_release", "args": {}, "id": "call_e2"} + ], + "tool_responses": [] + } + } + ] + } + ] +} diff --git a/samples/helm_reversed_order.json b/samples/helm_reversed_order.json new file mode 100644 index 0000000..2e20db1 --- /dev/null +++ b/samples/helm_reversed_order.json @@ -0,0 +1,84 @@ +{ + "data": [ + { + "traceID": "aabbccdd11223344aabbccdd11223344", + "spans": [ + { + "traceID": "aabbccdd11223344aabbccdd11223344", + "spanID": "invoke00000001", + "operationName": "invoke_agent helm_agent", + "references": [], + "startTime": 1771900000000000, + "duration": 5000000, + "tags": [ + {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"}, + {"key": "otel.scope.version", "type": "string", "value": "1.0.0"}, + {"key": "gcp.vertex.agent.invocation_id", "type": "string", "value": "helm_multi_tool_inv_1"}, + {"key": "gen_ai.agent.name", "type": "string", "value": "helm_agent"} + ], + "processID": "p1" + }, + { + "traceID": "aabbccdd11223344aabbccdd11223344", + "spanID": "calllm000000001", + "operationName": "call_llm", + "references": [ + {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"} + ], + "startTime": 1771900000100000, + "duration": 1000000, + "tags": [ + {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"}, + {"key": "gcp.vertex.agent.llm_request", "type": "string", "value": "{\"contents\": [{\"role\": \"user\", \"parts\": [{\"text\": \"list all Helm releases and get details for kagent\"}]}]}"}, + {"key": "gen_ai.request.model", "type": "string", "value": "gemini-2.0-flash"} + ], + "processID": "p1" + }, + { + "traceID": "aabbccdd11223344aabbccdd11223344", + "spanID": "toolget00000001", + "operationName": "execute_tool helm_get_release", + "references": [ + {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"} + ], + "startTime": 1771900001200000, + "duration": 200000, + "tags": [ + {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"} + ], + "processID": "p1" + }, + { + "traceID": "aabbccdd11223344aabbccdd11223344", + "spanID": "toollist0000001", + "operationName": "execute_tool helm_list_releases", + "references": [ + {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"} + ], + "startTime": 1771900001500000, + "duration": 200000, + "tags": [ + {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"} + ], + "processID": "p1" + }, + { + "traceID": "aabbccdd11223344aabbccdd11223344", + "spanID": "calllm000000002", + "operationName": "call_llm", + "references": [ + {"refType": "CHILD_OF", "traceID": "aabbccdd11223344aabbccdd11223344", "spanID": "invoke00000001"} + ], + "startTime": 1771900002000000, + "duration": 1000000, + "tags": [ + {"key": "otel.scope.name", "type": "string", "value": "gcp.vertex.agent"}, + {"key": "gcp.vertex.agent.llm_response", "type": "string", "value": "{\"content\": {\"role\": \"model\", \"parts\": [{\"text\": \"I found the releases and their details.\"}]}}"}, + {"key": "gen_ai.request.model", "type": "string", "value": "gemini-2.0-flash"} + ], + "processID": "p1" + } + ] + } + ] +} diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py index 46803e9..0689130 100644 --- a/src/agentevals/api/routes.py +++ b/src/agentevals/api/routes.py @@ -507,6 +507,7 @@ async def evaluate_traces( trace_format=trace_format, judge_model=config_dict.get("judgeModel"), threshold=threshold, + trajectory_match_type=config_dict.get("trajectoryMatchType"), ) logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}") @@ -615,6 +616,7 @@ async def event_generator(): trace_format=trace_format, judge_model=config_dict.get("judgeModel"), threshold=threshold, + trajectory_match_type=config_dict.get("trajectoryMatchType"), ) loader = get_loader(eval_config.trace_format) diff --git a/src/agentevals/api/streaming_routes.py b/src/agentevals/api/streaming_routes.py index 6fff215..811a513 100644 --- a/src/agentevals/api/streaming_routes.py +++ b/src/agentevals/api/streaming_routes.py @@ -46,6 +46,7 @@ class EvaluateSessionsRequest(BaseModel): eval_set_id: str metrics: list[str] = ["tool_trajectory_avg_score"] judge_model: str = "gemini-2.5-flash" + trajectory_match_type: str | None = None class PrepareEvaluationRequest(BaseModel): @@ -210,6 +211,7 @@ async def eval_one_session(session_id: str, session) -> SessionEvalResult: eval_set_file=eval_set_file.name, metrics=request.metrics, judge_model=request.judge_model, + trajectory_match_type=request.trajectory_match_type, ) eval_result = await run_evaluation(config) diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py index 0e03562..f70ddc2 100644 --- a/src/agentevals/builtin_metrics.py +++ b/src/agentevals/builtin_metrics.py @@ -60,6 +60,7 @@ def build_eval_metric( judge_model: str | None, threshold: float | None, rubrics: list[str] | None = None, + match_type: str | None = None, ) -> EvalMetric: """Construct an ADK ``EvalMetric`` with the appropriate criterion.""" effective_threshold = threshold if threshold is not None else 0.5 @@ -67,7 +68,12 @@ def build_eval_metric( criterion: BaseCriterion | None = None if metric_name == "tool_trajectory_avg_score": - criterion = ToolTrajectoryCriterion(threshold=effective_threshold) + _match = ( + ToolTrajectoryCriterion.MatchType[match_type.upper()] + if match_type + else ToolTrajectoryCriterion.MatchType.EXACT + ) + criterion = ToolTrajectoryCriterion(threshold=effective_threshold, matchType=_match) elif metric_name == "final_response_match_v2": judge_opts = JudgeModelOptions() if judge_model: @@ -179,6 +185,7 @@ async def evaluate_builtin_metric( expected_invocations: list[Invocation] | None, judge_model: str | None, threshold: float | None, + match_type: str | None = None, ) -> dict[str, Any]: """Evaluate a single built-in ADK metric. @@ -197,7 +204,7 @@ async def evaluate_builtin_metric( ) try: - eval_metric = build_eval_metric(metric_name, judge_model, threshold) + eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type) evaluator: Evaluator = get_evaluator(eval_metric) if inspect.iscoroutinefunction(evaluator.evaluate_invocations): diff --git a/src/agentevals/cli.py b/src/agentevals/cli.py index bb36352..5c4195f 100644 --- a/src/agentevals/cli.py +++ b/src/agentevals/cli.py @@ -108,6 +108,12 @@ def main(verbose: int) -> None: default=None, help="Score threshold for pass/fail.", ) +@click.option( + "--trajectory-match-type", + type=click.Choice(["EXACT", "IN_ORDER", "ANY_ORDER"], case_sensitive=False), + default=None, + help="Match type for tool_trajectory_avg_score: EXACT (default), IN_ORDER, or ANY_ORDER.", +) @click.option( "--output", "-o", @@ -130,6 +136,7 @@ def run( trace_format: str, judge_model: str | None, threshold: float | None, + trajectory_match_type: str | None, output: str, config_file: str | None, ) -> None: @@ -152,6 +159,7 @@ def run( trace_format=trace_format, judge_model=judge_model, threshold=threshold, + trajectory_match_type=trajectory_match_type, output_format=output, ) config = merge_configs(file_config, cli_config) @@ -164,6 +172,7 @@ def run( trace_format=trace_format, judge_model=judge_model, threshold=threshold, + trajectory_match_type=trajectory_match_type, output_format=output, ) diff --git a/src/agentevals/config.py b/src/agentevals/config.py index 3278c19..136bfdd 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -132,6 +132,11 @@ class EvalRunConfig(BaseModel): description="Score threshold for pass/fail.", ) + trajectory_match_type: str | None = Field( + default=None, + description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.", + ) + output_format: str = Field( default="table", description="Output format: 'table', 'json', or 'summary'.", diff --git a/src/agentevals/eval_config_loader.py b/src/agentevals/eval_config_loader.py index 5ddc7cc..e3b35b1 100644 --- a/src/agentevals/eval_config_loader.py +++ b/src/agentevals/eval_config_loader.py @@ -111,6 +111,8 @@ def load_eval_config(path: str | Path) -> EvalRunConfig: config.judge_model = data["judge_model"] if "threshold" in data: config.threshold = float(data["threshold"]) + if "trajectory_match_type" in data: + config.trajectory_match_type = data["trajectory_match_type"] if "trace_format" in data: config.trace_format = data["trace_format"] @@ -136,6 +138,8 @@ def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> Eval merged.judge_model = cli_config.judge_model if cli_config.threshold is not None: merged.threshold = cli_config.threshold + if cli_config.trajectory_match_type is not None: + merged.trajectory_match_type = cli_config.trajectory_match_type if cli_config.trace_format != "jaeger-json": merged.trace_format = cli_config.trace_format if cli_config.output_format != "table": diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py index 392082c..fc16990 100644 --- a/src/agentevals/runner.py +++ b/src/agentevals/runner.py @@ -140,6 +140,7 @@ async def _evaluate_trace_bounded(idx: int, conv_result: ConversionResult) -> Tr eval_set=eval_set, judge_model=config.judge_model, threshold=config.threshold, + trajectory_match_type=config.trajectory_match_type, eval_semaphore=eval_semaphore, progress_callback=progress_callback, trace_progress_callback=trace_progress_callback, @@ -196,7 +197,8 @@ async def _evaluate_trace( eval_set: EvalSet | None, judge_model: str | None, threshold: float | None, - eval_semaphore: asyncio.Semaphore, + trajectory_match_type: str | None = None, + eval_semaphore: asyncio.Semaphore = None, progress_callback: ProgressCallback | None = None, trace_progress_callback: TraceProgressCallback | None = None, trace=None, @@ -243,6 +245,7 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult: expected_invocations=expected_invocations, judge_model=judge_model, threshold=threshold, + match_type=trajectory_match_type, ) result.duration_ms = (time.monotonic() - t0) * 1000 return await _append_result(result) diff --git a/ui/src/components/upload/UploadView.tsx b/ui/src/components/upload/UploadView.tsx index 7ed6d9e..6e047fe 100644 --- a/ui/src/components/upload/UploadView.tsx +++ b/ui/src/components/upload/UploadView.tsx @@ -525,6 +525,26 @@ export const UploadView: React.FC = () => { Minimum score to pass + + {state.selectedMetrics.includes('tool_trajectory_avg_score') && ( +
+ +