diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py index 46803e9..0689130 100644 --- a/src/agentevals/api/routes.py +++ b/src/agentevals/api/routes.py @@ -507,6 +507,7 @@ async def evaluate_traces( trace_format=trace_format, judge_model=config_dict.get("judgeModel"), threshold=threshold, + trajectory_match_type=config_dict.get("trajectoryMatchType"), ) logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}") @@ -615,6 +616,7 @@ async def event_generator(): trace_format=trace_format, judge_model=config_dict.get("judgeModel"), threshold=threshold, + trajectory_match_type=config_dict.get("trajectoryMatchType"), ) loader = get_loader(eval_config.trace_format) diff --git a/src/agentevals/api/streaming_routes.py b/src/agentevals/api/streaming_routes.py index 6fff215..4ad76d6 100644 --- a/src/agentevals/api/streaming_routes.py +++ b/src/agentevals/api/streaming_routes.py @@ -5,7 +5,7 @@ import asyncio import json import logging -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import FileResponse @@ -46,6 +46,7 @@ class EvaluateSessionsRequest(BaseModel): eval_set_id: str metrics: list[str] = ["tool_trajectory_avg_score"] judge_model: str = "gemini-2.5-flash" + trajectory_match_type: Literal["EXACT", "IN_ORDER", "ANY_ORDER"] | None = None class PrepareEvaluationRequest(BaseModel): @@ -210,6 +211,7 @@ async def eval_one_session(session_id: str, session) -> SessionEvalResult: eval_set_file=eval_set_file.name, metrics=request.metrics, judge_model=request.judge_model, + trajectory_match_type=request.trajectory_match_type, ) eval_result = await run_evaluation(config) diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py index 0e03562..59a2892 100644 --- a/src/agentevals/builtin_metrics.py +++ b/src/agentevals/builtin_metrics.py @@ -60,6 +60,7 @@ def build_eval_metric( judge_model: str | None, threshold: float | None, rubrics: list[str] | None = None, + match_type: str | None = None, ) -> EvalMetric: """Construct an ADK ``EvalMetric`` with the appropriate criterion.""" effective_threshold = threshold if threshold is not None else 0.5 @@ -67,7 +68,10 @@ def build_eval_metric( criterion: BaseCriterion | None = None if metric_name == "tool_trajectory_avg_score": - criterion = ToolTrajectoryCriterion(threshold=effective_threshold) + _match = ( + ToolTrajectoryCriterion.MatchType[match_type] if match_type else ToolTrajectoryCriterion.MatchType.EXACT + ) + criterion = ToolTrajectoryCriterion(threshold=effective_threshold, match_type=_match) elif metric_name == "final_response_match_v2": judge_opts = JudgeModelOptions() if judge_model: @@ -105,7 +109,11 @@ def build_eval_metric( threshold=effective_threshold, judge_model_options=judge_opts, ) - elif metric_name in ("response_match_score", "response_evaluation_score", "safety_v1"): + elif metric_name in ( + "response_match_score", + "response_evaluation_score", + "safety_v1", + ): criterion = BaseCriterion(threshold=effective_threshold) return EvalMetric( @@ -179,6 +187,7 @@ async def evaluate_builtin_metric( expected_invocations: list[Invocation] | None, judge_model: str | None, threshold: float | None, + match_type: str | None = None, ) -> dict[str, Any]: """Evaluate a single built-in ADK metric. @@ -197,7 +206,7 @@ async def evaluate_builtin_metric( ) try: - eval_metric = build_eval_metric(metric_name, judge_model, threshold) + eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type) evaluator: Evaluator = get_evaluator(eval_metric) if inspect.iscoroutinefunction(evaluator.evaluate_invocations): diff --git a/src/agentevals/cli.py b/src/agentevals/cli.py index bb36352..5c4195f 100644 --- a/src/agentevals/cli.py +++ b/src/agentevals/cli.py @@ -108,6 +108,12 @@ def main(verbose: int) -> None: default=None, help="Score threshold for pass/fail.", ) +@click.option( + "--trajectory-match-type", + type=click.Choice(["EXACT", "IN_ORDER", "ANY_ORDER"], case_sensitive=False), + default=None, + help="Match type for tool_trajectory_avg_score: EXACT (default), IN_ORDER, or ANY_ORDER.", +) @click.option( "--output", "-o", @@ -130,6 +136,7 @@ def run( trace_format: str, judge_model: str | None, threshold: float | None, + trajectory_match_type: str | None, output: str, config_file: str | None, ) -> None: @@ -152,6 +159,7 @@ def run( trace_format=trace_format, judge_model=judge_model, threshold=threshold, + trajectory_match_type=trajectory_match_type, output_format=output, ) config = merge_configs(file_config, cli_config) @@ -164,6 +172,7 @@ def run( trace_format=trace_format, judge_model=judge_model, threshold=threshold, + trajectory_match_type=trajectory_match_type, output_format=output, ) diff --git a/src/agentevals/config.py b/src/agentevals/config.py index 3278c19..f7a3149 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -132,6 +132,19 @@ class EvalRunConfig(BaseModel): description="Score threshold for pass/fail.", ) + trajectory_match_type: str | None = Field( + default=None, + description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.", + ) + + @field_validator("trajectory_match_type") + @classmethod + def _validate_trajectory_match_type(cls, v: str | None) -> str | None: + valid = {"EXACT", "IN_ORDER", "ANY_ORDER"} + if v is not None and v.upper() not in valid: + raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}") + return v.upper() if v is not None else v + output_format: str = Field( default="table", description="Output format: 'table', 'json', or 'summary'.", diff --git a/src/agentevals/eval_config_loader.py b/src/agentevals/eval_config_loader.py index 5ddc7cc..e3b35b1 100644 --- a/src/agentevals/eval_config_loader.py +++ b/src/agentevals/eval_config_loader.py @@ -111,6 +111,8 @@ def load_eval_config(path: str | Path) -> EvalRunConfig: config.judge_model = data["judge_model"] if "threshold" in data: config.threshold = float(data["threshold"]) + if "trajectory_match_type" in data: + config.trajectory_match_type = data["trajectory_match_type"] if "trace_format" in data: config.trace_format = data["trace_format"] @@ -136,6 +138,8 @@ def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> Eval merged.judge_model = cli_config.judge_model if cli_config.threshold is not None: merged.threshold = cli_config.threshold + if cli_config.trajectory_match_type is not None: + merged.trajectory_match_type = cli_config.trajectory_match_type if cli_config.trace_format != "jaeger-json": merged.trace_format = cli_config.trace_format if cli_config.output_format != "table": diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py index 392082c..0c6134e 100644 --- a/src/agentevals/runner.py +++ b/src/agentevals/runner.py @@ -140,6 +140,7 @@ async def _evaluate_trace_bounded(idx: int, conv_result: ConversionResult) -> Tr eval_set=eval_set, judge_model=config.judge_model, threshold=config.threshold, + trajectory_match_type=config.trajectory_match_type, eval_semaphore=eval_semaphore, progress_callback=progress_callback, trace_progress_callback=trace_progress_callback, @@ -201,6 +202,7 @@ async def _evaluate_trace( trace_progress_callback: TraceProgressCallback | None = None, trace=None, performance_metrics: dict[str, Any] | None = None, + trajectory_match_type: str | None = None, ) -> TraceResult: trace_result = TraceResult( trace_id=conv_result.trace_id, @@ -243,6 +245,7 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult: expected_invocations=expected_invocations, judge_model=judge_model, threshold=threshold, + match_type=trajectory_match_type, ) result.duration_ms = (time.monotonic() - t0) * 1000 return await _append_result(result) diff --git a/tests/test_runner.py b/tests/test_runner.py index ce57950..06d5e4b 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -1,12 +1,101 @@ import asyncio +import json import os import pytest from agentevals.config import EvalRunConfig -from agentevals.runner import load_eval_set, run_evaluation +from agentevals.converter import convert_traces +from agentevals.loader.base import Span, Trace +from agentevals.runner import _evaluate_trace, load_eval_set, run_evaluation from agentevals.trace_metrics import extract_trace_metadata + +def _make_tool_trace(tools: list[str]) -> Trace: + """Build a minimal ADK trace calling the given tools in order.""" + invoke = Span( + trace_id="t1", + span_id="invoke1", + parent_span_id=None, + operation_name="invoke_agent test_agent", + start_time=1000, + duration=10000, + tags={"otel.scope.name": "gcp.vertex.agent"}, + ) + call_llm_1 = Span( + trace_id="t1", + span_id="llm1", + parent_span_id="invoke1", + operation_name="call_llm", + start_time=2000, + duration=1000, + tags={ + "otel.scope.name": "gcp.vertex.agent", + "gcp.vertex.agent.llm_request": json.dumps( + {"contents": [{"role": "user", "parts": [{"text": "do something"}]}]} + ), + }, + ) + tool_spans = [ + Span( + trace_id="t1", + span_id=f"tool{i}", + parent_span_id="invoke1", + operation_name=f"execute_tool {name}", + start_time=3000 + i * 100, + duration=100, + tags={"otel.scope.name": "gcp.vertex.agent"}, + ) + for i, name in enumerate(tools) + ] + call_llm_2 = Span( + trace_id="t1", + span_id="llm2", + parent_span_id="invoke1", + operation_name="call_llm", + start_time=5000, + duration=1000, + tags={ + "otel.scope.name": "gcp.vertex.agent", + "gcp.vertex.agent.llm_response": json.dumps({"content": {"role": "model", "parts": [{"text": "done"}]}}), + }, + ) + invoke.children = [call_llm_1, *tool_spans, call_llm_2] + return Trace( + trace_id="t1", + root_spans=[invoke], + all_spans=[invoke, call_llm_1, *tool_spans, call_llm_2], + ) + + +def _make_eval_set_json(tools: list[str]) -> dict: + return { + "eval_set_id": "test", + "eval_cases": [ + { + "eval_id": "inv_1", + "conversation": [ + { + "invocation_id": "inv_1", + "user_content": { + "role": "user", + "parts": [{"text": "do something"}], + }, + "final_response": { + "role": "model", + "parts": [{"text": "done"}], + }, + "intermediate_data": { + "tool_uses": [{"name": t, "args": {}, "id": f"e{i}"} for i, t in enumerate(tools)], + "tool_responses": [], + }, + } + ], + } + ], + } + + SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "..", "samples") HELM_TRACE = os.path.join(SAMPLES_DIR, "helm.json") HELM_3_TRACE = os.path.join(SAMPLES_DIR, "helm_3.json") @@ -168,3 +257,46 @@ def testextract_trace_metadata_adk(self): assert "helm" in metadata["user_input_preview"].lower() assert metadata["final_output_preview"] is not None assert len(metadata["final_output_preview"]) > 0 + + +class TestTrajectoryMatchType: + """Verify trajectory_match_type produces different scores on the same trace. + + Actual calls [get, list], expected calls [list, get]. + EXACT and IN_ORDER fail; ANY_ORDER passes. + """ + + def _run(self, match_type, tmp_path): + conv_result = convert_traces([_make_tool_trace(["helm_get_release", "helm_list_releases"])])[0] + + eval_set_path = tmp_path / "eval_set.json" + eval_set_path.write_text(json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"]))) + eval_set = load_eval_set(str(eval_set_path)) + + return asyncio.run( + _evaluate_trace( + conv_result=conv_result, + metrics=["tool_trajectory_avg_score"], + custom_evaluators=[], + eval_set=eval_set, + judge_model=None, + threshold=0.5, + trajectory_match_type=match_type, + eval_semaphore=asyncio.Semaphore(1), + ) + ) + + def test_exact_fails(self, tmp_path): + mr = self._run(None, tmp_path).metric_results[0] + assert mr.score == 0.0 + assert mr.eval_status == "FAILED" + + def test_any_order_passes(self, tmp_path): + mr = self._run("ANY_ORDER", tmp_path).metric_results[0] + assert mr.score == 1.0 + assert mr.eval_status == "PASSED" + + def test_in_order_fails(self, tmp_path): + mr = self._run("IN_ORDER", tmp_path).metric_results[0] + assert mr.score == 0.0 + assert mr.eval_status == "FAILED" diff --git a/ui/src/components/upload/UploadView.tsx b/ui/src/components/upload/UploadView.tsx index 7ed6d9e..6e047fe 100644 --- a/ui/src/components/upload/UploadView.tsx +++ b/ui/src/components/upload/UploadView.tsx @@ -525,6 +525,26 @@ export const UploadView: React.FC = () => { Minimum score to pass + + {state.selectedMetrics.includes('tool_trajectory_avg_score') && ( +
+ +