Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/agentevals/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,7 @@ async def evaluate_traces(
trace_format=trace_format,
judge_model=config_dict.get("judgeModel"),
threshold=threshold,
trajectory_match_type=config_dict.get("trajectoryMatchType"),
)

logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}")
Expand Down Expand Up @@ -615,6 +616,7 @@ async def event_generator():
trace_format=trace_format,
judge_model=config_dict.get("judgeModel"),
threshold=threshold,
trajectory_match_type=config_dict.get("trajectoryMatchType"),
)

loader = get_loader(eval_config.trace_format)
Expand Down
4 changes: 3 additions & 1 deletion src/agentevals/api/streaming_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import asyncio
import json
import logging
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal

from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import FileResponse
Expand Down Expand Up @@ -46,6 +46,7 @@ class EvaluateSessionsRequest(BaseModel):
eval_set_id: str
metrics: list[str] = ["tool_trajectory_avg_score"]
judge_model: str = "gemini-2.5-flash"
trajectory_match_type: Literal["EXACT", "IN_ORDER", "ANY_ORDER"] | None = None


class PrepareEvaluationRequest(BaseModel):
Expand Down Expand Up @@ -210,6 +211,7 @@ async def eval_one_session(session_id: str, session) -> SessionEvalResult:
eval_set_file=eval_set_file.name,
metrics=request.metrics,
judge_model=request.judge_model,
trajectory_match_type=request.trajectory_match_type,
)

eval_result = await run_evaluation(config)
Expand Down
15 changes: 12 additions & 3 deletions src/agentevals/builtin_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,18 @@ def build_eval_metric(
judge_model: str | None,
threshold: float | None,
rubrics: list[str] | None = None,
match_type: str | None = None,
) -> EvalMetric:
"""Construct an ADK ``EvalMetric`` with the appropriate criterion."""
effective_threshold = threshold if threshold is not None else 0.5

criterion: BaseCriterion | None = None

if metric_name == "tool_trajectory_avg_score":
criterion = ToolTrajectoryCriterion(threshold=effective_threshold)
_match = (
ToolTrajectoryCriterion.MatchType[match_type] if match_type else ToolTrajectoryCriterion.MatchType.EXACT
)
criterion = ToolTrajectoryCriterion(threshold=effective_threshold, match_type=_match)
elif metric_name == "final_response_match_v2":
judge_opts = JudgeModelOptions()
if judge_model:
Expand Down Expand Up @@ -105,7 +109,11 @@ def build_eval_metric(
threshold=effective_threshold,
judge_model_options=judge_opts,
)
elif metric_name in ("response_match_score", "response_evaluation_score", "safety_v1"):
elif metric_name in (
"response_match_score",
"response_evaluation_score",
"safety_v1",
):
criterion = BaseCriterion(threshold=effective_threshold)

return EvalMetric(
Expand Down Expand Up @@ -179,6 +187,7 @@ async def evaluate_builtin_metric(
expected_invocations: list[Invocation] | None,
judge_model: str | None,
threshold: float | None,
match_type: str | None = None,
) -> dict[str, Any]:
"""Evaluate a single built-in ADK metric.

Expand All @@ -197,7 +206,7 @@ async def evaluate_builtin_metric(
)

try:
eval_metric = build_eval_metric(metric_name, judge_model, threshold)
eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
evaluator: Evaluator = get_evaluator(eval_metric)

if inspect.iscoroutinefunction(evaluator.evaluate_invocations):
Expand Down
9 changes: 9 additions & 0 deletions src/agentevals/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ def main(verbose: int) -> None:
default=None,
help="Score threshold for pass/fail.",
)
@click.option(
"--trajectory-match-type",
type=click.Choice(["EXACT", "IN_ORDER", "ANY_ORDER"], case_sensitive=False),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please perform the same validation at all the layers this can be a problem in the codebase?

default=None,
help="Match type for tool_trajectory_avg_score: EXACT (default), IN_ORDER, or ANY_ORDER.",
)
@click.option(
"--output",
"-o",
Expand All @@ -130,6 +136,7 @@ def run(
trace_format: str,
judge_model: str | None,
threshold: float | None,
trajectory_match_type: str | None,
output: str,
config_file: str | None,
) -> None:
Expand All @@ -152,6 +159,7 @@ def run(
trace_format=trace_format,
judge_model=judge_model,
threshold=threshold,
trajectory_match_type=trajectory_match_type,
output_format=output,
)
config = merge_configs(file_config, cli_config)
Expand All @@ -164,6 +172,7 @@ def run(
trace_format=trace_format,
judge_model=judge_model,
threshold=threshold,
trajectory_match_type=trajectory_match_type,
output_format=output,
)

Expand Down
13 changes: 13 additions & 0 deletions src/agentevals/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,19 @@ class EvalRunConfig(BaseModel):
description="Score threshold for pass/fail.",
)

trajectory_match_type: str | None = Field(
default=None,
description="Match type for tool_trajectory_avg_score: 'EXACT', 'IN_ORDER', or 'ANY_ORDER'. Default: EXACT.",
)

@field_validator("trajectory_match_type")
@classmethod
def _validate_trajectory_match_type(cls, v: str | None) -> str | None:
valid = {"EXACT", "IN_ORDER", "ANY_ORDER"}
if v is not None and v.upper() not in valid:
raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
return v.upper() if v is not None else v

output_format: str = Field(
default="table",
description="Output format: 'table', 'json', or 'summary'.",
Expand Down
4 changes: 4 additions & 0 deletions src/agentevals/eval_config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ def load_eval_config(path: str | Path) -> EvalRunConfig:
config.judge_model = data["judge_model"]
if "threshold" in data:
config.threshold = float(data["threshold"])
if "trajectory_match_type" in data:
config.trajectory_match_type = data["trajectory_match_type"]
if "trace_format" in data:
config.trace_format = data["trace_format"]

Expand All @@ -136,6 +138,8 @@ def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> Eval
merged.judge_model = cli_config.judge_model
if cli_config.threshold is not None:
merged.threshold = cli_config.threshold
if cli_config.trajectory_match_type is not None:
merged.trajectory_match_type = cli_config.trajectory_match_type
if cli_config.trace_format != "jaeger-json":
merged.trace_format = cli_config.trace_format
if cli_config.output_format != "table":
Expand Down
3 changes: 3 additions & 0 deletions src/agentevals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ async def _evaluate_trace_bounded(idx: int, conv_result: ConversionResult) -> Tr
eval_set=eval_set,
judge_model=config.judge_model,
threshold=config.threshold,
trajectory_match_type=config.trajectory_match_type,
eval_semaphore=eval_semaphore,
progress_callback=progress_callback,
trace_progress_callback=trace_progress_callback,
Expand Down Expand Up @@ -201,6 +202,7 @@ async def _evaluate_trace(
trace_progress_callback: TraceProgressCallback | None = None,
trace=None,
performance_metrics: dict[str, Any] | None = None,
trajectory_match_type: str | None = None,
) -> TraceResult:
trace_result = TraceResult(
trace_id=conv_result.trace_id,
Expand Down Expand Up @@ -243,6 +245,7 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult:
expected_invocations=expected_invocations,
judge_model=judge_model,
threshold=threshold,
match_type=trajectory_match_type,
)
result.duration_ms = (time.monotonic() - t0) * 1000
return await _append_result(result)
Expand Down
134 changes: 133 additions & 1 deletion tests/test_runner.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,101 @@
import asyncio
import json
import os

import pytest

from agentevals.config import EvalRunConfig
from agentevals.runner import load_eval_set, run_evaluation
from agentevals.converter import convert_traces
from agentevals.loader.base import Span, Trace
from agentevals.runner import _evaluate_trace, load_eval_set, run_evaluation
from agentevals.trace_metrics import extract_trace_metadata


def _make_tool_trace(tools: list[str]) -> Trace:
"""Build a minimal ADK trace calling the given tools in order."""
invoke = Span(
trace_id="t1",
span_id="invoke1",
parent_span_id=None,
operation_name="invoke_agent test_agent",
start_time=1000,
duration=10000,
tags={"otel.scope.name": "gcp.vertex.agent"},
)
call_llm_1 = Span(
trace_id="t1",
span_id="llm1",
parent_span_id="invoke1",
operation_name="call_llm",
start_time=2000,
duration=1000,
tags={
"otel.scope.name": "gcp.vertex.agent",
"gcp.vertex.agent.llm_request": json.dumps(
{"contents": [{"role": "user", "parts": [{"text": "do something"}]}]}
),
},
)
tool_spans = [
Span(
trace_id="t1",
span_id=f"tool{i}",
parent_span_id="invoke1",
operation_name=f"execute_tool {name}",
start_time=3000 + i * 100,
duration=100,
tags={"otel.scope.name": "gcp.vertex.agent"},
)
for i, name in enumerate(tools)
]
call_llm_2 = Span(
trace_id="t1",
span_id="llm2",
parent_span_id="invoke1",
operation_name="call_llm",
start_time=5000,
duration=1000,
tags={
"otel.scope.name": "gcp.vertex.agent",
"gcp.vertex.agent.llm_response": json.dumps({"content": {"role": "model", "parts": [{"text": "done"}]}}),
},
)
invoke.children = [call_llm_1, *tool_spans, call_llm_2]
return Trace(
trace_id="t1",
root_spans=[invoke],
all_spans=[invoke, call_llm_1, *tool_spans, call_llm_2],
)


def _make_eval_set_json(tools: list[str]) -> dict:
return {
"eval_set_id": "test",
"eval_cases": [
{
"eval_id": "inv_1",
"conversation": [
{
"invocation_id": "inv_1",
"user_content": {
"role": "user",
"parts": [{"text": "do something"}],
},
"final_response": {
"role": "model",
"parts": [{"text": "done"}],
},
"intermediate_data": {
"tool_uses": [{"name": t, "args": {}, "id": f"e{i}"} for i, t in enumerate(tools)],
"tool_responses": [],
},
}
],
}
],
}


SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "..", "samples")
HELM_TRACE = os.path.join(SAMPLES_DIR, "helm.json")
HELM_3_TRACE = os.path.join(SAMPLES_DIR, "helm_3.json")
Expand Down Expand Up @@ -168,3 +257,46 @@ def testextract_trace_metadata_adk(self):
assert "helm" in metadata["user_input_preview"].lower()
assert metadata["final_output_preview"] is not None
assert len(metadata["final_output_preview"]) > 0


class TestTrajectoryMatchType:
"""Verify trajectory_match_type produces different scores on the same trace.

Actual calls [get, list], expected calls [list, get].
EXACT and IN_ORDER fail; ANY_ORDER passes.
"""

def _run(self, match_type, tmp_path):
conv_result = convert_traces([_make_tool_trace(["helm_get_release", "helm_list_releases"])])[0]

eval_set_path = tmp_path / "eval_set.json"
eval_set_path.write_text(json.dumps(_make_eval_set_json(["helm_list_releases", "helm_get_release"])))
eval_set = load_eval_set(str(eval_set_path))

return asyncio.run(
_evaluate_trace(
conv_result=conv_result,
metrics=["tool_trajectory_avg_score"],
custom_evaluators=[],
eval_set=eval_set,
judge_model=None,
threshold=0.5,
trajectory_match_type=match_type,
eval_semaphore=asyncio.Semaphore(1),
)
)

def test_exact_fails(self, tmp_path):
mr = self._run(None, tmp_path).metric_results[0]
assert mr.score == 0.0
assert mr.eval_status == "FAILED"

def test_any_order_passes(self, tmp_path):
mr = self._run("ANY_ORDER", tmp_path).metric_results[0]
assert mr.score == 1.0
assert mr.eval_status == "PASSED"

def test_in_order_fails(self, tmp_path):
mr = self._run("IN_ORDER", tmp_path).metric_results[0]
assert mr.score == 0.0
assert mr.eval_status == "FAILED"
20 changes: 20 additions & 0 deletions ui/src/components/upload/UploadView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,26 @@ export const UploadView: React.FC = () => {
Minimum score to pass
</span>
</div>

{state.selectedMetrics.includes('tool_trajectory_avg_score') && (
<div className="setting-item" style={{ marginTop: 10 }}>
<label className="setting-label">Trajectory Match Type</label>
<Select
value={state.trajectoryMatchType}
onChange={actions.setTrajectoryMatchType}
options={[
{ value: 'EXACT', label: 'EXACT — tools must match in exact order' },
{ value: 'IN_ORDER', label: 'IN_ORDER — expected tools appear in order (extras allowed)' },
{ value: 'ANY_ORDER', label: 'ANY_ORDER — expected tools appear in any order' },
]}
style={{ width: '100%' }}
size="small"
/>
<span className="setting-hint">
How to compare tool call sequences for tool_trajectory_avg_score
</span>
</div>
)}
</div>
</div>

Expand Down
2 changes: 2 additions & 0 deletions ui/src/context/TraceContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export interface TraceState {
selectedMetrics: string[];
judgeModel: string;
threshold: number;
trajectoryMatchType: string;
traceMetadata: Map<string, TraceMetadata>;
isLoadingMetadata: boolean;
apiKeyStatus: ApiKeyStatus | null;
Expand Down Expand Up @@ -54,6 +55,7 @@ export interface TraceContextType {
toggleMetric: (metric: string) => void;
setJudgeModel: (model: string) => void;
setThreshold: (threshold: number) => void;
setTrajectoryMatchType: (matchType: string) => void;
runEvaluation: () => Promise<void>;
setCurrentView: (view: ViewType) => void;
setEvaluationOrigin: (view: ViewType | null) => void;
Expand Down
Loading