Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ description = "Standalone framework to evaluate agent correctness based on porta
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"google-adk[eval]>=1.25.0",
"google-adk[eval]>=1.30.0",
"click>=8.0",
"tabulate>=0.9.0",
"fastapi>=0.115.0",
Expand Down Expand Up @@ -62,6 +62,18 @@ dev = [
"httpx>=0.27.0",
"ruff>=0.11.0",
]
e2e = [
"strands-agents[openai]>=1.29.0",
"langchain>=1.2.0",
"langchain-openai>=1.1.10",
"openai>=2.30.0",
"openai-agents>=0.13.0",
"opentelemetry-sdk>=1.36.0",
"opentelemetry-exporter-otlp-proto-http>=1.36.0",
"opentelemetry-instrumentation-openai-v2",
"opentelemetry-instrumentation-openai-agents-v2>=0.1.0",
"python-dotenv>=1.0.0",
]

[tool.ruff]
target-version = "py311"
Expand Down
33 changes: 33 additions & 0 deletions src/agentevals/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ async def list_metrics():
"hallucinations_v1": "safety",
"safety_v1": "safety",
"per_turn_user_simulator_quality_v1": "simulation",
"multi_turn_task_success_v1": "multi-turn",
"multi_turn_trajectory_quality_v1": "multi-turn",
"multi_turn_tool_use_quality_v1": "multi-turn",
}

try:
Expand Down Expand Up @@ -226,6 +229,36 @@ async def list_metrics():
working=False,
description="Rubric-based assessment of tool usage quality (requires rubrics config)",
),
MetricInfo(
name="multi_turn_task_success_v1",
category="multi-turn",
requires_eval_set=False,
requires_llm=False,
requires_gcp=True,
requires_rubrics=False,
working=True,
description="Evaluates if the agent achieved the goal(s) of the multi-turn conversation (Vertex AI)",
),
MetricInfo(
name="multi_turn_trajectory_quality_v1",
category="multi-turn",
requires_eval_set=False,
requires_llm=False,
requires_gcp=True,
requires_rubrics=False,
working=True,
description="Evaluates the overall trajectory the agent took across the conversation (Vertex AI)",
),
MetricInfo(
name="multi_turn_tool_use_quality_v1",
category="multi-turn",
requires_eval_set=False,
requires_llm=False,
requires_gcp=True,
requires_rubrics=False,
working=True,
description="Evaluates function calls made during a multi-turn conversation (Vertex AI)",
),
]
return StandardResponse(data=fallback)

Expand Down
124 changes: 123 additions & 1 deletion src/agentevals/builtin_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@
import logging
from typing import Any

from google.adk.evaluation.eval_case import Invocation, get_all_tool_calls
from google.adk.evaluation.eval_case import (
IntermediateData,
Invocation,
InvocationEvent,
InvocationEvents,
get_all_tool_calls,
)
from google.adk.evaluation.eval_metrics import (
BaseCriterion,
EvalMetric,
Expand Down Expand Up @@ -41,9 +47,117 @@
METRICS_NEEDING_GCP = {
"response_evaluation_score",
"safety_v1",
"multi_turn_task_success_v1",
"multi_turn_trajectory_quality_v1",
"multi_turn_tool_use_quality_v1",
}

_METRICS_NEEDING_INVOCATION_EVENTS = {
"multi_turn_task_success_v1",
"multi_turn_trajectory_quality_v1",
"multi_turn_tool_use_quality_v1",
}


def _to_invocation_events(inv: Invocation) -> Invocation:
"""Return a copy of *inv* with ``intermediate_data`` shaped as ``InvocationEvents``.

Multi-turn Vertex AI metrics read ``invocation.intermediate_data.invocation_events``
directly, but agentevals' trace converters populate the ``IntermediateData`` variant
of the ``IntermediateDataType`` union. This adapter pairs each tool call with its
matching tool response (by ``id`` when present, else by position) and emits them
interleaved as ``call -> response -> call -> response``. ADK's native runtime
authors both calls and responses with the agent name (no separate ``"tool"``
actor); we use ``"agent"`` to match that convention so the Vertex judges see
the dialog in the shape they expect.
"""
from google.genai import types as genai_types

if inv.intermediate_data is None or isinstance(inv.intermediate_data, InvocationEvents):
return inv

id_: IntermediateData = inv.intermediate_data
response_by_id: dict[str, genai_types.FunctionResponse] = {tr.id: tr for tr in id_.tool_responses if tr.id}

events: list[InvocationEvent] = []
for i, tool_call in enumerate(id_.tool_uses):
events.append(
InvocationEvent(
author="agent",
content=genai_types.Content(role="model", parts=[genai_types.Part(function_call=tool_call)]),
)
)

match: genai_types.FunctionResponse | None = None
if tool_call.id and tool_call.id in response_by_id:
match = response_by_id[tool_call.id]
elif not tool_call.id and i < len(id_.tool_responses):
candidate = id_.tool_responses[i]
if not candidate.id:
match = candidate

if match is not None:
events.append(
InvocationEvent(
author="agent",
content=genai_types.Content(role="user", parts=[genai_types.Part(function_response=match)]),
)
)

for author, parts in id_.intermediate_responses:
events.append(
InvocationEvent(
author=author or "agent",
content=genai_types.Content(role="model", parts=list(parts)),
)
)

return inv.model_copy(update={"intermediate_data": InvocationEvents(invocation_events=events)})


def _enrich_app_details(invocations: list[Invocation]) -> list[Invocation]:
"""Synthesize minimal ``app_details`` so multi-turn metrics can score tool quality.

Vertex AI's multi-turn evaluators read each invocation's ``app_details.agent_details``
to learn which tools the agent has access to (their declarations). Without this,
``multi_turn_tool_use_quality_v1`` cannot score tool use because it has no schema
to compare calls against. Our trace converters do not populate ``app_details``, so
we synthesize a minimal record from tool names observed across the conversation.
"""
from google.adk.evaluation.app_details import AgentDetails, AppDetails
from google.genai import types as genai_types

if any(inv.app_details and inv.app_details.agent_details for inv in invocations):
return invocations

tool_names: dict[str, None] = {}
for inv in invocations:
data = inv.intermediate_data
if data is None:
continue
if isinstance(data, IntermediateData):
for tc in data.tool_uses:
if tc.name:
tool_names.setdefault(tc.name)
elif isinstance(data, InvocationEvents):
for ev in data.invocation_events:
if not (ev.content and ev.content.parts):
continue
for part in ev.content.parts:
if part.function_call and part.function_call.name:
tool_names.setdefault(part.function_call.name)

if not tool_names:
return invocations

function_declarations = [genai_types.FunctionDeclaration(name=name) for name in tool_names]
tool = genai_types.Tool(function_declarations=function_declarations)
agent_details = AgentDetails(name="agent", instructions="", tool_declarations=[tool])
app_details = AppDetails(agent_details={"agent": agent_details})

return [inv.model_copy(update={"app_details": app_details}) for inv in invocations]


def rubric_strings_to_objects(rubric_texts: list[str]) -> list[Rubric]:
"""Convert plain-text rubric strings into ADK Rubric objects."""
return [
Expand Down Expand Up @@ -113,6 +227,9 @@ def build_eval_metric(
"response_match_score",
"response_evaluation_score",
"safety_v1",
"multi_turn_task_success_v1",
"multi_turn_trajectory_quality_v1",
"multi_turn_tool_use_quality_v1",
):
criterion = BaseCriterion(threshold=effective_threshold)

Expand Down Expand Up @@ -209,6 +326,11 @@ async def evaluate_builtin_metric(
eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
evaluator: Evaluator = get_evaluator(eval_metric)

if metric_name in _METRICS_NEEDING_INVOCATION_EVENTS:
actual_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in actual_invocations])
if expected_invocations is not None:
expected_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in expected_invocations])

if inspect.iscoroutinefunction(evaluator.evaluate_invocations):
eval_result: EvaluationResult = await evaluator.evaluate_invocations(
actual_invocations=actual_invocations,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def setup_class(cls):
def test_metrics_fallback(self):
with patch.dict("sys.modules", {"google.adk.evaluation.metric_evaluator_registry": None}):
body = _assert_envelope(self.client.get("/api/metrics"))
assert len(body["data"]) == 8
assert len(body["data"]) == 11

def test_metrics_envelope(self):
body = _assert_envelope(self.client.get("/api/metrics"))
Expand Down
77 changes: 46 additions & 31 deletions ui/src/components/upload/MetricSelector.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,21 @@ const selectorStyle = css`

.metric-list {
display: flex;
flex-wrap: wrap;
gap: 16px 24px;
flex-direction: column;
gap: 10px;
}

.metric-item {
display: flex;
flex-direction: column;
gap: 3px;
min-width: 200px;
flex: 0 0 auto;
gap: 2px;
}

.metric-row {
display: flex;
align-items: center;
flex-wrap: wrap;
gap: 6px 10px;
}

.metric-name {
Expand All @@ -62,19 +67,27 @@ const selectorStyle = css`
font-size: 11px;
margin-left: 24px;
line-height: 1.3;
overflow-wrap: break-word;
word-break: break-word;
display: -webkit-box;
-webkit-line-clamp: 2;
-webkit-box-orient: vertical;
overflow: hidden;
}

.metric-badges {
display: flex;
flex-wrap: wrap;
gap: 4px;
margin-left: 24px;
flex-shrink: 0;
}

.metric-badge {
font-size: 9px;
padding: 2px 5px;
font-size: 10px;
padding: 2px 6px;
border-radius: 3px;
font-weight: 500;
line-height: 1.4;
}

.badge-eval-set {
Expand Down Expand Up @@ -184,30 +197,32 @@ export const MetricSelector: React.FC<MetricSelectorProps> = ({
<div className="metric-list">
{metrics.map((metric) => (
<div key={metric.name} className="metric-item">
<Checkbox
checked={selectedMetrics.includes(metric.name)}
onChange={() => onToggleMetric(metric.name)}
>
<span className="metric-name">{metric.name}</span>
</Checkbox>
<div className="metric-description">{metric.description}</div>
<div className="metric-badges">
{metric.requiresEvalSet && (
<span className="metric-badge badge-eval-set">Requires Eval Set</span>
)}
{metric.requiresLLM && (
<span className="metric-badge badge-llm">Uses LLM</span>
)}
{metric.requiresGCP && (
<span className="metric-badge badge-gcp">Requires GCP</span>
)}
{metric.requiresRubrics && (
<span className="metric-badge badge-rubrics">Requires Rubrics</span>
)}
{metric.working === false && (
<span className="metric-badge badge-incomplete">Incomplete</span>
)}
<div className="metric-row">
<Checkbox
checked={selectedMetrics.includes(metric.name)}
onChange={() => onToggleMetric(metric.name)}
>
<span className="metric-name">{metric.name}</span>
</Checkbox>
<div className="metric-badges">
{metric.requiresEvalSet && (
<span className="metric-badge badge-eval-set">Requires Eval Set</span>
)}
{metric.requiresLLM && (
<span className="metric-badge badge-llm">Uses LLM</span>
)}
{metric.requiresGCP && (
<span className="metric-badge badge-gcp">Requires GCP</span>
)}
{metric.requiresRubrics && (
<span className="metric-badge badge-rubrics">Requires Rubrics</span>
)}
{metric.working === false && (
<span className="metric-badge badge-incomplete">Incomplete</span>
)}
</div>
</div>
<div className="metric-description" title={metric.description}>{metric.description}</div>
</div>
))}
</div>
Expand Down
31 changes: 31 additions & 0 deletions ui/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,37 @@ export const AVAILABLE_METRICS: MetricMetadata[] = [
requiresRubrics: false,
working: true,
description: 'Safety and security assessment using Vertex AI'
},
// Multi-turn metrics (Vertex AI Gen AI Eval SDK)
{
name: 'multi_turn_task_success_v1',
category: 'multi-turn',
requiresEvalSet: false,
requiresLLM: false,
requiresGCP: true,
requiresRubrics: false,
working: true,
description: 'Evaluates if the agent achieved the goal(s) of the multi-turn conversation (Vertex AI)'
},
{
name: 'multi_turn_trajectory_quality_v1',
category: 'multi-turn',
requiresEvalSet: false,
requiresLLM: false,
requiresGCP: true,
requiresRubrics: false,
working: true,
description: 'Evaluates the overall trajectory the agent took across the conversation (Vertex AI)'
},
{
name: 'multi_turn_tool_use_quality_v1',
category: 'multi-turn',
requiresEvalSet: false,
requiresLLM: false,
requiresGCP: true,
requiresRubrics: false,
working: true,
description: 'Evaluates function calls made during a multi-turn conversation (Vertex AI)'
}
];

Expand Down
Loading