agentevals-dev · krisztianfekete · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ description = "Standalone framework to evaluate agent correctness based on porta
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
-    "google-adk[eval]>=1.25.0",
+    "google-adk[eval]>=1.30.0",
     "click>=8.0",
     "tabulate>=0.9.0",
     "fastapi>=0.115.0",
@@ -62,6 +62,18 @@ dev = [
     "httpx>=0.27.0",
     "ruff>=0.11.0",
 ]
+e2e = [
+    "strands-agents[openai]>=1.29.0",
+    "langchain>=1.2.0",
+    "langchain-openai>=1.1.10",
+    "openai>=2.30.0",
+    "openai-agents>=0.13.0",
+    "opentelemetry-sdk>=1.36.0",
+    "opentelemetry-exporter-otlp-proto-http>=1.36.0",
+    "opentelemetry-instrumentation-openai-v2",
+    "opentelemetry-instrumentation-openai-agents-v2>=0.1.0",
+    "python-dotenv>=1.0.0",
+]
 
 [tool.ruff]
 target-version = "py311"

diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py
@@ -115,6 +115,9 @@ async def list_metrics():
         "hallucinations_v1": "safety",
         "safety_v1": "safety",
         "per_turn_user_simulator_quality_v1": "simulation",
+        "multi_turn_task_success_v1": "multi-turn",
+        "multi_turn_trajectory_quality_v1": "multi-turn",
+        "multi_turn_tool_use_quality_v1": "multi-turn",
     }
 
     try:
@@ -226,6 +229,36 @@ async def list_metrics():
                 working=False,
                 description="Rubric-based assessment of tool usage quality (requires rubrics config)",
             ),
+            MetricInfo(
+                name="multi_turn_task_success_v1",
+                category="multi-turn",
+                requires_eval_set=False,
+                requires_llm=False,
+                requires_gcp=True,
+                requires_rubrics=False,
+                working=True,
+                description="Evaluates if the agent achieved the goal(s) of the multi-turn conversation (Vertex AI)",
+            ),
+            MetricInfo(
+                name="multi_turn_trajectory_quality_v1",
+                category="multi-turn",
+                requires_eval_set=False,
+                requires_llm=False,
+                requires_gcp=True,
+                requires_rubrics=False,
+                working=True,
+                description="Evaluates the overall trajectory the agent took across the conversation (Vertex AI)",
+            ),
+            MetricInfo(
+                name="multi_turn_tool_use_quality_v1",
+                category="multi-turn",
+                requires_eval_set=False,
+                requires_llm=False,
+                requires_gcp=True,
+                requires_rubrics=False,
+                working=True,
+                description="Evaluates function calls made during a multi-turn conversation (Vertex AI)",
+            ),
         ]
         return StandardResponse(data=fallback)
 

diff --git a/src/agentevals/builtin_metrics.py b/src/agentevals/builtin_metrics.py
@@ -7,7 +7,13 @@
 import logging
 from typing import Any
 
-from google.adk.evaluation.eval_case import Invocation, get_all_tool_calls
+from google.adk.evaluation.eval_case import (
+    IntermediateData,
+    Invocation,
+    InvocationEvent,
+    InvocationEvents,
+    get_all_tool_calls,
+)
 from google.adk.evaluation.eval_metrics import (
     BaseCriterion,
     EvalMetric,
@@ -41,9 +47,117 @@
 METRICS_NEEDING_GCP = {
     "response_evaluation_score",
     "safety_v1",
+    "multi_turn_task_success_v1",
+    "multi_turn_trajectory_quality_v1",
+    "multi_turn_tool_use_quality_v1",
+}
+
+_METRICS_NEEDING_INVOCATION_EVENTS = {
+    "multi_turn_task_success_v1",
+    "multi_turn_trajectory_quality_v1",
+    "multi_turn_tool_use_quality_v1",
 }
 
 
+def _to_invocation_events(inv: Invocation) -> Invocation:
+    """Return a copy of *inv* with ``intermediate_data`` shaped as ``InvocationEvents``.
+
+    Multi-turn Vertex AI metrics read ``invocation.intermediate_data.invocation_events``
+    directly, but agentevals' trace converters populate the ``IntermediateData`` variant
+    of the ``IntermediateDataType`` union. This adapter pairs each tool call with its
+    matching tool response (by ``id`` when present, else by position) and emits them
+    interleaved as ``call -> response -> call -> response``. ADK's native runtime
+    authors both calls and responses with the agent name (no separate ``"tool"``
+    actor); we use ``"agent"`` to match that convention so the Vertex judges see
+    the dialog in the shape they expect.
+    """
+    from google.genai import types as genai_types
+
+    if inv.intermediate_data is None or isinstance(inv.intermediate_data, InvocationEvents):
+        return inv
+
+    id_: IntermediateData = inv.intermediate_data
+    response_by_id: dict[str, genai_types.FunctionResponse] = {tr.id: tr for tr in id_.tool_responses if tr.id}
+
+    events: list[InvocationEvent] = []
+    for i, tool_call in enumerate(id_.tool_uses):
+        events.append(
+            InvocationEvent(
+                author="agent",
+                content=genai_types.Content(role="model", parts=[genai_types.Part(function_call=tool_call)]),
+            )
+        )
+
+        match: genai_types.FunctionResponse | None = None
+        if tool_call.id and tool_call.id in response_by_id:
+            match = response_by_id[tool_call.id]
+        elif not tool_call.id and i < len(id_.tool_responses):
+            candidate = id_.tool_responses[i]
+            if not candidate.id:
+                match = candidate
+
+        if match is not None:
+            events.append(
+                InvocationEvent(
+                    author="agent",
+                    content=genai_types.Content(role="user", parts=[genai_types.Part(function_response=match)]),
+                )
+            )
+
+    for author, parts in id_.intermediate_responses:
+        events.append(
+            InvocationEvent(
+                author=author or "agent",
+                content=genai_types.Content(role="model", parts=list(parts)),
+            )
+        )
+
+    return inv.model_copy(update={"intermediate_data": InvocationEvents(invocation_events=events)})
+
+
+def _enrich_app_details(invocations: list[Invocation]) -> list[Invocation]:
+    """Synthesize minimal ``app_details`` so multi-turn metrics can score tool quality.
+
+    Vertex AI's multi-turn evaluators read each invocation's ``app_details.agent_details``
+    to learn which tools the agent has access to (their declarations). Without this,
+    ``multi_turn_tool_use_quality_v1`` cannot score tool use because it has no schema
+    to compare calls against. Our trace converters do not populate ``app_details``, so
+    we synthesize a minimal record from tool names observed across the conversation.
+    """
+    from google.adk.evaluation.app_details import AgentDetails, AppDetails
+    from google.genai import types as genai_types
+
+    if any(inv.app_details and inv.app_details.agent_details for inv in invocations):
+        return invocations
+
+    tool_names: dict[str, None] = {}
+    for inv in invocations:
+        data = inv.intermediate_data
+        if data is None:
+            continue
+        if isinstance(data, IntermediateData):
+            for tc in data.tool_uses:
+                if tc.name:
+                    tool_names.setdefault(tc.name)
+        elif isinstance(data, InvocationEvents):
+            for ev in data.invocation_events:
+                if not (ev.content and ev.content.parts):
+                    continue
+                for part in ev.content.parts:
+                    if part.function_call and part.function_call.name:
+                        tool_names.setdefault(part.function_call.name)
+
+    if not tool_names:
+        return invocations
+
+    function_declarations = [genai_types.FunctionDeclaration(name=name) for name in tool_names]
+    tool = genai_types.Tool(function_declarations=function_declarations)
+    agent_details = AgentDetails(name="agent", instructions="", tool_declarations=[tool])
+    app_details = AppDetails(agent_details={"agent": agent_details})
+
+    return [inv.model_copy(update={"app_details": app_details}) for inv in invocations]
+
+
 def rubric_strings_to_objects(rubric_texts: list[str]) -> list[Rubric]:
     """Convert plain-text rubric strings into ADK Rubric objects."""
     return [
@@ -113,6 +227,9 @@ def build_eval_metric(
         "response_match_score",
         "response_evaluation_score",
         "safety_v1",
+        "multi_turn_task_success_v1",
+        "multi_turn_trajectory_quality_v1",
+        "multi_turn_tool_use_quality_v1",
     ):
         criterion = BaseCriterion(threshold=effective_threshold)
 
@@ -209,6 +326,11 @@ async def evaluate_builtin_metric(
         eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
         evaluator: Evaluator = get_evaluator(eval_metric)
 
+        if metric_name in _METRICS_NEEDING_INVOCATION_EVENTS:
+            actual_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in actual_invocations])
+            if expected_invocations is not None:
+                expected_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in expected_invocations])
+
         if inspect.iscoroutinefunction(evaluator.evaluate_invocations):
             eval_result: EvaluationResult = await evaluator.evaluate_invocations(
                 actual_invocations=actual_invocations,

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -386,7 +386,7 @@ def setup_class(cls):
     def test_metrics_fallback(self):
         with patch.dict("sys.modules", {"google.adk.evaluation.metric_evaluator_registry": None}):
             body = _assert_envelope(self.client.get("/api/metrics"))
-        assert len(body["data"]) == 8
+        assert len(body["data"]) == 11
 
     def test_metrics_envelope(self):
         body = _assert_envelope(self.client.get("/api/metrics"))

diff --git a/ui/src/components/upload/MetricSelector.tsx b/ui/src/components/upload/MetricSelector.tsx
@@ -39,16 +39,21 @@ const selectorStyle = css`
 
   .metric-list {
     display: flex;
-    flex-wrap: wrap;
-    gap: 16px 24px;
+    flex-direction: column;
+    gap: 10px;
   }
 
   .metric-item {
     display: flex;
     flex-direction: column;
-    gap: 3px;
-    min-width: 200px;
-    flex: 0 0 auto;
+    gap: 2px;
+  }
+
+  .metric-row {
+    display: flex;
+    align-items: center;
+    flex-wrap: wrap;
+    gap: 6px 10px;
   }
 
   .metric-name {
@@ -62,19 +67,27 @@ const selectorStyle = css`
     font-size: 11px;
     margin-left: 24px;
     line-height: 1.3;
+    overflow-wrap: break-word;
+    word-break: break-word;
+    display: -webkit-box;
+    -webkit-line-clamp: 2;
+    -webkit-box-orient: vertical;
+    overflow: hidden;
   }
 
   .metric-badges {
     display: flex;
+    flex-wrap: wrap;
     gap: 4px;
-    margin-left: 24px;
+    flex-shrink: 0;
   }
 
   .metric-badge {
-    font-size: 9px;
-    padding: 2px 5px;
+    font-size: 10px;
+    padding: 2px 6px;
     border-radius: 3px;
     font-weight: 500;
+    line-height: 1.4;
   }
 
   .badge-eval-set {
@@ -184,30 +197,32 @@ export const MetricSelector: React.FC<MetricSelectorProps> = ({
             <div className="metric-list">
               {metrics.map((metric) => (
                 <div key={metric.name} className="metric-item">
-                  <Checkbox
-                    checked={selectedMetrics.includes(metric.name)}
-                    onChange={() => onToggleMetric(metric.name)}
-                  >
-                    <span className="metric-name">{metric.name}</span>
-                  </Checkbox>
-                  <div className="metric-description">{metric.description}</div>
-                  <div className="metric-badges">
-                    {metric.requiresEvalSet && (
-                      <span className="metric-badge badge-eval-set">Requires Eval Set</span>
-                    )}
-                    {metric.requiresLLM && (
-                      <span className="metric-badge badge-llm">Uses LLM</span>
-                    )}
-                    {metric.requiresGCP && (
-                      <span className="metric-badge badge-gcp">Requires GCP</span>
-                    )}
-                    {metric.requiresRubrics && (
-                      <span className="metric-badge badge-rubrics">Requires Rubrics</span>
-                    )}
-                    {metric.working === false && (
-                      <span className="metric-badge badge-incomplete">Incomplete</span>
-                    )}
+                  <div className="metric-row">
+                    <Checkbox
+                      checked={selectedMetrics.includes(metric.name)}
+                      onChange={() => onToggleMetric(metric.name)}
+                    >
+                      <span className="metric-name">{metric.name}</span>
+                    </Checkbox>
+                    <div className="metric-badges">
+                      {metric.requiresEvalSet && (
+                        <span className="metric-badge badge-eval-set">Requires Eval Set</span>
+                      )}
+                      {metric.requiresLLM && (
+                        <span className="metric-badge badge-llm">Uses LLM</span>
+                      )}
+                      {metric.requiresGCP && (
+                        <span className="metric-badge badge-gcp">Requires GCP</span>
+                      )}
+                      {metric.requiresRubrics && (
+                        <span className="metric-badge badge-rubrics">Requires Rubrics</span>
+                      )}
+                      {metric.working === false && (
+                        <span className="metric-badge badge-incomplete">Incomplete</span>
+                      )}
+                    </div>
                   </div>
+                  <div className="metric-description" title={metric.description}>{metric.description}</div>
                 </div>
               ))}
             </div>

diff --git a/ui/src/lib/types.ts b/ui/src/lib/types.ts
@@ -330,6 +330,37 @@ export const AVAILABLE_METRICS: MetricMetadata[] = [
     requiresRubrics: false,
     working: true,
     description: 'Safety and security assessment using Vertex AI'
+  },
+  // Multi-turn metrics (Vertex AI Gen AI Eval SDK)
+  {
+    name: 'multi_turn_task_success_v1',
+    category: 'multi-turn',
+    requiresEvalSet: false,
+    requiresLLM: false,
+    requiresGCP: true,
+    requiresRubrics: false,
+    working: true,
+    description: 'Evaluates if the agent achieved the goal(s) of the multi-turn conversation (Vertex AI)'
+  },
+  {
+    name: 'multi_turn_trajectory_quality_v1',
+    category: 'multi-turn',
+    requiresEvalSet: false,
+    requiresLLM: false,
+    requiresGCP: true,
+    requiresRubrics: false,
+    working: true,
+    description: 'Evaluates the overall trajectory the agent took across the conversation (Vertex AI)'
+  },
+  {
+    name: 'multi_turn_tool_use_quality_v1',
+    category: 'multi-turn',
+    requiresEvalSet: false,
+    requiresLLM: false,
+    requiresGCP: true,
+    requiresRubrics: false,
+    working: true,
+    description: 'Evaluates function calls made during a multi-turn conversation (Vertex AI)'
   }
 ];