Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 21 additions & 32 deletions src/agentevals/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
from google.adk.evaluation.eval_case import IntermediateData, Invocation
from google.genai import types as genai_types

from .extraction import get_extractor, parse_json
from .extraction import (
extract_agent_response_from_attrs,
extract_user_text_from_attrs,
get_extractor,
parse_json,
)
from .loader.base import Span, Trace
from .trace_attrs import (
ADK_INVOCATION_ID,
Expand Down Expand Up @@ -152,50 +157,34 @@ def _walk(span: Span, op_prefix: str, acc: list[Span]) -> None:


def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
"""Extract user input from the first call_llm span's llm_request tag."""
"""Extract user input from the first call_llm span's attributes via shared extractor."""
text = extract_user_text_from_attrs(first_call_llm.tags)
if text:
return genai_types.Content(
role="user",
parts=[genai_types.Part(text=text)],
)
Comment on lines 159 to +166
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring says user input is extracted from the first call_llm span’s llm_request tag, but the function now first uses extract_user_text_from_attrs() (which may read either ADK llm_request or gen_ai.input.messages). Please update the docstring to reflect the new “span attributes / shared extractor” behavior to avoid misleading future readers.

Copilot uses AI. Check for mistakes.
llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
llm_request = parse_json(llm_request_raw)
contents = llm_request.get("contents", [])

for content_dict in reversed(contents):
if content_dict.get("role") != "user":
continue
parts = content_dict.get("parts", [])
# Skip function_response parts — only want actual user text messages
text_parts = [p for p in parts if "text" in p]
if text_parts:
return genai_types.Content(
role="user",
parts=[genai_types.Part(text=p["text"]) for p in text_parts],
)

for content_dict in contents:
for content_dict in llm_request.get("contents", []):
if content_dict.get("role") == "user":
return _content_from_dict(content_dict)

raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request")


def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
"""Extract final text response from the last call_llm span's llm_response tag."""
"""Extract final text response from the last call_llm span's attributes via shared extractor."""
text = extract_agent_response_from_attrs(last_call_llm.tags)
if text:
return genai_types.Content(
role="model",
parts=[genai_types.Part(text=text)],
)
llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
llm_response = parse_json(llm_response_raw)

content_dict = llm_response.get("content", {})
if not content_dict:
raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response")

parts_dicts = content_dict.get("parts", [])
# Final response should have text parts, not function_call parts
text_parts = [p for p in parts_dicts if "text" in p]
if text_parts:
return genai_types.Content(
role="model",
parts=[genai_types.Part(text=p["text"]) for p in text_parts],
)

# If the last call_llm only has function_call parts, that's unexpected
# for a final response — the agent may have been cut short.
logger.warning(
"call_llm span %s: last llm_response has no text parts, may not be the actual final response",
last_call_llm.span_id,
Expand Down
2 changes: 1 addition & 1 deletion src/agentevals/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None:
if messages_raw:
messages = parse_json_attr(messages_raw, "gen_ai.output.messages")
if isinstance(messages, list):
for msg in messages:
for msg in reversed(messages):
if isinstance(msg, dict) and msg.get("role") in ASSISTANT_ROLES:
text = extract_text_from_message(msg)
if text:
Expand Down
60 changes: 19 additions & 41 deletions src/agentevals/genai_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@
from google.genai import types as genai_types

from .converter import ConversionResult
from .extraction import GenAIExtractor, is_invocation_span, is_llm_span, parse_tool_response_content
from .extraction import (
GenAIExtractor,
extract_agent_response_from_attrs,
extract_user_text_from_attrs,
is_invocation_span,
is_llm_span,
parse_tool_response_content,
)
from .loader.base import Span, Trace
from .trace_attrs import (
OTEL_GENAI_INPUT_MESSAGES,
Expand Down Expand Up @@ -307,50 +314,21 @@ def _turn_to_invocation(turn: _ConversationTurn) -> Invocation:


def _extract_user_text(llm_span: Span) -> str:
messages_raw = llm_span.get_tag(OTEL_GENAI_INPUT_MESSAGES, "[]")
messages = parse_json_attr(messages_raw, "gen_ai.input.messages")

if not isinstance(messages, list):
messages = []

for msg in reversed(messages):
if not isinstance(msg, dict):
continue
if msg.get("role") in USER_ROLES:
text = extract_text_from_message(msg)
if text:
logger.debug(f"Found user message: {text[:100]}")
return text

logger.warning(f"No user message found in {len(messages)} messages")
raise ValueError(f"LLM span {llm_span.span_id}: no user message found in gen_ai.input.messages")
text = extract_user_text_from_attrs(llm_span.tags)
if text:
return text
raise ValueError(
f"LLM span {llm_span.span_id}: no user message found (checked gen_ai.input.messages and ADK llm_request)"
)


def _extract_assistant_text(llm_span: Span) -> str:
messages_raw = llm_span.get_tag(OTEL_GENAI_OUTPUT_MESSAGES, "[]")
messages = parse_json_attr(messages_raw, "gen_ai.output.messages")

if not isinstance(messages, list):
messages = []

logger.debug(f"Extracting final response from {len(messages)} output messages")
for i, msg in enumerate(messages):
if isinstance(msg, dict):
logger.debug(
f" Message {i}: role={msg.get('role')}, content_len={len(msg.get('content', ''))}, has_tool_calls={bool(msg.get('tool_calls'))}"
)

for msg in reversed(messages):
if not isinstance(msg, dict):
continue
if msg.get("role") in ASSISTANT_ROLES:
text = extract_text_from_message(msg)
if text:
logger.debug(f"Found assistant message with text: {text[:100]}")
return text

text = extract_agent_response_from_attrs(llm_span.tags)
if text:
return text
logger.warning(
f"LLM span {llm_span.span_id}: no assistant message with content in gen_ai.output.messages ({len(messages)} messages)"
"LLM span %s: no assistant message with content in span attributes",
llm_span.span_id,
)
return ""

Expand Down
11 changes: 11 additions & 0 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,17 @@ def test_adk_no_text_parts(self):
attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"function_call": {"name": "tool"}}]}})}
assert extract_agent_response_from_attrs(attrs) is None

def test_genai_prefers_last_assistant(self):
attrs = {
OTEL_GENAI_OUTPUT_MESSAGES: json.dumps(
[
{"role": "assistant", "content": "First response"},
{"role": "assistant", "content": "Second response"},
]
)
}
assert extract_agent_response_from_attrs(attrs) == "Second response"


# ---------------------------------------------------------------------------
# extract_token_usage_from_attrs
Expand Down