diff --git a/src/agentevals/converter.py b/src/agentevals/converter.py index e9a01ce..b759675 100644 --- a/src/agentevals/converter.py +++ b/src/agentevals/converter.py @@ -18,7 +18,12 @@ from google.adk.evaluation.eval_case import IntermediateData, Invocation from google.genai import types as genai_types -from .extraction import get_extractor, parse_json +from .extraction import ( + extract_agent_response_from_attrs, + extract_user_text_from_attrs, + get_extractor, + parse_json, +) from .loader.base import Span, Trace from .trace_attrs import ( ADK_INVOCATION_ID, @@ -152,50 +157,34 @@ def _walk(span: Span, op_prefix: str, acc: list[Span]) -> None: def _extract_user_content(first_call_llm: Span) -> genai_types.Content: - """Extract user input from the first call_llm span's llm_request tag.""" + """Extract user input from the first call_llm span's attributes via shared extractor.""" + text = extract_user_text_from_attrs(first_call_llm.tags) + if text: + return genai_types.Content( + role="user", + parts=[genai_types.Part(text=text)], + ) llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}") llm_request = parse_json(llm_request_raw) - contents = llm_request.get("contents", []) - - for content_dict in reversed(contents): - if content_dict.get("role") != "user": - continue - parts = content_dict.get("parts", []) - # Skip function_response parts — only want actual user text messages - text_parts = [p for p in parts if "text" in p] - if text_parts: - return genai_types.Content( - role="user", - parts=[genai_types.Part(text=p["text"]) for p in text_parts], - ) - - for content_dict in contents: + for content_dict in llm_request.get("contents", []): if content_dict.get("role") == "user": return _content_from_dict(content_dict) - raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request") def _extract_final_response(last_call_llm: Span) -> genai_types.Content: - """Extract final text response from the last call_llm span's llm_response tag.""" + """Extract final text response from the last call_llm span's attributes via shared extractor.""" + text = extract_agent_response_from_attrs(last_call_llm.tags) + if text: + return genai_types.Content( + role="model", + parts=[genai_types.Part(text=text)], + ) llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}") llm_response = parse_json(llm_response_raw) - content_dict = llm_response.get("content", {}) if not content_dict: raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response") - - parts_dicts = content_dict.get("parts", []) - # Final response should have text parts, not function_call parts - text_parts = [p for p in parts_dicts if "text" in p] - if text_parts: - return genai_types.Content( - role="model", - parts=[genai_types.Part(text=p["text"]) for p in text_parts], - ) - - # If the last call_llm only has function_call parts, that's unexpected - # for a final response — the agent may have been cut short. logger.warning( "call_llm span %s: last llm_response has no text parts, may not be the actual final response", last_call_llm.span_id, diff --git a/src/agentevals/extraction.py b/src/agentevals/extraction.py index 55a039c..ba659c4 100644 --- a/src/agentevals/extraction.py +++ b/src/agentevals/extraction.py @@ -100,7 +100,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None: if messages_raw: messages = parse_json_attr(messages_raw, "gen_ai.output.messages") if isinstance(messages, list): - for msg in messages: + for msg in reversed(messages): if isinstance(msg, dict) and msg.get("role") in ASSISTANT_ROLES: text = extract_text_from_message(msg) if text: diff --git a/src/agentevals/genai_converter.py b/src/agentevals/genai_converter.py index 3a6764d..399ef64 100644 --- a/src/agentevals/genai_converter.py +++ b/src/agentevals/genai_converter.py @@ -14,7 +14,14 @@ from google.genai import types as genai_types from .converter import ConversionResult -from .extraction import GenAIExtractor, is_invocation_span, is_llm_span, parse_tool_response_content +from .extraction import ( + GenAIExtractor, + extract_agent_response_from_attrs, + extract_user_text_from_attrs, + is_invocation_span, + is_llm_span, + parse_tool_response_content, +) from .loader.base import Span, Trace from .trace_attrs import ( OTEL_GENAI_INPUT_MESSAGES, @@ -307,50 +314,21 @@ def _turn_to_invocation(turn: _ConversationTurn) -> Invocation: def _extract_user_text(llm_span: Span) -> str: - messages_raw = llm_span.get_tag(OTEL_GENAI_INPUT_MESSAGES, "[]") - messages = parse_json_attr(messages_raw, "gen_ai.input.messages") - - if not isinstance(messages, list): - messages = [] - - for msg in reversed(messages): - if not isinstance(msg, dict): - continue - if msg.get("role") in USER_ROLES: - text = extract_text_from_message(msg) - if text: - logger.debug(f"Found user message: {text[:100]}") - return text - - logger.warning(f"No user message found in {len(messages)} messages") - raise ValueError(f"LLM span {llm_span.span_id}: no user message found in gen_ai.input.messages") + text = extract_user_text_from_attrs(llm_span.tags) + if text: + return text + raise ValueError( + f"LLM span {llm_span.span_id}: no user message found (checked gen_ai.input.messages and ADK llm_request)" + ) def _extract_assistant_text(llm_span: Span) -> str: - messages_raw = llm_span.get_tag(OTEL_GENAI_OUTPUT_MESSAGES, "[]") - messages = parse_json_attr(messages_raw, "gen_ai.output.messages") - - if not isinstance(messages, list): - messages = [] - - logger.debug(f"Extracting final response from {len(messages)} output messages") - for i, msg in enumerate(messages): - if isinstance(msg, dict): - logger.debug( - f" Message {i}: role={msg.get('role')}, content_len={len(msg.get('content', ''))}, has_tool_calls={bool(msg.get('tool_calls'))}" - ) - - for msg in reversed(messages): - if not isinstance(msg, dict): - continue - if msg.get("role") in ASSISTANT_ROLES: - text = extract_text_from_message(msg) - if text: - logger.debug(f"Found assistant message with text: {text[:100]}") - return text - + text = extract_agent_response_from_attrs(llm_span.tags) + if text: + return text logger.warning( - f"LLM span {llm_span.span_id}: no assistant message with content in gen_ai.output.messages ({len(messages)} messages)" + "LLM span %s: no assistant message with content in span attributes", + llm_span.span_id, ) return "" diff --git a/tests/test_extraction.py b/tests/test_extraction.py index fe3d112..4c7fba9 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -194,6 +194,17 @@ def test_adk_no_text_parts(self): attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"function_call": {"name": "tool"}}]}})} assert extract_agent_response_from_attrs(attrs) is None + def test_genai_prefers_last_assistant(self): + attrs = { + OTEL_GENAI_OUTPUT_MESSAGES: json.dumps( + [ + {"role": "assistant", "content": "First response"}, + {"role": "assistant", "content": "Second response"}, + ] + ) + } + assert extract_agent_response_from_attrs(attrs) == "Second response" + # --------------------------------------------------------------------------- # extract_token_usage_from_attrs