From e2f1333f1afc1523b988c3700a2ac061fe362d5b Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Tue, 31 Mar 2026 12:59:50 +0200 Subject: [PATCH 1/2] consolidate duplicated tool call extractions --- src/agentevals/converter.py | 43 ++++++------------ src/agentevals/extraction.py | 23 ++++++++++ src/agentevals/genai_converter.py | 74 +++++++------------------------ 3 files changed, 54 insertions(+), 86 deletions(-) diff --git a/src/agentevals/converter.py b/src/agentevals/converter.py index b759675..e82a046 100644 --- a/src/agentevals/converter.py +++ b/src/agentevals/converter.py @@ -20,6 +20,8 @@ from .extraction import ( extract_agent_response_from_attrs, + extract_tool_call_from_span, + extract_tool_result_from_span, extract_user_text_from_attrs, get_extractor, parse_json, @@ -30,11 +32,7 @@ ADK_LLM_REQUEST, ADK_LLM_RESPONSE, ADK_SCOPE_VALUE, - ADK_TOOL_CALL_ARGS, - ADK_TOOL_RESPONSE, OTEL_GENAI_AGENT_NAME, - OTEL_GENAI_TOOL_CALL_ID, - OTEL_GENAI_TOOL_NAME, OTEL_SCOPE, ) @@ -223,37 +221,24 @@ def _extract_tool_trajectory( def _extract_from_tool_span( tool_span: Span, ) -> tuple[genai_types.FunctionCall | None, genai_types.FunctionResponse | None]: - tool_name = tool_span.get_tag(OTEL_GENAI_TOOL_NAME) - tool_call_id = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_ID) - - if not tool_name: - # Fallback: parse tool name from operationName "execute_tool " - op = tool_span.operation_name - if op.startswith("execute_tool "): - tool_name = op[len("execute_tool ") :] - else: - logger.warning("execute_tool span %s: no tool name found", tool_span.span_id) - return None, None - - args_raw = tool_span.get_tag(ADK_TOOL_CALL_ARGS, "{}") - args = parse_json(args_raw) + tool_call = extract_tool_call_from_span(tool_span) + if tool_call is None: + logger.warning("execute_tool span %s: no tool name found", tool_span.span_id) + return None, None fc = genai_types.FunctionCall( - name=tool_name, - args=args if args else {}, - id=tool_call_id, + name=tool_call["name"], + args=tool_call["args"], + id=tool_call["id"], ) - response_raw = tool_span.get_tag(ADK_TOOL_RESPONSE) + tool_result = extract_tool_result_from_span(tool_span) fr = None - if response_raw: - response_data = parse_json(response_raw) - # Response format varies: MCP uses {"content": [...], "isError": false}, - # other tools return flat dicts. We pass through as-is. + if tool_result: fr = genai_types.FunctionResponse( - name=tool_name, - response=response_data if response_data else {}, - id=tool_call_id, + name=tool_call["name"], + response=tool_result["response"], + id=tool_call["id"], ) return fc, fr diff --git a/src/agentevals/extraction.py b/src/agentevals/extraction.py index ba659c4..62f5e05 100644 --- a/src/agentevals/extraction.py +++ b/src/agentevals/extraction.py @@ -235,6 +235,29 @@ def extract_tool_result_from_attrs(attrs: dict[str, Any]) -> dict[str, Any] | No return None +# --------------------------------------------------------------------------- +# Span-level convenience wrappers +# --------------------------------------------------------------------------- + + +def extract_tool_call_from_span(span: Span) -> dict[str, Any] | None: + """Extract tool call info from a Span object. + + Delegates to extract_tool_call_from_attrs using the span's tags, operation + name, and span ID. Returns {"id", "name", "args"} or None. + """ + return extract_tool_call_from_attrs(span.tags, span.operation_name, span.span_id) + + +def extract_tool_result_from_span(span: Span) -> dict[str, Any] | None: + """Extract tool result from a Span object. + + Delegates to extract_tool_result_from_attrs using the span's tags. + Returns {"response": dict, "isError": bool} or None. + """ + return extract_tool_result_from_attrs(span.tags) + + # --------------------------------------------------------------------------- # Span classification helpers # --------------------------------------------------------------------------- diff --git a/src/agentevals/genai_converter.py b/src/agentevals/genai_converter.py index 399ef64..f37f063 100644 --- a/src/agentevals/genai_converter.py +++ b/src/agentevals/genai_converter.py @@ -17,25 +17,22 @@ from .extraction import ( GenAIExtractor, extract_agent_response_from_attrs, + extract_tool_call_from_span, + extract_tool_result_from_span, extract_user_text_from_attrs, is_invocation_span, is_llm_span, - parse_tool_response_content, ) from .loader.base import Span, Trace from .trace_attrs import ( OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES, - OTEL_GENAI_TOOL_CALL_ARGUMENTS, OTEL_GENAI_TOOL_CALL_ID, - OTEL_GENAI_TOOL_CALL_RESULT, - OTEL_GENAI_TOOL_NAME, ) from .utils.genai_messages import ( ASSISTANT_ROLES, USER_ROLES, extract_text_from_message, - extract_tool_call_args_from_messages, extract_tool_calls_from_message, parse_json_attr, ) @@ -385,68 +382,31 @@ def _extract_tool_calls( tool_responses: list[_ToolResponse] = [] for tool_span in tool_spans: - tool_name = tool_span.get_tag(OTEL_GENAI_TOOL_NAME) - if not tool_name: - logger.warning(f"Tool span missing gen_ai.tool.name: {tool_span.operation_name}") + tool_call = extract_tool_call_from_span(tool_span) + if tool_call is None: + logger.warning(f"Tool span missing tool name: {tool_span.operation_name}") continue - tool_call_id = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_ID) + # Use the real semconv tool_call_id for dedup (None when not present on + # the span), rather than the shared function's span_id/"unknown" fallback. + real_tool_call_id = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_ID) - args_raw = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_ARGUMENTS, "{}") - args = parse_json_attr(args_raw, "gen_ai.tool.call.arguments") - if not isinstance(args, dict): - args = {} - - if not args: - input_msgs_raw = tool_span.get_tag(OTEL_GENAI_INPUT_MESSAGES) - if input_msgs_raw: - args, _ = extract_tool_call_args_from_messages(input_msgs_raw, tool_name) - - tc = _ToolCall(name=tool_name, args=args, id=tool_call_id) - if tool_call_id: - tool_calls_by_id[tool_call_id] = tc + tc = _ToolCall(name=tool_call["name"], args=tool_call["args"], id=real_tool_call_id) + if real_tool_call_id: + tool_calls_by_id[real_tool_call_id] = tc else: tool_calls_no_id.append(tc) - result_raw = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_RESULT) - if result_raw: - result_data = parse_tool_response_content(result_raw) - logger.debug(f"Tool {tool_name} result: {str(result_data)[:100]}") + tool_result = extract_tool_result_from_span(tool_span) + if tool_result: + logger.debug(f"Tool {tool_call['name']} result: {str(tool_result['response'])[:100]}") tool_responses.append( _ToolResponse( - name=tool_name, - response=result_data, - id=tool_call_id, + name=tool_call["name"], + response=tool_result["response"], + id=real_tool_call_id, ) ) - else: - output_msgs_raw = tool_span.get_tag(OTEL_GENAI_OUTPUT_MESSAGES) - if output_msgs_raw: - output_msgs = parse_json_attr(output_msgs_raw, "gen_ai.output.messages") - if isinstance(output_msgs, list): - for msg in output_msgs: - if not isinstance(msg, dict): - continue - for part in msg.get("parts", []): - if not isinstance(part, dict): - continue - if part.get("type") == "tool_call_response" and "response" in part: - resp = part["response"] - if isinstance(resp, list): - texts = [t.get("text", "") for t in resp if isinstance(t, dict) and "text" in t] - result_data = parse_tool_response_content(" ".join(texts)) - elif isinstance(resp, dict): - result_data = resp - else: - result_data = {"result": str(resp)} - tool_responses.append( - _ToolResponse( - name=tool_name, - response=result_data, - id=tool_call_id, - ) - ) - break if llm_spans: for llm_span in llm_spans: From 6109dc509239d8fe20c001aeffd72da979ec52ce Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Tue, 31 Mar 2026 13:10:56 +0200 Subject: [PATCH 2/2] address review comments --- src/agentevals/extraction.py | 2 +- src/agentevals/genai_converter.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/agentevals/extraction.py b/src/agentevals/extraction.py index 62f5e05..7b0c328 100644 --- a/src/agentevals/extraction.py +++ b/src/agentevals/extraction.py @@ -207,7 +207,7 @@ def extract_tool_result_from_attrs(attrs: dict[str, Any]) -> dict[str, Any] | No if raw: parsed = parse_tool_response_content(raw) - if parsed: + if isinstance(parsed, dict): is_error = bool(parsed.get("isError", False)) return {"response": parsed, "isError": is_error} diff --git a/src/agentevals/genai_converter.py b/src/agentevals/genai_converter.py index f37f063..b635519 100644 --- a/src/agentevals/genai_converter.py +++ b/src/agentevals/genai_converter.py @@ -27,7 +27,6 @@ from .trace_attrs import ( OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_OUTPUT_MESSAGES, - OTEL_GENAI_TOOL_CALL_ID, ) from .utils.genai_messages import ( ASSISTANT_ROLES, @@ -387,13 +386,15 @@ def _extract_tool_calls( logger.warning(f"Tool span missing tool name: {tool_span.operation_name}") continue - # Use the real semconv tool_call_id for dedup (None when not present on - # the span), rather than the shared function's span_id/"unknown" fallback. - real_tool_call_id = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_ID) + # The shared function returns span_id or "unknown" as fallback IDs. + # For dedup against LLM-message tool calls, only real IDs are usable. + tc_id = tool_call["id"] + is_placeholder = tc_id in ("unknown", tool_span.span_id) + dedup_id = None if is_placeholder else tc_id - tc = _ToolCall(name=tool_call["name"], args=tool_call["args"], id=real_tool_call_id) - if real_tool_call_id: - tool_calls_by_id[real_tool_call_id] = tc + tc = _ToolCall(name=tool_call["name"], args=tool_call["args"], id=dedup_id) + if dedup_id: + tool_calls_by_id[dedup_id] = tc else: tool_calls_no_id.append(tc) @@ -404,7 +405,7 @@ def _extract_tool_calls( _ToolResponse( name=tool_call["name"], response=tool_result["response"], - id=real_tool_call_id, + id=dedup_id, ) )