Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions docs/streaming.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,18 @@ See [examples/README.md](../examples/README.md) for details on supported instrum

### OTLP/JSON Support

Native OpenTelemetry format — no conversion to Jaeger needed:
Native OpenTelemetry format. The CLI auto-detects Jaeger vs OTLP from
file contents, so `.json` and `.jsonl` exports from Tempo, Jaeger, or
the OTel collector all work without a `--format` flag:

```bash
# Load OTLP files directly
agentevals run trace.otlp.json --format otlp-json --eval-set eval.json
# Load any trace file directly; format is auto-detected
agentevals run trace.otlp.json --eval-set eval.json
```

Pass `--format otlp-json` (or `jaeger-json`) only as an override when
auto-detection fails on a non-standard export.

### Real-time Span Streaming

The `AgentEvalsStreamingProcessor` is an OTel `SpanProcessor` that streams spans over WebSocket as they complete:
Expand Down Expand Up @@ -311,6 +316,7 @@ This installs `opentelemetry-sdk>=1.20.0`. Agent code also needs `websockets` fo
## Compatibility

All existing workflows continue to work:
- Jaeger JSON files still supported: `agentevals run trace.json --eval-set ...`
- OTLP/JSON files: `agentevals run trace.otlp.json --format otlp-json --eval-set ...`
- Web UI upload flow unchanged
- Trace files (Jaeger or OTLP, including Tempo exports) auto-detect by
content: `agentevals run trace.json --eval-set ...`
- Pass `--format` only to override detection on non-standard exports.
- Web UI upload flow unchanged.
1 change: 1 addition & 0 deletions samples/tempo_export_with_batches.json

Large diffs are not rendered by default.

30 changes: 4 additions & 26 deletions src/agentevals/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
)
from ..converter import convert_traces
from ..extraction import get_extractor
from ..loader import load_traces
from ..loader.otlp import OtlpJsonLoader
from ..runner import (
RunResult,
get_loader,
load_eval_set,
load_eval_set_from_dict,
run_evaluation,
Expand Down Expand Up @@ -331,17 +331,10 @@ def _serialize_invocation(inv) -> dict[str, Any]:
return _camel_keys(inv_dict)


def _get_format_for_file(path: str, explicit_format: str) -> str:
"""Return the loader format for a single file, auto-detecting from extension."""
if explicit_format:
return explicit_format
return "otlp-json" if path.lower().endswith(".jsonl") else "jaeger-json"


@router.post("/convert", response_model=StandardResponse[ConvertTracesData])
async def convert_trace_files(
trace_files: list[UploadFile] = File(...),
trace_format: str = Form(""),
trace_format: str | None = Form(None),
):
"""Convert trace files to invocations and metadata without running evaluation."""
temp_dir = tempfile.mkdtemp()
Expand Down Expand Up @@ -380,10 +373,8 @@ async def convert_trace_files(
trace_to_filename: dict[str, str] = {}
load_warnings: list[str] = []
for path, original in saved_files:
fmt = _get_format_for_file(path, trace_format)
loader = get_loader(fmt)
try:
traces = loader.load(path)
traces = load_traces(path, format=trace_format or None)
for t in traces:
trace_to_filename[t.trace_id] = original
all_traces.extend(traces)
Expand Down Expand Up @@ -496,12 +487,6 @@ async def evaluate_traces(
)

trace_format = config_dict.get("trace_format")
if not trace_format:
first_file = trace_paths[0]
if first_file.endswith(".jsonl"):
trace_format = "otlp-json"
else:
trace_format = "jaeger-json"

eval_set_path = None
if eval_set_file and eval_set_file.filename:
Expand Down Expand Up @@ -612,12 +597,6 @@ async def event_generator():
return

trace_format = config_dict.get("trace_format")
if not trace_format:
first_file = trace_paths[0]
if first_file.endswith(".jsonl"):
trace_format = "otlp-json"
else:
trace_format = "jaeger-json"

eval_set_path = None
if eval_set_file and eval_set_file.filename:
Expand Down Expand Up @@ -663,10 +642,9 @@ async def event_generator():
trajectory_match_type=config_dict.get("trajectoryMatchType"),
)

loader = get_loader(eval_config.trace_format)
for trace_file_path in trace_paths:
try:
traces = loader.load(trace_file_path)
traces = load_traces(trace_file_path, format=eval_config.trace_format)
for trace in traces:
extractor = get_extractor(trace)
perf_metrics = _camel_keys(extract_performance_metrics(trace, extractor))
Expand Down
7 changes: 4 additions & 3 deletions src/agentevals/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,9 @@ def main(verbose: int) -> None:
"--format",
"-f",
"trace_format",
default="jaeger-json",
help="Trace file format.",
default=None,
type=click.Choice(["jaeger-json", "otlp-json"], case_sensitive=False),
help="Override the trace file format. Auto-detected from file contents when omitted.",
)
@click.option(
"--judge-model",
Expand Down Expand Up @@ -134,7 +135,7 @@ def run(
trace_files: tuple[str, ...],
eval_set: str | None,
metric: tuple[str, ...] | None,
trace_format: str,
trace_format: str | None,
judge_model: str | None,
threshold: float | None,
trajectory_match_type: str | None,
Expand Down
11 changes: 7 additions & 4 deletions src/agentevals/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,19 @@ def _validate_trajectory_match_type(cls, v: str | None) -> str | None:
class EvalRunConfig(EvalParams):
"""Full configuration for file-based evaluation runs."""

trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
trace_files: list[str] = Field(description="Paths to trace files (Jaeger or OTLP JSON, .json or .jsonl).")

eval_set_file: str | None = Field(
default=None,
description="Path to a golden eval set JSON file (ADK EvalSet format).",
)

trace_format: str = Field(
default="jaeger-json",
description="Format of the trace files (jaeger-json or otlp-json).",
trace_format: str | None = Field(
default=None,
description=(
"Optional explicit trace format override ('jaeger-json' or 'otlp-json'). "
"Leave unset to auto-detect from file contents."
),
)

output_format: str = Field(
Expand Down
25 changes: 19 additions & 6 deletions src/agentevals/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@
extract_tool_result_from_span,
extract_user_text_from_attrs,
get_extractor,
has_adk_descendant,
is_adk_scope,
parse_json,
)
from .loader.base import Span, Trace
from .trace_attrs import (
ADK_INVOCATION_ID,
ADK_LLM_REQUEST,
ADK_LLM_RESPONSE,
ADK_SCOPE_VALUE,
OTEL_GENAI_AGENT_NAME,
OTEL_SCOPE,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -101,13 +101,26 @@ def convert_traces(traces: list[Trace]) -> list[ConversionResult]:


def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:
"""Find spans with ``otel.scope.name == "gcp.vertex.agent"`` matching an operation prefix."""
"""Find ADK-instrumented spans matching an operation prefix.

Detection delegates to ``is_adk_scope``, which accepts either the OTel
scope marker, the ``gen_ai.system`` semconv attribute, or any
``gcp.vertex.agent.*`` custom attribute. The fallbacks matter for
Tempo-exported traces where scope info gets lost during compaction.

For ``invoke_agent`` we additionally accept spans whose subtree is ADK
instrumented even when the parent itself lost its markers — Tempo's
compactor can strip scope info on the parent while children retain
their ``gcp.vertex.agent.*`` attributes.
"""
matches = []
for span in trace.all_spans:
if span.get_tag(OTEL_SCOPE) != ADK_SCOPE_VALUE:
if not span.operation_name.startswith(operation):
continue
if is_adk_scope(span):
matches.append(span)
continue
# operationName is e.g. "invoke_agent helm_agent" or "call_llm"
if span.operation_name.startswith(operation):
if operation == "invoke_agent" and has_adk_descendant(span):
matches.append(span)
matches.sort(key=lambda s: s.start_time)
return matches
Expand Down
2 changes: 1 addition & 1 deletion src/agentevals/eval_config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> Eval
merged.threshold = cli_config.threshold
if cli_config.trajectory_match_type is not None:
merged.trajectory_match_type = cli_config.trajectory_match_type
if cli_config.trace_format != "jaeger-json":
if cli_config.trace_format is not None:
merged.trace_format = cli_config.trace_format
if cli_config.output_format != "table":
merged.output_format = cli_config.output_format
Expand Down
53 changes: 51 additions & 2 deletions src/agentevals/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,52 @@ def extract_tool_result_from_span(span: Span) -> dict[str, Any] | None:
# ---------------------------------------------------------------------------


_ADK_ATTR_MARKERS = (
ADK_LLM_REQUEST,
ADK_LLM_RESPONSE,
ADK_TOOL_CALL_ARGS,
ADK_TOOL_RESPONSE,
"gcp.vertex.agent.invocation_id",
"gcp.vertex.agent.session_id",
"gcp.vertex.agent.event_id",
)


def has_adk_descendant(span: Span) -> bool:
"""Return True if any descendant of ``span`` is ADK-instrumented.

Used to recover ADK invocation parents when round-tripped exports
(Tempo, etc.) drop scope info on the parent but retain ADK custom
attributes on the LLM/tool child spans.
"""
for child in span.children:
if is_adk_scope(child):
return True
if has_adk_descendant(child):
return True
return False


def is_adk_scope(span: Span) -> bool:
return span.get_tag(OTEL_SCOPE) == ADK_SCOPE_VALUE
"""Return True for spans emitted by Google ADK instrumentation.

Recognized signals (any one is sufficient):
1. ``otel.scope.name == "gcp.vertex.agent"`` — the canonical OTel scope.
2. ``gen_ai.system == "gcp.vertex.agent"`` — the per-span semconv marker.
Tempo's compactor sometimes drops/merges scope info, so this fallback
is required for round-tripped Tempo exports.
3. Any ``gcp.vertex.agent.*`` custom attribute (llm_request, llm_response,
tool_call_args, tool_response, invocation_id, session_id, event_id).
These are unambiguous ADK markers.
"""
if span.get_tag(OTEL_SCOPE) == ADK_SCOPE_VALUE:
return True
if span.get_tag(OTEL_GENAI_SYSTEM) == ADK_SCOPE_VALUE:
return True
for marker in _ADK_ATTR_MARKERS:
if span.get_tag(marker) is not None:
return True
return False


def is_llm_span(span: Span) -> bool:
Expand Down Expand Up @@ -423,7 +467,12 @@ def format_name(self) -> str:
return "adk"

def find_invocation_spans(self, trace: Trace) -> list[Span]:
matches = [s for s in trace.all_spans if is_adk_scope(s) and s.operation_name.startswith("invoke_agent")]
matches: list[Span] = []
for s in trace.all_spans:
if not s.operation_name.startswith("invoke_agent"):
continue
if is_adk_scope(s) or has_adk_descendant(s):
matches.append(s)
matches.sort(key=lambda s: s.start_time)
return matches

Expand Down
26 changes: 24 additions & 2 deletions src/agentevals/loader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
"""Trace loader implementations."""
"""Trace loader implementations.

Most callers should use :func:`load_traces` from
:mod:`agentevals.loader.auto`, which auto-detects the on-disk format
(Jaeger or OTLP, including Tempo's ``batches`` / wrapper variants) and
dispatches to the right underlying loader.
"""

from .auto import (
JAEGER_JSON,
OTLP_JSON,
detect_format,
get_loader_for_format,
load_traces,
)
from .base import TraceLoader
from .jaeger import JaegerJsonLoader
from .otlp import OtlpJsonLoader

__all__ = ["JaegerJsonLoader", "OtlpJsonLoader", "TraceLoader"]
__all__ = [
"JAEGER_JSON",
"OTLP_JSON",
"JaegerJsonLoader",
"OtlpJsonLoader",
"TraceLoader",
"detect_format",
"get_loader_for_format",
"load_traces",
]
Loading