From 33c4cc37e86f98b225b00f0d7b8bab6f0208c3e2 Mon Sep 17 00:00:00 2001 From: shahar-dagan <36898364+shahar-dagan@users.noreply.github.com> Date: Sun, 29 Mar 2026 20:14:17 -0700 Subject: [PATCH 1/4] add OpenAI Agents SDK zero-code example and e2e tests Self-contained dice-rolling agent showing zero-code OTLP integration with openai-agents>=0.3.3 via opentelemetry-instrumentation-openai-agents-v2. Includes run.py, requirements.txt, a golden multi-turn eval_set.json, and TestOpenAIAgentsZeroCode e2e tests for session/span/invocation/API verification. Uses result.to_input_list() for correct conversation context threading across turns, and try/finally to guarantee force_flush() even on API errors. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 13 ++ TODOS.md | 7 + .../openai-agents/eval_set.json | 91 +++++++++++ .../openai-agents/requirements.txt | 6 + .../zero-code-examples/openai-agents/run.py | 143 ++++++++++++++++++ pyproject.toml | 2 +- tests/integration/test_live_agents.py | 57 +++++++ 7 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 CHANGELOG.md create mode 100644 TODOS.md create mode 100644 examples/zero-code-examples/openai-agents/eval_set.json create mode 100644 examples/zero-code-examples/openai-agents/requirements.txt create mode 100644 examples/zero-code-examples/openai-agents/run.py diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3f5c676 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,13 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## [0.5.3] - 2026-03-29 + +### Added +- **OpenAI Agents SDK zero-code example** (`examples/zero-code-examples/openai-agents/`): a self-contained dice-rolling agent that demonstrates zero-code OTLP integration with the OpenAI Agents SDK (`openai-agents>=0.3.3`) via `opentelemetry-instrumentation-openai-agents-v2`. Includes `run.py`, `requirements.txt`, and a golden `eval_set.json` with a multi-turn conversation case. +- **E2E integration tests** for the OpenAI Agents SDK example (`TestOpenAIAgentsZeroCode` in `tests/integration/test_live_agents.py`): verifies session creation, span emission, invocation extraction, and API visibility. + +### Fixed +- Conversation context threading in `run.py` now uses `result.to_input_list()` (the SDK-idiomatic pattern) instead of manually appending raw role/content dicts, ensuring tool call history is preserved across turns. +- `force_flush()` is now called in a `try/finally` block, guaranteeing spans are sent to the OTLP receiver even when an API error occurs mid-conversation. diff --git a/TODOS.md b/TODOS.md new file mode 100644 index 0000000..a4b2993 --- /dev/null +++ b/TODOS.md @@ -0,0 +1,7 @@ +# TODOs + +## Eval set schema migration +- **What:** Update `examples/langchain_agent/eval_set.json` and `examples/strands_agent/eval_set.json` to use documented schema (`eval_id`/`final_response`) instead of stale format (`case_id`/`agent_content`). +- **Why:** The new openai-agents eval set uses the correct format. Without this fix, the langchain/strands examples become the odd ones out and teach new contributors the wrong pattern. +- **Context:** Discovered during openai-agents zero-code integration review (2026-03-29). Schema loader likely accepts both formats so no behavior change. Reference: `docs/eval-set-format.md`, `samples/eval_set_helm.json` (already correct). +- **Blocked by:** Nothing. Two-line JSON key rename per file. diff --git a/examples/zero-code-examples/openai-agents/eval_set.json b/examples/zero-code-examples/openai-agents/eval_set.json new file mode 100644 index 0000000..1b8a75e --- /dev/null +++ b/examples/zero-code-examples/openai-agents/eval_set.json @@ -0,0 +1,91 @@ +{ + "eval_set_id": "openai_agents_eval", + "name": "OpenAI Agents Dice Agent Eval Set", + "description": "Golden eval cases for the OpenAI Agents SDK zero-code example. One multi-turn session covering greeting, die roll, and prime check.", + "eval_cases": [ + { + "eval_id": "dice-roll-session", + "conversation": [ + { + "invocation_id": "inv-1", + "user_content": { + "role": "user", + "parts": [ + {"text": "Hi! Can you help me?"} + ] + }, + "final_response": { + "role": "model", + "parts": [ + {"text": "Hello! Yes, I can help you. I have tools that can roll dice and check if numbers are prime. What would you like me to do?"} + ] + } + }, + { + "invocation_id": "inv-2", + "user_content": { + "role": "user", + "parts": [ + {"text": "Roll a 20-sided die for me"} + ] + }, + "final_response": { + "role": "model", + "parts": [ + {"text": "I rolled a 20-sided die and got [number]. Let me know if you'd like to check if it's prime!"} + ] + }, + "intermediate_data": { + "tool_uses": [ + { + "name": "roll_die", + "args": {"sides": 20}, + "id": "call_roll_die_1" + } + ], + "tool_responses": [ + { + "name": "roll_die", + "response": { + "content": [{"type": "text", "text": "[number]"}] + } + } + ] + } + }, + { + "invocation_id": "inv-3", + "user_content": { + "role": "user", + "parts": [ + {"text": "Is the number you rolled prime?"} + ] + }, + "final_response": { + "role": "model", + "parts": [ + {"text": "[number] is [prime/not prime]."} + ] + }, + "intermediate_data": { + "tool_uses": [ + { + "name": "check_prime", + "args": {"number": 0}, + "id": "call_check_prime_1" + } + ], + "tool_responses": [ + { + "name": "check_prime", + "response": { + "content": [{"type": "text", "text": "[true/false]"}] + } + } + ] + } + } + ] + } + ] +} diff --git a/examples/zero-code-examples/openai-agents/requirements.txt b/examples/zero-code-examples/openai-agents/requirements.txt new file mode 100644 index 0000000..b3bc37f --- /dev/null +++ b/examples/zero-code-examples/openai-agents/requirements.txt @@ -0,0 +1,6 @@ +openai-agents>=0.3.3 +opentelemetry-instrumentation-openai-agents-v2>=0.1.0 + +opentelemetry-sdk>=1.36.0 +opentelemetry-exporter-otlp-proto-http>=1.36.0 +python-dotenv>=1.0.0 diff --git a/examples/zero-code-examples/openai-agents/run.py b/examples/zero-code-examples/openai-agents/run.py new file mode 100644 index 0000000..5d8c054 --- /dev/null +++ b/examples/zero-code-examples/openai-agents/run.py @@ -0,0 +1,143 @@ +"""Run a dice-rolling OpenAI Agents SDK agent with OTLP export — no agentevals SDK. + +Demonstrates zero-code integration: any OTel-instrumented agent streams +traces to agentevals by pointing the OTLP exporter at the receiver. + +Unlike the LangChain and Strands examples, this one is fully self-contained: +the agent code lives inline with no cross-folder imports. + +Prerequisites: + 1. pip install -r requirements.txt + 2. agentevals serve --dev + 3. export OPENAI_API_KEY="your-key-here" + +Usage: + python examples/zero-code-examples/openai-agents/run.py +""" + +import os +import random + +from agents import Agent, Runner, function_tool +from dotenv import load_dotenv +from opentelemetry import trace +from opentelemetry._logs import set_logger_provider +from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.openai_agents import OpenAIAgentsInstrumentor +from opentelemetry.sdk._logs import LoggerProvider +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +load_dotenv(override=True) + + +# ── Tool definitions ────────────────────────────────────────────────────────── + +@function_tool +def roll_die(sides: int) -> int: + """Roll a die with the given number of sides and return the result.""" + return random.randint(1, sides) + + +@function_tool +def check_prime(number: int) -> bool: + """Return True if the number is prime, False otherwise.""" + if number < 2: + return False + for i in range(2, int(number**0.5) + 1): + if number % i == 0: + return False + return True + + +def main(): + if not os.getenv("OPENAI_API_KEY"): + print("OPENAI_API_KEY not set.") + return + + endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") + print(f"OTLP endpoint: {endpoint}") + + # openai-agents-v2 uses ContentCaptureMode — use "span_and_event" for maximum + # content capture. agentevals reads from span attributes, so content will be + # visible in the UI. + os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "span_and_event" + + os.environ.setdefault( + "OTEL_RESOURCE_ATTRIBUTES", + "agentevals.eval_set_id=openai_agents_eval,agentevals.session_name=openai-agents-zero-code", + ) + + # OTel setup flow: + # + # Resource (session/eval attrs) + # │ + # ├── TracerProvider → BatchSpanProcessor → OTLPSpanExporter → :4318 + # │ + # └── LoggerProvider → BatchLogRecordProcessor → OTLPLogExporter → :4318 + # (openai-agents-v2 may route message content to span attributes + # rather than log records; OTLPLogExporter is a no-op if unused) + # + # OpenAIAgentsInstrumentor.instrument() + # └── hooks into agents SDK → emits spans + (optionally) log records + # + # Runner.run_sync(agent, input) ← called with accumulated conversation + + resource = Resource.create() + + tracer_provider = TracerProvider(resource=resource) + tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(), schedule_delay_millis=1000)) + trace.set_tracer_provider(tracer_provider) + + logger_provider = LoggerProvider(resource=resource) + logger_provider.add_log_record_processor(BatchLogRecordProcessor(OTLPLogExporter(), schedule_delay_millis=1000)) + set_logger_provider(logger_provider) + + OpenAIAgentsInstrumentor().instrument() + + agent = Agent( + name="Dice Agent", + instructions="You are a helpful assistant. You can roll dice and check if numbers are prime.", + tools=[roll_die, check_prime], + ) + + test_queries = [ + "Hi! Can you help me?", + "Roll a 20-sided die for me", + "Is the number you rolled prime?", + ] + + # Accumulate conversation context so turn 3 ("Is it prime?") can reference + # the number rolled in turn 2. openai-agents Runner.run_sync() is stateless + # per call, so we thread prior messages via result.to_input_list() — the SDK's + # canonical way to carry full response history (tool calls, tool results, etc.) + # into the next turn. Raw role/content dicts can silently drop tool-call context. + conversation_input: list = [] + + try: + for i, query in enumerate(test_queries, 1): + print(f"\n[{i}/{len(test_queries)}] User: {query}") + + conversation_input.append({"role": "user", "content": query}) + result = Runner.run_sync(agent, conversation_input) + + agent_response = result.final_output or "" + print(f" Agent: {agent_response}") + + # to_input_list() returns the full turn history including tool calls and + # results, which is what the SDK expects as input for the next turn. + conversation_input = result.to_input_list() + finally: + # Always flush — even if a turn raises (rate limit, network error, etc.) + # so that whatever spans were recorded make it to the OTLP receiver. + print() + tracer_provider.force_flush() + logger_provider.force_flush() + print("All traces and logs flushed to OTLP receiver.") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index cd82fc4..8cee168 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agentevals-cli" -version = "0.5.2" +version = "0.5.3" description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces" readme = "README.md" requires-python = ">=3.11" diff --git a/tests/integration/test_live_agents.py b/tests/integration/test_live_agents.py index 74c0f53..e647a22 100644 --- a/tests/integration/test_live_agents.py +++ b/tests/integration/test_live_agents.py @@ -248,6 +248,63 @@ def test_session_visible_via_api(self, live_servers): assert session_name in session_ids +@_skip_no_openai +class TestOpenAIAgentsZeroCode: + """Run the OpenAI Agents SDK zero-code OTLP example and verify session grouping.""" + + def test_session_created_with_spans(self, live_servers): + main_port, otlp_port, mgr = live_servers + session_name = "e2e-openai-agents" + + result = _run_agent( + "examples/zero-code-examples/openai-agents/run.py", + otlp_port, + session_name, + ) + assert result.returncode == 0, f"Agent failed:\nstdout: {result.stdout}\nstderr: {result.stderr}" + + wait_for_session_complete_sync(mgr, session_name, timeout=30) + session = mgr.sessions[session_name] + + assert session.is_complete + assert session.source == "otlp" + assert len(session.spans) > 0, "Expected spans from LLM calls" + + def test_invocations_extracted(self, live_servers): + main_port, otlp_port, mgr = live_servers + session_name = "e2e-openai-agents-inv" + + result = _run_agent( + "examples/zero-code-examples/openai-agents/run.py", + otlp_port, + session_name, + ) + assert result.returncode == 0, f"Agent failed:\nstdout: {result.stdout}\nstderr: {result.stderr}" + + wait_for_session_complete_sync(mgr, session_name, timeout=30) + session = mgr.sessions[session_name] + + assert len(session.invocations) > 0, "Expected extracted invocations" + + def test_session_visible_via_api(self, live_servers): + main_port, otlp_port, mgr = live_servers + session_name = "e2e-openai-agents-api" + + result = _run_agent( + "examples/zero-code-examples/openai-agents/run.py", + otlp_port, + session_name, + ) + assert result.returncode == 0 + + wait_for_session_complete_sync(mgr, session_name, timeout=30) + + resp = httpx.get(f"http://127.0.0.1:{main_port}/api/streaming/sessions") + assert resp.status_code == 200 + session_ids = [s["sessionId"] for s in resp.json()["data"]] + assert session_name in session_ids + + @_skip_no_openai class TestAgentRerun: """Verify that re-running an agent with the same session_name creates From eaf7109231aceb053a3a3bd36cf7d5e845afd896 Mon Sep 17 00:00:00 2001 From: shahar-dagan <36898364+shahar-dagan@users.noreply.github.com> Date: Sun, 29 Mar 2026 20:29:45 -0700 Subject: [PATCH 2/4] add OpenAI Agents SDK zero-code example and e2e tests Self-contained dice-rolling agent showing zero-code OTLP integration with openai-agents>=0.3.3 via opentelemetry-instrumentation-openai-agents-v2. Includes run.py, requirements.txt, a golden multi-turn eval_set.json, and TestOpenAIAgentsZeroCode e2e tests for session/span/invocation/API verification. Uses result.to_input_list() for correct conversation context threading across turns, and try/finally to guarantee force_flush() even on API errors. Co-Authored-By: Claude Sonnet 4.6 --- .../openai-agents/eval_set.json | 74 +++++++++ .../openai-agents/requirements.txt | 6 + .../zero-code-examples/openai-agents/run.py | 143 ++++++++++++++++++ tests/integration/test_live_agents.py | 57 +++++++ 4 files changed, 280 insertions(+) create mode 100644 examples/zero-code-examples/openai-agents/eval_set.json create mode 100644 examples/zero-code-examples/openai-agents/requirements.txt create mode 100644 examples/zero-code-examples/openai-agents/run.py diff --git a/examples/zero-code-examples/openai-agents/eval_set.json b/examples/zero-code-examples/openai-agents/eval_set.json new file mode 100644 index 0000000..4904d2c --- /dev/null +++ b/examples/zero-code-examples/openai-agents/eval_set.json @@ -0,0 +1,74 @@ +{ + "eval_set_id": "openai_agents_eval", + "name": "OpenAI Agents Dice Agent Eval Set", + "description": "Golden eval cases for the OpenAI Agents SDK zero-code example. One multi-turn session covering greeting, die roll, and prime check.", + "eval_cases": [ + { + "eval_id": "dice-roll-session", + "conversation": [ + { + "invocation_id": "inv-1", + "user_content": { + "role": "user", + "parts": [ + {"text": "Hi! Can you help me?"} + ] + }, + "final_response": { + "role": "model", + "parts": [ + {"text": "Hello! Yes, I can help you. I have tools that can roll dice and check if numbers are prime. What would you like me to do?"} + ] + } + }, + { + "invocation_id": "inv-2", + "user_content": { + "role": "user", + "parts": [ + {"text": "Roll a 20-sided die for me"} + ] + }, + "final_response": { + "role": "model", + "parts": [ + {"text": "I rolled a 20-sided die and got [number]. Let me know if you'd like to check if it's prime!"} + ] + }, + "intermediate_data": { + "tool_uses": [ + { + "name": "roll_die", + "args": {"sides": 20}, + "id": "call_roll_die_1" + } + ], + "tool_responses": [ + { + "name": "roll_die", + "response": { + "content": [{"type": "text", "text": "[number]"}] + } + } + ] + } + }, + { + "invocation_id": "inv-3", + "user_content": { + "role": "user", + "parts": [ + {"text": "Is the number you rolled prime?"} + ] + }, + "final_response": { + "role": "model", + "parts": [ + {"text": "[number] is [prime/not prime]."} + ] + } + } + ] + } + ] +} diff --git a/examples/zero-code-examples/openai-agents/requirements.txt b/examples/zero-code-examples/openai-agents/requirements.txt new file mode 100644 index 0000000..b3bc37f --- /dev/null +++ b/examples/zero-code-examples/openai-agents/requirements.txt @@ -0,0 +1,6 @@ +openai-agents>=0.3.3 +opentelemetry-instrumentation-openai-agents-v2>=0.1.0 + +opentelemetry-sdk>=1.36.0 +opentelemetry-exporter-otlp-proto-http>=1.36.0 +python-dotenv>=1.0.0 diff --git a/examples/zero-code-examples/openai-agents/run.py b/examples/zero-code-examples/openai-agents/run.py new file mode 100644 index 0000000..5d8c054 --- /dev/null +++ b/examples/zero-code-examples/openai-agents/run.py @@ -0,0 +1,143 @@ +"""Run a dice-rolling OpenAI Agents SDK agent with OTLP export — no agentevals SDK. + +Demonstrates zero-code integration: any OTel-instrumented agent streams +traces to agentevals by pointing the OTLP exporter at the receiver. + +Unlike the LangChain and Strands examples, this one is fully self-contained: +the agent code lives inline with no cross-folder imports. + +Prerequisites: + 1. pip install -r requirements.txt + 2. agentevals serve --dev + 3. export OPENAI_API_KEY="your-key-here" + +Usage: + python examples/zero-code-examples/openai-agents/run.py +""" + +import os +import random + +from agents import Agent, Runner, function_tool +from dotenv import load_dotenv +from opentelemetry import trace +from opentelemetry._logs import set_logger_provider +from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.openai_agents import OpenAIAgentsInstrumentor +from opentelemetry.sdk._logs import LoggerProvider +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +load_dotenv(override=True) + + +# ── Tool definitions ────────────────────────────────────────────────────────── + +@function_tool +def roll_die(sides: int) -> int: + """Roll a die with the given number of sides and return the result.""" + return random.randint(1, sides) + + +@function_tool +def check_prime(number: int) -> bool: + """Return True if the number is prime, False otherwise.""" + if number < 2: + return False + for i in range(2, int(number**0.5) + 1): + if number % i == 0: + return False + return True + + +def main(): + if not os.getenv("OPENAI_API_KEY"): + print("OPENAI_API_KEY not set.") + return + + endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") + print(f"OTLP endpoint: {endpoint}") + + # openai-agents-v2 uses ContentCaptureMode — use "span_and_event" for maximum + # content capture. agentevals reads from span attributes, so content will be + # visible in the UI. + os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "span_and_event" + + os.environ.setdefault( + "OTEL_RESOURCE_ATTRIBUTES", + "agentevals.eval_set_id=openai_agents_eval,agentevals.session_name=openai-agents-zero-code", + ) + + # OTel setup flow: + # + # Resource (session/eval attrs) + # │ + # ├── TracerProvider → BatchSpanProcessor → OTLPSpanExporter → :4318 + # │ + # └── LoggerProvider → BatchLogRecordProcessor → OTLPLogExporter → :4318 + # (openai-agents-v2 may route message content to span attributes + # rather than log records; OTLPLogExporter is a no-op if unused) + # + # OpenAIAgentsInstrumentor.instrument() + # └── hooks into agents SDK → emits spans + (optionally) log records + # + # Runner.run_sync(agent, input) ← called with accumulated conversation + + resource = Resource.create() + + tracer_provider = TracerProvider(resource=resource) + tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(), schedule_delay_millis=1000)) + trace.set_tracer_provider(tracer_provider) + + logger_provider = LoggerProvider(resource=resource) + logger_provider.add_log_record_processor(BatchLogRecordProcessor(OTLPLogExporter(), schedule_delay_millis=1000)) + set_logger_provider(logger_provider) + + OpenAIAgentsInstrumentor().instrument() + + agent = Agent( + name="Dice Agent", + instructions="You are a helpful assistant. You can roll dice and check if numbers are prime.", + tools=[roll_die, check_prime], + ) + + test_queries = [ + "Hi! Can you help me?", + "Roll a 20-sided die for me", + "Is the number you rolled prime?", + ] + + # Accumulate conversation context so turn 3 ("Is it prime?") can reference + # the number rolled in turn 2. openai-agents Runner.run_sync() is stateless + # per call, so we thread prior messages via result.to_input_list() — the SDK's + # canonical way to carry full response history (tool calls, tool results, etc.) + # into the next turn. Raw role/content dicts can silently drop tool-call context. + conversation_input: list = [] + + try: + for i, query in enumerate(test_queries, 1): + print(f"\n[{i}/{len(test_queries)}] User: {query}") + + conversation_input.append({"role": "user", "content": query}) + result = Runner.run_sync(agent, conversation_input) + + agent_response = result.final_output or "" + print(f" Agent: {agent_response}") + + # to_input_list() returns the full turn history including tool calls and + # results, which is what the SDK expects as input for the next turn. + conversation_input = result.to_input_list() + finally: + # Always flush — even if a turn raises (rate limit, network error, etc.) + # so that whatever spans were recorded make it to the OTLP receiver. + print() + tracer_provider.force_flush() + logger_provider.force_flush() + print("All traces and logs flushed to OTLP receiver.") + + +if __name__ == "__main__": + main() diff --git a/tests/integration/test_live_agents.py b/tests/integration/test_live_agents.py index 74c0f53..e647a22 100644 --- a/tests/integration/test_live_agents.py +++ b/tests/integration/test_live_agents.py @@ -248,6 +248,63 @@ def test_session_visible_via_api(self, live_servers): assert session_name in session_ids +@_skip_no_openai +class TestOpenAIAgentsZeroCode: + """Run the OpenAI Agents SDK zero-code OTLP example and verify session grouping.""" + + def test_session_created_with_spans(self, live_servers): + main_port, otlp_port, mgr = live_servers + session_name = "e2e-openai-agents" + + result = _run_agent( + "examples/zero-code-examples/openai-agents/run.py", + otlp_port, + session_name, + ) + assert result.returncode == 0, f"Agent failed:\nstdout: {result.stdout}\nstderr: {result.stderr}" + + wait_for_session_complete_sync(mgr, session_name, timeout=30) + session = mgr.sessions[session_name] + + assert session.is_complete + assert session.source == "otlp" + assert len(session.spans) > 0, "Expected spans from LLM calls" + + def test_invocations_extracted(self, live_servers): + main_port, otlp_port, mgr = live_servers + session_name = "e2e-openai-agents-inv" + + result = _run_agent( + "examples/zero-code-examples/openai-agents/run.py", + otlp_port, + session_name, + ) + assert result.returncode == 0, f"Agent failed:\nstdout: {result.stdout}\nstderr: {result.stderr}" + + wait_for_session_complete_sync(mgr, session_name, timeout=30) + session = mgr.sessions[session_name] + + assert len(session.invocations) > 0, "Expected extracted invocations" + + def test_session_visible_via_api(self, live_servers): + main_port, otlp_port, mgr = live_servers + session_name = "e2e-openai-agents-api" + + result = _run_agent( + "examples/zero-code-examples/openai-agents/run.py", + otlp_port, + session_name, + ) + assert result.returncode == 0 + + wait_for_session_complete_sync(mgr, session_name, timeout=30) + + resp = httpx.get(f"http://127.0.0.1:{main_port}/api/streaming/sessions") + assert resp.status_code == 200 + session_ids = [s["sessionId"] for s in resp.json()["data"]] + assert session_name in session_ids + + @_skip_no_openai class TestAgentRerun: """Verify that re-running an agent with the same session_name creates From 0dd669aa0848a9bdd891cfa61dd973b7535e18c8 Mon Sep 17 00:00:00 2001 From: shahar-dagan <36898364+shahar-dagan@users.noreply.github.com> Date: Mon, 30 Mar 2026 10:45:41 -0700 Subject: [PATCH 3/4] address PR review feedback - remove CHANGELOG.md, TODOS.md, and eval_set.json (not needed) - revert pyproject.toml version bump to 0.5.2 - reduce comments in openai-agents run.py to match other examples - remove LoggerProvider: openai-agents instrumentation only emits spans Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 13 ---- TODOS.md | 7 -- .../openai-agents/eval_set.json | 74 ------------------- .../zero-code-examples/openai-agents/run.py | 40 +--------- pyproject.toml | 2 +- 5 files changed, 2 insertions(+), 134 deletions(-) delete mode 100644 CHANGELOG.md delete mode 100644 TODOS.md delete mode 100644 examples/zero-code-examples/openai-agents/eval_set.json diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 3f5c676..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,13 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -## [0.5.3] - 2026-03-29 - -### Added -- **OpenAI Agents SDK zero-code example** (`examples/zero-code-examples/openai-agents/`): a self-contained dice-rolling agent that demonstrates zero-code OTLP integration with the OpenAI Agents SDK (`openai-agents>=0.3.3`) via `opentelemetry-instrumentation-openai-agents-v2`. Includes `run.py`, `requirements.txt`, and a golden `eval_set.json` with a multi-turn conversation case. -- **E2E integration tests** for the OpenAI Agents SDK example (`TestOpenAIAgentsZeroCode` in `tests/integration/test_live_agents.py`): verifies session creation, span emission, invocation extraction, and API visibility. - -### Fixed -- Conversation context threading in `run.py` now uses `result.to_input_list()` (the SDK-idiomatic pattern) instead of manually appending raw role/content dicts, ensuring tool call history is preserved across turns. -- `force_flush()` is now called in a `try/finally` block, guaranteeing spans are sent to the OTLP receiver even when an API error occurs mid-conversation. diff --git a/TODOS.md b/TODOS.md deleted file mode 100644 index a4b2993..0000000 --- a/TODOS.md +++ /dev/null @@ -1,7 +0,0 @@ -# TODOs - -## Eval set schema migration -- **What:** Update `examples/langchain_agent/eval_set.json` and `examples/strands_agent/eval_set.json` to use documented schema (`eval_id`/`final_response`) instead of stale format (`case_id`/`agent_content`). -- **Why:** The new openai-agents eval set uses the correct format. Without this fix, the langchain/strands examples become the odd ones out and teach new contributors the wrong pattern. -- **Context:** Discovered during openai-agents zero-code integration review (2026-03-29). Schema loader likely accepts both formats so no behavior change. Reference: `docs/eval-set-format.md`, `samples/eval_set_helm.json` (already correct). -- **Blocked by:** Nothing. Two-line JSON key rename per file. diff --git a/examples/zero-code-examples/openai-agents/eval_set.json b/examples/zero-code-examples/openai-agents/eval_set.json deleted file mode 100644 index 4904d2c..0000000 --- a/examples/zero-code-examples/openai-agents/eval_set.json +++ /dev/null @@ -1,74 +0,0 @@ -{ - "eval_set_id": "openai_agents_eval", - "name": "OpenAI Agents Dice Agent Eval Set", - "description": "Golden eval cases for the OpenAI Agents SDK zero-code example. One multi-turn session covering greeting, die roll, and prime check.", - "eval_cases": [ - { - "eval_id": "dice-roll-session", - "conversation": [ - { - "invocation_id": "inv-1", - "user_content": { - "role": "user", - "parts": [ - {"text": "Hi! Can you help me?"} - ] - }, - "final_response": { - "role": "model", - "parts": [ - {"text": "Hello! Yes, I can help you. I have tools that can roll dice and check if numbers are prime. What would you like me to do?"} - ] - } - }, - { - "invocation_id": "inv-2", - "user_content": { - "role": "user", - "parts": [ - {"text": "Roll a 20-sided die for me"} - ] - }, - "final_response": { - "role": "model", - "parts": [ - {"text": "I rolled a 20-sided die and got [number]. Let me know if you'd like to check if it's prime!"} - ] - }, - "intermediate_data": { - "tool_uses": [ - { - "name": "roll_die", - "args": {"sides": 20}, - "id": "call_roll_die_1" - } - ], - "tool_responses": [ - { - "name": "roll_die", - "response": { - "content": [{"type": "text", "text": "[number]"}] - } - } - ] - } - }, - { - "invocation_id": "inv-3", - "user_content": { - "role": "user", - "parts": [ - {"text": "Is the number you rolled prime?"} - ] - }, - "final_response": { - "role": "model", - "parts": [ - {"text": "[number] is [prime/not prime]."} - ] - } - } - ] - } - ] -} diff --git a/examples/zero-code-examples/openai-agents/run.py b/examples/zero-code-examples/openai-agents/run.py index 5d8c054..3bd9b46 100644 --- a/examples/zero-code-examples/openai-agents/run.py +++ b/examples/zero-code-examples/openai-agents/run.py @@ -21,12 +21,8 @@ from agents import Agent, Runner, function_tool from dotenv import load_dotenv from opentelemetry import trace -from opentelemetry._logs import set_logger_provider -from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.openai_agents import OpenAIAgentsInstrumentor -from opentelemetry.sdk._logs import LoggerProvider -from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor @@ -34,8 +30,6 @@ load_dotenv(override=True) -# ── Tool definitions ────────────────────────────────────────────────────────── - @function_tool def roll_die(sides: int) -> int: """Roll a die with the given number of sides and return the result.""" @@ -61,9 +55,6 @@ def main(): endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") print(f"OTLP endpoint: {endpoint}") - # openai-agents-v2 uses ContentCaptureMode — use "span_and_event" for maximum - # content capture. agentevals reads from span attributes, so content will be - # visible in the UI. os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "span_and_event" os.environ.setdefault( @@ -71,31 +62,12 @@ def main(): "agentevals.eval_set_id=openai_agents_eval,agentevals.session_name=openai-agents-zero-code", ) - # OTel setup flow: - # - # Resource (session/eval attrs) - # │ - # ├── TracerProvider → BatchSpanProcessor → OTLPSpanExporter → :4318 - # │ - # └── LoggerProvider → BatchLogRecordProcessor → OTLPLogExporter → :4318 - # (openai-agents-v2 may route message content to span attributes - # rather than log records; OTLPLogExporter is a no-op if unused) - # - # OpenAIAgentsInstrumentor.instrument() - # └── hooks into agents SDK → emits spans + (optionally) log records - # - # Runner.run_sync(agent, input) ← called with accumulated conversation - resource = Resource.create() tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(), schedule_delay_millis=1000)) trace.set_tracer_provider(tracer_provider) - logger_provider = LoggerProvider(resource=resource) - logger_provider.add_log_record_processor(BatchLogRecordProcessor(OTLPLogExporter(), schedule_delay_millis=1000)) - set_logger_provider(logger_provider) - OpenAIAgentsInstrumentor().instrument() agent = Agent( @@ -110,11 +82,6 @@ def main(): "Is the number you rolled prime?", ] - # Accumulate conversation context so turn 3 ("Is it prime?") can reference - # the number rolled in turn 2. openai-agents Runner.run_sync() is stateless - # per call, so we thread prior messages via result.to_input_list() — the SDK's - # canonical way to carry full response history (tool calls, tool results, etc.) - # into the next turn. Raw role/content dicts can silently drop tool-call context. conversation_input: list = [] try: @@ -127,16 +94,11 @@ def main(): agent_response = result.final_output or "" print(f" Agent: {agent_response}") - # to_input_list() returns the full turn history including tool calls and - # results, which is what the SDK expects as input for the next turn. conversation_input = result.to_input_list() finally: - # Always flush — even if a turn raises (rate limit, network error, etc.) - # so that whatever spans were recorded make it to the OTLP receiver. print() tracer_provider.force_flush() - logger_provider.force_flush() - print("All traces and logs flushed to OTLP receiver.") + print("All traces flushed to OTLP receiver.") if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 8cee168..cd82fc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agentevals-cli" -version = "0.5.3" +version = "0.5.2" description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces" readme = "README.md" requires-python = ">=3.11" From a625609e3642da9181301c17c5d92b3326b333b2 Mon Sep 17 00:00:00 2001 From: shahar-dagan <36898364+shahar-dagan@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:54:00 -0700 Subject: [PATCH 4/4] use setdefault for OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT Co-Authored-By: Claude Sonnet 4.6 --- examples/zero-code-examples/openai-agents/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/zero-code-examples/openai-agents/run.py b/examples/zero-code-examples/openai-agents/run.py index 3bd9b46..6618159 100644 --- a/examples/zero-code-examples/openai-agents/run.py +++ b/examples/zero-code-examples/openai-agents/run.py @@ -55,7 +55,7 @@ def main(): endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") print(f"OTLP endpoint: {endpoint}") - os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "span_and_event" + os.environ.setdefault("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "span_and_event") os.environ.setdefault( "OTEL_RESOURCE_ATTRIBUTES",