From aeda08ce041def6cfaf32da0062e53dce4a16284 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:27:05 +0000 Subject: [PATCH 1/5] Initial plan From 059c2cb6048c027c52d82f4508a7664e1364a5d4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:31:55 +0000 Subject: [PATCH 2/5] Initial plan for OTel span events Agent-Logs-Url: https://github.com/github/gh-aw/sessions/6bdc94c1-f53f-409c-b275-415885eaf9ea Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- .github/workflows/agentics-maintenance.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/agentics-maintenance.yml b/.github/workflows/agentics-maintenance.yml index 5dac0c9a07a..8c142f3ab1a 100644 --- a/.github/workflows/agentics-maintenance.yml +++ b/.github/workflows/agentics-maintenance.yml @@ -281,7 +281,7 @@ jobs: validate_workflows: if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.operation == 'validate' && !github.event.repository.fork }} - runs-on: ubuntu-latest + runs-on: ubuntu-slim permissions: contents: read issues: write @@ -315,11 +315,6 @@ jobs: - name: Build gh-aw run: make build - - name: Start Docker daemon - run: | - sudo systemctl start docker - docker info - - name: Validate workflows and file issue on findings uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 env: From d96f92cc8ddf7654a1bc172637a16ac994b4b7d7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:54:14 +0000 Subject: [PATCH 3/5] feat: add OTel exception span events for individual agent errors Following the OpenTelemetry exception event semantic convention, each agent error is now emitted as a separate span event with name="exception" and an "exception.message" attribute, making individual errors queryable in backends like Grafana Tempo, Honeycomb, and Datadog. Changes: - Add `events` parameter to `buildOTLPPayload` in send_otlp_span.cjs - Emit one exception span event per error in `sendJobConclusionSpan` - Add tests for events support in buildOTLPPayload (4 tests) - Add tests for exception events in sendJobConclusionSpan (6 tests) Agent-Logs-Url: https://github.com/github/gh-aw/sessions/6bdc94c1-f53f-409c-b275-415885eaf9ea Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- actions/setup/js/send_otlp_span.cjs | 20 ++- actions/setup/js/send_otlp_span.test.cjs | 201 +++++++++++++++++++++++ pkg/workflow/maintenance_workflow.go | 4 +- 3 files changed, 222 insertions(+), 3 deletions(-) diff --git a/actions/setup/js/send_otlp_span.cjs b/actions/setup/js/send_otlp_span.cjs index c70c04891a8..08f5fae68c9 100644 --- a/actions/setup/js/send_otlp_span.cjs +++ b/actions/setup/js/send_otlp_span.cjs @@ -104,6 +104,7 @@ const SPAN_KIND_CONSUMER = 5; * @property {number} [statusCode] - OTLP status code: 0=UNSET, 1=OK, 2=ERROR (defaults to 1) * @property {string} [statusMessage] - Human-readable status message (included when statusCode is 2) * @property {number} [kind] - OTLP SpanKind: use SPAN_KIND_* constants. Defaults to SPAN_KIND_INTERNAL (1). + * @property {Array<{timeUnixNano: string, name: string, attributes: Array<{key: string, value: object}>}>} [events] - Span events following the OTel events spec (e.g. exception events). */ /** @@ -112,7 +113,7 @@ const SPAN_KIND_CONSUMER = 5; * @param {OTLPSpanOptions} opts * @returns {object} - Ready to be serialised as JSON and POSTed to `/v1/traces` */ -function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL }) { +function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL, events }) { const code = typeof statusCode === "number" ? statusCode : 1; // STATUS_CODE_OK /** @type {{ code: number, message?: string }} */ const status = { code }; @@ -144,6 +145,7 @@ function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, en endTimeUnixNano: toNanoString(endMs), status, attributes, + ...(events && events.length > 0 ? { events } : {}), }, ], }, @@ -762,6 +764,21 @@ async function sendJobConclusionSpan(spanName, options = {}) { } resourceAttributes.push(buildAttr("deployment.environment", staged ? "staging" : "production")); + // Build OTel exception span events — one per error — following the + // OpenTelemetry semantic convention for exceptions. Each event has + // name="exception" and an "exception.message" attribute, making individual + // errors queryable in backends like Grafana Tempo, Honeycomb, and Datadog. + const spanEvents = isAgentFailure + ? outputErrors + .map(e => (e && typeof e.message === "string" ? e.message : String(e))) + .filter(Boolean) + .map(msg => ({ + timeUnixNano: toNanoString(nowMs()), + name: "exception", + attributes: [buildAttr("exception.message", msg.slice(0, MAX_ATTR_VALUE_LENGTH))], + })) + : []; + const payload = buildOTLPPayload({ traceId, spanId: generateSpanId(), @@ -775,6 +792,7 @@ async function sendJobConclusionSpan(spanName, options = {}) { resourceAttributes, statusCode, statusMessage, + events: spanEvents, }); // Always mirror to JSONL — the artifact is useful even without a live collector. diff --git a/actions/setup/js/send_otlp_span.test.cjs b/actions/setup/js/send_otlp_span.test.cjs index c2b47aed9b0..b968047a805 100644 --- a/actions/setup/js/send_otlp_span.test.cjs +++ b/actions/setup/js/send_otlp_span.test.cjs @@ -321,6 +321,80 @@ describe("buildOTLPPayload", () => { const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; expect(span.kind).toBe(SPAN_KIND_SERVER); }); + + it("includes events array in span when events are provided", () => { + const events = [ + { + timeUnixNano: toNanoString(1000), + name: "exception", + attributes: [buildAttr("exception.message", "something failed")], + }, + ]; + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + events, + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(1); + expect(span.events[0].name).toBe("exception"); + expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "something failed" } }); + }); + + it("includes multiple events when provided", () => { + const events = [ + { timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error A")] }, + { timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error B")] }, + ]; + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + events, + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(2); + expect(span.events[0].attributes[0].value.stringValue).toBe("error A"); + expect(span.events[1].attributes[0].value.stringValue).toBe("error B"); + }); + + it("omits events from span when events array is empty", () => { + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + events: [], + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); + + it("omits events from span when events is not provided", () => { + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); }); // --------------------------------------------------------------------------- @@ -1945,6 +2019,133 @@ describe("sendJobConclusionSpan", () => { expect(keys).not.toContain("gh-aw.error.messages"); expect(span.status.message).toBe("agent failure"); }); + + it("emits one exception span event per error on agent failure", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: [{ message: "Rate limit exceeded" }, { message: "Tool call failed" }] }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(2); + expect(span.events[0].name).toBe("exception"); + expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Rate limit exceeded" } }); + expect(span.events[1].name).toBe("exception"); + expect(span.events[1].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Tool call failed" } }); + }); + + it("truncates exception.message to 1024 characters", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + const longMessage = "x".repeat(2000); + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: [{ message: longMessage }] }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(1); + const msg = span.events[0].attributes.find(a => a.key === "exception.message"); + expect(msg.value.stringValue.length).toBe(1024); + }); + + it("does not emit exception events when agent conclusion is success", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "success"; + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); + + it("does not emit exception events when agent_output.json is absent on failure", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + // readFileSpy already throws ENOENT for all paths (set in beforeEach) + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); + + it("emits exception events for all errors (not capped at 5 like error messages attribute)", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + const manyErrors = [1, 2, 3, 4, 5, 6, 7].map(i => ({ message: `Error ${i}` })); + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: manyErrors }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(7); + for (let i = 0; i < 7; i++) { + expect(span.events[i].name).toBe("exception"); + expect(span.events[i].attributes).toContainEqual({ key: "exception.message", value: { stringValue: `Error ${i + 1}` } }); + } + }); + + it("sets valid timeUnixNano on each exception event", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: [{ message: "test error" }] }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(1); + expect(span.events[0].timeUnixNano).toMatch(/^\d+$/); + }); }); describe("rate-limit enrichment in conclusion span", () => { diff --git a/pkg/workflow/maintenance_workflow.go b/pkg/workflow/maintenance_workflow.go index c7c2b1fb2ee..5f582cd2335 100644 --- a/pkg/workflow/maintenance_workflow.go +++ b/pkg/workflow/maintenance_workflow.go @@ -737,7 +737,7 @@ func buildNotDispatchOrEmptyOperation() ConditionNode { // buildNotForkAndScheduledOrOperation creates a condition for jobs that run on // schedule (or empty operation) AND when a specific operation is selected. -// Condition: !fork && (not_dispatch || operation == '' || operation == op) +// Condition: !fork && (not_dispatch || operation == ” || operation == op) func buildNotForkAndScheduledOrOperation(operation string) ConditionNode { return BuildAnd( buildNotForkCondition(), @@ -753,7 +753,7 @@ func buildNotForkAndScheduledOrOperation(operation string) ConditionNode { // buildRunOperationCondition creates the condition for the unified run_operation // job that handles all dispatch operations except the ones with dedicated jobs. -// Condition: dispatch && operation != '' && operation != each excluded && !fork. +// Condition: dispatch && operation != ” && operation != each excluded && !fork. func buildRunOperationCondition(excludedOperations ...string) ConditionNode { // Start with: event is workflow_dispatch AND operation is not empty condition := BuildAnd( From cfd840345440e5e1ada95a8ae97e2f5b7934d608 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:58:30 +0000 Subject: [PATCH 4/5] refactor: capture nowMs() once for all exception event timestamps Agent-Logs-Url: https://github.com/github/gh-aw/sessions/6bdc94c1-f53f-409c-b275-415885eaf9ea Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- actions/setup/js/send_otlp_span.cjs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/actions/setup/js/send_otlp_span.cjs b/actions/setup/js/send_otlp_span.cjs index 08f5fae68c9..1cd009cf4b0 100644 --- a/actions/setup/js/send_otlp_span.cjs +++ b/actions/setup/js/send_otlp_span.cjs @@ -768,12 +768,13 @@ async function sendJobConclusionSpan(spanName, options = {}) { // OpenTelemetry semantic convention for exceptions. Each event has // name="exception" and an "exception.message" attribute, making individual // errors queryable in backends like Grafana Tempo, Honeycomb, and Datadog. + const errorTimeNano = toNanoString(nowMs()); const spanEvents = isAgentFailure ? outputErrors .map(e => (e && typeof e.message === "string" ? e.message : String(e))) .filter(Boolean) .map(msg => ({ - timeUnixNano: toNanoString(nowMs()), + timeUnixNano: errorTimeNano, name: "exception", attributes: [buildAttr("exception.message", msg.slice(0, MAX_ATTR_VALUE_LENGTH))], })) From 4afd5e9dc5b94a8ee056cadc206c23ad6e85955b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 16:38:55 +0000 Subject: [PATCH 5/5] fix: use ubuntu-latest for validate_workflows job in maintenance workflow Agent-Logs-Url: https://github.com/github/gh-aw/sessions/5da2db3c-0a28-4ed3-bd61-f29c5af342ea Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- .github/workflows/agentics-maintenance.yml | 2 +- pkg/workflow/maintenance_workflow.go | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/agentics-maintenance.yml b/.github/workflows/agentics-maintenance.yml index 8c142f3ab1a..6eb35db87c3 100644 --- a/.github/workflows/agentics-maintenance.yml +++ b/.github/workflows/agentics-maintenance.yml @@ -281,7 +281,7 @@ jobs: validate_workflows: if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.operation == 'validate' && !github.event.repository.fork }} - runs-on: ubuntu-slim + runs-on: ubuntu-latest permissions: contents: read issues: write diff --git a/pkg/workflow/maintenance_workflow.go b/pkg/workflow/maintenance_workflow.go index 5f582cd2335..0ef83099066 100644 --- a/pkg/workflow/maintenance_workflow.go +++ b/pkg/workflow/maintenance_workflow.go @@ -525,10 +525,12 @@ jobs: `) // Add validate_workflows job for workflow_dispatch with operation == 'validate' + // This job uses ubuntu-latest by default (needs full runner for CLI installation). + validateRunsOnValue := FormatRunsOn(configuredRunsOn, "ubuntu-latest") yaml.WriteString(` validate_workflows: if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.operation == 'validate' && !github.event.repository.fork }} - runs-on: ` + runsOnValue + ` + runs-on: ` + validateRunsOnValue + ` permissions: contents: read issues: write