diff --git a/.github/workflows/agentics-maintenance.yml b/.github/workflows/agentics-maintenance.yml index 5dac0c9a07a..6eb35db87c3 100644 --- a/.github/workflows/agentics-maintenance.yml +++ b/.github/workflows/agentics-maintenance.yml @@ -315,11 +315,6 @@ jobs: - name: Build gh-aw run: make build - - name: Start Docker daemon - run: | - sudo systemctl start docker - docker info - - name: Validate workflows and file issue on findings uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 env: diff --git a/actions/setup/js/send_otlp_span.cjs b/actions/setup/js/send_otlp_span.cjs index c70c04891a8..1cd009cf4b0 100644 --- a/actions/setup/js/send_otlp_span.cjs +++ b/actions/setup/js/send_otlp_span.cjs @@ -104,6 +104,7 @@ const SPAN_KIND_CONSUMER = 5; * @property {number} [statusCode] - OTLP status code: 0=UNSET, 1=OK, 2=ERROR (defaults to 1) * @property {string} [statusMessage] - Human-readable status message (included when statusCode is 2) * @property {number} [kind] - OTLP SpanKind: use SPAN_KIND_* constants. Defaults to SPAN_KIND_INTERNAL (1). + * @property {Array<{timeUnixNano: string, name: string, attributes: Array<{key: string, value: object}>}>} [events] - Span events following the OTel events spec (e.g. exception events). */ /** @@ -112,7 +113,7 @@ const SPAN_KIND_CONSUMER = 5; * @param {OTLPSpanOptions} opts * @returns {object} - Ready to be serialised as JSON and POSTed to `/v1/traces` */ -function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL }) { +function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL, events }) { const code = typeof statusCode === "number" ? statusCode : 1; // STATUS_CODE_OK /** @type {{ code: number, message?: string }} */ const status = { code }; @@ -144,6 +145,7 @@ function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, en endTimeUnixNano: toNanoString(endMs), status, attributes, + ...(events && events.length > 0 ? { events } : {}), }, ], }, @@ -762,6 +764,22 @@ async function sendJobConclusionSpan(spanName, options = {}) { } resourceAttributes.push(buildAttr("deployment.environment", staged ? "staging" : "production")); + // Build OTel exception span events — one per error — following the + // OpenTelemetry semantic convention for exceptions. Each event has + // name="exception" and an "exception.message" attribute, making individual + // errors queryable in backends like Grafana Tempo, Honeycomb, and Datadog. + const errorTimeNano = toNanoString(nowMs()); + const spanEvents = isAgentFailure + ? outputErrors + .map(e => (e && typeof e.message === "string" ? e.message : String(e))) + .filter(Boolean) + .map(msg => ({ + timeUnixNano: errorTimeNano, + name: "exception", + attributes: [buildAttr("exception.message", msg.slice(0, MAX_ATTR_VALUE_LENGTH))], + })) + : []; + const payload = buildOTLPPayload({ traceId, spanId: generateSpanId(), @@ -775,6 +793,7 @@ async function sendJobConclusionSpan(spanName, options = {}) { resourceAttributes, statusCode, statusMessage, + events: spanEvents, }); // Always mirror to JSONL — the artifact is useful even without a live collector. diff --git a/actions/setup/js/send_otlp_span.test.cjs b/actions/setup/js/send_otlp_span.test.cjs index c2b47aed9b0..b968047a805 100644 --- a/actions/setup/js/send_otlp_span.test.cjs +++ b/actions/setup/js/send_otlp_span.test.cjs @@ -321,6 +321,80 @@ describe("buildOTLPPayload", () => { const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; expect(span.kind).toBe(SPAN_KIND_SERVER); }); + + it("includes events array in span when events are provided", () => { + const events = [ + { + timeUnixNano: toNanoString(1000), + name: "exception", + attributes: [buildAttr("exception.message", "something failed")], + }, + ]; + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + events, + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(1); + expect(span.events[0].name).toBe("exception"); + expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "something failed" } }); + }); + + it("includes multiple events when provided", () => { + const events = [ + { timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error A")] }, + { timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error B")] }, + ]; + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + events, + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(2); + expect(span.events[0].attributes[0].value.stringValue).toBe("error A"); + expect(span.events[1].attributes[0].value.stringValue).toBe("error B"); + }); + + it("omits events from span when events array is empty", () => { + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + events: [], + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); + + it("omits events from span when events is not provided", () => { + const payload = buildOTLPPayload({ + traceId: "a".repeat(32), + spanId: "b".repeat(16), + spanName: "test", + startMs: 0, + endMs: 1, + serviceName: "gh-aw", + attributes: [], + }); + const span = payload.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); }); // --------------------------------------------------------------------------- @@ -1945,6 +2019,133 @@ describe("sendJobConclusionSpan", () => { expect(keys).not.toContain("gh-aw.error.messages"); expect(span.status.message).toBe("agent failure"); }); + + it("emits one exception span event per error on agent failure", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: [{ message: "Rate limit exceeded" }, { message: "Tool call failed" }] }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(2); + expect(span.events[0].name).toBe("exception"); + expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Rate limit exceeded" } }); + expect(span.events[1].name).toBe("exception"); + expect(span.events[1].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Tool call failed" } }); + }); + + it("truncates exception.message to 1024 characters", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + const longMessage = "x".repeat(2000); + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: [{ message: longMessage }] }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(1); + const msg = span.events[0].attributes.find(a => a.key === "exception.message"); + expect(msg.value.stringValue.length).toBe(1024); + }); + + it("does not emit exception events when agent conclusion is success", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "success"; + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); + + it("does not emit exception events when agent_output.json is absent on failure", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + // readFileSpy already throws ENOENT for all paths (set in beforeEach) + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toBeUndefined(); + }); + + it("emits exception events for all errors (not capped at 5 like error messages attribute)", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + const manyErrors = [1, 2, 3, 4, 5, 6, 7].map(i => ({ message: `Error ${i}` })); + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: manyErrors }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(7); + for (let i = 0; i < 7; i++) { + expect(span.events[i].name).toBe("exception"); + expect(span.events[i].attributes).toContainEqual({ key: "exception.message", value: { stringValue: `Error ${i + 1}` } }); + } + }); + + it("sets valid timeUnixNano on each exception event", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" }); + vi.stubGlobal("fetch", mockFetch); + + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com"; + process.env.GH_AW_AGENT_CONCLUSION = "failure"; + + readFileSpy.mockImplementation(filePath => { + if (filePath === "/tmp/gh-aw/agent_output.json") { + return JSON.stringify({ errors: [{ message: "test error" }] }); + } + throw Object.assign(new Error("ENOENT"), { code: "ENOENT" }); + }); + + await sendJobConclusionSpan("gh-aw.job.conclusion"); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + const span = body.resourceSpans[0].scopeSpans[0].spans[0]; + expect(span.events).toHaveLength(1); + expect(span.events[0].timeUnixNano).toMatch(/^\d+$/); + }); }); describe("rate-limit enrichment in conclusion span", () => { diff --git a/pkg/workflow/maintenance_workflow.go b/pkg/workflow/maintenance_workflow.go index c7c2b1fb2ee..0ef83099066 100644 --- a/pkg/workflow/maintenance_workflow.go +++ b/pkg/workflow/maintenance_workflow.go @@ -525,10 +525,12 @@ jobs: `) // Add validate_workflows job for workflow_dispatch with operation == 'validate' + // This job uses ubuntu-latest by default (needs full runner for CLI installation). + validateRunsOnValue := FormatRunsOn(configuredRunsOn, "ubuntu-latest") yaml.WriteString(` validate_workflows: if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.operation == 'validate' && !github.event.repository.fork }} - runs-on: ` + runsOnValue + ` + runs-on: ` + validateRunsOnValue + ` permissions: contents: read issues: write @@ -737,7 +739,7 @@ func buildNotDispatchOrEmptyOperation() ConditionNode { // buildNotForkAndScheduledOrOperation creates a condition for jobs that run on // schedule (or empty operation) AND when a specific operation is selected. -// Condition: !fork && (not_dispatch || operation == '' || operation == op) +// Condition: !fork && (not_dispatch || operation == ” || operation == op) func buildNotForkAndScheduledOrOperation(operation string) ConditionNode { return BuildAnd( buildNotForkCondition(), @@ -753,7 +755,7 @@ func buildNotForkAndScheduledOrOperation(operation string) ConditionNode { // buildRunOperationCondition creates the condition for the unified run_operation // job that handles all dispatch operations except the ones with dedicated jobs. -// Condition: dispatch && operation != '' && operation != each excluded && !fork. +// Condition: dispatch && operation != ” && operation != each excluded && !fork. func buildRunOperationCondition(excludedOperations ...string) ConditionNode { // Start with: event is workflow_dispatch AND operation is not empty condition := BuildAnd(