github · pelikhan · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/.github/workflows/agentics-maintenance.yml b/.github/workflows/agentics-maintenance.yml
@@ -315,11 +315,6 @@ jobs:
       - name: Build gh-aw
         run: make build
 
-
+
+      - name: Ensure Docker is running
+        run: |
+          docker info >/dev/null 2>&1 || {
+            sudo systemctl start docker || sudo service docker start
+          }
+          docker info
-
+
+      - name: Ensure Docker is running
+        run: |
+          docker info >/dev/null 2>&1 || {
+            sudo systemctl start docker || sudo service docker start
+          }
+          docker info
-      - name: Start Docker daemon
-        run: |
-          sudo systemctl start docker
-          docker info
-
       - name: Validate workflows and file issue on findings
         uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9
         env:

diff --git a/actions/setup/js/send_otlp_span.cjs b/actions/setup/js/send_otlp_span.cjs
@@ -104,6 +104,7 @@ const SPAN_KIND_CONSUMER = 5;
  * @property {number} [statusCode]      - OTLP status code: 0=UNSET, 1=OK, 2=ERROR (defaults to 1)
  * @property {string} [statusMessage]   - Human-readable status message (included when statusCode is 2)
  * @property {number} [kind]            - OTLP SpanKind: use SPAN_KIND_* constants. Defaults to SPAN_KIND_INTERNAL (1).
+ * @property {Array<{timeUnixNano: string, name: string, attributes: Array<{key: string, value: object}>}>} [events] - Span events following the OTel events spec (e.g. exception events).
  */
 
 /**
@@ -112,7 +113,7 @@ const SPAN_KIND_CONSUMER = 5;
  * @param {OTLPSpanOptions} opts
  * @returns {object} - Ready to be serialised as JSON and POSTed to `/v1/traces`
  */
-function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL }) {
+function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL, events }) {
   const code = typeof statusCode === "number" ? statusCode : 1; // STATUS_CODE_OK
   /** @type {{ code: number, message?: string }} */
   const status = { code };
@@ -144,6 +145,7 @@ function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, en
                 endTimeUnixNano: toNanoString(endMs),
                 status,
                 attributes,
+                ...(events && events.length > 0 ? { events } : {}),
               },
             ],
           },
@@ -762,6 +764,22 @@ async function sendJobConclusionSpan(spanName, options = {}) {
   }
   resourceAttributes.push(buildAttr("deployment.environment", staged ? "staging" : "production"));
 
+  // Build OTel exception span events — one per error — following the
+  // OpenTelemetry semantic convention for exceptions.  Each event has
+  // name="exception" and an "exception.message" attribute, making individual
+  // errors queryable in backends like Grafana Tempo, Honeycomb, and Datadog.
+  const errorTimeNano = toNanoString(nowMs());
+  const spanEvents = isAgentFailure
+    ? outputErrors
+        .map(e => (e && typeof e.message === "string" ? e.message : String(e)))
+        .filter(Boolean)
+        .map(msg => ({
+          timeUnixNano: errorTimeNano,
+          name: "exception",
+          attributes: [buildAttr("exception.message", msg.slice(0, MAX_ATTR_VALUE_LENGTH))],
+        }))
+    : [];
+
   const payload = buildOTLPPayload({
     traceId,
     spanId: generateSpanId(),
@@ -775,6 +793,7 @@ async function sendJobConclusionSpan(spanName, options = {}) {
     resourceAttributes,
     statusCode,
     statusMessage,
+    events: spanEvents,
   });
 
   // Always mirror to JSONL — the artifact is useful even without a live collector.

diff --git a/actions/setup/js/send_otlp_span.test.cjs b/actions/setup/js/send_otlp_span.test.cjs
@@ -321,6 +321,80 @@ describe("buildOTLPPayload", () => {
     const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
     expect(span.kind).toBe(SPAN_KIND_SERVER);
   });
+
+  it("includes events array in span when events are provided", () => {
+    const events = [
+      {
+        timeUnixNano: toNanoString(1000),
+        name: "exception",
+        attributes: [buildAttr("exception.message", "something failed")],
+      },
+    ];
+    const payload = buildOTLPPayload({
+      traceId: "a".repeat(32),
+      spanId: "b".repeat(16),
+      spanName: "test",
+      startMs: 0,
+      endMs: 1,
+      serviceName: "gh-aw",
+      attributes: [],
+      events,
+    });
+    const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
+    expect(span.events).toHaveLength(1);
+    expect(span.events[0].name).toBe("exception");
+    expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "something failed" } });
+  });
+
+  it("includes multiple events when provided", () => {
+    const events = [
+      { timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error A")] },
+      { timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error B")] },
+    ];
+    const payload = buildOTLPPayload({
+      traceId: "a".repeat(32),
+      spanId: "b".repeat(16),
+      spanName: "test",
+      startMs: 0,
+      endMs: 1,
+      serviceName: "gh-aw",
+      attributes: [],
+      events,
+    });
+    const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
+    expect(span.events).toHaveLength(2);
+    expect(span.events[0].attributes[0].value.stringValue).toBe("error A");
+    expect(span.events[1].attributes[0].value.stringValue).toBe("error B");
+  });
+
+  it("omits events from span when events array is empty", () => {
+    const payload = buildOTLPPayload({
+      traceId: "a".repeat(32),
+      spanId: "b".repeat(16),
+      spanName: "test",
+      startMs: 0,
+      endMs: 1,
+      serviceName: "gh-aw",
+      attributes: [],
+      events: [],
+    });
+    const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
+    expect(span.events).toBeUndefined();
+  });
+
+  it("omits events from span when events is not provided", () => {
+    const payload = buildOTLPPayload({
+      traceId: "a".repeat(32),
+      spanId: "b".repeat(16),
+      spanName: "test",
+      startMs: 0,
+      endMs: 1,
+      serviceName: "gh-aw",
+      attributes: [],
+    });
+    const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
+    expect(span.events).toBeUndefined();
+  });
 });
 
 // ---------------------------------------------------------------------------
@@ -1945,6 +2019,133 @@ describe("sendJobConclusionSpan", () => {
       expect(keys).not.toContain("gh-aw.error.messages");
       expect(span.status.message).toBe("agent failure");
     });
+
+    it("emits one exception span event per error on agent failure", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
+      vi.stubGlobal("fetch", mockFetch);
+
+      process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
+      process.env.GH_AW_AGENT_CONCLUSION = "failure";
+
+      readFileSpy.mockImplementation(filePath => {
+        if (filePath === "/tmp/gh-aw/agent_output.json") {
+          return JSON.stringify({ errors: [{ message: "Rate limit exceeded" }, { message: "Tool call failed" }] });
+        }
+        throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
+      });
+
+      await sendJobConclusionSpan("gh-aw.job.conclusion");
+
+      const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+      const span = body.resourceSpans[0].scopeSpans[0].spans[0];
+      expect(span.events).toHaveLength(2);
+      expect(span.events[0].name).toBe("exception");
+      expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Rate limit exceeded" } });
+      expect(span.events[1].name).toBe("exception");
+      expect(span.events[1].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Tool call failed" } });
+    });
+
+    it("truncates exception.message to 1024 characters", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
+      vi.stubGlobal("fetch", mockFetch);
+
+      process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
+      process.env.GH_AW_AGENT_CONCLUSION = "failure";
+
+      const longMessage = "x".repeat(2000);
+      readFileSpy.mockImplementation(filePath => {
+        if (filePath === "/tmp/gh-aw/agent_output.json") {
+          return JSON.stringify({ errors: [{ message: longMessage }] });
+        }
+        throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
+      });
+
+      await sendJobConclusionSpan("gh-aw.job.conclusion");
+
+      const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+      const span = body.resourceSpans[0].scopeSpans[0].spans[0];
+      expect(span.events).toHaveLength(1);
+      const msg = span.events[0].attributes.find(a => a.key === "exception.message");
+      expect(msg.value.stringValue.length).toBe(1024);
+    });
+
+    it("does not emit exception events when agent conclusion is success", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
+      vi.stubGlobal("fetch", mockFetch);
+
+      process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
+      process.env.GH_AW_AGENT_CONCLUSION = "success";
+
+      await sendJobConclusionSpan("gh-aw.job.conclusion");
+
+      const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+      const span = body.resourceSpans[0].scopeSpans[0].spans[0];
+      expect(span.events).toBeUndefined();
+    });
+
+    it("does not emit exception events when agent_output.json is absent on failure", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
+      vi.stubGlobal("fetch", mockFetch);
+
+      process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
+      process.env.GH_AW_AGENT_CONCLUSION = "failure";
+
+      // readFileSpy already throws ENOENT for all paths (set in beforeEach)
+
+      await sendJobConclusionSpan("gh-aw.job.conclusion");
+
+      const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+      const span = body.resourceSpans[0].scopeSpans[0].spans[0];
+      expect(span.events).toBeUndefined();
+    });
+
+    it("emits exception events for all errors (not capped at 5 like error messages attribute)", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
+      vi.stubGlobal("fetch", mockFetch);
+
+      process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
+      process.env.GH_AW_AGENT_CONCLUSION = "failure";
+
+      const manyErrors = [1, 2, 3, 4, 5, 6, 7].map(i => ({ message: `Error ${i}` }));
+      readFileSpy.mockImplementation(filePath => {
+        if (filePath === "/tmp/gh-aw/agent_output.json") {
+          return JSON.stringify({ errors: manyErrors });
+        }
+        throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
+      });
+
+      await sendJobConclusionSpan("gh-aw.job.conclusion");
+
+      const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+      const span = body.resourceSpans[0].scopeSpans[0].spans[0];
+      expect(span.events).toHaveLength(7);
+      for (let i = 0; i < 7; i++) {
+        expect(span.events[i].name).toBe("exception");
+        expect(span.events[i].attributes).toContainEqual({ key: "exception.message", value: { stringValue: `Error ${i + 1}` } });
+      }
+    });
+
+    it("sets valid timeUnixNano on each exception event", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
+      vi.stubGlobal("fetch", mockFetch);
+
+      process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
+      process.env.GH_AW_AGENT_CONCLUSION = "failure";
+
+      readFileSpy.mockImplementation(filePath => {
+        if (filePath === "/tmp/gh-aw/agent_output.json") {
+          return JSON.stringify({ errors: [{ message: "test error" }] });
+        }
+        throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
+      });
+
+      await sendJobConclusionSpan("gh-aw.job.conclusion");
+
+      const body = JSON.parse(mockFetch.mock.calls[0][1].body);
+      const span = body.resourceSpans[0].scopeSpans[0].spans[0];
+      expect(span.events).toHaveLength(1);
+      expect(span.events[0].timeUnixNano).toMatch(/^\d+$/);
+    });
   });
 
   describe("rate-limit enrichment in conclusion span", () => {

diff --git a/pkg/workflow/maintenance_workflow.go b/pkg/workflow/maintenance_workflow.go
@@ -525,10 +525,12 @@ jobs:
 `)
 
 	// Add validate_workflows job for workflow_dispatch with operation == 'validate'
+	// This job uses ubuntu-latest by default (needs full runner for CLI installation).
+	validateRunsOnValue := FormatRunsOn(configuredRunsOn, "ubuntu-latest")
 	yaml.WriteString(`
   validate_workflows:
     if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.operation == 'validate' && !github.event.repository.fork }}
-    runs-on: ` + runsOnValue + `
+    runs-on: ` + validateRunsOnValue + `
     permissions:
       contents: read
       issues: write
@@ -737,7 +739,7 @@ func buildNotDispatchOrEmptyOperation() ConditionNode {
 
 // buildNotForkAndScheduledOrOperation creates a condition for jobs that run on
 // schedule (or empty operation) AND when a specific operation is selected.
-// Condition: !fork && (not_dispatch || operation == '' || operation == op)
+// Condition: !fork && (not_dispatch || operation == ” || operation == op)
 func buildNotForkAndScheduledOrOperation(operation string) ConditionNode {
 	return BuildAnd(
 		buildNotForkCondition(),
@@ -753,7 +755,7 @@ func buildNotForkAndScheduledOrOperation(operation string) ConditionNode {
 
 // buildRunOperationCondition creates the condition for the unified run_operation
 // job that handles all dispatch operations except the ones with dedicated jobs.
-// Condition: dispatch && operation != '' && operation != each excluded && !fork.
+// Condition: dispatch && operation != ” && operation != each excluded && !fork.
 func buildRunOperationCondition(excludedOperations ...string) ConditionNode {
 	// Start with: event is workflow_dispatch AND operation is not empty
 	condition := BuildAnd(