Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/workflows/agentics-maintenance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,6 @@ jobs:
- name: Build gh-aw
run: make build

Copy link

Copilot AI Apr 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The validate_workflows job runs gh aw compile --validate --zizmor --actionlint --poutine (via run_validate_workflows.cjs), and those scanners are executed via Docker (see e.g. pkg/cli/zizmor.go, pkg/cli/actionlint.go, pkg/cli/poutine.go). This PR removes the step that ensures the Docker daemon is running, which can cause validation to fail on runners where Docker isn’t started by default. Consider restoring the Docker start/check (e.g. docker info with a fallback to start the service) before running validation.

Suggested change
- name: Ensure Docker is running
run: |
docker info >/dev/null 2>&1 || {
sudo systemctl start docker || sudo service docker start
}
docker info

Copilot uses AI. Check for mistakes.
- name: Start Docker daemon
run: |
sudo systemctl start docker
docker info

- name: Validate workflows and file issue on findings
uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9
env:
Expand Down
21 changes: 20 additions & 1 deletion actions/setup/js/send_otlp_span.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ const SPAN_KIND_CONSUMER = 5;
* @property {number} [statusCode] - OTLP status code: 0=UNSET, 1=OK, 2=ERROR (defaults to 1)
* @property {string} [statusMessage] - Human-readable status message (included when statusCode is 2)
* @property {number} [kind] - OTLP SpanKind: use SPAN_KIND_* constants. Defaults to SPAN_KIND_INTERNAL (1).
* @property {Array<{timeUnixNano: string, name: string, attributes: Array<{key: string, value: object}>}>} [events] - Span events following the OTel events spec (e.g. exception events).
*/

/**
Expand All @@ -112,7 +113,7 @@ const SPAN_KIND_CONSUMER = 5;
* @param {OTLPSpanOptions} opts
* @returns {object} - Ready to be serialised as JSON and POSTed to `/v1/traces`
*/
function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL }) {
function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, endMs, serviceName, scopeVersion, attributes, resourceAttributes, statusCode, statusMessage, kind = SPAN_KIND_INTERNAL, events }) {
const code = typeof statusCode === "number" ? statusCode : 1; // STATUS_CODE_OK
/** @type {{ code: number, message?: string }} */
const status = { code };
Expand Down Expand Up @@ -144,6 +145,7 @@ function buildOTLPPayload({ traceId, spanId, parentSpanId, spanName, startMs, en
endTimeUnixNano: toNanoString(endMs),
status,
attributes,
...(events && events.length > 0 ? { events } : {}),
},
],
},
Expand Down Expand Up @@ -762,6 +764,22 @@ async function sendJobConclusionSpan(spanName, options = {}) {
}
resourceAttributes.push(buildAttr("deployment.environment", staged ? "staging" : "production"));

// Build OTel exception span events — one per error — following the
// OpenTelemetry semantic convention for exceptions. Each event has
// name="exception" and an "exception.message" attribute, making individual
// errors queryable in backends like Grafana Tempo, Honeycomb, and Datadog.
const errorTimeNano = toNanoString(nowMs());
const spanEvents = isAgentFailure
? outputErrors
.map(e => (e && typeof e.message === "string" ? e.message : String(e)))
.filter(Boolean)
.map(msg => ({
timeUnixNano: errorTimeNano,
name: "exception",
attributes: [buildAttr("exception.message", msg.slice(0, MAX_ATTR_VALUE_LENGTH))],
}))
: [];

const payload = buildOTLPPayload({
traceId,
spanId: generateSpanId(),
Expand All @@ -775,6 +793,7 @@ async function sendJobConclusionSpan(spanName, options = {}) {
resourceAttributes,
statusCode,
statusMessage,
events: spanEvents,
});

// Always mirror to JSONL — the artifact is useful even without a live collector.
Expand Down
201 changes: 201 additions & 0 deletions actions/setup/js/send_otlp_span.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,80 @@ describe("buildOTLPPayload", () => {
const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.kind).toBe(SPAN_KIND_SERVER);
});

it("includes events array in span when events are provided", () => {
const events = [
{
timeUnixNano: toNanoString(1000),
name: "exception",
attributes: [buildAttr("exception.message", "something failed")],
},
];
const payload = buildOTLPPayload({
traceId: "a".repeat(32),
spanId: "b".repeat(16),
spanName: "test",
startMs: 0,
endMs: 1,
serviceName: "gh-aw",
attributes: [],
events,
});
const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toHaveLength(1);
expect(span.events[0].name).toBe("exception");
expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "something failed" } });
});

it("includes multiple events when provided", () => {
const events = [
{ timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error A")] },
{ timeUnixNano: toNanoString(1000), name: "exception", attributes: [buildAttr("exception.message", "error B")] },
];
const payload = buildOTLPPayload({
traceId: "a".repeat(32),
spanId: "b".repeat(16),
spanName: "test",
startMs: 0,
endMs: 1,
serviceName: "gh-aw",
attributes: [],
events,
});
const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toHaveLength(2);
expect(span.events[0].attributes[0].value.stringValue).toBe("error A");
expect(span.events[1].attributes[0].value.stringValue).toBe("error B");
});

it("omits events from span when events array is empty", () => {
const payload = buildOTLPPayload({
traceId: "a".repeat(32),
spanId: "b".repeat(16),
spanName: "test",
startMs: 0,
endMs: 1,
serviceName: "gh-aw",
attributes: [],
events: [],
});
const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toBeUndefined();
});

it("omits events from span when events is not provided", () => {
const payload = buildOTLPPayload({
traceId: "a".repeat(32),
spanId: "b".repeat(16),
spanName: "test",
startMs: 0,
endMs: 1,
serviceName: "gh-aw",
attributes: [],
});
const span = payload.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toBeUndefined();
});
});

// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -1945,6 +2019,133 @@ describe("sendJobConclusionSpan", () => {
expect(keys).not.toContain("gh-aw.error.messages");
expect(span.status.message).toBe("agent failure");
});

it("emits one exception span event per error on agent failure", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
process.env.GH_AW_AGENT_CONCLUSION = "failure";

readFileSpy.mockImplementation(filePath => {
if (filePath === "/tmp/gh-aw/agent_output.json") {
return JSON.stringify({ errors: [{ message: "Rate limit exceeded" }, { message: "Tool call failed" }] });
}
throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
});

await sendJobConclusionSpan("gh-aw.job.conclusion");

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const span = body.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toHaveLength(2);
expect(span.events[0].name).toBe("exception");
expect(span.events[0].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Rate limit exceeded" } });
expect(span.events[1].name).toBe("exception");
expect(span.events[1].attributes).toContainEqual({ key: "exception.message", value: { stringValue: "Tool call failed" } });
});

it("truncates exception.message to 1024 characters", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
process.env.GH_AW_AGENT_CONCLUSION = "failure";

const longMessage = "x".repeat(2000);
readFileSpy.mockImplementation(filePath => {
if (filePath === "/tmp/gh-aw/agent_output.json") {
return JSON.stringify({ errors: [{ message: longMessage }] });
}
throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
});

await sendJobConclusionSpan("gh-aw.job.conclusion");

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const span = body.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toHaveLength(1);
const msg = span.events[0].attributes.find(a => a.key === "exception.message");
expect(msg.value.stringValue.length).toBe(1024);
});

it("does not emit exception events when agent conclusion is success", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
process.env.GH_AW_AGENT_CONCLUSION = "success";

await sendJobConclusionSpan("gh-aw.job.conclusion");

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const span = body.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toBeUndefined();
});

it("does not emit exception events when agent_output.json is absent on failure", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
process.env.GH_AW_AGENT_CONCLUSION = "failure";

// readFileSpy already throws ENOENT for all paths (set in beforeEach)

await sendJobConclusionSpan("gh-aw.job.conclusion");

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const span = body.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toBeUndefined();
});

it("emits exception events for all errors (not capped at 5 like error messages attribute)", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
process.env.GH_AW_AGENT_CONCLUSION = "failure";

const manyErrors = [1, 2, 3, 4, 5, 6, 7].map(i => ({ message: `Error ${i}` }));
readFileSpy.mockImplementation(filePath => {
if (filePath === "/tmp/gh-aw/agent_output.json") {
return JSON.stringify({ errors: manyErrors });
}
throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
});

await sendJobConclusionSpan("gh-aw.job.conclusion");

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const span = body.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toHaveLength(7);
for (let i = 0; i < 7; i++) {
expect(span.events[i].name).toBe("exception");
expect(span.events[i].attributes).toContainEqual({ key: "exception.message", value: { stringValue: `Error ${i + 1}` } });
}
});

it("sets valid timeUnixNano on each exception event", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.OTEL_EXPORTER_OTLP_ENDPOINT = "https://traces.example.com";
process.env.GH_AW_AGENT_CONCLUSION = "failure";

readFileSpy.mockImplementation(filePath => {
if (filePath === "/tmp/gh-aw/agent_output.json") {
return JSON.stringify({ errors: [{ message: "test error" }] });
}
throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
});

await sendJobConclusionSpan("gh-aw.job.conclusion");

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const span = body.resourceSpans[0].scopeSpans[0].spans[0];
expect(span.events).toHaveLength(1);
expect(span.events[0].timeUnixNano).toMatch(/^\d+$/);
});
});

describe("rate-limit enrichment in conclusion span", () => {
Expand Down
8 changes: 5 additions & 3 deletions pkg/workflow/maintenance_workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -525,10 +525,12 @@ jobs:
`)

// Add validate_workflows job for workflow_dispatch with operation == 'validate'
// This job uses ubuntu-latest by default (needs full runner for CLI installation).
validateRunsOnValue := FormatRunsOn(configuredRunsOn, "ubuntu-latest")
yaml.WriteString(`
validate_workflows:
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.operation == 'validate' && !github.event.repository.fork }}
runs-on: ` + runsOnValue + `
runs-on: ` + validateRunsOnValue + `
permissions:
contents: read
issues: write
Expand Down Expand Up @@ -737,7 +739,7 @@ func buildNotDispatchOrEmptyOperation() ConditionNode {

// buildNotForkAndScheduledOrOperation creates a condition for jobs that run on
// schedule (or empty operation) AND when a specific operation is selected.
// Condition: !fork && (not_dispatch || operation == '' || operation == op)
// Condition: !fork && (not_dispatch || operation == || operation == op)
func buildNotForkAndScheduledOrOperation(operation string) ConditionNode {
Comment on lines 740 to 743
Copy link

Copilot AI Apr 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition examples in this comment use a smart quote character (” ) where an empty string literal is intended. This makes the documentation misleading and can also cause copy/paste errors. Replace ” with "" (empty string) to match the actual condition logic used below.

This issue also appears on line 754 of the same file.

Copilot uses AI. Check for mistakes.
return BuildAnd(
buildNotForkCondition(),
Expand All @@ -753,7 +755,7 @@ func buildNotForkAndScheduledOrOperation(operation string) ConditionNode {

// buildRunOperationCondition creates the condition for the unified run_operation
// job that handles all dispatch operations except the ones with dedicated jobs.
// Condition: dispatch && operation != '' && operation != each excluded && !fork.
// Condition: dispatch && operation != && operation != each excluded && !fork.
func buildRunOperationCondition(excludedOperations ...string) ConditionNode {
// Start with: event is workflow_dispatch AND operation is not empty
condition := BuildAnd(
Expand Down