Skip to content

Commit d33e99c

Browse files
authored
update all-green script to rerun failed workflows once (#7847)
Update all-green script to rerun failed workflows once. Workflows are only rerun if they have 3 jobs or less failing, otherwise they are considered actual failures and not flaky. Right now whenever a workflow fails, the All Green workflow also fails. If the failure was because of flakiness, rerunning the workflow does not rerun All Green, so all-green also has to be manually rerun. I was trying to think of a way to address the above, but then realized that even if All Green would automatically be rerun when the failed workflow is rerun, that workflow still needs to be rerun manually. By having All Green handle the rerun automatically, both issues are fixed, and as a bonus the Retry workflow can also be removed as it was noisy for the main branch. Also reordered the summary table by conclusion severity.
1 parent d07ba70 commit d33e99c

3 files changed

Lines changed: 112 additions & 71 deletions

File tree

.github/workflows/all-green.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ jobs:
1515
all-green:
1616
runs-on: ubuntu-latest
1717
permissions:
18+
actions: write
1819
checks: read
1920
contents: read
2021
steps:

.github/workflows/retry.yml

Lines changed: 0 additions & 36 deletions
This file was deleted.

scripts/all-green.mjs

Lines changed: 111 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ const {
1313
RETRIES,
1414
} = process.env
1515

16+
const maxRerunFailedJobs = 3
17+
1618
const octokit = new Octokit({ auth: GITHUB_TOKEN })
1719
const owner = 'DataDog'
1820
const repo = 'dd-trace-js'
1921
const ref = context.payload.pull_request?.head.sha || GITHUB_SHA
2022
const params = { owner, repo, ref }
21-
const checkConclusionEmojis = {
23+
const conclusionEmojis = {
2224
action_required: '🔶',
2325
cancelled: '🚫',
2426
failure: '❌',
@@ -29,7 +31,19 @@ const checkConclusionEmojis = {
2931
timed_out: '⌛',
3032
}
3133

34+
const conclusionSeverity = {
35+
failure: 0,
36+
timed_out: 1,
37+
action_required: 2,
38+
cancelled: 3,
39+
stale: 4,
40+
neutral: 5,
41+
skipped: 6,
42+
success: 7,
43+
}
44+
3245
let retries = 0
46+
let hasRerun = false
3347

3448
async function hasCompleted () {
3549
const { data: inProgressRuns } = await octokit.rest.checks.listForRef({
@@ -69,55 +83,117 @@ async function checkCompleted () {
6983
}
7084
}
7185

86+
async function getLatestRuns () {
87+
const checkRuns = await octokit.paginate(
88+
'GET /repos/:owner/:repo/commits/:ref/check-runs',
89+
{
90+
...params,
91+
per_page: 100,
92+
}
93+
)
94+
95+
// When a check is re-run, older runs remain with their original conclusions.
96+
// Deduplicate by name and evaluate only the latest run for each check.
97+
const latestByName = new Map()
98+
for (const run of checkRuns) {
99+
const existing = latestByName.get(run.name)
100+
if (!existing || new Date(run.started_at) >= new Date(existing.started_at)) {
101+
latestByName.set(run.name, run)
102+
}
103+
}
104+
105+
return [...latestByName.values()]
106+
}
107+
108+
async function rerunFailedWorkflows (failedRuns) {
109+
const failedCountByCheckSuiteId = new Map()
110+
for (const run of failedRuns) {
111+
const id = run.check_suite?.id
112+
if (id !== undefined) {
113+
failedCountByCheckSuiteId.set(id, (failedCountByCheckSuiteId.get(id) ?? 0) + 1)
114+
}
115+
}
116+
117+
const eligibleSuiteIds = [...failedCountByCheckSuiteId.entries()]
118+
.filter(([, count]) => count <= maxRerunFailedJobs)
119+
.map(([id]) => id)
120+
121+
// If a workflow has many jobs failed, it's unlikely to be flakiness to no
122+
// point in re-running.
123+
if (eligibleSuiteIds.length < failedCountByCheckSuiteId.size) {
124+
console.log(
125+
`Skipping rerun for ${failedCountByCheckSuiteId.size - eligibleSuiteIds.length} workflow(s) ` +
126+
`with more than ${maxRerunFailedJobs} failed job(s).`
127+
)
128+
}
129+
130+
const workflowRunsPerSuite = await Promise.all(
131+
eligibleSuiteIds.map(checkSuiteId =>
132+
octokit.rest.actions.listWorkflowRunsForRepo({ owner, repo, check_suite_id: checkSuiteId })
133+
.then(({ data }) => data.workflow_runs)
134+
)
135+
)
136+
137+
const workflowRuns = workflowRunsPerSuite.flat()
138+
139+
await Promise.all(
140+
workflowRuns.map(workflowRun => {
141+
console.log(`Rerunning failed jobs for workflow run ${workflowRun.id} (${workflowRun.name}).`)
142+
return octokit.rest.actions.reRunWorkflowFailedJobs({ owner, repo, run_id: workflowRun.id })
143+
})
144+
)
145+
146+
return workflowRuns.length > 0
147+
}
148+
72149
async function checkAllGreen () {
73150
let latestRuns
74151

75152
try {
76153
await checkCompleted()
77154
} finally {
78-
const checkRuns = await octokit.paginate(
79-
'GET /repos/:owner/:repo/commits/:ref/check-runs',
80-
{
81-
...params,
82-
per_page: 100,
83-
}
84-
)
85-
86-
// When a check is re-run, older runs remain with their original conclusions.
87-
// Deduplicate by name and evaluate only the latest run for each check.
88-
const latestByName = new Map()
89-
for (const run of checkRuns) {
90-
const existing = latestByName.get(run.name)
91-
if (!existing || new Date(run.started_at) >= new Date(existing.started_at)) {
92-
latestByName.set(run.name, run)
93-
}
94-
}
95-
latestRuns = [...latestByName.values()]
96-
97-
await printSummary(latestRuns)
155+
latestRuns = await getLatestRuns()
98156
}
99157

100-
const allGreen = !latestRuns.some(run => (
158+
const failedRuns = latestRuns.filter(run =>
101159
run.conclusion === 'failure' || run.conclusion === 'timed_out'
102-
))
160+
)
103161

104-
if (allGreen) {
162+
if (failedRuns.length === 0) {
163+
await printSummary(latestRuns)
105164
console.log('All jobs were successful.')
106-
} else {
107-
throw new Error('One or more jobs failed.')
165+
return
108166
}
167+
168+
if (!hasRerun) {
169+
hasRerun = true
170+
console.log(`${failedRuns.length} job(s) failed. Rerunning failed workflows...`)
171+
const didRerun = await rerunFailedWorkflows(failedRuns)
172+
if (didRerun) {
173+
retries = 0
174+
console.log(`Waiting for ${POLLING_INTERVAL} minutes before polling for rerun results.`)
175+
await setTimeout(POLLING_INTERVAL * 60_000)
176+
await checkAllGreen()
177+
return
178+
}
179+
}
180+
181+
await printSummary(latestRuns)
182+
throw new Error('One or more jobs failed.')
109183
}
110184

111185
async function printSummary (checkRuns) {
112-
const runs = checkRuns.map(run => ({
113-
name: run.name,
114-
status: run.status,
115-
conclusion: run.conclusion
116-
? `${run.conclusion} ${checkConclusionEmojis[run.conclusion]}`
117-
: ' ',
118-
started_at: run.started_at,
119-
completed_at: run.completed_at ?? ' ',
120-
}))
186+
const runs = [...checkRuns]
187+
.sort((a, b) => (conclusionSeverity[a.conclusion] ?? 8) - (conclusionSeverity[b.conclusion] ?? 8))
188+
.map(run => ({
189+
name: run.name,
190+
status: run.status,
191+
conclusion: run.conclusion
192+
? `${run.conclusion} ${conclusionEmojis[run.conclusion]}`
193+
: ' ',
194+
started_at: run.started_at,
195+
completed_at: run.completed_at ?? ' ',
196+
}))
121197

122198
console.table(runs)
123199

0 commit comments

Comments
 (0)