diff --git a/.github/workflows/pr-rescue.yml b/.github/workflows/pr-rescue.yml index 537ecce..081cfb6 100644 --- a/.github/workflows/pr-rescue.yml +++ b/.github/workflows/pr-rescue.yml @@ -1,12 +1,16 @@ name: PR Rescue -# Runs when main advances (PR merged). Rebases open agent PRs that are -# behind main so auto-merge can proceed. With dismiss_stale_reviews -# disabled, the existing quality-gate approval survives the rebase. +# Unsticks agent PRs at any pipeline stage: +# - No Copilot review → requests one (rebases first if behind main) +# - Unresolved threads → resolves if addressed (bot reply is last comment) +# - Behind main → rebases (removes review-response-attempted label for fresh responder run) +# - Merge conflicts → labels aw-conflict, skips in future runs on: push: branches: [main] + schedule: + - cron: '*/15 * * * *' workflow_dispatch: permissions: @@ -33,74 +37,192 @@ jobs: git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - - name: Find and rebase stuck agent PRs + - name: Rescue stuck agent PRs env: GH_TOKEN: ${{ secrets.GH_AW_WRITE_TOKEN }} run: | set -euo pipefail + # Resolve the PAT owner login for thread-resolution matching + token_owner=$(gh api user --jq '.login' 2>/dev/null) || token_owner="" + + # ── Discover agent PRs ────────────────────────────────────── + # Ensure aw-conflict label exists + gh label create aw-conflict --color D93F0B --description "PR has merge conflicts — rescue skips" 2>/dev/null || true + echo "::group::Finding open agent PRs with auto-merge enabled" - prs=$(gh pr list --state open --label aw --json number,headRefName,autoMergeRequest \ - --jq '.[] | select(.autoMergeRequest != null) | "\(.number) \(.headRefName)"') + prs=$(gh pr list --state open --label aw --json number,headRefName,autoMergeRequest,reviewDecision,labels \ + --jq '[.[] | select(.autoMergeRequest != null) | select([.labels[].name] | index("aw-conflict") | not)]') - if [ -z "$prs" ]; then - echo "No agent PRs with auto-merge enabled. Nothing to rescue." + count=$(echo "$prs" | jq 'length') + if [ "$count" = "0" ] || [ -z "$prs" ]; then + echo "No rescuable agent PRs found. Exiting." exit 0 fi - echo "Found PRs to check:" - echo "$prs" + + # Sort: approved PRs first (closest to merging) + sorted=$(echo "$prs" | jq -r 'sort_by(if .reviewDecision == "APPROVED" then 0 else 1 end) | .[] | "\(.number) \(.headRefName)"') + echo "Found $count PR(s) to check (sorted by progress):" + echo "$sorted" echo "::endgroup::" rescued=0 while IFS=' ' read -r pr_number branch; do echo "::group::Processing PR #${pr_number} (branch: ${branch})" - # Check if PR is specifically behind main (not blocked for other reasons) - merge_state=$(gh pr view "$pr_number" --json mergeStateStatus --jq '.mergeStateStatus') - echo "Merge state: ${merge_state}" + # ── Gather PR state ───────────────────────────────────── + merge_state=$(gh pr view "$pr_number" --json mergeStateStatus --jq '.mergeStateStatus' 2>/dev/null) || { echo "::warning::Failed to query PR #${pr_number}. Skipping."; echo "::endgroup::"; continue; } + review_decision=$(gh pr view "$pr_number" --json reviewDecision --jq '.reviewDecision' 2>/dev/null) || { echo "::warning::Failed to query PR #${pr_number}. Skipping."; echo "::endgroup::"; continue; } + has_copilot_review=$(gh pr view "$pr_number" --json reviews \ + --jq '[.reviews[] | select(.author.login == "copilot-pull-request-reviewer")] | length' 2>/dev/null) || has_copilot_review=0 - if [ "$merge_state" != "BEHIND" ]; then - echo "PR #${pr_number} is not behind main (state: ${merge_state}). Skipping." - echo "::endgroup::" - continue - fi + echo "State: merge=${merge_state} review=${review_decision} copilot_reviews=${has_copilot_review}" - # Check if PR has an approving review (no point rebasing if not approved yet) - review_decision=$(gh pr view "$pr_number" --json reviewDecision --jq '.reviewDecision') - if [ "$review_decision" != "APPROVED" ]; then - echo "PR #${pr_number} has no approval yet (decision: ${review_decision}). Skipping." + # ── Helper: rebase onto main ──────────────────────────── + do_rebase() { + if ! git fetch origin -- "$branch" 2>&1; then + echo "::warning::Failed to fetch branch for PR #${pr_number}. Skipping." + return 1 + fi + if ! git checkout -B "$branch" "origin/$branch" 2>&1; then + echo "::warning::Failed to checkout branch for PR #${pr_number}. Skipping." + return 1 + fi + if git rebase origin/main; then + # Remove label BEFORE push so responder can run on new comments + gh pr edit "$pr_number" --remove-label review-response-attempted 2>/dev/null || true + if git push --force-with-lease origin -- "$branch" 2>&1; then + echo "✅ PR #${pr_number} rebased successfully." + git checkout main + return 0 + else + echo "::warning::Push failed for PR #${pr_number}. Skipping." + git checkout main + return 1 + fi + else + echo "::warning::Rebase failed for PR #${pr_number} — merge conflict." + git rebase --abort 2>/dev/null || true + git checkout main + gh pr edit "$pr_number" --add-label aw-conflict 2>/dev/null || true + gh pr comment "$pr_number" --body "PR Rescue: rebase onto main failed due to merge conflicts. Labeled \`aw-conflict\` — will not retry until conflict is resolved manually." 2>/dev/null || true + return 1 + fi + } + + # ── Check 1: No Copilot review ────────────────────────── + if [ "$has_copilot_review" = "0" ]; then + # If also behind main, rebase first so Copilot reviews fresh code + if [ "$merge_state" = "BEHIND" ]; then + echo "PR #${pr_number} has no review and is behind main. Rebasing first..." + if ! do_rebase; then + echo "::endgroup::" + continue + fi + fi + echo "PR #${pr_number} has no Copilot review. Requesting..." + gh pr edit "$pr_number" --add-reviewer @copilot 2>/dev/null || true + echo "✅ Copilot review requested for PR #${pr_number}. Will check again next cycle." + rescued=$((rescued + 1)) echo "::endgroup::" continue fi - echo "PR #${pr_number} is approved but behind main. Rebasing..." - - if ! git fetch origin "$branch" 2>&1; then - echo "::warning::Failed to fetch branch for PR #${pr_number}. Branch may have been deleted. Skipping." + # ── Check 2: Unresolved threads ───────────────────────── + unresolved_threads=$(gh api graphql -f query='query($pr: Int!) { + repository(owner: "'"${GITHUB_REPOSITORY_OWNER}"'", name: "'"${GITHUB_REPOSITORY#*/}"'") { + pullRequest(number: $pr) { + reviewThreads(first: 100) { + totalCount + nodes { + id + isResolved + comments(last: 1) { + nodes { + author { login } + } + } + } + } + } + } + }' -F pr="$pr_number" --jq '.data.repository.pullRequest.reviewThreads.nodes[] | select(.isResolved == false)') || { + echo "::warning::Failed to query review threads for PR #${pr_number}. Skipping." echo "::endgroup::" continue + } + + # Warn if thread count exceeds pagination cap + thread_total=$(gh api graphql -f query='query($pr: Int!) { + repository(owner: "'"${GITHUB_REPOSITORY_OWNER}"'", name: "'"${GITHUB_REPOSITORY#*/}"'") { + pullRequest(number: $pr) { reviewThreads(first: 1) { totalCount } } + } + }' -F pr="$pr_number" --jq '.data.repository.pullRequest.reviewThreads.totalCount' 2>/dev/null) || thread_total=0 + if [ "$thread_total" -gt 100 ] 2>/dev/null; then + echo "::warning::PR #${pr_number} has ${thread_total} threads (cap: 100). Some may be missed." fi - if ! git checkout "$branch" 2>&1; then - echo "::warning::Failed to checkout branch for PR #${pr_number}. Skipping." - echo "::endgroup::" - continue + if [ -n "$unresolved_threads" ]; then + echo "PR #${pr_number} has unresolved threads. Checking if safe to resolve..." + + # Write thread data to temp file to avoid subshell variable scoping + echo "$unresolved_threads" | jq -c '.' > /tmp/rescue_threads.jsonl + while read -r thread; do + thread_id=$(echo "$thread" | jq -r '.id') + last_author=$(echo "$thread" | jq -r '.comments.nodes[0].author.login // "unknown"') + + if [ "$last_author" = "github-actions[bot]" ] || [ "$last_author" = "github-actions" ] || { [ -n "$token_owner" ] && [ "$last_author" = "$token_owner" ]; }; then + echo " Thread ${thread_id}: last comment by responder (${last_author}) — resolving..." + if gh api graphql -f query='mutation($id: ID!) { resolveReviewThread(input: {threadId: $id}) { thread { isResolved } } }' -f id="$thread_id" >/dev/null 2>&1; then + echo " ✅ Resolved ${thread_id}" + else + echo " ::warning::Failed to resolve ${thread_id}" + fi + else + echo " Thread ${thread_id}: last comment by ${last_author} — not safe to resolve. Skipping." + fi + done < /tmp/rescue_threads.jsonl + + # Re-check if any unresolved threads remain + remaining=$(gh api graphql -f query='query($pr: Int!) { + repository(owner: "'"${GITHUB_REPOSITORY_OWNER}"'", name: "'"${GITHUB_REPOSITORY#*/}"'") { + pullRequest(number: $pr) { + reviewThreads(first: 100) { + nodes { isResolved } + } + } + } + }' -F pr="$pr_number" --jq '[.data.repository.pullRequest.reviewThreads.nodes[] | select(.isResolved == false)] | length') || { + echo "::warning::Failed to re-check threads for PR #${pr_number}. Skipping." + echo "::endgroup::" + continue + } + + if [ "$remaining" != "0" ]; then + echo "PR #${pr_number} still has ${remaining} unresolved thread(s). Cannot proceed." + echo "::endgroup::" + continue + fi + echo "All threads resolved for PR #${pr_number}." + rescued=$((rescued + 1)) fi - if git rebase origin/main; then - if git push --force-with-lease origin "$branch" 2>&1; then - echo "✅ PR #${pr_number} rebased. CI will rerun, approval survives, auto-merge will fire." + # ── Check 3: Behind main (approved + threads clear) ───── + # Re-query merge state — may have changed after thread resolution + merge_state=$(gh pr view "$pr_number" --json mergeStateStatus --jq '.mergeStateStatus' 2>/dev/null) || { echo "::warning::Failed to re-query PR #${pr_number}. Skipping."; echo "::endgroup::"; continue; } + review_decision=$(gh pr view "$pr_number" --json reviewDecision --jq '.reviewDecision' 2>/dev/null) || { echo "::warning::Failed to re-query PR #${pr_number}. Skipping."; echo "::endgroup::"; continue; } + if [ "$merge_state" = "BEHIND" ] && [ "$review_decision" = "APPROVED" ]; then + echo "PR #${pr_number} is approved, threads clear, but behind main. Rebasing..." + if do_rebase; then rescued=$((rescued + 1)) - else - echo "::warning::Push failed for PR #${pr_number}. Branch may have been updated. Skipping." fi + elif [ "$merge_state" != "BEHIND" ] && [ "$review_decision" = "APPROVED" ]; then + echo "PR #${pr_number} is approved, up to date, threads clear. Auto-merge should handle it." else - echo "::warning::Rebase failed for PR #${pr_number} — likely has conflicts. Skipping." - git rebase --abort 2>/dev/null || true + echo "PR #${pr_number}: merge=${merge_state} review=${review_decision}. Nothing to do." fi - git checkout main echo "::endgroup::" - done <<< "$prs" + done <<< "$sorted" echo "Rescued ${rescued} PR(s)." diff --git a/docs/agentic-workflows.md b/docs/agentic-workflows.md index 2395045..9d27164 100644 --- a/docs/agentic-workflows.md +++ b/docs/agentic-workflows.md @@ -36,7 +36,7 @@ Audit/Health Agent → creates issue (max 2) → dispatches Implementer → LOW/MEDIUM impact → approves + adds quality-gate-approved label → auto-merge fires → HIGH impact → flags for human review (auto-merge stays blocked) → PR behind main after another PR merges? - → PR Rescue workflow (push to main trigger) → rebases → CI reruns → re-approves → auto-merge fires + → PR Rescue workflow (push to main + cron 15min) → requests review / resolves threads / rebases → auto-merge fires ``` @@ -217,6 +217,7 @@ The `check_membership.cjs` script (in `github/gh-aw`) works as follows: Due to the upstream bug above, the `bots:` field alone is insufficient. The current workaround is `roles: all`, which tells the compiler to skip the permission check entirely (`check_membership.cjs` is not included in the `pre_activation` job): ```yaml +if: "contains(github.event.pull_request.labels.*.name, 'aw')" on: pull_request_review: types: [submitted] @@ -224,7 +225,9 @@ on: bots: [Copilot, copilot-pull-request-reviewer] # keep for when upstream is fixed ``` -This is overly permissive — any actor can trigger the workflow. Track removal via issue #74. +The `roles: all` makes the workflow overly permissive — any actor can trigger it. The `if:` condition compensates by gating on the `aw` label: only agent-created PRs (which carry the `aw` label) will activate the workflow. Without the `if:` gate, the agent still runs and burns tokens before noop'ing on the prompt-level label check. Track `roles: all` removal via issue #74. + +> **`pull_request_review` ref behavior**: GitHub Actions uses the workflow file from the PR's **head branch** for `pull_request_review` events. This means the `if:` condition is active immediately on the PR — no need to merge to main first. Verified empirically on PR #119. > **Actor identity note**: The event **actor** for Copilot reviews is `Copilot` (the GitHub App), NOT `copilot-pull-request-reviewer` (the review author login). `context.actor` returns `Copilot`. Keep both in the bots list for when the upstream bug is fixed. @@ -392,16 +395,19 @@ This is a known limitation for solo repos. Agent PRs don't need this — the qua ### PR Rescue Workflow -When a PR merges to main, other open agent PRs fall behind. With `strict: true` (require branch to be up to date), auto-merge blocks until the branch is rebased. +The **PR Rescue workflow** (`.github/workflows/pr-rescue.yml`) unsticks agent PRs at any pipeline stage. Runs on push to main, every 15 minutes on cron, and manual dispatch. + +PRs are sorted by progress (approved first). For each `aw`-labeled PR with auto-merge enabled (excluding `aw-conflict` labeled): + +**Check 1 — No Copilot review**: If the PR has no review from `copilot-pull-request-reviewer`, rebase first if behind main (so Copilot reviews fresh code), then request a review. Stops processing — next cycle picks up. + +**Check 2 — Unresolved threads**: Queries real thread IDs via GraphQL. For each unresolved thread, checks if the last comment is from `github-actions[bot]` (responder addressed it). If so, resolves it. If not, leaves it (human needs to look). If any remain unresolved, stops processing. + +**Check 3 — Behind main**: If approved + all threads resolved + BEHIND, rebases onto main. Removes `review-response-attempted` label BEFORE pushing so the responder can run again on any new Copilot comments. -The **PR Rescue workflow** (`.github/workflows/pr-rescue.yml`) fires on every push to main: -1. Finds open `aw`-labeled PRs with auto-merge enabled that are behind main and already approved -2. Rebases them onto latest main -3. CI reruns on the rebased commit -4. Since `dismiss_stale_reviews` is disabled, the existing approval survives -5. Auto-merge fires +**Conflict handling**: If rebase fails (merge conflict), adds `aw-conflict` label and posts a comment. Future rescue runs skip `aw-conflict` PRs. -Note: `dismiss_stale_reviews` is set to `false` to support this flow. This is safe for a solo-developer repo. If collaborators are added, re-evaluate this setting. +Note: `dismiss_stale_reviews` is set to `false` so approvals survive rebases. Safe for a solo-developer repo. If collaborators are added, re-evaluate this setting. @@ -425,8 +431,8 @@ All changes go in the `.md` file. Run `gh aw compile` to regenerate. Copilot may ### 5. `dismiss_stale_reviews` only dismisses APPROVED reviews `COMMENTED` reviews are NOT dismissed on new pushes. This means a Copilot `COMMENTED` review from before a rebase will persist. -### 6. `pull_request_review` workflows run from default branch -The workflow definition always comes from the default branch, not the PR branch. You cannot test workflow changes from a PR — they must be merged first. +### 6. `pull_request_review` workflows run from the PR's head branch +Despite common documentation claiming otherwise, `pull_request_review` events use the workflow file from the **PR's head branch**, not the default branch. Verified empirically on PR #119: a new `if:` condition in the PR branch's lock file was active immediately, without merging to main first. This means workflow changes CAN be tested from a PR. ### 7. GitHub's `action_required` is separate from gh-aw's `pre_activation` `action_required` means GitHub itself blocked the run (first-time contributor approval). No jobs run at all. `pre_activation` is gh-aw's role/bot check within the workflow. @@ -581,4 +587,18 @@ gh run view --log-failed # View failed job logs - PR #109: Reverts labels config, rewrites responder instructions with `***MUST***`/`***DOUBLE CHECK***` ordering enforcement. - **Lesson reinforced**: NEVER add config without verifying the runtime behavior. Read the source code. The compiler accepting a field does not mean the handler implements it. +### 2026-03-15 — Thread ID hallucination discovery + enhanced rescue + +- Discovered Review Responder hallucinates `PRRT_` thread IDs. The GitHub MCP server v0.32.0 doesn't expose thread node IDs to agents. Agent fabricates plausible-looking IDs that fail at GraphQL API. +- gh-aw's own smoke tests confirm this — "Resolve Review Thread" consistently skipped with "could not obtain PRRT node ID via available tools" (smoke tests #20998, #17210, #19689). +- Root cause: github/github-mcp-server#1919 (resolve thread support) merged 2026-03-13, one week after v0.32.0 released. Fix ships in next MCP server release. +- Filed upstream: github/gh-aw#21130 — requesting MCP server upgrade. +- Enhanced PR Rescue workflow: now handles three stuck states (no review, unresolved threads, behind main), runs on 15-min cron, sorts PRs by progress, labels conflicts. + +### 2026-03-16 — Agent workflow label gate + +- Discovered agent workflows (review-responder, quality-gate) fire on ALL PRs — including human-authored ones without the `aw` label. Both agents activated, burned tokens, then noop'd on the prompt-level label check. Discovered on PR #118. (Issue #120) +- Fix: Added `if: "contains(github.event.pull_request.labels.*.name, 'aw')"` to both workflow frontmatters. gh-aw compiles this to a job-level `if:` on the activation job. No `aw` label → workflow skips entirely → zero tokens burned. (PR #119) +- Key finding: `pull_request_review` events use the workflow file from the PR's **head branch**, not the default branch. The `if:` condition was active immediately on PR #119 itself — contradicts common documentation. + diff --git a/docs/changelog.md b/docs/changelog.md index b9ebd08..1b68dad 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,6 +4,28 @@ Append-only history of repo-level changes (CI, infra, shared config). Tool-speci --- +## fix: gate agent workflows on aw label — 2026-03-16 + +**Problem**: Agent workflows (review-responder, quality-gate) fired on every `pull_request_review` event, including human-authored PRs. The `aw` label check was only in the agent prompt — a soft guard that still burned compute and inference tokens before noop'ing. Discovered on PR #118. (Issue #120) + +**Fix**: Added `if: "contains(github.event.pull_request.labels.*.name, 'aw')"` to both workflow frontmatters. gh-aw compiles this to a job-level `if:` on the activation job — workflow skips entirely at the GitHub Actions level when the label is absent. Zero tokens burned. (PR #119) + +**Finding**: `pull_request_review` events use the workflow file from the PR's **head branch**, not the default branch. The `if:` condition was active immediately on the PR itself — no need to merge first. (Contradicts common web search results.) + +--- + +## feat: Enhanced PR Rescue — resolve threads, request reviews, handle conflicts — 2026-03-15 + +**Problem**: Agent PRs could get stuck at multiple stages: no Copilot review, unresolved threads (responder hallucinates thread IDs, #114), behind main. Old rescue only handled behind-main. + +**Fix**: Enhanced rescue workflow with three checks per PR (sorted by progress, approved first): +1. No Copilot review → rebase if behind main, then request review +2. Unresolved threads → resolve if bot reply is last comment (uses real GraphQL thread IDs) +3. Behind main → rebase, remove review-response-attempted label before push +Plus: 15-min cron schedule, `aw-conflict` label on merge conflicts, skip conflicted PRs. (PR #121, closes #116) + +--- + ## fix: revert labels config + strengthen responder resolve-before-push — 2026-03-15 **Problem 1**: PR #97 added `labels: ["aw"]` to `create-pull-request` config. This broke label application — the gh-aw handler's post-creation label API call fails non-deterministically with a node ID resolution error, and the tool description tells the agent "labels will be automatically added" so the agent stops including them. PR #104 was created without the `aw` label. (Issue #107)