diff --git a/.github/workflows/pr-rescue.yml b/.github/workflows/pr-rescue.yml index 537ecce..d5e3b5c 100644 --- a/.github/workflows/pr-rescue.yml +++ b/.github/workflows/pr-rescue.yml @@ -1,12 +1,16 @@ name: PR Rescue -# Runs when main advances (PR merged). Rebases open agent PRs that are -# behind main so auto-merge can proceed. With dismiss_stale_reviews -# disabled, the existing quality-gate approval survives the rebase. +# Unsticks agent PRs at any pipeline stage: +# - No Copilot review → requests one (rebases first if behind main) +# - Unresolved threads → resolves if addressed (bot reply is last comment) +# - Behind main → rebases (removes review-response-attempted label for fresh responder run) +# - Merge conflicts → labels aw-conflict, skips in future runs on: push: branches: [main] + schedule: + - cron: '*/15 * * * *' workflow_dispatch: permissions: @@ -33,74 +37,170 @@ jobs: git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - - name: Find and rebase stuck agent PRs + - name: Rescue stuck agent PRs env: GH_TOKEN: ${{ secrets.GH_AW_WRITE_TOKEN }} run: | set -euo pipefail + # ── Discover agent PRs ────────────────────────────────────── + # Ensure aw-conflict label exists + gh label create aw-conflict --color D93F0B --description "PR has merge conflicts — rescue skips" 2>/dev/null || true + echo "::group::Finding open agent PRs with auto-merge enabled" - prs=$(gh pr list --state open --label aw --json number,headRefName,autoMergeRequest \ - --jq '.[] | select(.autoMergeRequest != null) | "\(.number) \(.headRefName)"') + prs=$(gh pr list --state open --label aw --json number,headRefName,autoMergeRequest,reviewDecision,labels \ + --jq '[.[] | select(.autoMergeRequest != null) | select([.labels[].name] | index("aw-conflict") | not)]') - if [ -z "$prs" ]; then - echo "No agent PRs with auto-merge enabled. Nothing to rescue." + count=$(echo "$prs" | jq 'length') + if [ "$count" = "0" ] || [ -z "$prs" ]; then + echo "No rescuable agent PRs found. Exiting." exit 0 fi - echo "Found PRs to check:" - echo "$prs" + + # Sort: approved PRs first (closest to merging) + sorted=$(echo "$prs" | jq -r 'sort_by(if .reviewDecision == "APPROVED" then 0 else 1 end) | .[] | "\(.number) \(.headRefName)"') + echo "Found $count PR(s) to check (sorted by progress):" + echo "$sorted" echo "::endgroup::" rescued=0 while IFS=' ' read -r pr_number branch; do echo "::group::Processing PR #${pr_number} (branch: ${branch})" - # Check if PR is specifically behind main (not blocked for other reasons) - merge_state=$(gh pr view "$pr_number" --json mergeStateStatus --jq '.mergeStateStatus') - echo "Merge state: ${merge_state}" - - if [ "$merge_state" != "BEHIND" ]; then - echo "PR #${pr_number} is not behind main (state: ${merge_state}). Skipping." - echo "::endgroup::" - continue - fi - - # Check if PR has an approving review (no point rebasing if not approved yet) - review_decision=$(gh pr view "$pr_number" --json reviewDecision --jq '.reviewDecision') - if [ "$review_decision" != "APPROVED" ]; then - echo "PR #${pr_number} has no approval yet (decision: ${review_decision}). Skipping." - echo "::endgroup::" - continue - fi + # ── Gather PR state ───────────────────────────────────── + merge_state=$(gh pr view "$pr_number" --json mergeStateStatus --jq '.mergeStateStatus' 2>/dev/null) || { echo "::warning::Failed to query PR #${pr_number}. Skipping."; echo "::endgroup::"; continue; } + review_decision=$(gh pr view "$pr_number" --json reviewDecision --jq '.reviewDecision' 2>/dev/null) || { echo "::warning::Failed to query PR #${pr_number}. Skipping."; echo "::endgroup::"; continue; } + has_copilot_review=$(gh pr view "$pr_number" --json reviews \ + --jq '[.reviews[] | select(.author.login == "copilot-pull-request-reviewer")] | length' 2>/dev/null) || has_copilot_review=0 - echo "PR #${pr_number} is approved but behind main. Rebasing..." + echo "State: merge=${merge_state} review=${review_decision} copilot_reviews=${has_copilot_review}" - if ! git fetch origin "$branch" 2>&1; then - echo "::warning::Failed to fetch branch for PR #${pr_number}. Branch may have been deleted. Skipping." + # ── Helper: rebase onto main ──────────────────────────── + do_rebase() { + if ! git fetch origin "$branch" 2>&1; then + echo "::warning::Failed to fetch branch for PR #${pr_number}. Skipping." + return 1 + fi + if ! git checkout "$branch" 2>&1; then + echo "::warning::Failed to checkout branch for PR #${pr_number}. Skipping." + return 1 + fi + if git rebase origin/main; then + # Remove label BEFORE push so responder can run on new comments + gh pr edit "$pr_number" --remove-label review-response-attempted 2>/dev/null || true + if git push --force-with-lease origin "$branch" 2>&1; then + echo "✅ PR #${pr_number} rebased successfully." + git checkout main + return 0 + else + echo "::warning::Push failed for PR #${pr_number}. Skipping." + git checkout main + return 1 + fi + else + echo "::warning::Rebase failed for PR #${pr_number} — merge conflict." + git rebase --abort 2>/dev/null || true + git checkout main + gh pr edit "$pr_number" --add-label aw-conflict 2>/dev/null || true + gh pr comment "$pr_number" --body "PR Rescue: rebase onto main failed due to merge conflicts. Labeled \`aw-conflict\` — will not retry until conflict is resolved manually." 2>/dev/null || true + return 1 + fi + } + + # ── Check 1: No Copilot review ────────────────────────── + if [ "$has_copilot_review" = "0" ]; then + # If also behind main, rebase first so Copilot reviews fresh code + if [ "$merge_state" = "BEHIND" ]; then + echo "PR #${pr_number} has no review and is behind main. Rebasing first..." + if ! do_rebase; then + echo "::endgroup::" + continue + fi + fi + echo "PR #${pr_number} has no Copilot review. Requesting..." + gh pr edit "$pr_number" --add-reviewer @copilot 2>/dev/null || true + echo "✅ Copilot review requested for PR #${pr_number}. Will check again next cycle." + rescued=$((rescued + 1)) echo "::endgroup::" continue fi - if ! git checkout "$branch" 2>&1; then - echo "::warning::Failed to checkout branch for PR #${pr_number}. Skipping." - echo "::endgroup::" - continue + # ── Check 2: Unresolved threads ───────────────────────── + unresolved_threads=$(gh api graphql -f query='query($pr: Int!) { + repository(owner: "'"${GITHUB_REPOSITORY_OWNER}"'", name: "'"${GITHUB_REPOSITORY#*/}"'") { + pullRequest(number: $pr) { + reviewThreads(first: 50) { + nodes { + id + isResolved + comments(last: 1) { + nodes { + author { login } + } + } + } + } + } + } + }' -F pr="$pr_number" --jq '.data.repository.pullRequest.reviewThreads.nodes[] | select(.isResolved == false)') + + if [ -n "$unresolved_threads" ]; then + echo "PR #${pr_number} has unresolved threads. Checking if safe to resolve..." + + # Write thread data to temp file to avoid subshell variable scoping + echo "$unresolved_threads" | jq -c '.' > /tmp/rescue_threads.jsonl + while read -r thread; do + thread_id=$(echo "$thread" | jq -r '.id') + last_author=$(echo "$thread" | jq -r '.comments.nodes[0].author.login // "unknown"') + + if [ "$last_author" = "github-actions[bot]" ] || [ "$last_author" = "github-actions" ]; then + echo " Thread ${thread_id}: last comment by bot — resolving..." + if gh api graphql -f query='mutation($id: ID!) { resolveReviewThread(input: {threadId: $id}) { thread { isResolved } } }' -f id="$thread_id" >/dev/null 2>&1; then + echo " ✅ Resolved ${thread_id}" + else + echo " ::warning::Failed to resolve ${thread_id}" + fi + else + echo " Thread ${thread_id}: last comment by ${last_author} — not safe to resolve. Skipping." + fi + done < /tmp/rescue_threads.jsonl + + # Re-check if any unresolved threads remain + remaining=$(gh api graphql -f query='query($pr: Int!) { + repository(owner: "'"${GITHUB_REPOSITORY_OWNER}"'", name: "'"${GITHUB_REPOSITORY#*/}"'") { + pullRequest(number: $pr) { + reviewThreads(first: 50) { + nodes { isResolved } + } + } + } + }' -F pr="$pr_number" --jq '[.data.repository.pullRequest.reviewThreads.nodes[] | select(.isResolved == false)] | length') + + if [ "$remaining" != "0" ]; then + echo "PR #${pr_number} still has ${remaining} unresolved thread(s). Cannot proceed." + echo "::endgroup::" + continue + fi + echo "All threads resolved for PR #${pr_number}." + rescued=$((rescued + 1)) fi - if git rebase origin/main; then - if git push --force-with-lease origin "$branch" 2>&1; then - echo "✅ PR #${pr_number} rebased. CI will rerun, approval survives, auto-merge will fire." + # ── Check 3: Behind main (approved + threads clear) ───── + # Re-query merge state — may have changed after thread resolution + merge_state=$(gh pr view "$pr_number" --json mergeStateStatus --jq '.mergeStateStatus') + review_decision=$(gh pr view "$pr_number" --json reviewDecision --jq '.reviewDecision') + if [ "$merge_state" = "BEHIND" ] && [ "$review_decision" = "APPROVED" ]; then + echo "PR #${pr_number} is approved, threads clear, but behind main. Rebasing..." + if do_rebase; then rescued=$((rescued + 1)) - else - echo "::warning::Push failed for PR #${pr_number}. Branch may have been updated. Skipping." fi + elif [ "$merge_state" != "BEHIND" ] && [ "$review_decision" = "APPROVED" ]; then + echo "PR #${pr_number} is approved, up to date, threads clear. Auto-merge should handle it." else - echo "::warning::Rebase failed for PR #${pr_number} — likely has conflicts. Skipping." - git rebase --abort 2>/dev/null || true + echo "PR #${pr_number}: merge=${merge_state} review=${review_decision}. Nothing to do." fi - git checkout main echo "::endgroup::" - done <<< "$prs" + done <<< "$sorted" echo "Rescued ${rescued} PR(s)." diff --git a/docs/agentic-workflows.md b/docs/agentic-workflows.md index 2395045..c1726fa 100644 --- a/docs/agentic-workflows.md +++ b/docs/agentic-workflows.md @@ -36,7 +36,7 @@ Audit/Health Agent → creates issue (max 2) → dispatches Implementer → LOW/MEDIUM impact → approves + adds quality-gate-approved label → auto-merge fires → HIGH impact → flags for human review (auto-merge stays blocked) → PR behind main after another PR merges? - → PR Rescue workflow (push to main trigger) → rebases → CI reruns → re-approves → auto-merge fires + → PR Rescue workflow (push to main + cron 15min) → requests review / resolves threads / rebases → auto-merge fires ``` @@ -392,16 +392,19 @@ This is a known limitation for solo repos. Agent PRs don't need this — the qua ### PR Rescue Workflow -When a PR merges to main, other open agent PRs fall behind. With `strict: true` (require branch to be up to date), auto-merge blocks until the branch is rebased. +The **PR Rescue workflow** (`.github/workflows/pr-rescue.yml`) unsticks agent PRs at any pipeline stage. Runs on push to main, every 15 minutes on cron, and manual dispatch. -The **PR Rescue workflow** (`.github/workflows/pr-rescue.yml`) fires on every push to main: -1. Finds open `aw`-labeled PRs with auto-merge enabled that are behind main and already approved -2. Rebases them onto latest main -3. CI reruns on the rebased commit -4. Since `dismiss_stale_reviews` is disabled, the existing approval survives -5. Auto-merge fires +PRs are sorted by progress (approved first). For each `aw`-labeled PR with auto-merge enabled (excluding `aw-conflict` labeled): -Note: `dismiss_stale_reviews` is set to `false` to support this flow. This is safe for a solo-developer repo. If collaborators are added, re-evaluate this setting. +**Check 1 — No Copilot review**: If the PR has no review from `copilot-pull-request-reviewer`, rebase first if behind main (so Copilot reviews fresh code), then request a review. Stops processing — next cycle picks up. + +**Check 2 — Unresolved threads**: Queries real thread IDs via GraphQL. For each unresolved thread, checks if the last comment is from `github-actions[bot]` (responder addressed it). If so, resolves it. If not, leaves it (human needs to look). If any remain unresolved, stops processing. + +**Check 3 — Behind main**: If approved + all threads resolved + BEHIND, rebases onto main. Removes `review-response-attempted` label BEFORE pushing so the responder can run again on any new Copilot comments. + +**Conflict handling**: If rebase fails (merge conflict), adds `aw-conflict` label and posts a comment. Future rescue runs skip `aw-conflict` PRs. + +Note: `dismiss_stale_reviews` is set to `false` so approvals survive rebases. Safe for a solo-developer repo. If collaborators are added, re-evaluate this setting. @@ -581,4 +584,12 @@ gh run view --log-failed # View failed job logs - PR #109: Reverts labels config, rewrites responder instructions with `***MUST***`/`***DOUBLE CHECK***` ordering enforcement. - **Lesson reinforced**: NEVER add config without verifying the runtime behavior. Read the source code. The compiler accepting a field does not mean the handler implements it. +### 2026-03-15 — Thread ID hallucination discovery + enhanced rescue + +- Discovered Review Responder hallucinates `PRRT_` thread IDs. The GitHub MCP server v0.32.0 doesn't expose thread node IDs to agents. Agent fabricates plausible-looking IDs that fail at GraphQL API. +- gh-aw's own smoke tests confirm this — "Resolve Review Thread" consistently skipped with "could not obtain PRRT node ID via available tools" (smoke tests #20998, #17210, #19689). +- Root cause: github/github-mcp-server#1919 (resolve thread support) merged 2026-03-13, one week after v0.32.0 released. Fix ships in next MCP server release. +- Filed upstream: github/gh-aw#21130 — requesting MCP server upgrade. +- Enhanced PR Rescue workflow: now handles three stuck states (no review, unresolved threads, behind main), runs on 15-min cron, sorts PRs by progress, labels conflicts. + diff --git a/docs/changelog.md b/docs/changelog.md index b9ebd08..3f6c6bb 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,6 +4,18 @@ Append-only history of repo-level changes (CI, infra, shared config). Tool-speci --- +## feat: Enhanced PR Rescue — resolve threads, request reviews, handle conflicts — 2026-03-15 + +**Problem**: Agent PRs could get stuck at multiple stages: no Copilot review, unresolved threads (responder hallucinates thread IDs, #114), behind main. Old rescue only handled behind-main. + +**Fix**: Enhanced rescue workflow with three checks per PR (sorted by progress, approved first): +1. No Copilot review → rebase if behind main, then request review +2. Unresolved threads → resolve if bot reply is last comment (uses real GraphQL thread IDs) +3. Behind main → rebase, remove review-response-attempted label before push +Plus: 15-min cron schedule, `aw-conflict` label on merge conflicts, skip conflicted PRs. (PR #118, closes #116) + +--- + ## fix: revert labels config + strengthen responder resolve-before-push — 2026-03-15 **Problem 1**: PR #97 added `labels: ["aw"]` to `create-pull-request` config. This broke label application — the gh-aw handler's post-creation label API call fails non-deterministically with a node ID resolution error, and the tool description tells the agent "labels will be automatically added" so the agent stops including them. PR #104 was created without the `aw` label. (Issue #107)