diff --git a/.github/workflows/upgrade-test.yml b/.github/workflows/upgrade-test.yml index 9afbe30afa4..3f96cb8c970 100644 --- a/.github/workflows/upgrade-test.yml +++ b/.github/workflows/upgrade-test.yml @@ -8,6 +8,7 @@ on: paths: - 'pkg/cli/update_extension_check.go' - '.github/workflows/upgrade-test.yml' + - 'install-gh-aw.sh' permissions: {} @@ -60,11 +61,88 @@ jobs: echo "Target release: $TARGET" echo "Previous release: $PREVIOUS" + # ────────────────────────────────────────────────────────────────────────── + # INVESTIGATION: Windows hangs for ~10 min during `gh extension install`. + # + # Evidence from failed runs (e.g. job/73303392593): + # 1. `gh extension install github/gh-aw --pin ... --force` hangs silently + # for exactly the job timeout (10 min) with zero output. + # 2. At cleanup time the runner always finds an orphan `gh-aw` process, + # which strongly suggests the `gh` CLI is *executing* the freshly + # downloaded binary as part of its installation (e.g. as a + # verification/metadata step that was added in a later gh release). + # 3. When that `gh-aw.exe` subprocess is launched, Windows Defender + # Real-Time Protection intercepts the new executable before it can + # run, causing a prolonged scan that blocks the process indefinitely. + # + # Fix strategy (this PR): + # A. Disable Windows Defender real-time scanning for the gh CLI data + # directory *before* the install so the scan cannot block execution. + # B. Add a per-step timeout of 3 min so failures are caught fast rather + # than hitting the 10-min job ceiling. + # C. Capture `GH_DEBUG=api` on Windows to trace exactly which API call + # or binary invocation causes the stall if Defender is not the cause. + # ────────────────────────────────────────────────────────────────────────── + + - name: Diagnose Windows environment before install + if: runner.os == 'Windows' + shell: pwsh + run: | + Write-Host "=== gh CLI version ===" + gh --version + + Write-Host "=== Windows Defender real-time protection status ===" + try { + $status = Get-MpComputerStatus + Write-Host "RealTimeProtectionEnabled : $($status.RealTimeProtectionEnabled)" + Write-Host "AntivirusEnabled : $($status.AntivirusEnabled)" + } catch { + Write-Host "Could not query Windows Defender status: $_" + } + + Write-Host "=== Existing gh extension directory ===" + $extDir = Join-Path $env:LOCALAPPDATA "GitHub CLI" + if (Test-Path $extDir) { + Get-ChildItem $extDir -Recurse -ErrorAction SilentlyContinue | + Select-Object FullName, Length | Format-Table -AutoSize + } else { + Write-Host "$extDir does not exist yet" + } + + Write-Host "=== Running gh* processes before install ===" + Get-Process -Name 'gh*' -ErrorAction SilentlyContinue | + Select-Object Name, Id, StartTime | Format-Table -AutoSize + + # Disable Windows Defender real-time protection for the gh CLI extensions + # directory. This is the most likely cause of the hang: Defender intercepts + # the newly downloaded gh-aw.exe before it can execute, blocking the process + # indefinitely. Excluding the gh data directory removes that barrier. + - name: Exclude gh CLI directory from Windows Defender scanning + if: runner.os == 'Windows' + shell: pwsh + run: | + $ghDataDir = Join-Path $env:LOCALAPPDATA "GitHub CLI" + Write-Host "Adding Windows Defender exclusion path: $ghDataDir" + Add-MpPreference -ExclusionPath $ghDataDir + Write-Host "Exclusion added. Current exclusions:" + (Get-MpPreference).ExclusionPath + - name: Install previous version (n-1) + # Per-step timeout: the Windows hang hit the 10-min *job* ceiling every + # time. A shorter step timeout lets us fail fast with a clear error and + # collect the diagnostic steps that follow. + timeout-minutes: 4 shell: bash env: PREVIOUS: ${{ steps.releases.outputs.previous }} run: | + # Enable gh API tracing on Windows to pinpoint exactly which network + # or subprocess call stalls (only on Windows to avoid noise elsewhere). + if [[ "$RUNNER_OS" == "Windows" ]]; then + export GH_DEBUG=api + echo "GH_DEBUG=api enabled for Windows debugging" + fi + echo "Installing gh-aw $PREVIOUS ..." gh extension install github/gh-aw --pin "$PREVIOUS" --force @@ -74,6 +152,14 @@ jobs: [ "$INSTALLED" = "$PREVIOUS" ] || { echo "❌ Expected $PREVIOUS, got $INSTALLED"; exit 1; } echo "✅ n-1 installed: $INSTALLED" + - name: Capture orphan processes after install (Windows) + if: runner.os == 'Windows' && always() + shell: pwsh + run: | + Write-Host "=== Processes still running after install step ===" + Get-Process -Name 'gh*' -ErrorAction SilentlyContinue | + Select-Object Name, Id, StartTime, CPU | Format-Table -AutoSize + - name: Run gh aw upgrade (exercises the self-upgrade code path) shell: bash env: diff --git a/actions/setup-cli/install.sh b/actions/setup-cli/install.sh index 63bd5a405a5..b1276d94f3b 100755 --- a/actions/setup-cli/install.sh +++ b/actions/setup-cli/install.sh @@ -179,16 +179,38 @@ fi # Try gh extension install if requested (and gh is available) if [ "$TRY_GH_INSTALL" = true ] && command -v gh &> /dev/null; then print_info "Attempting to install gh-aw using 'gh extension install'..." - + + # On Windows, `gh extension install` has been observed to hang for the full + # job timeout (~10 min) without producing any output. Root-cause hypothesis: + # the gh CLI executes the newly downloaded gh-aw.exe for verification, and + # Windows Defender Real-Time Protection blocks that execution while scanning + # the new binary. Apply a generous-but-bounded timeout so we fall through to + # the direct-download path when this happens instead of hanging indefinitely. + GH_INSTALL_CMD_TIMEOUT="" + if [[ "$OS_NAME" == "windows" ]] && command -v timeout &>/dev/null; then + GH_INSTALL_CMD_TIMEOUT="timeout 90" + print_info "Windows detected: wrapping gh extension install with a 90s timeout" + fi + # Call gh extension install directly to avoid command injection install_result=0 if [ -n "$VERSION" ] && [ "$VERSION" != "latest" ]; then - gh extension install "$REPO" --force --pin "$VERSION" 2>&1 | tee /tmp/gh-install.log + # shellcheck disable=SC2086 # GH_INSTALL_CMD_TIMEOUT is intentionally unquoted + $GH_INSTALL_CMD_TIMEOUT gh extension install "$REPO" --force --pin "$VERSION" 2>&1 | tee /tmp/gh-install.log install_result=${PIPESTATUS[0]} else - gh extension install "$REPO" --force 2>&1 | tee /tmp/gh-install.log + # shellcheck disable=SC2086 # GH_INSTALL_CMD_TIMEOUT is intentionally unquoted + $GH_INSTALL_CMD_TIMEOUT gh extension install "$REPO" --force 2>&1 | tee /tmp/gh-install.log install_result=${PIPESTATUS[0]} fi + + # Exit code 124 means the timeout command fired; treat it the same as a + # failed install so we fall through to the direct-download path below. + if [ $install_result -eq 124 ]; then + print_warning "gh extension install timed out (90s) — falling back to manual installation." + print_warning "This is a known issue on Windows where Defender scans the new binary." + install_result=1 + fi if [ $install_result -eq 0 ]; then # Verify the installation succeeded diff --git a/install-gh-aw.sh b/install-gh-aw.sh index 63bd5a405a5..b1276d94f3b 100755 --- a/install-gh-aw.sh +++ b/install-gh-aw.sh @@ -179,16 +179,38 @@ fi # Try gh extension install if requested (and gh is available) if [ "$TRY_GH_INSTALL" = true ] && command -v gh &> /dev/null; then print_info "Attempting to install gh-aw using 'gh extension install'..." - + + # On Windows, `gh extension install` has been observed to hang for the full + # job timeout (~10 min) without producing any output. Root-cause hypothesis: + # the gh CLI executes the newly downloaded gh-aw.exe for verification, and + # Windows Defender Real-Time Protection blocks that execution while scanning + # the new binary. Apply a generous-but-bounded timeout so we fall through to + # the direct-download path when this happens instead of hanging indefinitely. + GH_INSTALL_CMD_TIMEOUT="" + if [[ "$OS_NAME" == "windows" ]] && command -v timeout &>/dev/null; then + GH_INSTALL_CMD_TIMEOUT="timeout 90" + print_info "Windows detected: wrapping gh extension install with a 90s timeout" + fi + # Call gh extension install directly to avoid command injection install_result=0 if [ -n "$VERSION" ] && [ "$VERSION" != "latest" ]; then - gh extension install "$REPO" --force --pin "$VERSION" 2>&1 | tee /tmp/gh-install.log + # shellcheck disable=SC2086 # GH_INSTALL_CMD_TIMEOUT is intentionally unquoted + $GH_INSTALL_CMD_TIMEOUT gh extension install "$REPO" --force --pin "$VERSION" 2>&1 | tee /tmp/gh-install.log install_result=${PIPESTATUS[0]} else - gh extension install "$REPO" --force 2>&1 | tee /tmp/gh-install.log + # shellcheck disable=SC2086 # GH_INSTALL_CMD_TIMEOUT is intentionally unquoted + $GH_INSTALL_CMD_TIMEOUT gh extension install "$REPO" --force 2>&1 | tee /tmp/gh-install.log install_result=${PIPESTATUS[0]} fi + + # Exit code 124 means the timeout command fired; treat it the same as a + # failed install so we fall through to the direct-download path below. + if [ $install_result -eq 124 ]; then + print_warning "gh extension install timed out (90s) — falling back to manual installation." + print_warning "This is a known issue on Windows where Defender scans the new binary." + install_result=1 + fi if [ $install_result -eq 0 ]; then # Verify the installation succeeded diff --git a/pkg/cli/update_extension_check.go b/pkg/cli/update_extension_check.go index ddda24fa2ad..9a745f8e068 100644 --- a/pkg/cli/update_extension_check.go +++ b/pkg/cli/update_extension_check.go @@ -146,11 +146,38 @@ func upgradeExtensionIfOutdated(verbose bool, includePrereleases bool) (bool, st } } - retryCmd := exec.Command("gh", extensionUpgradeArgs()...) + // Retry path: remove + reinstall at the exact target version. + // + // Using "gh extension upgrade --force" again would call fetchLatestRelease + // (/releases/latest) internally, which returns 404 for prerelease-only repos + // and causes "unable to retrieve latest version for extension" errors. + // Using "gh extension install --pin VERSION" instead calls fetchReleaseFromTag, + // which accepts any tag (stable or prerelease). + // + // We must remove the extension first because "gh extension install" checks + // whether the extension is already present via its manifest.yml. With the + // manifest in place the install command takes the "already installed" code + // path and does nothing; removing the extension clears that guard. + // + // Note: the backup file lives inside the extension directory, so if the + // remove step succeeds the backup is also gone; we clear backupPath to + // avoid a misleading restore attempt on subsequent failures. + removeCmd := exec.Command("gh", "extension", "remove", extensionRepo) + removeCmd.Stdout = os.Stderr + removeCmd.Stderr = os.Stderr + if removeErr := removeCmd.Run(); removeErr == nil { + // Extension directory (and the backup inside it) has been deleted. + backupPath = "" + } else { + updateExtensionCheckLog.Printf("Could not remove extension before reinstall (will attempt install anyway): %v", removeErr) + } + + retryCmd := exec.Command("gh", "extension", "install", extensionRepo, "--pin", latestVersion) retryCmd.Stdout = os.Stderr retryCmd.Stderr = os.Stderr if retryErr := retryCmd.Run(); retryErr != nil { - // Retry also failed. Restore the backup so the user still has gh-aw. + // Retry also failed. Restore the backup so the user still has gh-aw + // (only possible when the remove step above did not succeed). if backupPath != "" { restoreExecutableBackup(installPath, backupPath) } @@ -162,12 +189,13 @@ func upgradeExtensionIfOutdated(verbose bool, includePrereleases bool) (bool, st fmt.Fprintln(os.Stderr, " gh extension upgrade gh-aw") fmt.Fprintln(os.Stderr, console.FormatInfoMessage("If that does not work, try reinstalling:")) fmt.Fprintln(os.Stderr, " gh extension remove gh-aw") - fmt.Fprintln(os.Stderr, " gh extension install github/gh-aw") + fmt.Fprintln(os.Stderr, " gh extension install "+extensionRepo) } return false, "", fmt.Errorf("failed to upgrade gh-aw extension: %w", retryErr) } - // Retry succeeded. Clean up the backup. + // Retry succeeded. Clean up the backup if it still exists + // (it will be gone when the remove step above succeeded). if backupPath != "" { cleanupExecutableBackup(backupPath) } @@ -262,5 +290,8 @@ func isWindowsLockError(output string, err error) bool { // --force is required so pinned installs (e.g. `gh extension install ... --pin`) // can be upgraded in-place. func extensionUpgradeArgs() []string { - return []string{"extension", "upgrade", "github/gh-aw", "--force"} + return []string{"extension", "upgrade", extensionRepo, "--force"} } + +// extensionRepo is the GitHub repo slug used in all gh-extension CLI invocations. +const extensionRepo = "github/gh-aw"