-
Notifications
You must be signed in to change notification settings - Fork 371
fix: Windows gh extension install hang and upgrade retry failure in upgrade test (#73303392593) #28860
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: Windows gh extension install hang and upgrade retry failure in upgrade test (#73303392593) #28860
Changes from all commits
cab1ac2
5467402
571a9bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,7 @@ on: | |
| paths: | ||
| - 'pkg/cli/update_extension_check.go' | ||
| - '.github/workflows/upgrade-test.yml' | ||
| - 'install-gh-aw.sh' | ||
|
|
||
| permissions: {} | ||
|
|
||
|
|
@@ -60,11 +61,88 @@ jobs: | |
| echo "Target release: $TARGET" | ||
| echo "Previous release: $PREVIOUS" | ||
|
|
||
| # ────────────────────────────────────────────────────────────────────────── | ||
| # INVESTIGATION: Windows hangs for ~10 min during `gh extension install`. | ||
| # | ||
| # Evidence from failed runs (e.g. job/73303392593): | ||
| # 1. `gh extension install github/gh-aw --pin ... --force` hangs silently | ||
| # for exactly the job timeout (10 min) with zero output. | ||
| # 2. At cleanup time the runner always finds an orphan `gh-aw` process, | ||
| # which strongly suggests the `gh` CLI is *executing* the freshly | ||
| # downloaded binary as part of its installation (e.g. as a | ||
| # verification/metadata step that was added in a later gh release). | ||
| # 3. When that `gh-aw.exe` subprocess is launched, Windows Defender | ||
| # Real-Time Protection intercepts the new executable before it can | ||
| # run, causing a prolonged scan that blocks the process indefinitely. | ||
| # | ||
| # Fix strategy (this PR): | ||
| # A. Disable Windows Defender real-time scanning for the gh CLI data | ||
| # directory *before* the install so the scan cannot block execution. | ||
| # B. Add a per-step timeout of 3 min so failures are caught fast rather | ||
| # than hitting the 10-min job ceiling. | ||
| # C. Capture `GH_DEBUG=api` on Windows to trace exactly which API call | ||
| # or binary invocation causes the stall if Defender is not the cause. | ||
| # ────────────────────────────────────────────────────────────────────────── | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Smoke Test Review Comment — The Windows Defender exclusion step ( |
||
| - name: Diagnose Windows environment before install | ||
| if: runner.os == 'Windows' | ||
| shell: pwsh | ||
| run: | | ||
| Write-Host "=== gh CLI version ===" | ||
| gh --version | ||
|
|
||
| Write-Host "=== Windows Defender real-time protection status ===" | ||
| try { | ||
| $status = Get-MpComputerStatus | ||
| Write-Host "RealTimeProtectionEnabled : $($status.RealTimeProtectionEnabled)" | ||
| Write-Host "AntivirusEnabled : $($status.AntivirusEnabled)" | ||
| } catch { | ||
| Write-Host "Could not query Windows Defender status: $_" | ||
| } | ||
|
|
||
| Write-Host "=== Existing gh extension directory ===" | ||
| $extDir = Join-Path $env:LOCALAPPDATA "GitHub CLI" | ||
| if (Test-Path $extDir) { | ||
| Get-ChildItem $extDir -Recurse -ErrorAction SilentlyContinue | | ||
| Select-Object FullName, Length | Format-Table -AutoSize | ||
| } else { | ||
| Write-Host "$extDir does not exist yet" | ||
| } | ||
|
|
||
| Write-Host "=== Running gh* processes before install ===" | ||
| Get-Process -Name 'gh*' -ErrorAction SilentlyContinue | | ||
| Select-Object Name, Id, StartTime | Format-Table -AutoSize | ||
|
|
||
| # Disable Windows Defender real-time protection for the gh CLI extensions | ||
| # directory. This is the most likely cause of the hang: Defender intercepts | ||
| # the newly downloaded gh-aw.exe before it can execute, blocking the process | ||
| # indefinitely. Excluding the gh data directory removes that barrier. | ||
| - name: Exclude gh CLI directory from Windows Defender scanning | ||
| if: runner.os == 'Windows' | ||
| shell: pwsh | ||
| run: | | ||
| $ghDataDir = Join-Path $env:LOCALAPPDATA "GitHub CLI" | ||
| Write-Host "Adding Windows Defender exclusion path: $ghDataDir" | ||
| Add-MpPreference -ExclusionPath $ghDataDir | ||
| Write-Host "Exclusion added. Current exclusions:" | ||
| (Get-MpPreference).ExclusionPath | ||
|
|
||
| - name: Install previous version (n-1) | ||
| # Per-step timeout: the Windows hang hit the 10-min *job* ceiling every | ||
| # time. A shorter step timeout lets us fail fast with a clear error and | ||
| # collect the diagnostic steps that follow. | ||
| timeout-minutes: 4 | ||
| shell: bash | ||
| env: | ||
| PREVIOUS: ${{ steps.releases.outputs.previous }} | ||
| run: | | ||
| # Enable gh API tracing on Windows to pinpoint exactly which network | ||
| # or subprocess call stalls (only on Windows to avoid noise elsewhere). | ||
| if [[ "$RUNNER_OS" == "Windows" ]]; then | ||
| export GH_DEBUG=api | ||
| echo "GH_DEBUG=api enabled for Windows debugging" | ||
| fi | ||
|
|
||
| echo "Installing gh-aw $PREVIOUS ..." | ||
| gh extension install github/gh-aw --pin "$PREVIOUS" --force | ||
|
|
||
|
|
@@ -74,6 +152,14 @@ jobs: | |
| [ "$INSTALLED" = "$PREVIOUS" ] || { echo "❌ Expected $PREVIOUS, got $INSTALLED"; exit 1; } | ||
| echo "✅ n-1 installed: $INSTALLED" | ||
|
|
||
| - name: Capture orphan processes after install (Windows) | ||
| if: runner.os == 'Windows' && always() | ||
| shell: pwsh | ||
| run: | | ||
| Write-Host "=== Processes still running after install step ===" | ||
| Get-Process -Name 'gh*' -ErrorAction SilentlyContinue | | ||
| Select-Object Name, Id, StartTime, CPU | Format-Table -AutoSize | ||
|
|
||
| - name: Run gh aw upgrade (exercises the self-upgrade code path) | ||
| shell: bash | ||
| env: | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -179,16 +179,38 @@ fi | |
| # Try gh extension install if requested (and gh is available) | ||
| if [ "$TRY_GH_INSTALL" = true ] && command -v gh &> /dev/null; then | ||
| print_info "Attempting to install gh-aw using 'gh extension install'..." | ||
|
|
||
|
|
||
| # On Windows, `gh extension install` has been observed to hang for the full | ||
| # job timeout (~10 min) without producing any output. Root-cause hypothesis: | ||
| # the gh CLI executes the newly downloaded gh-aw.exe for verification, and | ||
| # Windows Defender Real-Time Protection blocks that execution while scanning | ||
| # the new binary. Apply a generous-but-bounded timeout so we fall through to | ||
| # the direct-download path when this happens instead of hanging indefinitely. | ||
| GH_INSTALL_CMD_TIMEOUT="" | ||
| if [[ "$OS_NAME" == "windows" ]] && command -v timeout &>/dev/null; then | ||
| GH_INSTALL_CMD_TIMEOUT="timeout 90" | ||
| print_info "Windows detected: wrapping gh extension install with a 90s timeout" | ||
| fi | ||
|
|
||
| # Call gh extension install directly to avoid command injection | ||
| install_result=0 | ||
| if [ -n "$VERSION" ] && [ "$VERSION" != "latest" ]; then | ||
| gh extension install "$REPO" --force --pin "$VERSION" 2>&1 | tee /tmp/gh-install.log | ||
| # shellcheck disable=SC2086 # GH_INSTALL_CMD_TIMEOUT is intentionally unquoted | ||
| $GH_INSTALL_CMD_TIMEOUT gh extension install "$REPO" --force --pin "$VERSION" 2>&1 | tee /tmp/gh-install.log | ||
| install_result=${PIPESTATUS[0]} | ||
| else | ||
| gh extension install "$REPO" --force 2>&1 | tee /tmp/gh-install.log | ||
| # shellcheck disable=SC2086 # GH_INSTALL_CMD_TIMEOUT is intentionally unquoted | ||
| $GH_INSTALL_CMD_TIMEOUT gh extension install "$REPO" --force 2>&1 | tee /tmp/gh-install.log | ||
| install_result=${PIPESTATUS[0]} | ||
| fi | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The SC2086 disable comment is needed here, but consider using an array instead: declare the timeout as an array (GH_INSTALL_CMD_TIMEOUT=() or GH_INSTALL_CMD_TIMEOUT=(timeout 90)) so it expands safely without shellcheck suppression. |
||
| # Exit code 124 means the timeout command fired; treat it the same as a | ||
| # failed install so we fall through to the direct-download path below. | ||
| if [ $install_result -eq 124 ]; then | ||
| print_warning "gh extension install timed out (90s) — falling back to manual installation." | ||
| print_warning "This is a known issue on Windows where Defender scans the new binary." | ||
| install_result=1 | ||
| fi | ||
|
|
||
| if [ $install_result -eq 0 ]; then | ||
| # Verify the installation succeeded | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -146,11 +146,38 @@ func upgradeExtensionIfOutdated(verbose bool, includePrereleases bool) (bool, st | |
| } | ||
| } | ||
|
|
||
| retryCmd := exec.Command("gh", extensionUpgradeArgs()...) | ||
| // Retry path: remove + reinstall at the exact target version. | ||
| // | ||
| // Using "gh extension upgrade --force" again would call fetchLatestRelease | ||
| // (/releases/latest) internally, which returns 404 for prerelease-only repos | ||
| // and causes "unable to retrieve latest version for extension" errors. | ||
|
Comment on lines
+149
to
+153
|
||
| // Using "gh extension install --pin VERSION" instead calls fetchReleaseFromTag, | ||
| // which accepts any tag (stable or prerelease). | ||
| // | ||
| // We must remove the extension first because "gh extension install" checks | ||
| // whether the extension is already present via its manifest.yml. With the | ||
| // manifest in place the install command takes the "already installed" code | ||
| // path and does nothing; removing the extension clears that guard. | ||
| // | ||
| // Note: the backup file lives inside the extension directory, so if the | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Smoke Test Review Comment — The remove+reinstall retry logic correctly handles the prerelease-only repo limitation. Clearing |
||
| // remove step succeeds the backup is also gone; we clear backupPath to | ||
| // avoid a misleading restore attempt on subsequent failures. | ||
| removeCmd := exec.Command("gh", "extension", "remove", extensionRepo) | ||
| removeCmd.Stdout = os.Stderr | ||
|
Comment on lines
+162
to
+166
|
||
| removeCmd.Stderr = os.Stderr | ||
| if removeErr := removeCmd.Run(); removeErr == nil { | ||
| // Extension directory (and the backup inside it) has been deleted. | ||
| backupPath = "" | ||
| } else { | ||
| updateExtensionCheckLog.Printf("Could not remove extension before reinstall (will attempt install anyway): %v", removeErr) | ||
| } | ||
|
|
||
| retryCmd := exec.Command("gh", "extension", "install", extensionRepo, "--pin", latestVersion) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removeCmd.Stdout is set to os.Stderr here - intentional for interleaving remove output with error output, but worth a comment explaining why stdout goes to stderr. |
||
| retryCmd.Stdout = os.Stderr | ||
| retryCmd.Stderr = os.Stderr | ||
| if retryErr := retryCmd.Run(); retryErr != nil { | ||
| // Retry also failed. Restore the backup so the user still has gh-aw. | ||
| // Retry also failed. Restore the backup so the user still has gh-aw | ||
| // (only possible when the remove step above did not succeed). | ||
| if backupPath != "" { | ||
| restoreExecutableBackup(installPath, backupPath) | ||
| } | ||
|
|
@@ -162,12 +189,13 @@ func upgradeExtensionIfOutdated(verbose bool, includePrereleases bool) (bool, st | |
| fmt.Fprintln(os.Stderr, " gh extension upgrade gh-aw") | ||
| fmt.Fprintln(os.Stderr, console.FormatInfoMessage("If that does not work, try reinstalling:")) | ||
| fmt.Fprintln(os.Stderr, " gh extension remove gh-aw") | ||
| fmt.Fprintln(os.Stderr, " gh extension install github/gh-aw") | ||
| fmt.Fprintln(os.Stderr, " gh extension install "+extensionRepo) | ||
| } | ||
| return false, "", fmt.Errorf("failed to upgrade gh-aw extension: %w", retryErr) | ||
| } | ||
|
|
||
| // Retry succeeded. Clean up the backup. | ||
| // Retry succeeded. Clean up the backup if it still exists | ||
| // (it will be gone when the remove step above succeeded). | ||
| if backupPath != "" { | ||
| cleanupExecutableBackup(backupPath) | ||
| } | ||
|
|
@@ -262,5 +290,8 @@ func isWindowsLockError(output string, err error) bool { | |
| // --force is required so pinned installs (e.g. `gh extension install ... --pin`) | ||
| // can be upgraded in-place. | ||
| func extensionUpgradeArgs() []string { | ||
| return []string{"extension", "upgrade", "github/gh-aw", "--force"} | ||
| return []string{"extension", "upgrade", extensionRepo, "--force"} | ||
| } | ||
|
|
||
| // extensionRepo is the GitHub repo slug used in all gh-extension CLI invocations. | ||
| const extensionRepo = "github/gh-aw" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment says the per-step timeout is "3 min", but the actual
timeout-minutesconfigured on the install step is 4. Please align the comment with the configuration (or adjust the timeout) to avoid confusion when debugging workflow timeouts.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed! The comment says 3 min but timeout-minutes is 4. A suggestion code block was already included - hope the author picks it up!