diff --git a/actions/setup/js/start_mcp_gateway.cjs b/actions/setup/js/start_mcp_gateway.cjs index 57f2e0b5603..bfca155fb2f 100644 --- a/actions/setup/js/start_mcp_gateway.cjs +++ b/actions/setup/js/start_mcp_gateway.cjs @@ -31,6 +31,7 @@ const { spawn, execSync } = require("child_process"); const fs = require("fs"); const http = require("http"); const path = require("path"); +const { withRetry } = require("./error_recovery.cjs"); // --------------------------------------------------------------------------- // Timing helpers @@ -370,43 +371,59 @@ async function main() { core.info(`Health endpoint: ${healthUrl}`); core.info(`(Note: MCP_GATEWAY_DOMAIN is '${gatewayDomain}' for container access)`); - core.info("Retrying up to 120 times with 1s delay (120s total timeout)"); + core.info("Retrying up to 120 times with exponential backoff (250ms to 1s, ~120s total timeout)"); core.info(""); - const maxRetries = 120; + const maxTotalAttempts = 120; + // withRetry's maxRetries excludes the initial attempt. + const maxRetryCount = maxTotalAttempts - 1; + const initialRetryDelayMs = 250; let httpCode = 0; let healthBody = ""; let succeeded = false; + let attemptsMade = 0; core.info("=== Health Check Progress ==="); - for (let attempt = 1; attempt <= maxRetries; attempt++) { - const elapsedSec = Math.floor((nowMs() - healthCheckStart) / 1000); - if (attempt % 10 === 1 || attempt === 1) { - core.info(`Attempt ${attempt}/${maxRetries} (${elapsedSec}s elapsed)...`); - } - - try { - const res = await httpGet(healthUrl, 2000); - httpCode = res.statusCode; - healthBody = res.body; - if (httpCode === 200 && healthBody) { - core.info(`✓ Health check succeeded on attempt ${attempt} (${elapsedSec}s elapsed)`); - succeeded = true; - break; - } - } catch { - // Connection refused / timeout – retry - } + try { + await withRetry( + async () => { + // Counts total health-check attempts, including the final successful attempt. + attemptsMade += 1; + const elapsedSec = Math.floor((nowMs() - healthCheckStart) / 1000); + if (attemptsMade % 10 === 1 || attemptsMade === 1) { + core.info(`Attempt ${attemptsMade}/${maxTotalAttempts} (${elapsedSec}s elapsed)...`); + } - if (attempt < maxRetries) { - await sleep(1000); - } + const res = await httpGet(healthUrl, 2000); + httpCode = res.statusCode; + healthBody = res.body; + if (httpCode === 200 && healthBody) { + core.info(`✓ Health check succeeded on attempt ${attemptsMade} (${elapsedSec}s elapsed)`); + succeeded = true; + return; + } + throw new Error(`Health endpoint not ready (HTTP ${httpCode || 0})`); + }, + { + maxRetries: maxRetryCount, + initialDelayMs: initialRetryDelayMs, + maxDelayMs: 1000, + backoffMultiplier: 2, + jitterMs: 0, + // Preserve previous loop behavior: retry any health-check failure until attempts are exhausted. + shouldRetry: () => true, + }, + "MCP gateway health check" + ); + } catch { + // Retry exhaustion is handled below using existing diagnostics. } + core.info("=== End Health Check Progress ==="); core.info(""); core.info(`Final HTTP code: ${httpCode}`); - core.info(`Total attempts: ${maxRetries}`); + core.info(`Total attempts: ${attemptsMade}`); if (healthBody) { core.info(`Health response body: ${healthBody}`); } else { diff --git a/actions/setup/sh/start_mcp_gateway.sh b/actions/setup/sh/start_mcp_gateway.sh index 824bff5a9a7..604428836d7 100755 --- a/actions/setup/sh/start_mcp_gateway.sh +++ b/actions/setup/sh/start_mcp_gateway.sh @@ -209,7 +209,7 @@ HEALTH_CHECK_START=$(date +%s%3N) HEALTH_CHECK_HOST="localhost" echo "Health endpoint: http://${HEALTH_CHECK_HOST}:${MCP_GATEWAY_PORT}/health" echo "(Note: MCP_GATEWAY_DOMAIN is '${MCP_GATEWAY_DOMAIN}' for container access)" -echo "Retrying up to 120 times with 1s delay (120s total timeout)" +echo "Retrying up to 120 times with exponential backoff (250ms to 1s, ~120s total timeout)" echo "" # Check health endpoint using localhost (since we're running on the host) @@ -219,7 +219,6 @@ echo "" set +e MAX_RETRIES=120 -RETRY_DELAY=1 RETRY_COUNT=0 HTTP_CODE="" HEALTH_RESPONSE="" @@ -234,7 +233,6 @@ while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do ELAPSED_MS=$(($(date +%s%3N) - HEALTH_CHECK_START)) ELAPSED_SEC=$((ELAPSED_MS / 1000)) - # Show progress every 10 retries or on first attempt if [ $((RETRY_COUNT % 10)) -eq 1 ] || [ $RETRY_COUNT -eq 1 ]; then echo "Attempt $RETRY_COUNT/$MAX_RETRIES (${ELAPSED_SEC}s elapsed)..." fi @@ -255,7 +253,16 @@ while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do # If this is not the last attempt, wait before retrying if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then - sleep $RETRY_DELAY + # Exponential backoff with 1s cap: + # attempt 1 -> 0.25s, attempt 2 -> 0.5s, attempt 3+ -> 1s + if [ $RETRY_COUNT -eq 1 ]; then + RETRY_DELAY="0.25" + elif [ $RETRY_COUNT -eq 2 ]; then + RETRY_DELAY="0.5" + else + RETRY_DELAY="1" + fi + sleep "$RETRY_DELAY" fi done echo "=== End Health Check Progress ===" diff --git a/actions/setup/sh/start_mcp_gateway_test.sh b/actions/setup/sh/start_mcp_gateway_test.sh index e1d4e1ba565..2712646be53 100755 --- a/actions/setup/sh/start_mcp_gateway_test.sh +++ b/actions/setup/sh/start_mcp_gateway_test.sh @@ -281,6 +281,21 @@ test_validation_functions_exist() { else print_result "--network host flag validation missing" "FAIL" fi + + # Check for health check retry/backoff logic + if grep -q "RETRY_COUNT -eq 1" "$SCRIPT_PATH" && + grep -q "RETRY_COUNT -eq 2" "$SCRIPT_PATH" && + grep -q "elif \[ \$RETRY_COUNT -eq 2 \]" "$SCRIPT_PATH" && + grep -q "else" "$SCRIPT_PATH" && + grep -q "RETRY_DELAY=\"0.25\"" "$SCRIPT_PATH" && + grep -q "RETRY_DELAY=\"0.5\"" "$SCRIPT_PATH" && + grep -q "RETRY_DELAY=\"1\"" "$SCRIPT_PATH" && + grep -q "attempt 3+ -> 1s" "$SCRIPT_PATH" && + grep -q "sleep \"\$RETRY_DELAY\"" "$SCRIPT_PATH"; then + print_result "Health check exponential backoff configuration exists" "PASS" + else + print_result "Health check exponential backoff configuration missing" "FAIL" + fi } # Run all tests