diff --git a/src/docker-manager.test.ts b/src/docker-manager.test.ts index b4465330..9df7b9d2 100644 --- a/src/docker-manager.test.ts +++ b/src/docker-manager.test.ts @@ -2419,9 +2419,9 @@ describe('docker-manager', () => { expect(proxy.healthcheck).toBeDefined(); const healthcheck = proxy.healthcheck!; expect(healthcheck.test).toEqual(['CMD', 'curl', '-f', 'http://localhost:10000/health']); - expect(healthcheck.timeout).toBe('2s'); - expect(healthcheck.retries).toBe(10); - expect(healthcheck.start_period).toBe('10s'); + expect(healthcheck.timeout).toBe('3s'); + expect(healthcheck.retries).toBe(15); + expect(healthcheck.start_period).toBe('30s'); }); it('should drop all capabilities', () => { @@ -3809,6 +3809,101 @@ describe('docker-manager', () => { await expect(startContainers(testDir, ['github.com'])).rejects.toThrow(); }); + + it('should retry once when awf-api-proxy fails its health check', async () => { + // 1. docker rm (initial cleanup) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 2. docker compose up (first attempt - fails with api-proxy unhealthy) + mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-api-proxy is unhealthy')); + // 3. docker logs --tail 50 awf-api-proxy (get logs for diagnosis) + mockExecaFn.mockResolvedValueOnce({ stdout: 'api-proxy startup logs', stderr: '', exitCode: 0 } as any); + // 4. docker compose down (cleanup before retry) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 5. docker compose up (retry - succeeds) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + + await expect(startContainers(testDir, ['github.com'])).resolves.toBeUndefined(); + + // Verify retry happened: compose up was called twice + const upCalls = mockExecaFn.mock.calls.filter((call: any[]) => + call[0] === 'docker' && Array.isArray(call[1]) && call[1].includes('up') + ); + expect(upCalls).toHaveLength(2); + }); + + it('should throw clear error when awf-api-proxy fails its health check on both attempts', async () => { + // 1. docker rm (initial cleanup) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 2. docker compose up (first attempt - fails) + mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-api-proxy is unhealthy')); + // 3. docker logs (first diagnosis) + mockExecaFn.mockResolvedValueOnce({ stdout: 'api-proxy logs', stderr: '', exitCode: 0 } as any); + // 4. docker compose down (cleanup before retry) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 5. docker compose up (retry - also fails) + mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-api-proxy is unhealthy')); + // 6. docker logs (second diagnosis) + mockExecaFn.mockResolvedValueOnce({ stdout: 'api-proxy logs', stderr: '', exitCode: 0 } as any); + + await expect(startContainers(testDir, ['github.com'])).rejects.toThrow( + 'AWF firewall failed to start: awf-api-proxy failed its health check on both attempts' + ); + }); + + it('should not retry for non-api-proxy healthcheck failures', async () => { + // 1. docker rm (initial cleanup) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 2. docker compose up (fails with squid unhealthy - not api-proxy) + mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-squid is unhealthy')); + + await expect(startContainers(testDir, ['github.com'])).rejects.toThrow(); + + // Only one compose up call (no retry) + const upCalls = mockExecaFn.mock.calls.filter((call: any[]) => + call[0] === 'docker' && Array.isArray(call[1]) && call[1].includes('up') + ); + expect(upCalls).toHaveLength(1); + }); + + it('should route retry error through Squid diagnostics when retry fails with non-api-proxy error', async () => { + // Create access.log with denied entries so Squid diagnostics fire + const squidLogsDir = path.join(testDir, 'squid-logs'); + fs.mkdirSync(squidLogsDir, { recursive: true }); + fs.writeFileSync( + path.join(squidLogsDir, 'access.log'), + '1760994429.358 172.30.0.20:36274 blocked.com:443 -:- 1.1 CONNECT 403 TCP_DENIED:HIER_NONE blocked.com:443 "curl/7.81.0"\n' + ); + + // 1. docker rm (initial cleanup) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 2. docker compose up (first attempt - fails with api-proxy unhealthy) + mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-api-proxy is unhealthy')); + // 3. docker logs (diagnosis before retry) + mockExecaFn.mockResolvedValueOnce({ stdout: 'some logs', stderr: '', exitCode: 0 } as any); + // 4. docker compose down (cleanup before retry) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 5. docker compose up (retry - fails with a different, non-api-proxy error) + mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-squid is unhealthy')); + + // Should surface the Squid blocked-domain error, not a raw throw + await expect(startContainers(testDir, ['github.com'])).rejects.toThrow('Firewall blocked access to:'); + }); + + it('should not emit container logs when docker logs exits non-zero (container not found)', async () => { + // 1. docker rm (initial cleanup) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 2. docker compose up (fails with api-proxy unhealthy) + mockExecaFn.mockRejectedValueOnce(new Error('dependency failed to start: container awf-api-proxy is unhealthy')); + // 3. docker logs returns non-zero (container not found) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: 'No such container: awf-api-proxy', exitCode: 1 } as any); + // 4. docker compose down (cleanup before retry) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // 5. docker compose up (retry - succeeds) + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + + // Should succeed without emitting "No such container" noise at error level + await expect(startContainers(testDir, ['github.com'])).resolves.toBeUndefined(); + }); }); describe('stopContainers', () => { diff --git a/src/docker-manager.ts b/src/docker-manager.ts index c098361e..276e590f 100644 --- a/src/docker-manager.ts +++ b/src/docker-manager.ts @@ -1784,10 +1784,10 @@ export function generateDockerCompose( }, healthcheck: { test: ['CMD', 'curl', '-f', `http://localhost:${API_PROXY_HEALTH_PORT}/health`], - interval: '1s', - timeout: '2s', - retries: 10, - start_period: '10s', + interval: '2s', + timeout: '3s', + retries: 15, + start_period: '30s', }, // Security hardening: Drop all capabilities cap_drop: ['ALL'], @@ -2424,6 +2424,43 @@ async function checkSquidLogs(workDir: string, proxyLogsDir?: string): Promise<{ } } +/** + * Returns true when the Docker Compose error message indicates that the + * api-proxy container specifically failed its health check. + * Docker emits "dependency failed to start: container is unhealthy" + * when a dependent container's health check does not pass. + */ +function isApiProxyUnhealthyError(errorMsg: string): boolean { + return errorMsg.includes('is unhealthy') && + errorMsg.includes(API_PROXY_CONTAINER_NAME); +} + +/** + * Dumps the tail of a container's logs to stderr for diagnosis. + * Silently skips if the container does not exist or logs are unavailable. + */ +async function logContainerLogsToStderr(containerName: string): Promise { + try { + const result = await execa('docker', ['logs', '--tail', '50', containerName], { + reject: false, + env: getLocalDockerEnv(), + }); + // Only emit stdout/stderr from a successful docker logs invocation. + // When the container does not exist, docker logs exits non-zero and writes + // "No such container" to stderr — skip that noise entirely. + if (result.exitCode === 0) { + const combined = [result.stdout, result.stderr].filter(Boolean).join('\n').trim(); + if (combined) { + logger.error(`${containerName} container logs (last 50 lines):\n${combined}`); + } + } else { + logger.debug(`docker logs exited with ${result.exitCode} for container ${containerName} — container may not exist`); + } + } catch (error) { + logger.debug(`Could not retrieve logs for container ${containerName}:`, error); + } +} + /** * Starts Docker Compose services * @param workDir - Working directory containing Docker Compose config @@ -2447,12 +2484,13 @@ export async function startContainers(workDir: string, allowedDomains: string[], logger.debug('No existing containers to remove (this is normal)'); } - try { - const composeArgs = ['compose', 'up', '-d']; - if (skipPull) { - composeArgs.push('--pull', 'never'); - logger.debug('Using --pull never (skip-pull mode)'); - } + const composeArgs = ['compose', 'up', '-d']; + if (skipPull) { + composeArgs.push('--pull', 'never'); + logger.debug('Using --pull never (skip-pull mode)'); + } + + const runDockerComposeUp = async (): Promise => { // Redirect Docker Compose stdout to stderr so it doesn't pollute the // agent command's stdout. Docker Compose outputs build progress and // container creation status to stdout, which would be captured by test @@ -2465,63 +2503,124 @@ export async function startContainers(workDir: string, allowedDomains: string[], stderr: 'inherit', env: getLocalDockerEnv(), }); + }; + + try { + await runDockerComposeUp(); logger.success('Containers started successfully'); - } catch (error) { - // Check if this is a healthcheck failure - const errorMsg = error instanceof Error ? error.message : String(error); - if (errorMsg.includes('is unhealthy') || errorMsg.includes('dependency failed')) { - // Check Squid logs to see if it's actually working and blocking traffic - const { hasDenials, blockedTargets } = await checkSquidLogs(workDir, proxyLogsDir); + } catch (firstError) { + const firstErrorMsg = firstError instanceof Error ? firstError.message : String(firstError); - if (hasDenials) { - logger.error('Firewall blocked domains during startup:'); + // When api-proxy specifically fails its health check, retry once. + // Transient failures are common on slow or busy runners (e.g. Azure-hosted runners) + // where the Node.js process inside the container takes longer to bind its port. + if (isApiProxyUnhealthyError(firstErrorMsg)) { + logger.warn(`${API_PROXY_CONTAINER_NAME} failed its health check — this may be a transient startup failure, retrying once...`); + await logContainerLogsToStderr(API_PROXY_CONTAINER_NAME); - const missingDomains: string[] = []; - const portIssues: BlockedTarget[] = []; + // Tear down before retry so Docker Compose starts fresh + try { + await execa('docker', ['compose', 'down', '-v', '-t', '1'], { + cwd: workDir, + stdout: process.stderr, + stderr: 'inherit', + env: getLocalDockerEnv(), + reject: false, + }); + } catch (cleanupError) { + // Best-effort cleanup — proceed with retry regardless + logger.debug('Cleanup before retry failed (proceeding anyway):', cleanupError); + } - blockedTargets.forEach(blocked => { - const isAllowed = allowedDomains.some(allowed => - blocked.domain === allowed || blocked.domain.endsWith('.' + allowed) + try { + await runDockerComposeUp(); + logger.success('Containers started successfully (retry succeeded)'); + return; + } catch (retryError) { + const retryErrorMsg = retryError instanceof Error ? retryError.message : String(retryError); + if (isApiProxyUnhealthyError(retryErrorMsg)) { + // Surface api-proxy logs and emit a clear, unambiguous error so + // downstream parse steps don't blame the model for never running. + await logContainerLogsToStderr(API_PROXY_CONTAINER_NAME); + throw new Error( + `AWF firewall failed to start: ${API_PROXY_CONTAINER_NAME} failed its health check on both attempts. ` + + `The agent was never invoked. ` + + `See ${API_PROXY_CONTAINER_NAME} container logs above for details.` ); + } + // Any other retry error (e.g. squid healthcheck or domain blockage) falls + // through to the Squid log diagnostic path below as if it were the first error. + // Re-assign so the shared handler at the end of the catch block can process it. + return await handleHealthcheckError(retryErrorMsg, retryError as Error, workDir, proxyLogsDir, allowedDomains); + } + } - if (!isAllowed) { - // Domain not in allowlist - logger.error(` - Blocked: ${blocked.target} (domain not in allowlist)`); - missingDomains.push(blocked.domain); - } else if (blocked.port && blocked.port !== '80' && blocked.port !== '443') { - // Domain is allowed but port is not - logger.error(` - Blocked: ${blocked.target} (port ${blocked.port} not allowed, only 80 and 443 are permitted)`); - portIssues.push(blocked); - } else { - // Other reason (shouldn't happen often) - logger.error(` - Blocked: ${blocked.target}`); - } - }); + return await handleHealthcheckError(firstErrorMsg, firstError as Error, workDir, proxyLogsDir, allowedDomains); + } +} - logger.error('Allowed domains:'); - allowedDomains.forEach(domain => { - logger.error(` - Allowed: ${domain}`); - }); +/** + * Runs the Squid-log diagnostic check and re-throws with a user-friendly message + * when blocked domains are found, or rethrows the original error otherwise. + */ +async function handleHealthcheckError( + errorMsg: string, + error: Error, + workDir: string, + proxyLogsDir: string | undefined, + allowedDomains: string[] +): Promise { + if (errorMsg.includes('is unhealthy') || errorMsg.includes('dependency failed')) { + const { hasDenials, blockedTargets } = await checkSquidLogs(workDir, proxyLogsDir); - if (missingDomains.length > 0) { - logger.error(`To fix domain issues: --allow-domains "${[...allowedDomains, ...missingDomains].join(',')}"`); - } - if (portIssues.length > 0) { - logger.error('To fix port issues: Use standard ports 80 (HTTP) or 443 (HTTPS)'); - } + if (hasDenials) { + logger.error('Firewall blocked domains during startup:'); + + const missingDomains: string[] = []; + const portIssues: BlockedTarget[] = []; - // Create a more user-friendly error - const blockedList = blockedTargets.map(b => `"${b.target}"`).join(', '); - throw new Error( - `Firewall blocked access to: ${blockedList}. ` + - `Check error messages above for details.` + blockedTargets.forEach(blocked => { + const isAllowed = allowedDomains.some(allowed => + blocked.domain === allowed || blocked.domain.endsWith('.' + allowed) ); + + if (!isAllowed) { + // Domain not in allowlist + logger.error(` - Blocked: ${blocked.target} (domain not in allowlist)`); + missingDomains.push(blocked.domain); + } else if (blocked.port && blocked.port !== '80' && blocked.port !== '443') { + // Domain is allowed but port is not + logger.error(` - Blocked: ${blocked.target} (port ${blocked.port} not allowed, only 80 and 443 are permitted)`); + portIssues.push(blocked); + } else { + // Other reason (shouldn't happen often) + logger.error(` - Blocked: ${blocked.target}`); + } + }); + + logger.error('Allowed domains:'); + allowedDomains.forEach(domain => { + logger.error(` - Allowed: ${domain}`); + }); + + if (missingDomains.length > 0) { + logger.error(`To fix domain issues: --allow-domains "${[...allowedDomains, ...missingDomains].join(',')}"`); + } + if (portIssues.length > 0) { + logger.error('To fix port issues: Use standard ports 80 (HTTP) or 443 (HTTPS)'); } - } - logger.error('Failed to start containers:', error); - throw error; + // Create a more user-friendly error + const blockedList = blockedTargets.map(b => `"${b.target}"`).join(', '); + throw new Error( + `Firewall blocked access to: ${blockedList}. ` + + `Check error messages above for details.` + ); + } } + + logger.error('Failed to start containers:', error); + throw error; } /**