-
Notifications
You must be signed in to change notification settings - Fork 17
fix: fast-kill agent container on SIGTERM/SIGINT before full cleanup #1623
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
57c2a64
bb56268
08a38d1
bb4f572
5d381e1
6009d8a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,24 @@ import { DEFAULT_DNS_SERVERS } from './dns-resolver'; | |
|
|
||
| const SQUID_PORT = 3128; | ||
|
|
||
| /** | ||
| * Container names used in Docker Compose and referenced by docker CLI commands. | ||
| * Extracted as constants so that generateDockerCompose() and helpers like | ||
| * fastKillAgentContainer() stay in sync. | ||
| */ | ||
| export const AGENT_CONTAINER_NAME = 'awf-agent'; | ||
| const SQUID_CONTAINER_NAME = 'awf-squid'; | ||
| const IPTABLES_INIT_CONTAINER_NAME = 'awf-iptables-init'; | ||
| const API_PROXY_CONTAINER_NAME = 'awf-api-proxy'; | ||
| const DOH_PROXY_CONTAINER_NAME = 'awf-doh-proxy'; | ||
|
|
||
| /** | ||
| * Flag set by fastKillAgentContainer() to signal runAgentCommand() that | ||
| * the container was externally stopped. When true, runAgentCommand() skips | ||
| * its own docker wait / log collection to avoid racing with the signal handler. | ||
| */ | ||
| let agentExternallyKilled = false; | ||
|
|
||
| // When bundled with esbuild, this global is replaced at build time with the | ||
| // JSON content of containers/agent/seccomp-profile.json. In normal (tsc) | ||
| // builds the identifier remains undeclared, so the typeof check below is safe. | ||
|
|
@@ -418,7 +436,7 @@ export function generateDockerCompose( | |
|
|
||
| // Squid service configuration | ||
| const squidService: any = { | ||
| container_name: 'awf-squid', | ||
| container_name: SQUID_CONTAINER_NAME, | ||
| networks: { | ||
| 'awf-net': { | ||
| ipv4_address: networkConfig.squidIp, | ||
|
|
@@ -1143,7 +1161,7 @@ export function generateDockerCompose( | |
|
|
||
| // Agent service configuration | ||
| const agentService: any = { | ||
| container_name: 'awf-agent', | ||
| container_name: AGENT_CONTAINER_NAME, | ||
| networks: { | ||
| 'awf-net': { | ||
| ipv4_address: networkConfig.agentIp, | ||
|
|
@@ -1316,7 +1334,7 @@ export function generateDockerCompose( | |
| // that shares the agent's network namespace but NEVER gives NET_ADMIN to the agent. | ||
| // This eliminates the window where the agent holds NET_ADMIN during startup. | ||
| const iptablesInitService: any = { | ||
| container_name: 'awf-iptables-init', | ||
| container_name: IPTABLES_INIT_CONTAINER_NAME, | ||
| // Share agent's network namespace so iptables rules apply to agent's traffic | ||
| network_mode: 'service:agent', | ||
| // Only mount the init signal volume and the iptables setup script | ||
|
|
@@ -1379,7 +1397,7 @@ export function generateDockerCompose( | |
| // Add Node.js API proxy sidecar if enabled | ||
| if (config.enableApiProxy && networkConfig.proxyIp) { | ||
| const proxyService: any = { | ||
| container_name: 'awf-api-proxy', | ||
| container_name: API_PROXY_CONTAINER_NAME, | ||
| networks: { | ||
| 'awf-net': { | ||
| ipv4_address: networkConfig.proxyIp, | ||
|
|
@@ -1513,7 +1531,7 @@ export function generateDockerCompose( | |
| // Add DNS-over-HTTPS proxy sidecar if enabled | ||
| if (config.dnsOverHttps && networkConfig.dohProxyIp) { | ||
| const dohService: any = { | ||
| container_name: 'awf-doh-proxy', | ||
| container_name: DOH_PROXY_CONTAINER_NAME, | ||
| image: 'cloudflare/cloudflared:latest', | ||
| networks: { | ||
| 'awf-net': { | ||
|
|
@@ -1918,7 +1936,7 @@ export async function startContainers(workDir: string, allowedDomains: string[], | |
| // This handles orphaned containers from failed/interrupted previous runs | ||
| logger.debug('Removing any existing containers with conflicting names...'); | ||
| try { | ||
| await execa('docker', ['rm', '-f', 'awf-squid', 'awf-agent', 'awf-iptables-init', 'awf-api-proxy'], { | ||
| await execa('docker', ['rm', '-f', SQUID_CONTAINER_NAME, AGENT_CONTAINER_NAME, IPTABLES_INIT_CONTAINER_NAME, API_PROXY_CONTAINER_NAME], { | ||
| reject: false, | ||
| }); | ||
| } catch { | ||
|
|
@@ -2011,7 +2029,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], | |
| try { | ||
| // Stream logs in real-time using docker logs -f (follow mode) | ||
| // Run this in the background and wait for the container to exit separately | ||
| const logsProcess = execa('docker', ['logs', '-f', 'awf-agent'], { | ||
| const logsProcess = execa('docker', ['logs', '-f', AGENT_CONTAINER_NAME], { | ||
| stdio: 'inherit', | ||
| reject: false, | ||
| }); | ||
|
|
@@ -2023,7 +2041,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], | |
| logger.info(`Agent timeout: ${agentTimeoutMinutes} minutes`); | ||
|
|
||
| // Race docker wait against a timeout | ||
| const waitPromise = execa('docker', ['wait', 'awf-agent']).then(result => ({ | ||
| const waitPromise = execa('docker', ['wait', AGENT_CONTAINER_NAME]).then(result => ({ | ||
| type: 'completed' as const, | ||
| exitCodeStr: result.stdout, | ||
| })); | ||
|
|
@@ -2038,7 +2056,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], | |
| if (raceResult.type === 'timeout') { | ||
| logger.warn(`Agent command timed out after ${agentTimeoutMinutes} minutes, stopping container...`); | ||
| // Stop the container gracefully (10 second grace period before SIGKILL) | ||
| await execa('docker', ['stop', '-t', '10', 'awf-agent'], { reject: false }); | ||
| await execa('docker', ['stop', '-t', '10', AGENT_CONTAINER_NAME], { reject: false }); | ||
| exitCode = 124; // Standard timeout exit code (same as coreutils timeout) | ||
| } else { | ||
| // Clear the timeout timer so it doesn't keep the event loop alive | ||
|
|
@@ -2047,13 +2065,21 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], | |
| } | ||
| } else { | ||
| // No timeout - wait indefinitely | ||
| const { stdout: exitCodeStr } = await execa('docker', ['wait', 'awf-agent']); | ||
| const { stdout: exitCodeStr } = await execa('docker', ['wait', AGENT_CONTAINER_NAME]); | ||
| exitCode = parseInt(exitCodeStr.trim(), 10); | ||
| } | ||
|
|
||
| // Wait for the logs process to finish (it should exit automatically when container stops) | ||
| await logsProcess; | ||
|
|
||
| // If the container was killed externally (e.g. by fastKillAgentContainer in a | ||
| // signal handler), skip the remaining log analysis — the container state is | ||
| // unreliable and the signal handler will drive the rest of the shutdown. | ||
| if (agentExternallyKilled) { | ||
| logger.debug('Agent was externally killed, skipping post-run analysis'); | ||
| return { exitCode: exitCode || 143, blockedDomains: [] }; | ||
| } | ||
|
|
||
| logger.debug(`Agent exit code: ${exitCode}`); | ||
|
|
||
| // Small delay to ensure Squid logs are flushed to disk | ||
|
|
@@ -2108,6 +2134,44 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Fast-kills the agent container with a short grace period. | ||
| * Used in signal handlers (SIGTERM/SIGINT) to ensure the agent cannot outlive | ||
| * the awf process — e.g. when GH Actions sends SIGTERM followed by SIGKILL | ||
| * after ~10 seconds. The full `docker compose down -v` in stopContainers() is | ||
| * too slow to reliably complete in that window. | ||
| * | ||
| * @param stopTimeoutSeconds - Grace period before SIGKILL (default: 3) | ||
| */ | ||
| export async function fastKillAgentContainer(stopTimeoutSeconds = 3): Promise<void> { | ||
| agentExternallyKilled = true; | ||
| try { | ||
| await execa('docker', ['stop', '-t', String(stopTimeoutSeconds), AGENT_CONTAINER_NAME], { | ||
| reject: false, | ||
| timeout: (stopTimeoutSeconds + 5) * 1000, // hard deadline on the stop command itself | ||
| }); | ||
|
Comment on lines
+2146
to
+2152
|
||
| } catch { | ||
| // Best-effort — if docker CLI is unavailable or hangs, we still proceed | ||
| // to performCleanup which will attempt docker compose down. | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns whether the agent was externally killed via fastKillAgentContainer(). | ||
| * @internal Exported for testing. | ||
| */ | ||
| export function isAgentExternallyKilled(): boolean { | ||
| return agentExternallyKilled; | ||
| } | ||
|
|
||
| /** | ||
| * Resets the externally-killed flag. Only used in tests. | ||
| * @internal Exported for testing. | ||
| */ | ||
| export function resetAgentExternallyKilled(): void { | ||
| agentExternallyKilled = false; | ||
| } | ||
|
|
||
| /** | ||
| * Stops and removes Docker Compose services | ||
| */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The signal handler fast-stops
awf-agenteven when--keep-containersis enabled. That contradicts the existing semantics inperformCleanup()/stopContainers()wherekeepContainerspreserves running containers. Consider gating this call on!config.keepContainers(or passing the flag through) so--keep-containerscontinues to behave as documented during SIGINT shutdown.This issue also appears on line 1915 of the same file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Added
!config.keepContainersguard to both signal handlers so--keep-containerscontinues to preserve running containers during shutdown.