From 84dd44b7255ab17d77879c2e6c69a6f6574c5e2b Mon Sep 17 00:00:00 2001 From: "claude[bot]" Date: Thu, 5 Feb 2026 02:52:24 +0000 Subject: [PATCH] fix: add timeout to health check exec calls to prevent blocking The health monitor was getting blocked when trying to run 'fcvm exec' to check container status via podman inspect. The exec command has built-in retry logic with exponential backoff that can take 50+ seconds when the exec server isn't ready yet (e.g., during VM startup). This caused the health check to hang for extended periods, preventing VMs from being marked as healthy in a timely manner. The test-packaging-e2e test was timing out after 120 seconds because the health checks were taking too long. Solution: Add a 2-second timeout to exec calls in both check_container_running and check_podman_healthcheck. This allows the health monitor to fail fast and retry on the next iteration (every 100ms during startup), rather than blocking for the full exec retry duration. Fixes CI #21696494896 --- src/health.rs | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/health.rs b/src/health.rs index df428021..07efff23 100644 --- a/src/health.rs +++ b/src/health.rs @@ -201,7 +201,12 @@ async fn check_container_running(pid: u32) -> bool { None => return false, // Can't find fcvm binary }; - let output = match tokio::process::Command::new(&exe) + // Use a short timeout (2s) for health checks to avoid blocking + // The exec command has built-in retry logic that can take 50+ seconds if the server isn't ready + // We want to fail fast and try again on the next health check iteration + let timeout = Duration::from_secs(2); + + let output_future = tokio::process::Command::new(&exe) .args([ "exec", "--pid", @@ -214,14 +219,18 @@ async fn check_container_running(pid: u32) -> bool { "{{.State.Running}}", "fcvm-container", ]) - .output() - .await - { - Ok(o) => o, - Err(e) => { + .output(); + + let output = match tokio::time::timeout(timeout, output_future).await { + Ok(Ok(o)) => o, + Ok(Err(e)) => { debug!(target: "health-monitor", error = %e, "podman inspect exec failed"); return false; } + Err(_) => { + debug!(target: "health-monitor", "podman inspect exec timed out after {:?}", timeout); + return false; + } }; if !output.status.success() { @@ -249,7 +258,12 @@ async fn check_podman_healthcheck(pid: u32) -> Option { None => return Some(true), // Can't find fcvm binary, assume healthy }; - let output = match tokio::process::Command::new(&exe) + // Use a short timeout (2s) for health checks to avoid blocking + // The exec command has built-in retry logic that can take 50+ seconds if the server isn't ready + // We want to fail fast and try again on the next health check iteration + let timeout = Duration::from_secs(2); + + let output_future = tokio::process::Command::new(&exe) .args([ "exec", "--pid", @@ -262,15 +276,20 @@ async fn check_podman_healthcheck(pid: u32) -> Option { "{{.State.Health.Status}}", "fcvm-container", ]) - .output() - .await - { - Ok(o) => o, - Err(e) => { + .output(); + + let output = match tokio::time::timeout(timeout, output_future).await { + Ok(Ok(o)) => o, + Ok(Err(e)) => { // Exec not available yet, don't assume healthy - keep checking debug!(target: "health-monitor", error = %e, "podman healthcheck exec failed, will retry"); return Some(false); } + Err(_) => { + // Timeout - exec server not ready yet, keep checking + debug!(target: "health-monitor", "podman healthcheck exec timed out after {:?}, will retry", timeout); + return Some(false); + } }; if !output.status.success() {