Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 31 additions & 12 deletions src/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,12 @@ async fn check_container_running(pid: u32) -> bool {
None => return false, // Can't find fcvm binary
};

let output = match tokio::process::Command::new(&exe)
// Use a short timeout (2s) for health checks to avoid blocking
// The exec command has built-in retry logic that can take 50+ seconds if the server isn't ready
// We want to fail fast and try again on the next health check iteration
let timeout = Duration::from_secs(2);

let output_future = tokio::process::Command::new(&exe)
.args([
"exec",
"--pid",
Expand All @@ -214,14 +219,18 @@ async fn check_container_running(pid: u32) -> bool {
"{{.State.Running}}",
"fcvm-container",
])
.output()
.await
{
Ok(o) => o,
Err(e) => {
.output();

let output = match tokio::time::timeout(timeout, output_future).await {
Ok(Ok(o)) => o,
Ok(Err(e)) => {
debug!(target: "health-monitor", error = %e, "podman inspect exec failed");
return false;
}
Err(_) => {
debug!(target: "health-monitor", "podman inspect exec timed out after {:?}", timeout);
return false;
}
};

if !output.status.success() {
Expand Down Expand Up @@ -249,7 +258,12 @@ async fn check_podman_healthcheck(pid: u32) -> Option<bool> {
None => return Some(true), // Can't find fcvm binary, assume healthy
};

let output = match tokio::process::Command::new(&exe)
// Use a short timeout (2s) for health checks to avoid blocking
// The exec command has built-in retry logic that can take 50+ seconds if the server isn't ready
// We want to fail fast and try again on the next health check iteration
let timeout = Duration::from_secs(2);

let output_future = tokio::process::Command::new(&exe)
.args([
"exec",
"--pid",
Expand All @@ -262,15 +276,20 @@ async fn check_podman_healthcheck(pid: u32) -> Option<bool> {
"{{.State.Health.Status}}",
"fcvm-container",
])
.output()
.await
{
Ok(o) => o,
Err(e) => {
.output();

let output = match tokio::time::timeout(timeout, output_future).await {
Ok(Ok(o)) => o,
Ok(Err(e)) => {
// Exec not available yet, don't assume healthy - keep checking
debug!(target: "health-monitor", error = %e, "podman healthcheck exec failed, will retry");
return Some(false);
}
Err(_) => {
// Timeout - exec server not ready yet, keep checking
debug!(target: "health-monitor", "podman healthcheck exec timed out after {:?}, will retry", timeout);
return Some(false);
}
};

if !output.status.success() {
Expand Down
Loading