From b471846a5282ac9887003fb891b556a65ee6e7df Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 18 Feb 2026 18:53:18 +0000 Subject: [PATCH 1/2] fix: conditional podman reset + container build lock The reset_podman_state() call fixes "database graph driver does not match" errors caused by health monitor running `podman inspect` before storage setup completes. Previously it ran unconditionally as root, destroying the user's btrfs storage directory at /var/lib/containers/storage/user-{uid}. Now it only runs for root podman (empty cmd_prefix). User-mode podman already resets in create_vm_user(). Also adds file locking to ensure_nested_container() to prevent the concurrent `podman build` race condition on x64 where multiple nextest processes race on overlay unmount and corrupt the build cache. --- fc-agent/src/agent.rs | 13 ++++++++----- fc-agent/src/container.rs | 16 +++++++--------- tests/common/mod.rs | 20 ++++++++++++++++++-- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/fc-agent/src/agent.rs b/fc-agent/src/agent.rs index c9391e1b..9696ab6c 100644 --- a/fc-agent/src/agent.rs +++ b/fc-agent/src/agent.rs @@ -162,11 +162,14 @@ pub async fn run() -> Result<()> { // Store prefix globally so exec server and health checks can use it container::set_podman_cmd_prefix(cmd_prefix.clone()); - // Reset podman state to match storage.conf before the first real podman operation. - // By this point, storage setup is complete (btrfs loopback mounted, storage.conf - // written with correct driver). Reset ensures db.sql matches storage.conf even if - // concurrent health monitor `podman inspect` created stale state during setup. - container::reset_podman_state(); + // Reset root podman state to match storage.conf. The health monitor may have + // run `podman inspect` via the exec server during setup, creating a stale + // db.sql with the wrong graph driver. Only needed for root podman — user mode + // already resets in create_vm_user(), and a root reset would destroy the + // user's storage directory. + if cmd_prefix.is_empty() { + container::reset_podman_state(); + } // Prepare image based on delivery mode let image_ref = match (plan.image_mode.as_deref(), &plan.image_device) { diff --git a/fc-agent/src/container.rs b/fc-agent/src/container.rs index 89adee5e..3596a708 100644 --- a/fc-agent/src/container.rs +++ b/fc-agent/src/container.rs @@ -356,17 +356,15 @@ pub fn setup_btrfs_storage_if_available() { ); } -/// Reset podman state to match the current storage.conf. +/// Reset root podman state to match the current storage.conf. /// -/// Must be called after storage.conf is written (by btrfs/overlay setup) and -/// immediately before the first real podman operation (pull/load/run). +/// Fixes "database graph driver does not match" errors caused by the health +/// monitor running `podman inspect` via exec before storage setup completes, +/// creating db.sql with an empty or wrong driver. /// -/// This fixes "database graph driver does not match" errors caused by: -/// 1. Stale db.sql from rootfs build (apt post-install creates it with driver="") -/// 2. Concurrent health monitor `podman inspect` recreating db.sql during setup -/// -/// `podman system reset --force` atomically drops and recreates all podman state -/// to match the current storage.conf, eliminating any driver mismatch. +/// Only call for root podman (empty cmd_prefix). User-mode podman already +/// resets in create_vm_user(). A root reset would destroy the user's btrfs +/// storage subdirectory at /var/lib/containers/storage/user-{uid}. pub fn reset_podman_state() { match std::process::Command::new("podman") .args(["system", "reset", "--force"]) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 915f2fa2..2951293d 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1042,9 +1042,10 @@ pub async fn ensure_nested_image() -> anyhow::Result<()> { ensure_nested_container("localhost/nested-test", "Containerfile.nested").await } -/// Build a container image for nested testing. +/// Build a container image for nested testing, serialized via file lock. /// -/// Always runs podman build - relies on podman's layer caching for speed. +/// Uses an exclusive file lock to prevent concurrent builds from racing on +/// overlay unmount. Skips the build if another process already built the image. /// If the container extends localhost/nested-test, call ensure_nested_image() first. /// /// # Arguments @@ -1054,6 +1055,19 @@ pub async fn ensure_nested_container(image_name: &str, containerfile: &str) -> a let fcvm_path = find_fcvm_binary()?; let fcvm_dir = fcvm_path.parent().unwrap(); + // Serialize concurrent builds with a file lock. Multiple nextest processes + // may call this simultaneously; without locking, concurrent `podman build` + // races on overlay unmount and corrupts the build cache (x64-specific). + let lock_name = image_name.replace('/', "-"); + let lock_path = format!("/tmp/fcvm-build-{}.lock", lock_name); + let lock_file = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(false) + .open(&lock_path) + .context("creating build lock file")?; + lock_file.lock_exclusive().context("acquiring build lock")?; + // Copy binaries to build context (needed for nested-test base) if image_name == "localhost/nested-test" { let profile = fcvm::setup::get_kernel_profile("nested")? @@ -1080,6 +1094,7 @@ pub async fn ensure_nested_container(image_name: &str, containerfile: &str) -> a .context("running podman build")?; if !output.status.success() { + drop(lock_file); let stderr = String::from_utf8_lossy(&output.stderr); anyhow::bail!("Failed to build {}: {}", image_name, stderr); } @@ -1130,6 +1145,7 @@ pub async fn ensure_nested_container(image_name: &str, containerfile: &str) -> a println!("✓ {} built", image_name); } + drop(lock_file); Ok(()) } From 93fc4e9dff199a3e79420d3a0a746cbc86dd0ae4 Mon Sep 17 00:00:00 2001 From: "claude[bot]" Date: Wed, 18 Feb 2026 19:35:49 +0000 Subject: [PATCH 2/2] fix: correct ensure_nested_container doc comment to match actual behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doc said "Skips the build if another process already built the image" but the code always runs podman build (relying on layer caching). Updated to accurately describe the behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- tests/common/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 2951293d..4d4b78c7 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1045,7 +1045,7 @@ pub async fn ensure_nested_image() -> anyhow::Result<()> { /// Build a container image for nested testing, serialized via file lock. /// /// Uses an exclusive file lock to prevent concurrent builds from racing on -/// overlay unmount. Skips the build if another process already built the image. +/// overlay unmount. Redundant builds are fast due to podman's layer caching. /// If the container extends localhost/nested-test, call ensure_nested_image() first. /// /// # Arguments