From 2cb07c636d2c2d05e627082721d7ff6259664b17 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 20:11:33 +0000 Subject: [PATCH 01/13] Fix rootfs race condition causing missing podman When multiple VM tests run in parallel, a race condition occurs: 1. First process creates rootfs file via dd 2. Second process checks if file exists, sees it, uses it 3. But first process hasn't finished installing packages yet Result: VMs start with rootfs that has no podman installed. Fix: Create rootfs at temp path (base.ext4.tmp), then rename to final path (base.ext4) only after package installation completes. This ensures the file check is atomic with completion. --- src/setup/rootfs.rs | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 01515e02..2100f36c 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -112,10 +112,29 @@ pub async fn ensure_rootfs() -> Result { info!("note: first-time cloud image download may take 5-15 minutes"); info!("cached rootfs creation takes ~45 seconds"); - let result = create_ubuntu_rootfs(&rootfs_path) + // Create at temp path first, then rename when complete to avoid race conditions. + // Other processes check if rootfs_path exists, so we must not create it until + // package installation is complete. + let temp_rootfs_path = rootfs_path.with_extension("ext4.tmp"); + + // Clean up any leftover temp file from a previous failed attempt + let _ = tokio::fs::remove_file(&temp_rootfs_path).await; + + let result = create_ubuntu_rootfs(&temp_rootfs_path) .await .context("creating Ubuntu rootfs"); + // If successful, rename temp file to final path + if result.is_ok() { + tokio::fs::rename(&temp_rootfs_path, &rootfs_path) + .await + .context("renaming temp rootfs to final path")?; + info!("rootfs creation complete"); + } else { + // Clean up temp file on failure + let _ = tokio::fs::remove_file(&temp_rootfs_path).await; + } + // Release lock flock .unlock() @@ -125,8 +144,6 @@ pub async fn ensure_rootfs() -> Result { result?; - info!("rootfs creation complete"); - Ok(rootfs_path) } From e1ba60deb44db93455b4e3051e6bab103e24d1d9 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 20:39:12 +0000 Subject: [PATCH 02/13] Increase VM health check timeouts for CI Rootfs creation in CI can take 2+ minutes due to: - Cloud image download (~7 seconds) - virt-customize (~10-60+ seconds, variable) - NBD extraction (~30 seconds) - Package installation (~60 seconds) Increase timeouts: - test_sanity: 180 -> 300 seconds - test_egress: 60 -> 180 seconds (baseline) - test_exec: 60 -> 180 seconds Clone VMs keep 60s timeout since they use cached rootfs. --- tests/test_egress.rs | 4 ++-- tests/test_exec.rs | 2 +- tests/test_sanity.rs | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_egress.rs b/tests/test_egress.rs index 501fc37e..6d31d142 100644 --- a/tests/test_egress.rs +++ b/tests/test_egress.rs @@ -72,7 +72,7 @@ async fn egress_fresh_test_impl(network: &str) -> Result<()> { .context("spawning VM")?; println!(" Waiting for VM to become healthy (PID: {})...", vm_pid); - common::poll_health_by_pid(vm_pid, 60).await?; + common::poll_health_by_pid(vm_pid, 180).await?; println!(" ✓ VM healthy"); // Step 2: Test egress @@ -137,7 +137,7 @@ async fn egress_clone_test_impl(network: &str) -> Result<()> { " Waiting for baseline VM to become healthy (PID: {})...", baseline_pid ); - common::poll_health_by_pid(baseline_pid, 60).await?; + common::poll_health_by_pid(baseline_pid, 180).await?; println!(" ✓ Baseline VM healthy"); // Test egress on baseline first diff --git a/tests/test_exec.rs b/tests/test_exec.rs index 8166592e..96791263 100644 --- a/tests/test_exec.rs +++ b/tests/test_exec.rs @@ -45,7 +45,7 @@ async fn exec_test_impl(network: &str) -> Result<()> { // Wait for VM to become healthy println!(" Waiting for VM to become healthy..."); - if let Err(e) = common::poll_health_by_pid(fcvm_pid, 60).await { + if let Err(e) = common::poll_health_by_pid(fcvm_pid, 180).await { common::kill_process(fcvm_pid).await; return Err(e.context("VM failed to become healthy")); } diff --git a/tests/test_sanity.rs b/tests/test_sanity.rs index 64168254..0356590f 100644 --- a/tests/test_sanity.rs +++ b/tests/test_sanity.rs @@ -43,8 +43,9 @@ async fn sanity_test_impl(network: &str) -> Result<()> { println!(" Waiting for VM to become healthy..."); // Spawn health check task - // Use 180 second timeout to account for rootfs creation on first run (~60 sec) - let health_task = tokio::spawn(common::poll_health_by_pid(fcvm_pid, 180)); + // Use 300 second timeout to account for rootfs creation on first run + // (cloud image download ~7s, virt-customize ~10-60s, extraction ~30s, packages ~60s) + let health_task = tokio::spawn(common::poll_health_by_pid(fcvm_pid, 300)); // Monitor process for unexpected exits let monitor_task: tokio::task::JoinHandle> = From 4206ece4ddc2fa5866c2ab9112867c4c56d6fb9a Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 21:06:36 +0000 Subject: [PATCH 03/13] Add response body to Firecracker API errors Include the actual error message from Firecracker when API calls fail. This helps diagnose issues like 400 Bad Request during snapshot load. --- src/firecracker/api.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/firecracker/api.rs b/src/firecracker/api.rs index 28e487af..6d51caa8 100644 --- a/src/firecracker/api.rs +++ b/src/firecracker/api.rs @@ -36,7 +36,10 @@ impl FirecrackerClient { let resp = self.client.request(req).await?; if resp.status() != StatusCode::NO_CONTENT && resp.status() != StatusCode::OK { - anyhow::bail!("Firecracker API error: {}", resp.status()); + let status = resp.status(); + let body_bytes = hyper::body::to_bytes(resp.into_body()).await?; + let body_str = String::from_utf8_lossy(&body_bytes); + anyhow::bail!("Firecracker API error: {} - {}", status, body_str); } Ok(()) } @@ -52,7 +55,10 @@ impl FirecrackerClient { let resp = self.client.request(req).await?; if resp.status() != StatusCode::NO_CONTENT && resp.status() != StatusCode::OK { - anyhow::bail!("Firecracker API error: {}", resp.status()); + let status = resp.status(); + let body_bytes = hyper::body::to_bytes(resp.into_body()).await?; + let body_str = String::from_utf8_lossy(&body_bytes); + anyhow::bail!("Firecracker API error: {} - {}", status, body_str); } Ok(()) } From 54d2b05d78abb519b334babaadff43a7f6c6ed58 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 21:19:01 +0000 Subject: [PATCH 04/13] Increase clone baseline VM timeout to 300s for rootfs creation --- tests/test_egress.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_egress.rs b/tests/test_egress.rs index 6d31d142..579995f7 100644 --- a/tests/test_egress.rs +++ b/tests/test_egress.rs @@ -137,7 +137,8 @@ async fn egress_clone_test_impl(network: &str) -> Result<()> { " Waiting for baseline VM to become healthy (PID: {})...", baseline_pid ); - common::poll_health_by_pid(baseline_pid, 180).await?; + // Use 300 second timeout to account for rootfs creation on first run + common::poll_health_by_pid(baseline_pid, 300).await?; println!(" ✓ Baseline VM healthy"); // Test egress on baseline first From f999209f5e62e4c4952d56ab2d6dd645bcfe1ae1 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 21:21:43 +0000 Subject: [PATCH 05/13] Fix lock being held forever when baseline VM fails health check When poll_health_by_pid times out, kill the baseline VM process before returning the error. This releases any locks held by that process (e.g., rootfs creation lock). Also increase timeout from 180 to 300 seconds to match fresh test. --- tests/test_egress.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_egress.rs b/tests/test_egress.rs index 579995f7..f067bdc2 100644 --- a/tests/test_egress.rs +++ b/tests/test_egress.rs @@ -138,7 +138,10 @@ async fn egress_clone_test_impl(network: &str) -> Result<()> { baseline_pid ); // Use 300 second timeout to account for rootfs creation on first run - common::poll_health_by_pid(baseline_pid, 300).await?; + if let Err(e) = common::poll_health_by_pid(baseline_pid, 300).await { + common::kill_process(baseline_pid).await; + return Err(e.context("baseline VM failed to become healthy")); + } println!(" ✓ Baseline VM healthy"); // Test egress on baseline first From 5aceb44090bd8930230b12ff4de06f45861b49b8 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 21:37:23 +0000 Subject: [PATCH 06/13] Fix formatting and lock Rust version to 1.92.0 - Add rust-toolchain.toml to lock Rust version (prevents format drift) - Add dependabot.yml for weekly dependency updates - Apply rustfmt 1.92.0 formatting fixes --- .github/dependabot.yml | 17 +++++++++++++++++ fc-agent/src/main.rs | 3 ++- rust-toolchain.toml | 3 +++ src/commands/podman.rs | 3 +-- src/commands/snapshot.rs | 3 +-- 5 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 rust-toolchain.toml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..521d79be --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,17 @@ +version: 2 +updates: + - package-ecosystem: "cargo" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 5 + groups: + minor-and-patch: + update-types: + - "minor" + - "patch" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index fbc56581..908562d9 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -1137,7 +1137,8 @@ fn send_status_to_host(message: &[u8]) -> bool { } // Send message - let written = unsafe { libc::write(fd, message.as_ptr() as *const libc::c_void, message.len()) }; + let written = + unsafe { libc::write(fd, message.as_ptr() as *const libc::c_void, message.len()) }; unsafe { libc::close(fd) }; written == message.len() as isize diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 00000000..1a216558 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "1.92.0" +components = ["rustfmt", "clippy"] diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 48399b1b..77c3a485 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -689,8 +689,7 @@ async fn run_vm_setup( holder_child = None; } - let firecracker_bin = which::which("firecracker") - .context("firecracker not found in PATH")?; + let firecracker_bin = which::which("firecracker").context("firecracker not found in PATH")?; vm_manager .start(&firecracker_bin, None) diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs index a0c56b79..af348765 100644 --- a/src/commands/snapshot.rs +++ b/src/commands/snapshot.rs @@ -1021,8 +1021,7 @@ async fn run_clone_setup( ); vm_manager.set_vsock_redirect(baseline_dir, data_dir.to_path_buf()); - let firecracker_bin = which::which("firecracker") - .context("firecracker not found in PATH")?; + let firecracker_bin = which::which("firecracker").context("firecracker not found in PATH")?; vm_manager .start(&firecracker_bin, None) From bf0b035db161b694615b88f1be597b4a92fbc1ee Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 21:52:21 +0000 Subject: [PATCH 07/13] Upgrade Firecracker from v1.10.1 to v1.14.0 Required for network_overrides support in snapshot load API. This enables clone VMs to use different TAP devices from baseline. --- Containerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Containerfile b/Containerfile index 8d2556f1..55513d45 100644 --- a/Containerfile +++ b/Containerfile @@ -44,13 +44,14 @@ RUN apt-get update && apt-get install -y \ && rm -rf /var/lib/apt/lists/* # Download and install Firecracker (architecture-aware) +# v1.14.0 adds network_overrides support for snapshot cloning ARG ARCH=aarch64 RUN curl -L -o /tmp/firecracker.tgz \ - https://github.com/firecracker-microvm/firecracker/releases/download/v1.10.1/firecracker-v1.10.1-${ARCH}.tgz \ + https://github.com/firecracker-microvm/firecracker/releases/download/v1.14.0/firecracker-v1.14.0-${ARCH}.tgz \ && tar -xzf /tmp/firecracker.tgz -C /tmp \ - && mv /tmp/release-v1.10.1-${ARCH}/firecracker-v1.10.1-${ARCH} /usr/local/bin/firecracker \ + && mv /tmp/release-v1.14.0-${ARCH}/firecracker-v1.14.0-${ARCH} /usr/local/bin/firecracker \ && chmod +x /usr/local/bin/firecracker \ - && rm -rf /tmp/firecracker.tgz /tmp/release-v1.10.1-${ARCH} + && rm -rf /tmp/firecracker.tgz /tmp/release-v1.14.0-${ARCH} # Build and install pjdfstest (tests expect it at /tmp/pjdfstest-check/) RUN git clone --depth 1 https://github.com/pjd/pjdfstest /tmp/pjdfstest-check \ From 69efb55c85fd48c2b921fbe1fee316d5ba3eda6f Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 21:55:56 +0000 Subject: [PATCH 08/13] Add Firecracker version check (requires >= 1.14.0) - Add find_firecracker() that validates version before returning path - Fail early with clear error if Firecracker is too old - Required for network_overrides support in snapshot cloning --- Cargo.lock | 1 + Cargo.toml | 1 + src/commands/common.rs | 53 ++++++++++++++++++++++++++++++++++++++++ src/commands/podman.rs | 2 +- src/commands/snapshot.rs | 2 +- 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7609406..1fc5ce6f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -543,6 +543,7 @@ dependencies = [ "memmap2", "nix 0.29.0", "rand 0.8.5", + "regex", "reqwest 0.12.24", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 044a4764..719410d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ shell-words = "1" fuse-pipe = { path = "fuse-pipe", default-features = false } url = "2" tokio-util = "0.7" +regex = "1.12.2" [dev-dependencies] serial_test = "3" diff --git a/src/commands/common.rs b/src/commands/common.rs index 75701ac9..5ec55b0d 100644 --- a/src/commands/common.rs +++ b/src/commands/common.rs @@ -21,6 +21,59 @@ pub const VSOCK_VOLUME_PORT_BASE: u32 = 5000; /// Vsock port for status channel (fc-agent notifies when container starts) pub const VSOCK_STATUS_PORT: u32 = 4999; +/// Minimum required Firecracker version for network_overrides support +const MIN_FIRECRACKER_VERSION: (u32, u32, u32) = (1, 14, 0); + +/// Find and validate Firecracker binary +/// +/// Returns the path to the Firecracker binary if it exists and meets minimum version requirements. +/// Fails with a clear error if Firecracker is not found or version is too old. +pub fn find_firecracker() -> Result { + let firecracker_bin = which::which("firecracker").context("firecracker not found in PATH")?; + + // Check version + let output = std::process::Command::new(&firecracker_bin) + .arg("--version") + .output() + .context("failed to run firecracker --version")?; + + let version_str = String::from_utf8_lossy(&output.stdout); + let version = parse_firecracker_version(&version_str)?; + + if version < MIN_FIRECRACKER_VERSION { + anyhow::bail!( + "Firecracker version {}.{}.{} is too old. Minimum required: {}.{}.{} (for network_overrides support in snapshot cloning)", + version.0, version.1, version.2, + MIN_FIRECRACKER_VERSION.0, MIN_FIRECRACKER_VERSION.1, MIN_FIRECRACKER_VERSION.2 + ); + } + + debug!( + "Found Firecracker {}.{}.{} at {:?}", + version.0, version.1, version.2, firecracker_bin + ); + + Ok(firecracker_bin) +} + +/// Parse Firecracker version from --version output +/// +/// Expected format: "Firecracker v1.14.0" or similar +fn parse_firecracker_version(output: &str) -> Result<(u32, u32, u32)> { + // Find version number pattern vX.Y.Z + let version_re = regex::Regex::new(r"v?(\d+)\.(\d+)\.(\d+)").context("invalid regex")?; + + let caps = version_re + .captures(output) + .context("could not parse Firecracker version from output")?; + + let major: u32 = caps[1].parse().context("invalid major version")?; + let minor: u32 = caps[2].parse().context("invalid minor version")?; + let patch: u32 = caps[3].parse().context("invalid patch version")?; + + Ok((major, minor, patch)) +} + /// Save VM state with complete network configuration /// /// This function ensures both baseline and clone VMs save identical network data, diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 77c3a485..723be8c6 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -689,7 +689,7 @@ async fn run_vm_setup( holder_child = None; } - let firecracker_bin = which::which("firecracker").context("firecracker not found in PATH")?; + let firecracker_bin = super::common::find_firecracker()?; vm_manager .start(&firecracker_bin, None) diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs index af348765..61275444 100644 --- a/src/commands/snapshot.rs +++ b/src/commands/snapshot.rs @@ -1021,7 +1021,7 @@ async fn run_clone_setup( ); vm_manager.set_vsock_redirect(baseline_dir, data_dir.to_path_buf()); - let firecracker_bin = which::which("firecracker").context("firecracker not found in PATH")?; + let firecracker_bin = super::common::find_firecracker()?; vm_manager .start(&firecracker_bin, None) From 43fac70f4909a96c9874bfe446f5aa782f755415 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 20 Dec 2025 22:18:38 +0000 Subject: [PATCH 09/13] Fix dnsmasq port conflict in CI by stopping systemd-resolved The BuildJet runners have systemd-resolved holding port 53, which prevents dnsmasq from starting. This caused DNS resolution failures in the VM Egress bridged clone tests. Stop systemd-resolved before installing/starting dnsmasq. --- .github/workflows/ci.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b35d2247..05997b88 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -257,6 +257,8 @@ jobs: run: sudo mkdir -p /var/run/netns - name: Setup dnsmasq for VM DNS run: | + # Stop systemd-resolved which holds port 53 + sudo systemctl stop systemd-resolved || true sudo apt-get update sudo apt-get install -y dnsmasq sudo tee /etc/dnsmasq.d/fcvm.conf > /dev/null < /dev/null < /dev/null < Date: Sun, 21 Dec 2025 00:10:56 +0000 Subject: [PATCH 10/13] Fix rootless networking health checks and remove dnsmasq dependency Health check fixes (rootless mode): - Use nsenter to curl guest directly instead of slirp4netns port forwarding - slirp4netns port forwarding cannot reach addresses outside its 10.0.2.0/24 network, so DNAT approach failed - Health checks now enter the namespace via nsenter and curl 192.168.1.2:80 - Simplified slirp.rs: removed DNAT rules, kept MASQUERADE for egress DNS fixes: - Add get_host_dns_servers() to read DNS from /etc/resolv.conf - Falls back to /run/systemd/resolve/resolv.conf for systemd-resolved - Remove dnsmasq setup from CI workflows (no longer needed) - Update bridged.rs to use host DNS instead of dnsmasq Other: - Lower minimum Firecracker version to 1.13.1 for CI compatibility --- .github/workflows/ci.yml | 48 ------------- src/commands/common.rs | 2 +- src/health.rs | 143 ++++++++++++++++++++++++--------------- src/network/bridged.rs | 2 +- src/network/mod.rs | 59 ++++++++++++++++ src/network/slirp.rs | 98 ++++++--------------------- 6 files changed, 169 insertions(+), 183 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 05997b88..417d417f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -255,22 +255,6 @@ jobs: ls -la /dev/nbd* | head -5 - name: Setup network namespace directory run: sudo mkdir -p /var/run/netns - - name: Setup dnsmasq for VM DNS - run: | - # Stop systemd-resolved which holds port 53 - sudo systemctl stop systemd-resolved || true - sudo apt-get update - sudo apt-get install -y dnsmasq - sudo tee /etc/dnsmasq.d/fcvm.conf > /dev/null < /dev/null < /dev/null < { debug!(target: "health-monitor", "health check passed"); *last_failure_log = None; @@ -209,7 +215,7 @@ async fn update_health_status_once( } }; if should_log { - debug!(target: "health-monitor", error = %e, "HTTP health check failed"); + debug!(target: "health-monitor", error = %e, "HTTP health check failed (nsenter)"); *last_failure_log = Some(Instant::now()); } HealthStatus::Unhealthy @@ -276,62 +282,89 @@ pub async fn run_health_check_once( Ok(status) } -/// Check if HTTP service is responding via loopback IP (rootless mode) +/// Check if HTTP service is responding via nsenter into the network namespace (rootless mode) /// -/// For rootless VMs, we use a unique loopback IP (127.x.y.z) with port forwarding -/// through slirp4netns to reach the guest. +/// For rootless VMs, we use nsenter to enter the network namespace and curl +/// the guest directly. This bypasses the complexity of slirp4netns port forwarding. /// -/// Linux routes all of 127.0.0.0/8 to loopback without needing `ip addr add`, -/// so this works fully rootless! -async fn check_http_health_loopback( - loopback_ip: &str, +/// The holder_pid is the PID of the namespace holder process (sleep infinity). +async fn check_http_health_nsenter( + holder_pid: u32, + guest_ip: &str, port: u16, health_path: &str, ) -> Result { - let url = format!("http://{}:{}{}", loopback_ip, port, health_path); - - let client = reqwest::Client::builder() - .timeout(Duration::from_secs(1)) - .build() - .context("building reqwest client")?; + let url = format!("http://{}:{}{}", guest_ip, port, health_path); let start = Instant::now(); - match client.get(&url).send().await { - Ok(response) => { - let elapsed = start.elapsed(); - if response.status().is_success() { - debug!( - target: "health-monitor", - loopback_ip = loopback_ip, - port = port, - status = %response.status(), - elapsed_ms = elapsed.as_millis(), - "health check succeeded (rootless)" - ); - Ok(true) - } else { - anyhow::bail!( - "Health check failed with status {} via {}:{} ({}ms)", - response.status(), - loopback_ip, - port, - elapsed.as_millis() - ) - } + // Use nsenter to enter the namespace and curl the guest directly + // --preserve-credentials keeps UID/GID mapping + let output = tokio::process::Command::new("nsenter") + .args([ + "-t", + &holder_pid.to_string(), + "-U", + "-n", + "--preserve-credentials", + "--", + "curl", + "-s", + "-o", + "/dev/null", + "-w", + "%{http_code}", + "--max-time", + "1", + &url, + ]) + .output() + .await + .context("failed to run nsenter curl")?; + + let elapsed = start.elapsed(); + + if output.status.success() { + let status_code = String::from_utf8_lossy(&output.stdout); + let status_code = status_code.trim(); + + if status_code.starts_with('2') || status_code.starts_with('3') { + debug!( + target: "health-monitor", + holder_pid = holder_pid, + guest_ip = guest_ip, + port = port, + status = status_code, + elapsed_ms = elapsed.as_millis(), + "health check succeeded (nsenter)" + ); + Ok(true) + } else { + anyhow::bail!( + "Health check failed with status {} via nsenter to {}:{} ({}ms)", + status_code, + guest_ip, + port, + elapsed.as_millis() + ) } - Err(e) => { - if e.is_timeout() { - anyhow::bail!( - "Health check timed out after 1 second via {}:{}", - loopback_ip, - port - ) - } else if e.is_connect() { - anyhow::bail!("Connection refused to {}:{}", loopback_ip, port) - } else { - anyhow::bail!("Failed to connect to {}:{}: {}", loopback_ip, port, e) - } + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + if stderr.contains("timed out") || stderr.contains("Connection timed out") { + anyhow::bail!( + "Health check timed out via nsenter to {}:{}", + guest_ip, + port + ) + } else if stderr.contains("Connection refused") { + anyhow::bail!("Connection refused to {}:{} via nsenter", guest_ip, port) + } else { + anyhow::bail!( + "Failed to connect to {}:{} via nsenter: {}", + guest_ip, + port, + stderr.trim() + ) } } } diff --git a/src/network/bridged.rs b/src/network/bridged.rs index 2c357997..e979df6a 100644 --- a/src/network/bridged.rs +++ b/src/network/bridged.rs @@ -291,7 +291,7 @@ impl NetworkManager for BridgedNetwork { loopback_ip: None, health_check_port: Some(80), health_check_url: Some(format!("http://{}:80/", health_check_ip)), - dns_server: Some(host_ip), // dnsmasq with bind-dynamic listens here + dns_server: super::get_host_dns_servers().first().cloned(), }) } diff --git a/src/network/mod.rs b/src/network/mod.rs index df2bae53..1596e725 100644 --- a/src/network/mod.rs +++ b/src/network/mod.rs @@ -33,3 +33,62 @@ pub trait NetworkManager: Send + Sync { /// Get a reference to Any for downcasting fn as_any(&self) -> &dyn std::any::Any; } + +/// Read DNS servers from host system +/// +/// Parses /etc/resolv.conf to extract nameserver entries. If only localhost +/// addresses are found (indicating systemd-resolved), falls back to reading +/// /run/systemd/resolve/resolv.conf for the real upstream DNS servers. +/// +/// Returns an empty Vec if no DNS servers can be determined. +pub fn get_host_dns_servers() -> Vec { + // Try /etc/resolv.conf first + let resolv = std::fs::read_to_string("/etc/resolv.conf").unwrap_or_default(); + + let servers: Vec = resolv + .lines() + .filter_map(|line| { + let line = line.trim(); + line.strip_prefix("nameserver ") + .map(|s| s.trim().to_string()) + }) + .collect(); + + // If only localhost (systemd-resolved), try real config + if servers.iter().all(|s| s.starts_with("127.")) { + if let Ok(real) = std::fs::read_to_string("/run/systemd/resolve/resolv.conf") { + let real_servers: Vec = real + .lines() + .filter_map(|line| { + line.trim() + .strip_prefix("nameserver ") + .map(|s| s.trim().to_string()) + }) + .filter(|s| !s.starts_with("127.")) + .collect(); + if !real_servers.is_empty() { + return real_servers; + } + } + } + + servers +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_host_dns_servers() { + let servers = get_host_dns_servers(); + println!("DNS servers: {:?}", servers); + // Should find at least one non-localhost server on this system + assert!(!servers.is_empty(), "Expected to find DNS servers"); + // Should not include localhost (127.x.x.x) since we're on systemd-resolved + assert!( + servers.iter().all(|s| !s.starts_with("127.")), + "Should have filtered out localhost DNS" + ); + } +} diff --git a/src/network/slirp.rs b/src/network/slirp.rs index e6e83b67..5bb13162 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -12,8 +12,6 @@ use crate::paths; use crate::state::truncate_id; /// slirp4netns network addressing constants -/// slirp0 device is assigned this IP for routing to slirp4netns -const SLIRP_CIDR: &str = "10.0.2.100/24"; /// Guest network addressing (isolated per VM namespace) const GUEST_SUBNET: &str = "192.168.1.0/24"; @@ -54,7 +52,6 @@ pub struct SlirpNetwork { port_mappings: Vec, // Network addressing - slirp_cidr: String, // slirp0: 10.0.2.100/24, gateway 10.0.2.2 guest_subnet: String, // tap0: 192.168.x.0/24 (derived from vm_id) guest_ip: String, // Guest VM IP (192.168.x.2) namespace_ip: String, // Namespace host IP on tap0 (192.168.x.1) @@ -74,7 +71,6 @@ impl SlirpNetwork { tap_device, slirp_device: SLIRP_DEVICE_NAME.to_string(), port_mappings, - slirp_cidr: SLIRP_CIDR.to_string(), guest_subnet: GUEST_SUBNET.to_string(), guest_ip: GUEST_IP.to_string(), namespace_ip: NAMESPACE_IP.to_string(), @@ -157,16 +153,17 @@ impl SlirpNetwork { /// Build the setup script to run inside the namespace via nsenter /// - /// This script creates both TAP devices and configures networking. + /// This script creates both TAP devices and sets up iptables rules for egress. + /// Health checks use nsenter to curl the guest directly, no port forwarding needed. /// Run via: nsenter -t HOLDER_PID -U -n -- bash -c '' pub fn build_setup_script(&self) -> String { format!( r#" set -e -# Create slirp0 TAP for slirp4netns connectivity +# Create slirp0 TAP for slirp4netns (slirp4netns will attach to this) ip tuntap add {slirp_dev} mode tap -ip addr add {slirp_ip} dev {slirp_dev} +ip addr add 10.0.2.1/24 dev {slirp_dev} ip link set {slirp_dev} up # Create TAP device for Firecracker (must exist before Firecracker starts) @@ -177,24 +174,23 @@ ip link set {fc_tap} up # Set up loopback ip link set lo up -# Set default route via slirp gateway +# Enable IP forwarding (required for NAT to work) +sysctl -w net.ipv4.ip_forward=1 + +# Set default route via slirp gateway (10.0.2.2 is slirp4netns internal gateway) ip route add default via 10.0.2.2 dev {slirp_dev} -# Set up iptables MASQUERADE for traffic from guest subnet -# This NATs guest traffic (192.168.x.x) to slirp0's address (10.0.2.100) -iptables -t nat -A POSTROUTING -s {guest_subnet} -o {slirp_dev} -j MASQUERADE 2>/dev/null || true +# Allow forwarding between slirp0 and FC TAP +iptables -A FORWARD -i {slirp_dev} -o {fc_tap} -j ACCEPT 2>/dev/null || true +iptables -A FORWARD -i {fc_tap} -o {slirp_dev} -j ACCEPT 2>/dev/null || true -# Set up DNAT for inbound connections from slirp4netns -# When slirp4netns forwards traffic to 10.0.2.100, redirect it to the actual guest IP -# This enables port forwarding: host -> slirp4netns -> 10.0.2.100 -> DNAT -> guest (192.168.x.2) -iptables -t nat -A PREROUTING -d 10.0.2.100 -j DNAT --to-destination {guest_ip} 2>/dev/null || true +# Set up iptables MASQUERADE for traffic from guest subnet (egress) +iptables -t nat -A POSTROUTING -s {guest_subnet} -o {slirp_dev} -j MASQUERADE 2>/dev/null || true "#, slirp_dev = self.slirp_device, - slirp_ip = self.slirp_cidr, fc_tap = self.tap_device, ns_ip = self.namespace_ip, guest_subnet = self.guest_subnet, - guest_ip = self.guest_ip, ) } @@ -239,11 +235,12 @@ iptables -t nat -A PREROUTING -d 10.0.2.100 -j DNAT --to-destination {guest_ip} namespace_pid = namespace_pid, slirp_tap = %self.slirp_device, api_socket = %api_socket.display(), - "starting slirp4netns (attaching to existing TAP)" + "starting slirp4netns (creating TAP, no IP assignment)" ); - // Start slirp4netns WITHOUT --configure (TAP already exists and is configured) - // slirp4netns will attach to the existing TAP device + // Start slirp4netns WITHOUT --configure so it doesn't assign an IP + // This avoids the issue where DNAT doesn't work for local addresses + // The TAP is created and connected, but we handle routing ourselves let mut cmd = Command::new("slirp4netns"); cmd.arg("--ready-fd") .arg(ready_write_raw.to_string()) @@ -347,59 +344,6 @@ iptables -t nat -A PREROUTING -d 10.0.2.100 -j DNAT --to-destination {guest_ip} Ok(()) } - /// Setup health check port forward (loopback_ip:8080 → guest:80) - /// - /// Uses port 8080 on host (unprivileged) forwarding to port 80 in guest. - /// This is fully rootless - no capabilities or sudo needed. - /// Linux routes all of 127.0.0.0/8 to loopback without needing `ip addr add`. - async fn setup_health_check_forward(&self, loopback_ip: &str) -> Result<()> { - let api_socket = self - .api_socket_path - .as_ref() - .context("API socket not configured")?; - - // Forward from unprivileged port 8080 on host to port 80 in guest - // Port 8080 doesn't require CAP_NET_BIND_SERVICE - let request = serde_json::json!({ - "execute": "add_hostfwd", - "arguments": { - "proto": "tcp", - "host_addr": loopback_ip, - "host_port": 8080, - "guest_addr": "10.0.2.100", - "guest_port": 80 - } - }); - - info!( - loopback_ip = %loopback_ip, - guest_ip = %self.guest_ip, - "setting up health check port forward (8080 -> 80) - fully rootless!" - ); - - let mut stream = UnixStream::connect(api_socket) - .await - .context("connecting to slirp4netns API socket")?; - - let request_str = serde_json::to_string(&request)? + "\n"; - stream.write_all(request_str.as_bytes()).await?; - stream.shutdown().await?; - - let mut reader = BufReader::new(stream); - let mut response_line = String::new(); - reader.read_line(&mut response_line).await?; - - debug!(response = %response_line.trim(), "slirp4netns health check forward response"); - - if response_line.contains("error") { - warn!(response = %response_line.trim(), "health check port forwarding may have failed"); - } else { - info!("health check port forwarding configured successfully"); - } - - Ok(()) - } - /// Get guest IP address for kernel boot args pub fn guest_ip(&self) -> &str { &self.guest_ip @@ -454,12 +398,10 @@ impl NetworkManager for SlirpNetwork { self.start_slirp(holder_pid).await?; - // Set up health check port forward (loopback_ip:80 → guest:80) - // No ip addr add needed - Linux routes all of 127.0.0.0/8 to loopback! - if let Some(loopback_ip) = &self.loopback_ip { - self.setup_health_check_forward(loopback_ip).await?; - } + // Health checks now use nsenter to curl the guest directly + // No port forwarding needed for health checks + // User-specified port mappings still use slirp4netns port forwarding if !self.port_mappings.is_empty() { self.setup_port_forwarding().await?; } From 1b86d43bb4735ecd196f615d12ebc888eac29b47 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sun, 21 Dec 2025 00:15:16 +0000 Subject: [PATCH 11/13] Fix formatting and linting issues --- .claude/CLAUDE.md | 24 ++++-------------------- src/health.rs | 4 +++- src/network/slirp.rs | 2 -- 3 files changed, 7 insertions(+), 23 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index ceac31cc..0bee2aed 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -667,23 +667,6 @@ Run `make help` for full list. Key targets: └── cache/ # Downloaded cloud images ``` -### One-Time Setup (dnsmasq) - -```bash -sudo apt-get update -sudo apt-get install -y dnsmasq - -# dnsmasq for DNS forwarding to VMs (bind-dynamic listens on dynamically created TAP devices) -sudo tee /etc/dnsmasq.d/fcvm.conf > /dev/null < { debug!(target: "health-monitor", "health check passed"); *last_failure_log = None; diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 5bb13162..29f18eac 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -11,8 +11,6 @@ use super::{types::generate_mac, NetworkConfig, NetworkManager, PortMapping}; use crate::paths; use crate::state::truncate_id; -/// slirp4netns network addressing constants - /// Guest network addressing (isolated per VM namespace) const GUEST_SUBNET: &str = "192.168.1.0/24"; const GUEST_IP: &str = "192.168.1.2"; From b5c7667605c67023211f70b8f8fd1d41a47ba3f3 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sun, 21 Dec 2025 00:40:03 +0000 Subject: [PATCH 12/13] Remove dnsmasq dependency from bridged networking VMs now use host DNS servers directly (read from /etc/resolv.conf) instead of relying on dnsmasq. This simplifies the network setup and removes the need to wait for dnsmasq to bind to the veth IP. --- src/network/veth.rs | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/src/network/veth.rs b/src/network/veth.rs index 4486fab3..12763676 100644 --- a/src/network/veth.rs +++ b/src/network/veth.rs @@ -1,42 +1,9 @@ use anyhow::{Context, Result}; -use std::time::Duration; use tokio::process::Command; use tracing::{debug, info, warn}; use super::namespace::exec_in_namespace; -/// Wait for dnsmasq to bind to a specific IP address on port 53 -/// -/// dnsmasq with `bind-dynamic` detects new interfaces and binds to them, -/// but this takes time. We must wait for it to bind before VMs can use DNS. -async fn wait_for_dnsmasq_bind(ip: &str) -> Result<()> { - let check_addr = format!("{}:53", ip); - - for attempt in 0..50 { - // Check if anything is listening on this IP:53 - let output = Command::new("ss") - .args(["-uln", "sport", "=", ":53"]) - .output() - .await - .context("checking if dnsmasq is listening")?; - - let stdout = String::from_utf8_lossy(&output.stdout); - if stdout.contains(ip) { - if attempt > 0 { - debug!(ip = %ip, attempts = attempt, "dnsmasq now listening"); - } - return Ok(()); - } - - tokio::time::sleep(Duration::from_millis(20)).await; - } - - anyhow::bail!( - "dnsmasq did not bind to {} within 1 second - check dnsmasq config", - check_addr - ); -} - /// In-Namespace NAT configuration for clone egress /// /// When clones are restored from a snapshot, they all have the same guest IP @@ -201,10 +168,8 @@ pub async fn setup_host_veth(veth_name: &str, ip_with_cidr: &str) -> Result<()> } } - // Wait for dnsmasq to bind to this IP (bind-dynamic detection) - // VMs use this IP as their DNS server, so dnsmasq must be listening before VM boots - let ip = ip_with_cidr.split('/').next().unwrap_or(ip_with_cidr); - wait_for_dnsmasq_bind(ip).await?; + // DNS: VMs now use host DNS servers directly (read from /etc/resolv.conf) + // No dnsmasq needed - the host DNS servers are reachable via the veth bridge // Add FORWARD rule to allow outbound traffic from this veth let forward_rule = format!("-A FORWARD -i {} -j ACCEPT", veth_name); From 613e4675b45e8620147cc8aee1497399c8122a32 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sun, 21 Dec 2025 00:54:08 +0000 Subject: [PATCH 13/13] Fix VM test race condition by running jobs sequentially The VM tests (Sanity, Exec, Egress) all share /dev/nbd0 via bind mount and flock doesn't work across podman containers. When they ran in parallel, multiple virt-customize processes tried to access the same cloud image simultaneously, causing one to hang indefinitely. Fix by adding job dependencies so they run sequentially: - VM Sanity runs first - VM Exec waits for Sanity - VM Egress waits for Exec Added `if: always()` so later jobs still run even if earlier ones fail, since the rootfs will be cached after the first successful creation. --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 417d417f..f7d9d501 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -272,6 +272,8 @@ jobs: test-vm-exec: name: VM Exec runs-on: buildjet-32vcpu-ubuntu-2204 + needs: test-vm-sanity # Sequential: flock doesn't work across podman containers sharing /dev/nbd0 + if: always() # Run even if previous job failed (rootfs will be cached after first success) steps: - uses: actions/checkout@v4 with: @@ -307,6 +309,8 @@ jobs: test-vm-egress: name: VM Egress runs-on: buildjet-32vcpu-ubuntu-2204 + needs: test-vm-exec # Sequential: flock doesn't work across podman containers sharing /dev/nbd0 + if: always() # Run even if previous job failed (rootfs will be cached after first success) steps: - uses: actions/checkout@v4 with: