From c6a591d9c461158abd37b6d833a21af3b99d2f18 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Thu, 5 Feb 2026 23:12:03 -0800 Subject: [PATCH 01/12] Enable snapshot caching for localhost images Localhost images were excluded from snapshot caching because the FUSE volume path wouldn't exist on restore. Now that images are attached as raw block devices (CAS-cached at image-cache/{digest}.docker.tar), the path is stable across runs. This enables instant snapshot restore for localhost images instead of re-loading all blobs every startup. --- src/commands/podman.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 155320f3..45d9f21d 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -989,15 +989,13 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { None }; - // Check for snapshot cache (unless --no-snapshot is set, FCVM_NO_SNAPSHOT env var, or localhost image) - // Localhost images have tarball paths in MMDS that won't exist on restore + // Check for snapshot cache (unless --no-snapshot is set or FCVM_NO_SNAPSHOT env var) // Keep fc_config and snapshot_key available for later snapshot creation on miss let no_snapshot = args.no_snapshot || std::env::var("FCVM_NO_SNAPSHOT").is_ok(); - let is_localhost_image = args.image.starts_with("localhost/"); let (fc_config, snapshot_key): ( Option, Option, - ) = if !no_snapshot && !is_localhost_image { + ) = if !no_snapshot { // Get image identifier for cache key computation let image_identifier = get_image_identifier(&args.image).await?; let config = build_firecracker_config( @@ -1068,9 +1066,6 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { "Snapshot miss, will create snapshot after image load" ); (Some(config), Some(key)) - } else if is_localhost_image { - info!("Snapshot disabled for localhost image (tarball path won't exist on restore)"); - (None, None) } else { if std::env::var("FCVM_NO_SNAPSHOT").is_ok() { info!("Snapshot disabled via FCVM_NO_SNAPSHOT environment variable"); @@ -1361,9 +1356,7 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { // Skip snapshot creation when: // - --no-snapshot flag or FCVM_NO_SNAPSHOT env var is set // - Volumes are specified (FUSE-over-vsock breaks during snapshot pause) - // - Localhost images (tarball path in MMDS won't exist on restore) - // Note: no_snapshot and is_localhost_image are already defined above - let skip_snapshot_creation = no_snapshot || !args.map.is_empty() || is_localhost_image; + let skip_snapshot_creation = no_snapshot || !args.map.is_empty(); if !args.map.is_empty() && !no_snapshot { info!( "Skipping snapshot creation: volumes specified (FUSE doesn't survive snapshot pause)" From a064337d8f4f21f116c1b72ee5b95fd764fd8b4e Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Thu, 5 Feb 2026 23:22:19 -0800 Subject: [PATCH 02/12] Add Known Limitations section to DESIGN.md Documents FUSE cache coherency behavior, NV2 nested VM constraints, and snapshot + FUSE volume interaction. --- DESIGN.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/DESIGN.md b/DESIGN.md index a8db8ea7..ae83a1b9 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -1677,6 +1677,28 @@ The fuse-pipe library passes the pjdfstest POSIX compliance suite. Tests run via --- +## Known Limitations + +### FUSE Volume Cache Coherency + +`--map` volumes use FUSE-over-vsock with `WRITEBACK_CACHE` and `AUTO_INVAL_DATA`. When a host process modifies a file in a mapped directory, the guest sees the change on its next read — but only after the kernel detects the mtime change (up to ~1 second granularity). Writes within the same second may not be visible immediately. + +Directory changes (new files, deletions) are subject to the kernel's directory entry cache TTL. A new file created on the host may not appear in guest `readdir()` until the cache expires. + +There are no push notifications from host to guest. The guest discovers changes only on access. inotify/fanotify in the guest watches the FUSE mount, not the host filesystem, so host-side changes don't trigger guest notifications. + +**Potential fix**: Use `FUSE_NOTIFY_INVAL_INODE` and `FUSE_NOTIFY_INVAL_ENTRY` — server-initiated invalidation notifications. The host VolumeServer would watch directories with inotify and push invalidations through the FUSE connection when files change. This is how production network filesystems (NFS, CIFS) handle it. + +### Nested VM Performance (NV2) + +ARM64 FEAT_NV2 has architectural issues with cache coherency under double Stage 2 translation. The DSB SY kernel patch fixes this for vsock/FUSE data paths, but multi-vCPU L2 VMs still hit interrupt delivery issues (NETDEV WATCHDOG). L2 VMs are limited to single vCPU. + +### Snapshot + FUSE Volumes + +Snapshots are disabled when `--map` volumes are present because the FUSE-over-vsock connection state may not survive the pause/resume cycle cleanly. This means VMs with volume mounts always do a fresh boot. Block device mounts (`--disk`, `--disk-dir`) do not have this limitation. + +--- + ## Future Enhancements ### Phase 2 (Post-MVP) From 4a6959f74bab9b044495ee2ebd4eb822f011ae37 Mon Sep 17 00:00:00 2001 From: "claude[bot]" Date: Fri, 6 Feb 2026 07:16:21 +0000 Subject: [PATCH 03/12] fix: update comment to reflect localhost snapshot support Remove outdated reference to 'not localhost image' in comment on line 1377. Localhost images are now supported for snapshot caching (as of this PR). --- src/commands/podman.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 45d9f21d..190a8d75 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -1374,7 +1374,7 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { // Create startup snapshot channel for health-triggered snapshot creation // Only create startup snapshots if: - // - Not skipping snapshots (no --no-snapshot, no volumes, not localhost image) + // - Not skipping snapshots (no --no-snapshot, no volumes) // - Have a snapshot key // - Have a health_check URL configured (HTTP health check, not just container-ready) let (startup_tx, mut startup_rx): ( From 9d2dcab7f3e7548ad1ffacacc804452e9eb2d74c Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Fri, 6 Feb 2026 00:12:10 -0800 Subject: [PATCH 04/12] Fix health check process leak: kill orphaned podman inspect on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Health checks spawn podman inspect via fcvm exec with a 5s timeout. When podman is busy (e.g., importing a large image), inspect blocks on the storage lock. On timeout, the process was orphaned — it kept running and holding the lock. New health checks spawned every poll interval, stacking up dozens of blocked processes (~35MB each). Fix: use kill_on_drop(true) so the child is killed when the timeout drops the future. --- src/health.rs | 53 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/src/health.rs b/src/health.rs index 4c034039..f43dfc5e 100644 --- a/src/health.rs +++ b/src/health.rs @@ -204,7 +204,7 @@ async fn check_container_running(pid: u32) -> bool { None => return false, // Can't find fcvm binary }; - let cmd_future = tokio::process::Command::new(&exe) + let child = match tokio::process::Command::new(&exe) .args([ "exec", "--pid", @@ -217,20 +217,31 @@ async fn check_container_running(pid: u32) -> bool { "{{.State.Running}}", "fcvm-container", ]) - .output(); - - let output = match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, cmd_future).await { - Ok(Ok(o)) => o, - Ok(Err(e)) => { - debug!(target: "health-monitor", error = %e, "podman inspect exec failed"); - return false; - } - Err(_) => { - debug!(target: "health-monitor", "podman inspect exec timed out"); + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .kill_on_drop(true) + .spawn() + { + Ok(c) => c, + Err(e) => { + debug!(target: "health-monitor", error = %e, "podman inspect spawn failed"); return false; } }; + let output = + match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, child.wait_with_output()).await { + Ok(Ok(o)) => o, + Ok(Err(e)) => { + debug!(target: "health-monitor", error = %e, "podman inspect exec failed"); + return false; + } + Err(_) => { + debug!(target: "health-monitor", "podman inspect exec timed out"); + return false; + } + }; + if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); debug!(target: "health-monitor", stderr = %stderr, "podman inspect failed"); @@ -256,7 +267,7 @@ async fn check_podman_healthcheck(pid: u32) -> Option { None => return Some(true), // Can't find fcvm binary, assume healthy }; - let cmd_future = tokio::process::Command::new(&exe) + let child = match tokio::process::Command::new(&exe) .args([ "exec", "--pid", @@ -269,12 +280,24 @@ async fn check_podman_healthcheck(pid: u32) -> Option { "{{.State.Health.Status}}", "fcvm-container", ]) - .output(); + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .kill_on_drop(true) + .spawn() + { + Ok(c) => c, + Err(e) => { + debug!(target: "health-monitor", error = %e, "podman healthcheck spawn failed"); + return Some(false); + } + }; - let output = match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, cmd_future).await { + // kill_on_drop ensures the child is killed if the timeout fires + let output = match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, child.wait_with_output()) + .await + { Ok(Ok(o)) => o, Ok(Err(e)) => { - // Exec not available yet, don't assume healthy - keep checking debug!(target: "health-monitor", error = %e, "podman healthcheck exec failed, will retry"); return Some(false); } From db956531d873aaab07f68d8961e0cff4082fb5be Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Fri, 6 Feb 2026 00:12:32 -0800 Subject: [PATCH 05/12] Remove output listener read timeout for large image imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 5-minute read timeout on the container output vsock caused the listener to exit during long image imports (10+ min). When the container finally started, its stdout/stderr had nowhere to go. Remove the timeout — the listener stays alive until EOF (connection closed) or the VM exits. The VM exit handler already cleans up. --- src/commands/podman.rs | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 190a8d75..377a6505 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -841,21 +841,17 @@ pub(crate) async fn run_output_listener( None }; - // Read lines until connection closes + // Read lines until connection closes (no read timeout — large image imports + // can take 10+ minutes during which fc-agent produces no output) loop { line_buf.clear(); - match tokio::time::timeout( - std::time::Duration::from_secs(300), // 5 min read timeout - reader.read_line(&mut line_buf), - ) - .await - { - Ok(Ok(0)) => { + match reader.read_line(&mut line_buf).await { + Ok(0) => { // EOF - connection closed debug!(vm_id = %vm_id, "Output connection closed"); break; } - Ok(Ok(_)) => { + Ok(_) => { // Parse raw line format: stream:content let line = line_buf.trim_end(); if let Some((stream, content)) = line.split_once(':') { @@ -873,15 +869,10 @@ pub(crate) async fn run_output_listener( let _ = w.write_all(b"ack\n").await; } } - Ok(Err(e)) => { + Err(e) => { warn!(vm_id = %vm_id, error = %e, "Error reading output"); break; } - Err(_) => { - // Read timeout - debug!(vm_id = %vm_id, "Output read timeout"); - break; - } } } From 161b5128115d8d0c683c3cb7724eef0ae8471d76 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Thu, 5 Feb 2026 23:02:02 -0800 Subject: [PATCH 06/12] Fix FUSE mount race: poll for readiness instead of fixed 500ms sleep The 500ms sleep wasn't enough for large images or slow hosts. Replace with a poll loop that waits up to 30s for each FUSE mount to become accessible via read_dir before starting the container. --- fc-agent/src/main.rs | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index 04721fb1..49963c7b 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -1693,23 +1693,29 @@ fn mount_fuse_volumes(volumes: &[VolumeMount]) -> Result> { mounted_paths.push(vol.guest_path.clone()); } - // Give FUSE mounts time to initialize - if !volumes.is_empty() { - eprintln!("[fc-agent] waiting for FUSE mounts to initialize..."); - std::thread::sleep(std::time::Duration::from_millis(500)); - - // Verify each mount point is accessible - for vol in volumes { - let path = std::path::Path::new(&vol.guest_path); + // Wait for each FUSE mount to become accessible (up to 30s per mount) + for vol in volumes { + let path = std::path::Path::new(&vol.guest_path); + let mut ready = false; + for attempt in 1..=60 { if let Ok(entries) = std::fs::read_dir(path) { let count = entries.count(); eprintln!( - "[fc-agent] ✓ mount {} accessible ({} entries)", - vol.guest_path, count + "[fc-agent] ✓ mount {} ready ({} entries, {}ms)", + vol.guest_path, + count, + attempt * 500 ); - } else { - eprintln!("[fc-agent] ✗ mount {} NOT accessible", vol.guest_path); + ready = true; + break; } + std::thread::sleep(std::time::Duration::from_millis(500)); + } + if !ready { + eprintln!( + "[fc-agent] ✗ mount {} NOT accessible after 30s", + vol.guest_path + ); } } From a0be637c816812d2af4512a0ddaf3148ce566e79 Mon Sep 17 00:00:00 2001 From: "claude[bot]" Date: Fri, 6 Feb 2026 07:07:28 +0000 Subject: [PATCH 07/12] fix: return error when FUSE mount timeout + correct timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Return error when mount not ready after 30s (was silently continuing) - Fix elapsed time calculation: (attempt - 1) * 500 instead of attempt * 500 - Ensures containers don't start with inaccessible mounts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- fc-agent/src/main.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index 49963c7b..dac28b98 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -1704,7 +1704,7 @@ fn mount_fuse_volumes(volumes: &[VolumeMount]) -> Result> { "[fc-agent] ✓ mount {} ready ({} entries, {}ms)", vol.guest_path, count, - attempt * 500 + (attempt - 1) * 500 ); ready = true; break; @@ -1712,10 +1712,10 @@ fn mount_fuse_volumes(volumes: &[VolumeMount]) -> Result> { std::thread::sleep(std::time::Duration::from_millis(500)); } if !ready { - eprintln!( - "[fc-agent] ✗ mount {} NOT accessible after 30s", + return Err(anyhow::anyhow!( + "mount {} not accessible after 30s", vol.guest_path - ); + )); } } From bbd2458246088e8e03721ca826fda789793cf8eb Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Fri, 6 Feb 2026 07:54:39 -0800 Subject: [PATCH 08/12] Add --user flag for rootless podman in VM (--userns=keep-id) Replicates host podman behavior: creates user in VM, sets up subuid/subgid, delegates cgroup, runs podman as the target user with --userns=keep-id. Container sees the same UID as on the host. Also adds uidmap package to rootfs for rootless user namespace support. --- fc-agent/src/main.rs | 77 ++++++++++++++++++++++++++++++++++++++++++ rootfs-config.toml | 2 +- src/cli/args.rs | 5 +++ src/commands/podman.rs | 1 + 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index dac28b98..d89e77e7 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -33,6 +33,9 @@ struct Plan { /// Path to OCI archive for localhost/ images (run directly without import) #[serde(default)] image_archive: Option, + /// Run container as USER:GROUP (e.g., "1000:1000") + #[serde(default)] + user: Option, /// Run container in privileged mode (allows mknod, device access, etc.) #[serde(default)] privileged: bool, @@ -2550,6 +2553,80 @@ async fn run_agent() -> Result<()> { "nofile=65536:65536".to_string(), ]; + // User mapping: run podman as the specified user with --userns=keep-id + // This replicates host behavior where rootless podman maps the user as root + // inside the container while keeping the real UID on shared mounts. + if let Some(ref user_spec) = plan.user { + // Parse "uid:gid" format + let parts: Vec<&str> = user_spec.split(':').collect(); + let uid = parts[0]; + let gid = parts.get(1).unwrap_or(&"100"); + let username = format!("fcvm-user"); + + eprintln!( + "[fc-agent] setting up user mapping: uid={} gid={}", + uid, gid + ); + + // Create group and user in the VM + let _ = std::process::Command::new("groupadd") + .args(["-g", gid, &username]) + .output(); + let _ = std::process::Command::new("useradd") + .args(["-u", uid, "-g", gid, "-m", "-s", "/bin/sh", &username]) + .output(); + + // Set up subuid/subgid for rootless podman + let subuid_entry = format!("{}:100000:65536\n", username); + let _ = std::fs::write("/etc/subuid", &subuid_entry); + let _ = std::fs::write("/etc/subgid", &subuid_entry); + + // Ensure XDG_RUNTIME_DIR exists for rootless podman + let runtime_dir = format!("/run/user/{}", uid); + let _ = std::fs::create_dir_all(&runtime_dir); + let _ = std::process::Command::new("chown") + .args([&format!("{}:{}", uid, gid), &runtime_dir]) + .output(); + + // Delegate cgroup subtree to the user for rootless podman + let cgroup_dir = format!("/sys/fs/cgroup/user.slice/user-{}.slice", uid); + let _ = std::fs::create_dir_all(&cgroup_dir); + let _ = std::process::Command::new("chown") + .args(["-R", &format!("{}:{}", uid, gid), &cgroup_dir]) + .output(); + // Enable controllers in the user's cgroup + for path in &[ + "/sys/fs/cgroup/cgroup.subtree_control", + &format!("{}/cgroup.subtree_control", cgroup_dir), + ] { + let _ = std::fs::write(path, "+cpu +memory +pids"); + } + + // Delegate fc-agent's own cgroup to the user so rootless podman can create sub-cgroups + if let Ok(cgroup_path) = std::fs::read_to_string("/proc/self/cgroup") { + // Format: "0::/system.slice/fc-agent.service" + if let Some(path) = cgroup_path.trim().strip_prefix("0::") { + let full_path = format!("/sys/fs/cgroup{}", path); + let _ = std::process::Command::new("chown") + .args(["-R", &format!("{}:{}", uid, gid), &full_path]) + .output(); + eprintln!("[fc-agent] delegated cgroup {} to user {}", full_path, uid); + } + } + + // Remove --cgroups=split (rootless podman uses cgroupfs, not split) + podman_args.retain(|a| a != "--cgroups=split"); + + // Add --userns=keep-id to podman args (replicates host behavior) + podman_args.push("--userns=keep-id".to_string()); + + // Wrap entire command with runuser to run podman as the target user + podman_args.insert(0, "--".to_string()); + podman_args.insert(0, username.clone()); + podman_args.insert(0, "-u".to_string()); + podman_args.insert(0, "runuser".to_string()); + } + // Privileged mode: allows mknod, device access, etc. for POSIX compliance tests if plan.privileged { eprintln!("[fc-agent] privileged mode enabled"); diff --git a/rootfs-config.toml b/rootfs-config.toml index f1e90ca7..e51cd6f0 100644 --- a/rootfs-config.toml +++ b/rootfs-config.toml @@ -57,7 +57,7 @@ path = "opt/kata/share/kata-containers/vmlinux-6.12.47-173" [packages] # Container runtime -runtime = ["podman", "crun", "fuse-overlayfs", "skopeo"] +runtime = ["podman", "crun", "fuse-overlayfs", "skopeo", "uidmap"] # FUSE support for overlay filesystem fuse = ["fuse3"] diff --git a/src/cli/args.rs b/src/cli/args.rs index 9e41e7c5..3d6cea98 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -169,6 +169,11 @@ pub struct RunArgs { #[arg(long)] pub health_check: Option, + /// Run container as USER:GROUP (e.g., --user 1000:1000) + /// Equivalent to podman run --userns=keep-id on the host + #[arg(long)] + pub user: Option, + /// Run container in privileged mode (allows mknod, device access, etc.) /// Use for POSIX compliance tests that need full filesystem capabilities #[arg(long)] diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 377a6505..705a6fc4 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -2625,6 +2625,7 @@ async fn run_vm_setup( "nfs_mounts": nfs_mounts, "image_archive": image_device.clone(), "privileged": args.privileged, + "user": args.user.as_deref(), "interactive": args.interactive, "tty": args.tty, // Use network-provided proxy, or fall back to environment variables. From 111ccc6f7ccfc020cf84a00a0e763f0b22203fe7 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Fri, 6 Feb 2026 10:26:07 -0800 Subject: [PATCH 09/12] Add --user flag, /etc-host mount approach, chmod block device for userns - CLI: --user uid:gid flag for rootless podman in VM - fc-agent: creates user, subuid/subgid, cgroup delegation, runuser wrapper - fc-agent: chmod 444 block device for docker-archive with --userns=keep-id - rootfs-config: add uidmap package for rootless user namespaces --- fc-agent/src/main.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index d89e77e7..37b410a3 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -2366,6 +2366,13 @@ async fn run_agent() -> Result<()> { let image_ref = if let Some(archive_path) = &plan.image_archive { eprintln!("[fc-agent] using Docker archive: {}", archive_path); + // Make block device readable by non-root (needed with --userns=keep-id) + if archive_path.starts_with("/dev/") { + let _ = std::process::Command::new("chmod") + .args(["444", archive_path]) + .output(); + } + format!("docker-archive:{}", archive_path) } else { // Pull image with retries to handle transient DNS/network errors From 9c38c082772b95f3980701f6b4825b5f08b06f89 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Fri, 6 Feb 2026 15:04:41 -0800 Subject: [PATCH 10/12] Add --forward-localhost flag: redirect VM localhost to host gateway Opt-in iptables DNAT rule that redirects 127.0.0.0/8 in the VM to the host via the slirp gateway (10.0.2.2). This allows containers to reach host-only services (e.g., service discovery, config proxies) via localhost, matching the behavior of --network=host on the physical host. Requires: sysctl route_localnet=1 + iptables nat DNAT Only applied when --forward-localhost is passed. --- README.md | 2 + fc-agent/src/main.rs | 35 ++++++++++++++++ src/cli/args.rs | 6 +++ src/commands/podman.rs | 1 + tests/test_forward_localhost.rs | 71 +++++++++++++++++++++++++++++++++ 5 files changed, 115 insertions(+) create mode 100644 tests/test_forward_localhost.rs diff --git a/README.md b/README.md index 817814cb..3a03481f 100644 --- a/README.md +++ b/README.md @@ -780,6 +780,8 @@ See [DESIGN.md](DESIGN.md#cli-interface) for architecture and design decisions. -t, --tty Allocate pseudo-TTY (for vim, colors, etc.) --setup Auto-setup if kernel/rootfs missing (rootless only) --no-snapshot Disable automatic snapshot creation (for testing) +--forward-localhost Forward localhost ports to host (e.g., 1421,9099) +--rootfs-size Minimum free space on rootfs (default: 10G) ``` **`fcvm exec`** - Execute in VM/container: diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index 37b410a3..540b4532 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -36,6 +36,9 @@ struct Plan { /// Run container as USER:GROUP (e.g., "1000:1000") #[serde(default)] user: Option, + /// Localhost ports to forward to host gateway via iptables DNAT + #[serde(default)] + forward_localhost: Vec, /// Run container in privileged mode (allows mknod, device access, etc.) #[serde(default)] privileged: bool, @@ -2249,6 +2252,38 @@ async fn run_agent() -> Result<()> { // Save proxy settings for exec commands to use save_proxy_settings(&plan); + // Forward specific localhost ports to host gateway via iptables DNAT. + // Only the listed ports are redirected — other localhost traffic stays local. + if !plan.forward_localhost.is_empty() { + let _ = std::process::Command::new("sysctl") + .args(["-w", "net.ipv4.conf.all.route_localnet=1"]) + .output(); + for port in &plan.forward_localhost { + let _ = std::process::Command::new("iptables") + .args([ + "-t", + "nat", + "-A", + "OUTPUT", + "-d", + "127.0.0.0/8", + "-p", + "tcp", + "--dport", + port, + "-j", + "DNAT", + "--to-destination", + "10.0.2.2", + ]) + .output(); + } + eprintln!( + "[fc-agent] ✓ forwarding localhost ports to host: {:?}", + plan.forward_localhost + ); + } + // Sync VM clock from host before launching container // This ensures TLS certificate validation works immediately if let Err(e) = sync_clock_from_host().await { diff --git a/src/cli/args.rs b/src/cli/args.rs index 3d6cea98..cf7ec45c 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -174,6 +174,12 @@ pub struct RunArgs { #[arg(long)] pub user: Option, + /// Forward specific localhost ports to the host gateway via iptables DNAT. + /// Enables containers to reach host-only services via localhost. + /// Comma-separated port list, e.g., --forward-localhost 1421,9099 + #[arg(long, value_delimiter = ',')] + pub forward_localhost: Vec, + /// Run container in privileged mode (allows mknod, device access, etc.) /// Use for POSIX compliance tests that need full filesystem capabilities #[arg(long)] diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 705a6fc4..b8cf4880 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -2625,6 +2625,7 @@ async fn run_vm_setup( "nfs_mounts": nfs_mounts, "image_archive": image_device.clone(), "privileged": args.privileged, + "forward_localhost": args.forward_localhost.iter().map(|p| p.to_string()).collect::>(), "user": args.user.as_deref(), "interactive": args.interactive, "tty": args.tty, diff --git a/tests/test_forward_localhost.rs b/tests/test_forward_localhost.rs new file mode 100644 index 00000000..6b71e49d --- /dev/null +++ b/tests/test_forward_localhost.rs @@ -0,0 +1,71 @@ +//! Test --forward-localhost flag: VM localhost reaches host services. + +#![cfg(feature = "privileged-tests")] + +mod common; + +use anyhow::{Context, Result}; +use std::net::TcpListener; + +/// Test that --forward-localhost makes VM's 127.0.0.1 reach host services. +#[tokio::test] +async fn test_forward_localhost() -> Result<()> { + println!("\nTest --forward-localhost"); + println!("========================"); + + // Start a TCP server on host 127.0.0.1 + let listener = TcpListener::bind("127.0.0.1:0")?; + let port = listener.local_addr()?.port(); + println!(" Host server on 127.0.0.1:{}", port); + + // Accept in background + let accept_handle = tokio::task::spawn_blocking(move || { + listener.set_nonblocking(false).ok(); + if let Ok((mut conn, _)) = listener.accept() { + use std::io::Write; + let _ = conn.write_all(b"HELLO\n"); + } + }); + + let (vm_name, _, _, _) = common::unique_names("fwd-localhost"); + + let port_str = port.to_string(); + let (_, pid) = common::spawn_fcvm(&[ + "podman", + "run", + "--name", + &vm_name, + "--forward-localhost", + &port_str, + "--no-snapshot", + common::TEST_IMAGE, + ]) + .await + .context("spawning fcvm")?; + + common::poll_health_by_pid(pid, 300).await?; + println!(" VM healthy"); + + // From inside VM, connect to localhost:port — should reach host + let result = common::exec_in_vm( + pid, + &[ + "sh", + "-c", + &format!("nc -w5 127.0.0.1 {} || echo FAILED", port), + ], + ) + .await?; + + println!(" Result: {}", result.trim()); + assert!( + result.contains("HELLO"), + "VM localhost should reach host with --forward-localhost (got: {})", + result.trim() + ); + + common::kill_process(pid).await; + accept_handle.abort(); + println!("✅ FORWARD LOCALHOST TEST PASSED!"); + Ok(()) +} From ad9f230b4400c4248a7d4e88163ae7b9f4a0617a Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Fri, 6 Feb 2026 16:41:42 -0800 Subject: [PATCH 11/12] Fix 8 bugs from codebase review (Wave 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Wire protocol Written size u32 → u64 to prevent truncation on copy_file_range/remap_file_range returns exceeding 4GB - Loopback IP exhaustion now returns error instead of silently reusing 127.0.0.2 (would cause IP conflicts) - Remove security.capability xattr fast-path that returned ENODATA for all files, hiding real capabilities - Check e2fsck exit code before resize2fs (exit >= 4 means uncorrectable filesystem errors) - slirp4netns stdout/stderr changed from Stdio::piped() to Stdio::null() to prevent pipe buffer deadlock - Check truncate exit code in create_disk_from_dir - parse_size uses checked_mul to prevent silent overflow - Delete dead code mount_vsock_with_readers in fc-agent Tested: cargo test -p fuse-pipe --lib (42 pass), cargo test -p fcvm --lib (48 pass) --- fc-agent/src/fuse/mod.rs | 17 ----------------- fuse-pipe/src/client/fuse.rs | 26 +++----------------------- fuse-pipe/src/protocol/response.rs | 2 +- fuse-pipe/src/server/passthrough.rs | 10 +++++----- src/commands/podman.rs | 6 +++++- src/network/slirp.rs | 4 ++-- src/state/manager.rs | 10 ++++------ src/storage/disk.rs | 17 ++++++++++++++--- 8 files changed, 34 insertions(+), 58 deletions(-) diff --git a/fc-agent/src/fuse/mod.rs b/fc-agent/src/fuse/mod.rs index 05d18344..666cdf74 100644 --- a/fc-agent/src/fuse/mod.rs +++ b/fc-agent/src/fuse/mod.rs @@ -121,20 +121,3 @@ pub fn mount_vsock(port: u32, mount_point: &str) -> anyhow::Result<()> { ); fuse_pipe::mount_vsock_with_options(HOST_CID, port, mount_point, num_readers, trace_rate) } - -/// Mount a FUSE filesystem with multiple reader threads. -/// -/// Same as `mount_vsock` but creates multiple FUSE reader threads for -/// better parallel performance. -#[allow(dead_code)] -pub fn mount_vsock_with_readers( - port: u32, - mount_point: &str, - num_readers: usize, -) -> anyhow::Result<()> { - eprintln!( - "[fc-agent] mounting FUSE volume at {} via vsock port {} ({} readers)", - mount_point, port, num_readers - ); - fuse_pipe::mount_vsock_with_readers(HOST_CID, port, mount_point, num_readers) -} diff --git a/fuse-pipe/src/client/fuse.rs b/fuse-pipe/src/client/fuse.rs index 92a27ea0..92b25a45 100644 --- a/fuse-pipe/src/client/fuse.rs +++ b/fuse-pipe/src/client/fuse.rs @@ -605,7 +605,7 @@ impl Filesystem for FuseClient { }); match response { - VolumeResponse::Written { size } => reply.written(size), + VolumeResponse::Written { size } => reply.written(size as u32), VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)), _ => reply.error(Errno::EIO), } @@ -990,26 +990,6 @@ impl Filesystem for FuseClient { } fn getxattr(&self, req: &Request, ino: INodeNo, name: &OsStr, size: u32, reply: ReplyXattr) { - // Fast path: The kernel calls getxattr("security.capability") on every write - // to check if file capabilities need to be cleared. This is extremely common - // and almost always returns ENODATA (no capabilities set). Short-circuit this - // to avoid the expensive server round-trip (~32µs savings per write). - // - // This is safe because: - // 1. If capabilities ARE set, they're preserved (we'd need setxattr to clear) - // 2. The kernel's capability check is advisory - it clears caps on successful write - // 3. Container workloads rarely use file capabilities - // - // Can be disabled via FCVM_NO_XATTR_FASTPATH=1 for debugging. - if std::env::var("FCVM_NO_XATTR_FASTPATH").is_err() { - if let Some(name_str) = name.to_str() { - if name_str == "security.capability" { - reply.error(Errno::ENODATA); - return; - } - } - } - let response = self.send_request_sync(VolumeRequest::Getxattr { ino: ino.into(), name: name.to_string_lossy().to_string(), @@ -1198,7 +1178,7 @@ impl Filesystem for FuseClient { }); match response { - VolumeResponse::Written { size } => reply.written(size), + VolumeResponse::Written { size } => reply.written(size as u32), VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)), _ => reply.error(Errno::EIO), } @@ -1241,7 +1221,7 @@ impl Filesystem for FuseClient { ); match response { - VolumeResponse::Written { size } => reply.written(size), + VolumeResponse::Written { size } => reply.written(size as u32), VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)), _ => reply.error(Errno::EIO), } diff --git a/fuse-pipe/src/protocol/response.rs b/fuse-pipe/src/protocol/response.rs index ea9279b9..9079136d 100644 --- a/fuse-pipe/src/protocol/response.rs +++ b/fuse-pipe/src/protocol/response.rs @@ -31,7 +31,7 @@ pub enum VolumeResponse { Data { data: Vec }, /// Number of bytes written. - Written { size: u32 }, + Written { size: u64 }, /// File opened response. Opened { fh: u64, flags: u32 }, diff --git a/fuse-pipe/src/server/passthrough.rs b/fuse-pipe/src/server/passthrough.rs index 730bf856..597b4466 100644 --- a/fuse-pipe/src/server/passthrough.rs +++ b/fuse-pipe/src/server/passthrough.rs @@ -656,7 +656,7 @@ impl FilesystemHandler for PassthroughFs { ) { Ok(n) => { tracing::debug!(target: "passthrough", fh, written = n, "write succeeded"); - VolumeResponse::Written { size: n as u32 } + VolumeResponse::Written { size: n as u64 } } Err(e) => { tracing::debug!(target: "passthrough", fh, error = ?e, "write failed"); @@ -1149,7 +1149,7 @@ impl FilesystemHandler for PassthroughFs { ) { Ok(n) => { tracing::debug!(target: "passthrough", copied = n, "copy_file_range succeeded"); - VolumeResponse::Written { size: n as u32 } + VolumeResponse::Written { size: n as u64 } } Err(e) => { tracing::debug!(target: "passthrough", error = ?e, "copy_file_range failed"); @@ -1190,7 +1190,7 @@ impl FilesystemHandler for PassthroughFs { ) { Ok(n) => { tracing::debug!(target: "passthrough", cloned = n, "remap_file_range succeeded"); - VolumeResponse::Written { size: n as u32 } + VolumeResponse::Written { size: n as u64 } } Err(e) => { tracing::debug!(target: "passthrough", error = ?e, "remap_file_range failed"); @@ -1607,7 +1607,7 @@ mod tests { // For whole-file clone (len=0), we return the file size on success assert_eq!( size, - test_data.len() as u32, + test_data.len() as u64, "FICLONE should return file size for whole file (len=0)" ); @@ -1726,7 +1726,7 @@ mod tests { match resp { VolumeResponse::Written { size } => { eprintln!("FICLONERANGE succeeded, size={}", size); - assert_eq!(size, block_size as u32, "should clone requested size"); + assert_eq!(size, block_size as u64, "should clone requested size"); // Verify: first block of dest should equal second block of source let resp = fs.read(dst_ino, dst_fh, 0, block_size as u32, uid, gid, 0); diff --git a/src/commands/podman.rs b/src/commands/podman.rs index b8cf4880..0591bcc0 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -479,12 +479,16 @@ async fn create_disk_from_dir( ); // Create sparse file - tokio::process::Command::new("truncate") + let truncate_status = tokio::process::Command::new("truncate") .args(["-s", &image_size.to_string(), output_path.to_str().unwrap()]) .status() .await .context("creating sparse file")?; + if !truncate_status.success() { + bail!("truncate failed with exit code: {:?}", truncate_status.code()); + } + // Format as ext4 let mkfs = tokio::process::Command::new("mkfs.ext4") .args(["-q", "-F", output_path.to_str().unwrap()]) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index d662170c..371d77dd 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -343,8 +343,8 @@ ip addr add {namespace_ip}/24 dev {bridge} cmd.arg(namespace_pid.to_string()) .arg(&self.slirp_device) .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); + .stdout(Stdio::null()) + .stderr(Stdio::null()); let child = cmd.spawn().context("failed to spawn slirp4netns")?; diff --git a/src/state/manager.rs b/src/state/manager.rs index 0b596ec7..80efe296 100644 --- a/src/state/manager.rs +++ b/src/state/manager.rs @@ -524,20 +524,18 @@ impl StateManager { // Note: We rely on state file cleanup (cleanup_stale_state) to handle dead processes. // We don't check if port 8080 is available because wildcard binds (0.0.0.0:8080) // would cause false negatives. Real port conflicts are detected at slirp4netns add_hostfwd time. - let ip = (|| { + let ip = (|| -> Result { for b2 in 0..=255u8 { for b3 in 2..=254u8 { // Skip 127.0.0.1 (localhost) let ip = format!("127.0.{}.{}", b2, b3); if !used_ips.contains(&ip) { - return ip; + return Ok(ip); } } } - // Fallback if all IPs are used (very unlikely - 65,000+ IPs) - tracing::warn!("all loopback IPs in use, reusing 127.0.0.2"); - "127.0.0.2".to_string() - })(); + anyhow::bail!("all loopback IPs exhausted (65,000+ VMs)") + })()?; // Update VM state with the allocated IP and SAVE WHILE HOLDING THE LOCK // This ensures no other process can allocate the same IP diff --git a/src/storage/disk.rs b/src/storage/disk.rs index 25c0e8ed..cedf2ac5 100644 --- a/src/storage/disk.rs +++ b/src/storage/disk.rs @@ -196,10 +196,20 @@ pub async fn ensure_free_space( } // Check filesystem before resize (required by resize2fs) - let _ = tokio::process::Command::new("e2fsck") + let e2fsck_output = tokio::process::Command::new("e2fsck") .args(["-f", "-y", disk_path.to_string_lossy().as_ref()]) .output() - .await; + .await + .context("running e2fsck")?; + + // e2fsck exit codes: 0=clean, 1=corrected, 2=corrected+reboot needed + // Exit code >= 4 means uncorrected errors + if e2fsck_output.status.code().unwrap_or(1) >= 4 { + bail!( + "e2fsck found uncorrectable errors: {}", + String::from_utf8_lossy(&e2fsck_output.stderr) + ); + } // Resize ext4 filesystem to fill the new space let output = tokio::process::Command::new("resize2fs") @@ -256,5 +266,6 @@ pub fn parse_size(s: &str) -> Result { .parse() .with_context(|| format!("parsing size number '{}'", num_str))?; - Ok(num * multiplier) + num.checked_mul(multiplier) + .with_context(|| format!("size overflow: {} * {}", num, multiplier)) } From dd526b1f37226135be8718cdf26e75127f4b10e5 Mon Sep 17 00:00:00 2001 From: "claude[bot]" Date: Sat, 7 Feb 2026 00:49:23 +0000 Subject: [PATCH 12/12] fix: e2fsck signal-kill default and stale README reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change e2fsck exit code default from unwrap_or(1) to unwrap_or(8) so signal-killed processes are treated as fatal errors instead of passing the >= 4 check. A signal-killed e2fsck means the filesystem check did not complete, so resize2fs should not proceed. - Remove stale FCVM_NO_XATTR_FASTPATH env var from README since the xattr fast-path was removed in this PR. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- README.md | 1 - src/storage/disk.rs | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3a03481f..9fe4e381 100644 --- a/README.md +++ b/README.md @@ -877,7 +877,6 @@ See [DESIGN.md](DESIGN.md#guest-agent) for details. | `RUST_LOG` | `warn` | Logging level (quiet by default; use `info` or `debug` for verbose) | | `FCVM_NO_SNAPSHOT` | unset | Set to `1` to disable automatic snapshot creation (same as `--no-snapshot` flag) | | `FCVM_NO_WRITEBACK_CACHE` | unset | Set to `1` to disable FUSE writeback cache (see below) | -| `FCVM_NO_XATTR_FASTPATH` | unset | Set to `1` to disable security.capability xattr fast path | ### FUSE Writeback Cache diff --git a/src/storage/disk.rs b/src/storage/disk.rs index cedf2ac5..9dd5b641 100644 --- a/src/storage/disk.rs +++ b/src/storage/disk.rs @@ -204,7 +204,8 @@ pub async fn ensure_free_space( // e2fsck exit codes: 0=clean, 1=corrected, 2=corrected+reboot needed // Exit code >= 4 means uncorrected errors - if e2fsck_output.status.code().unwrap_or(1) >= 4 { + // If killed by signal (code() returns None), treat as fatal (8 = operational error) + if e2fsck_output.status.code().unwrap_or(8) >= 4 { bail!( "e2fsck found uncorrectable errors: {}", String::from_utf8_lossy(&e2fsck_output.stderr)