diff --git a/DESIGN.md b/DESIGN.md index a8db8ea7..ae83a1b9 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -1677,6 +1677,28 @@ The fuse-pipe library passes the pjdfstest POSIX compliance suite. Tests run via --- +## Known Limitations + +### FUSE Volume Cache Coherency + +`--map` volumes use FUSE-over-vsock with `WRITEBACK_CACHE` and `AUTO_INVAL_DATA`. When a host process modifies a file in a mapped directory, the guest sees the change on its next read — but only after the kernel detects the mtime change (up to ~1 second granularity). Writes within the same second may not be visible immediately. + +Directory changes (new files, deletions) are subject to the kernel's directory entry cache TTL. A new file created on the host may not appear in guest `readdir()` until the cache expires. + +There are no push notifications from host to guest. The guest discovers changes only on access. inotify/fanotify in the guest watches the FUSE mount, not the host filesystem, so host-side changes don't trigger guest notifications. + +**Potential fix**: Use `FUSE_NOTIFY_INVAL_INODE` and `FUSE_NOTIFY_INVAL_ENTRY` — server-initiated invalidation notifications. The host VolumeServer would watch directories with inotify and push invalidations through the FUSE connection when files change. This is how production network filesystems (NFS, CIFS) handle it. + +### Nested VM Performance (NV2) + +ARM64 FEAT_NV2 has architectural issues with cache coherency under double Stage 2 translation. The DSB SY kernel patch fixes this for vsock/FUSE data paths, but multi-vCPU L2 VMs still hit interrupt delivery issues (NETDEV WATCHDOG). L2 VMs are limited to single vCPU. + +### Snapshot + FUSE Volumes + +Snapshots are disabled when `--map` volumes are present because the FUSE-over-vsock connection state may not survive the pause/resume cycle cleanly. This means VMs with volume mounts always do a fresh boot. Block device mounts (`--disk`, `--disk-dir`) do not have this limitation. + +--- + ## Future Enhancements ### Phase 2 (Post-MVP) diff --git a/README.md b/README.md index 817814cb..3a03481f 100644 --- a/README.md +++ b/README.md @@ -780,6 +780,8 @@ See [DESIGN.md](DESIGN.md#cli-interface) for architecture and design decisions. -t, --tty Allocate pseudo-TTY (for vim, colors, etc.) --setup Auto-setup if kernel/rootfs missing (rootless only) --no-snapshot Disable automatic snapshot creation (for testing) +--forward-localhost Forward localhost ports to host (e.g., 1421,9099) +--rootfs-size Minimum free space on rootfs (default: 10G) ``` **`fcvm exec`** - Execute in VM/container: diff --git a/fc-agent/src/fuse/mod.rs b/fc-agent/src/fuse/mod.rs index 05d18344..666cdf74 100644 --- a/fc-agent/src/fuse/mod.rs +++ b/fc-agent/src/fuse/mod.rs @@ -121,20 +121,3 @@ pub fn mount_vsock(port: u32, mount_point: &str) -> anyhow::Result<()> { ); fuse_pipe::mount_vsock_with_options(HOST_CID, port, mount_point, num_readers, trace_rate) } - -/// Mount a FUSE filesystem with multiple reader threads. -/// -/// Same as `mount_vsock` but creates multiple FUSE reader threads for -/// better parallel performance. -#[allow(dead_code)] -pub fn mount_vsock_with_readers( - port: u32, - mount_point: &str, - num_readers: usize, -) -> anyhow::Result<()> { - eprintln!( - "[fc-agent] mounting FUSE volume at {} via vsock port {} ({} readers)", - mount_point, port, num_readers - ); - fuse_pipe::mount_vsock_with_readers(HOST_CID, port, mount_point, num_readers) -} diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index 04721fb1..540b4532 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -33,6 +33,12 @@ struct Plan { /// Path to OCI archive for localhost/ images (run directly without import) #[serde(default)] image_archive: Option, + /// Run container as USER:GROUP (e.g., "1000:1000") + #[serde(default)] + user: Option, + /// Localhost ports to forward to host gateway via iptables DNAT + #[serde(default)] + forward_localhost: Vec, /// Run container in privileged mode (allows mknod, device access, etc.) #[serde(default)] privileged: bool, @@ -1693,23 +1699,29 @@ fn mount_fuse_volumes(volumes: &[VolumeMount]) -> Result> { mounted_paths.push(vol.guest_path.clone()); } - // Give FUSE mounts time to initialize - if !volumes.is_empty() { - eprintln!("[fc-agent] waiting for FUSE mounts to initialize..."); - std::thread::sleep(std::time::Duration::from_millis(500)); - - // Verify each mount point is accessible - for vol in volumes { - let path = std::path::Path::new(&vol.guest_path); + // Wait for each FUSE mount to become accessible (up to 30s per mount) + for vol in volumes { + let path = std::path::Path::new(&vol.guest_path); + let mut ready = false; + for attempt in 1..=60 { if let Ok(entries) = std::fs::read_dir(path) { let count = entries.count(); eprintln!( - "[fc-agent] ✓ mount {} accessible ({} entries)", - vol.guest_path, count + "[fc-agent] ✓ mount {} ready ({} entries, {}ms)", + vol.guest_path, + count, + (attempt - 1) * 500 ); - } else { - eprintln!("[fc-agent] ✗ mount {} NOT accessible", vol.guest_path); + ready = true; + break; } + std::thread::sleep(std::time::Duration::from_millis(500)); + } + if !ready { + return Err(anyhow::anyhow!( + "mount {} not accessible after 30s", + vol.guest_path + )); } } @@ -2240,6 +2252,38 @@ async fn run_agent() -> Result<()> { // Save proxy settings for exec commands to use save_proxy_settings(&plan); + // Forward specific localhost ports to host gateway via iptables DNAT. + // Only the listed ports are redirected — other localhost traffic stays local. + if !plan.forward_localhost.is_empty() { + let _ = std::process::Command::new("sysctl") + .args(["-w", "net.ipv4.conf.all.route_localnet=1"]) + .output(); + for port in &plan.forward_localhost { + let _ = std::process::Command::new("iptables") + .args([ + "-t", + "nat", + "-A", + "OUTPUT", + "-d", + "127.0.0.0/8", + "-p", + "tcp", + "--dport", + port, + "-j", + "DNAT", + "--to-destination", + "10.0.2.2", + ]) + .output(); + } + eprintln!( + "[fc-agent] ✓ forwarding localhost ports to host: {:?}", + plan.forward_localhost + ); + } + // Sync VM clock from host before launching container // This ensures TLS certificate validation works immediately if let Err(e) = sync_clock_from_host().await { @@ -2357,6 +2401,13 @@ async fn run_agent() -> Result<()> { let image_ref = if let Some(archive_path) = &plan.image_archive { eprintln!("[fc-agent] using Docker archive: {}", archive_path); + // Make block device readable by non-root (needed with --userns=keep-id) + if archive_path.starts_with("/dev/") { + let _ = std::process::Command::new("chmod") + .args(["444", archive_path]) + .output(); + } + format!("docker-archive:{}", archive_path) } else { // Pull image with retries to handle transient DNS/network errors @@ -2544,6 +2595,80 @@ async fn run_agent() -> Result<()> { "nofile=65536:65536".to_string(), ]; + // User mapping: run podman as the specified user with --userns=keep-id + // This replicates host behavior where rootless podman maps the user as root + // inside the container while keeping the real UID on shared mounts. + if let Some(ref user_spec) = plan.user { + // Parse "uid:gid" format + let parts: Vec<&str> = user_spec.split(':').collect(); + let uid = parts[0]; + let gid = parts.get(1).unwrap_or(&"100"); + let username = format!("fcvm-user"); + + eprintln!( + "[fc-agent] setting up user mapping: uid={} gid={}", + uid, gid + ); + + // Create group and user in the VM + let _ = std::process::Command::new("groupadd") + .args(["-g", gid, &username]) + .output(); + let _ = std::process::Command::new("useradd") + .args(["-u", uid, "-g", gid, "-m", "-s", "/bin/sh", &username]) + .output(); + + // Set up subuid/subgid for rootless podman + let subuid_entry = format!("{}:100000:65536\n", username); + let _ = std::fs::write("/etc/subuid", &subuid_entry); + let _ = std::fs::write("/etc/subgid", &subuid_entry); + + // Ensure XDG_RUNTIME_DIR exists for rootless podman + let runtime_dir = format!("/run/user/{}", uid); + let _ = std::fs::create_dir_all(&runtime_dir); + let _ = std::process::Command::new("chown") + .args([&format!("{}:{}", uid, gid), &runtime_dir]) + .output(); + + // Delegate cgroup subtree to the user for rootless podman + let cgroup_dir = format!("/sys/fs/cgroup/user.slice/user-{}.slice", uid); + let _ = std::fs::create_dir_all(&cgroup_dir); + let _ = std::process::Command::new("chown") + .args(["-R", &format!("{}:{}", uid, gid), &cgroup_dir]) + .output(); + // Enable controllers in the user's cgroup + for path in &[ + "/sys/fs/cgroup/cgroup.subtree_control", + &format!("{}/cgroup.subtree_control", cgroup_dir), + ] { + let _ = std::fs::write(path, "+cpu +memory +pids"); + } + + // Delegate fc-agent's own cgroup to the user so rootless podman can create sub-cgroups + if let Ok(cgroup_path) = std::fs::read_to_string("/proc/self/cgroup") { + // Format: "0::/system.slice/fc-agent.service" + if let Some(path) = cgroup_path.trim().strip_prefix("0::") { + let full_path = format!("/sys/fs/cgroup{}", path); + let _ = std::process::Command::new("chown") + .args(["-R", &format!("{}:{}", uid, gid), &full_path]) + .output(); + eprintln!("[fc-agent] delegated cgroup {} to user {}", full_path, uid); + } + } + + // Remove --cgroups=split (rootless podman uses cgroupfs, not split) + podman_args.retain(|a| a != "--cgroups=split"); + + // Add --userns=keep-id to podman args (replicates host behavior) + podman_args.push("--userns=keep-id".to_string()); + + // Wrap entire command with runuser to run podman as the target user + podman_args.insert(0, "--".to_string()); + podman_args.insert(0, username.clone()); + podman_args.insert(0, "-u".to_string()); + podman_args.insert(0, "runuser".to_string()); + } + // Privileged mode: allows mknod, device access, etc. for POSIX compliance tests if plan.privileged { eprintln!("[fc-agent] privileged mode enabled"); diff --git a/fuse-pipe/src/client/fuse.rs b/fuse-pipe/src/client/fuse.rs index 92a27ea0..92b25a45 100644 --- a/fuse-pipe/src/client/fuse.rs +++ b/fuse-pipe/src/client/fuse.rs @@ -605,7 +605,7 @@ impl Filesystem for FuseClient { }); match response { - VolumeResponse::Written { size } => reply.written(size), + VolumeResponse::Written { size } => reply.written(size as u32), VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)), _ => reply.error(Errno::EIO), } @@ -990,26 +990,6 @@ impl Filesystem for FuseClient { } fn getxattr(&self, req: &Request, ino: INodeNo, name: &OsStr, size: u32, reply: ReplyXattr) { - // Fast path: The kernel calls getxattr("security.capability") on every write - // to check if file capabilities need to be cleared. This is extremely common - // and almost always returns ENODATA (no capabilities set). Short-circuit this - // to avoid the expensive server round-trip (~32µs savings per write). - // - // This is safe because: - // 1. If capabilities ARE set, they're preserved (we'd need setxattr to clear) - // 2. The kernel's capability check is advisory - it clears caps on successful write - // 3. Container workloads rarely use file capabilities - // - // Can be disabled via FCVM_NO_XATTR_FASTPATH=1 for debugging. - if std::env::var("FCVM_NO_XATTR_FASTPATH").is_err() { - if let Some(name_str) = name.to_str() { - if name_str == "security.capability" { - reply.error(Errno::ENODATA); - return; - } - } - } - let response = self.send_request_sync(VolumeRequest::Getxattr { ino: ino.into(), name: name.to_string_lossy().to_string(), @@ -1198,7 +1178,7 @@ impl Filesystem for FuseClient { }); match response { - VolumeResponse::Written { size } => reply.written(size), + VolumeResponse::Written { size } => reply.written(size as u32), VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)), _ => reply.error(Errno::EIO), } @@ -1241,7 +1221,7 @@ impl Filesystem for FuseClient { ); match response { - VolumeResponse::Written { size } => reply.written(size), + VolumeResponse::Written { size } => reply.written(size as u32), VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)), _ => reply.error(Errno::EIO), } diff --git a/fuse-pipe/src/protocol/response.rs b/fuse-pipe/src/protocol/response.rs index ea9279b9..9079136d 100644 --- a/fuse-pipe/src/protocol/response.rs +++ b/fuse-pipe/src/protocol/response.rs @@ -31,7 +31,7 @@ pub enum VolumeResponse { Data { data: Vec }, /// Number of bytes written. - Written { size: u32 }, + Written { size: u64 }, /// File opened response. Opened { fh: u64, flags: u32 }, diff --git a/fuse-pipe/src/server/passthrough.rs b/fuse-pipe/src/server/passthrough.rs index 730bf856..597b4466 100644 --- a/fuse-pipe/src/server/passthrough.rs +++ b/fuse-pipe/src/server/passthrough.rs @@ -656,7 +656,7 @@ impl FilesystemHandler for PassthroughFs { ) { Ok(n) => { tracing::debug!(target: "passthrough", fh, written = n, "write succeeded"); - VolumeResponse::Written { size: n as u32 } + VolumeResponse::Written { size: n as u64 } } Err(e) => { tracing::debug!(target: "passthrough", fh, error = ?e, "write failed"); @@ -1149,7 +1149,7 @@ impl FilesystemHandler for PassthroughFs { ) { Ok(n) => { tracing::debug!(target: "passthrough", copied = n, "copy_file_range succeeded"); - VolumeResponse::Written { size: n as u32 } + VolumeResponse::Written { size: n as u64 } } Err(e) => { tracing::debug!(target: "passthrough", error = ?e, "copy_file_range failed"); @@ -1190,7 +1190,7 @@ impl FilesystemHandler for PassthroughFs { ) { Ok(n) => { tracing::debug!(target: "passthrough", cloned = n, "remap_file_range succeeded"); - VolumeResponse::Written { size: n as u32 } + VolumeResponse::Written { size: n as u64 } } Err(e) => { tracing::debug!(target: "passthrough", error = ?e, "remap_file_range failed"); @@ -1607,7 +1607,7 @@ mod tests { // For whole-file clone (len=0), we return the file size on success assert_eq!( size, - test_data.len() as u32, + test_data.len() as u64, "FICLONE should return file size for whole file (len=0)" ); @@ -1726,7 +1726,7 @@ mod tests { match resp { VolumeResponse::Written { size } => { eprintln!("FICLONERANGE succeeded, size={}", size); - assert_eq!(size, block_size as u32, "should clone requested size"); + assert_eq!(size, block_size as u64, "should clone requested size"); // Verify: first block of dest should equal second block of source let resp = fs.read(dst_ino, dst_fh, 0, block_size as u32, uid, gid, 0); diff --git a/rootfs-config.toml b/rootfs-config.toml index f1e90ca7..e51cd6f0 100644 --- a/rootfs-config.toml +++ b/rootfs-config.toml @@ -57,7 +57,7 @@ path = "opt/kata/share/kata-containers/vmlinux-6.12.47-173" [packages] # Container runtime -runtime = ["podman", "crun", "fuse-overlayfs", "skopeo"] +runtime = ["podman", "crun", "fuse-overlayfs", "skopeo", "uidmap"] # FUSE support for overlay filesystem fuse = ["fuse3"] diff --git a/src/cli/args.rs b/src/cli/args.rs index 9e41e7c5..cf7ec45c 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -169,6 +169,17 @@ pub struct RunArgs { #[arg(long)] pub health_check: Option, + /// Run container as USER:GROUP (e.g., --user 1000:1000) + /// Equivalent to podman run --userns=keep-id on the host + #[arg(long)] + pub user: Option, + + /// Forward specific localhost ports to the host gateway via iptables DNAT. + /// Enables containers to reach host-only services via localhost. + /// Comma-separated port list, e.g., --forward-localhost 1421,9099 + #[arg(long, value_delimiter = ',')] + pub forward_localhost: Vec, + /// Run container in privileged mode (allows mknod, device access, etc.) /// Use for POSIX compliance tests that need full filesystem capabilities #[arg(long)] diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 155320f3..f4f0fc8c 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -479,12 +479,19 @@ async fn create_disk_from_dir( ); // Create sparse file - tokio::process::Command::new("truncate") + let truncate_status = tokio::process::Command::new("truncate") .args(["-s", &image_size.to_string(), output_path.to_str().unwrap()]) .status() .await .context("creating sparse file")?; + if !truncate_status.success() { + bail!( + "truncate failed with exit code: {:?}", + truncate_status.code() + ); + } + // Format as ext4 let mkfs = tokio::process::Command::new("mkfs.ext4") .args(["-q", "-F", output_path.to_str().unwrap()]) @@ -841,21 +848,17 @@ pub(crate) async fn run_output_listener( None }; - // Read lines until connection closes + // Read lines until connection closes (no read timeout — large image imports + // can take 10+ minutes during which fc-agent produces no output) loop { line_buf.clear(); - match tokio::time::timeout( - std::time::Duration::from_secs(300), // 5 min read timeout - reader.read_line(&mut line_buf), - ) - .await - { - Ok(Ok(0)) => { + match reader.read_line(&mut line_buf).await { + Ok(0) => { // EOF - connection closed debug!(vm_id = %vm_id, "Output connection closed"); break; } - Ok(Ok(_)) => { + Ok(_) => { // Parse raw line format: stream:content let line = line_buf.trim_end(); if let Some((stream, content)) = line.split_once(':') { @@ -873,15 +876,10 @@ pub(crate) async fn run_output_listener( let _ = w.write_all(b"ack\n").await; } } - Ok(Err(e)) => { + Err(e) => { warn!(vm_id = %vm_id, error = %e, "Error reading output"); break; } - Err(_) => { - // Read timeout - debug!(vm_id = %vm_id, "Output read timeout"); - break; - } } } @@ -989,15 +987,13 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { None }; - // Check for snapshot cache (unless --no-snapshot is set, FCVM_NO_SNAPSHOT env var, or localhost image) - // Localhost images have tarball paths in MMDS that won't exist on restore + // Check for snapshot cache (unless --no-snapshot is set or FCVM_NO_SNAPSHOT env var) // Keep fc_config and snapshot_key available for later snapshot creation on miss let no_snapshot = args.no_snapshot || std::env::var("FCVM_NO_SNAPSHOT").is_ok(); - let is_localhost_image = args.image.starts_with("localhost/"); let (fc_config, snapshot_key): ( Option, Option, - ) = if !no_snapshot && !is_localhost_image { + ) = if !no_snapshot { // Get image identifier for cache key computation let image_identifier = get_image_identifier(&args.image).await?; let config = build_firecracker_config( @@ -1068,9 +1064,6 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { "Snapshot miss, will create snapshot after image load" ); (Some(config), Some(key)) - } else if is_localhost_image { - info!("Snapshot disabled for localhost image (tarball path won't exist on restore)"); - (None, None) } else { if std::env::var("FCVM_NO_SNAPSHOT").is_ok() { info!("Snapshot disabled via FCVM_NO_SNAPSHOT environment variable"); @@ -1361,9 +1354,7 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { // Skip snapshot creation when: // - --no-snapshot flag or FCVM_NO_SNAPSHOT env var is set // - Volumes are specified (FUSE-over-vsock breaks during snapshot pause) - // - Localhost images (tarball path in MMDS won't exist on restore) - // Note: no_snapshot and is_localhost_image are already defined above - let skip_snapshot_creation = no_snapshot || !args.map.is_empty() || is_localhost_image; + let skip_snapshot_creation = no_snapshot || !args.map.is_empty(); if !args.map.is_empty() && !no_snapshot { info!( "Skipping snapshot creation: volumes specified (FUSE doesn't survive snapshot pause)" @@ -1381,7 +1372,7 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { // Create startup snapshot channel for health-triggered snapshot creation // Only create startup snapshots if: - // - Not skipping snapshots (no --no-snapshot, no volumes, not localhost image) + // - Not skipping snapshots (no --no-snapshot, no volumes) // - Have a snapshot key // - Have a health_check URL configured (HTTP health check, not just container-ready) let (startup_tx, mut startup_rx): ( @@ -2641,6 +2632,8 @@ async fn run_vm_setup( "nfs_mounts": nfs_mounts, "image_archive": image_device.clone(), "privileged": args.privileged, + "forward_localhost": args.forward_localhost.iter().map(|p| p.to_string()).collect::>(), + "user": args.user.as_deref(), "interactive": args.interactive, "tty": args.tty, // Use network-provided proxy, or fall back to environment variables. diff --git a/src/health.rs b/src/health.rs index 4c034039..f43dfc5e 100644 --- a/src/health.rs +++ b/src/health.rs @@ -204,7 +204,7 @@ async fn check_container_running(pid: u32) -> bool { None => return false, // Can't find fcvm binary }; - let cmd_future = tokio::process::Command::new(&exe) + let child = match tokio::process::Command::new(&exe) .args([ "exec", "--pid", @@ -217,20 +217,31 @@ async fn check_container_running(pid: u32) -> bool { "{{.State.Running}}", "fcvm-container", ]) - .output(); - - let output = match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, cmd_future).await { - Ok(Ok(o)) => o, - Ok(Err(e)) => { - debug!(target: "health-monitor", error = %e, "podman inspect exec failed"); - return false; - } - Err(_) => { - debug!(target: "health-monitor", "podman inspect exec timed out"); + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .kill_on_drop(true) + .spawn() + { + Ok(c) => c, + Err(e) => { + debug!(target: "health-monitor", error = %e, "podman inspect spawn failed"); return false; } }; + let output = + match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, child.wait_with_output()).await { + Ok(Ok(o)) => o, + Ok(Err(e)) => { + debug!(target: "health-monitor", error = %e, "podman inspect exec failed"); + return false; + } + Err(_) => { + debug!(target: "health-monitor", "podman inspect exec timed out"); + return false; + } + }; + if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); debug!(target: "health-monitor", stderr = %stderr, "podman inspect failed"); @@ -256,7 +267,7 @@ async fn check_podman_healthcheck(pid: u32) -> Option { None => return Some(true), // Can't find fcvm binary, assume healthy }; - let cmd_future = tokio::process::Command::new(&exe) + let child = match tokio::process::Command::new(&exe) .args([ "exec", "--pid", @@ -269,12 +280,24 @@ async fn check_podman_healthcheck(pid: u32) -> Option { "{{.State.Health.Status}}", "fcvm-container", ]) - .output(); + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .kill_on_drop(true) + .spawn() + { + Ok(c) => c, + Err(e) => { + debug!(target: "health-monitor", error = %e, "podman healthcheck spawn failed"); + return Some(false); + } + }; - let output = match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, cmd_future).await { + // kill_on_drop ensures the child is killed if the timeout fires + let output = match tokio::time::timeout(HEALTH_CHECK_EXEC_TIMEOUT, child.wait_with_output()) + .await + { Ok(Ok(o)) => o, Ok(Err(e)) => { - // Exec not available yet, don't assume healthy - keep checking debug!(target: "health-monitor", error = %e, "podman healthcheck exec failed, will retry"); return Some(false); } diff --git a/src/network/slirp.rs b/src/network/slirp.rs index d662170c..371d77dd 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -343,8 +343,8 @@ ip addr add {namespace_ip}/24 dev {bridge} cmd.arg(namespace_pid.to_string()) .arg(&self.slirp_device) .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); + .stdout(Stdio::null()) + .stderr(Stdio::null()); let child = cmd.spawn().context("failed to spawn slirp4netns")?; diff --git a/src/state/manager.rs b/src/state/manager.rs index 0b596ec7..80efe296 100644 --- a/src/state/manager.rs +++ b/src/state/manager.rs @@ -524,20 +524,18 @@ impl StateManager { // Note: We rely on state file cleanup (cleanup_stale_state) to handle dead processes. // We don't check if port 8080 is available because wildcard binds (0.0.0.0:8080) // would cause false negatives. Real port conflicts are detected at slirp4netns add_hostfwd time. - let ip = (|| { + let ip = (|| -> Result { for b2 in 0..=255u8 { for b3 in 2..=254u8 { // Skip 127.0.0.1 (localhost) let ip = format!("127.0.{}.{}", b2, b3); if !used_ips.contains(&ip) { - return ip; + return Ok(ip); } } } - // Fallback if all IPs are used (very unlikely - 65,000+ IPs) - tracing::warn!("all loopback IPs in use, reusing 127.0.0.2"); - "127.0.0.2".to_string() - })(); + anyhow::bail!("all loopback IPs exhausted (65,000+ VMs)") + })()?; // Update VM state with the allocated IP and SAVE WHILE HOLDING THE LOCK // This ensures no other process can allocate the same IP diff --git a/src/storage/disk.rs b/src/storage/disk.rs index 25c0e8ed..cedf2ac5 100644 --- a/src/storage/disk.rs +++ b/src/storage/disk.rs @@ -196,10 +196,20 @@ pub async fn ensure_free_space( } // Check filesystem before resize (required by resize2fs) - let _ = tokio::process::Command::new("e2fsck") + let e2fsck_output = tokio::process::Command::new("e2fsck") .args(["-f", "-y", disk_path.to_string_lossy().as_ref()]) .output() - .await; + .await + .context("running e2fsck")?; + + // e2fsck exit codes: 0=clean, 1=corrected, 2=corrected+reboot needed + // Exit code >= 4 means uncorrected errors + if e2fsck_output.status.code().unwrap_or(1) >= 4 { + bail!( + "e2fsck found uncorrectable errors: {}", + String::from_utf8_lossy(&e2fsck_output.stderr) + ); + } // Resize ext4 filesystem to fill the new space let output = tokio::process::Command::new("resize2fs") @@ -256,5 +266,6 @@ pub fn parse_size(s: &str) -> Result { .parse() .with_context(|| format!("parsing size number '{}'", num_str))?; - Ok(num * multiplier) + num.checked_mul(multiplier) + .with_context(|| format!("size overflow: {} * {}", num, multiplier)) } diff --git a/tests/test_forward_localhost.rs b/tests/test_forward_localhost.rs new file mode 100644 index 00000000..6b71e49d --- /dev/null +++ b/tests/test_forward_localhost.rs @@ -0,0 +1,71 @@ +//! Test --forward-localhost flag: VM localhost reaches host services. + +#![cfg(feature = "privileged-tests")] + +mod common; + +use anyhow::{Context, Result}; +use std::net::TcpListener; + +/// Test that --forward-localhost makes VM's 127.0.0.1 reach host services. +#[tokio::test] +async fn test_forward_localhost() -> Result<()> { + println!("\nTest --forward-localhost"); + println!("========================"); + + // Start a TCP server on host 127.0.0.1 + let listener = TcpListener::bind("127.0.0.1:0")?; + let port = listener.local_addr()?.port(); + println!(" Host server on 127.0.0.1:{}", port); + + // Accept in background + let accept_handle = tokio::task::spawn_blocking(move || { + listener.set_nonblocking(false).ok(); + if let Ok((mut conn, _)) = listener.accept() { + use std::io::Write; + let _ = conn.write_all(b"HELLO\n"); + } + }); + + let (vm_name, _, _, _) = common::unique_names("fwd-localhost"); + + let port_str = port.to_string(); + let (_, pid) = common::spawn_fcvm(&[ + "podman", + "run", + "--name", + &vm_name, + "--forward-localhost", + &port_str, + "--no-snapshot", + common::TEST_IMAGE, + ]) + .await + .context("spawning fcvm")?; + + common::poll_health_by_pid(pid, 300).await?; + println!(" VM healthy"); + + // From inside VM, connect to localhost:port — should reach host + let result = common::exec_in_vm( + pid, + &[ + "sh", + "-c", + &format!("nc -w5 127.0.0.1 {} || echo FAILED", port), + ], + ) + .await?; + + println!(" Result: {}", result.trim()); + assert!( + result.contains("HELLO"), + "VM localhost should reach host with --forward-localhost (got: {})", + result.trim() + ); + + common::kill_process(pid).await; + accept_handle.abort(); + println!("✅ FORWARD LOCALHOST TEST PASSED!"); + Ok(()) +}