diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2381ba5..d3ca6346 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -104,9 +104,7 @@ jobs: path: fcvm - uses: ./fcvm/.github/actions/checkout-deps - name: Install Rust - uses: dtolnay/rust-toolchain@stable - with: - components: rustfmt, clippy + uses: dtolnay/rust-toolchain@1.93.0 - name: Get dependency SHAs id: deps run: | @@ -151,7 +149,7 @@ jobs: path: fcvm - uses: ./fcvm/.github/actions/checkout-deps - name: Install Rust - uses: dtolnay/rust-toolchain@stable + uses: dtolnay/rust-toolchain@1.93.0 - name: Get dependency SHAs id: deps run: | @@ -182,7 +180,7 @@ jobs: path: fcvm - uses: ./fcvm/.github/actions/checkout-deps - name: Install Rust - uses: dtolnay/rust-toolchain@stable + uses: dtolnay/rust-toolchain@1.93.0 - name: Get dependency SHAs id: deps run: | @@ -247,8 +245,10 @@ jobs: - uses: ./fcvm/.github/actions/checkout-deps - name: Install Rust run: | - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + curl --proto '=https' --tlsv1.2 --retry 10 --retry-connrefused --location --silent --show-error --fail https://sh.rustup.rs | sh -s -- --default-toolchain none -y echo "$HOME/.cargo/bin" >> $GITHUB_PATH + # rust-toolchain.toml handles version + components; just trigger install + rustup show active-toolchain || true - name: Get dependency SHAs id: deps run: | @@ -462,8 +462,10 @@ jobs: - uses: ./fcvm/.github/actions/checkout-deps - name: Install Rust run: | - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + curl --proto '=https' --tlsv1.2 --retry 10 --retry-connrefused --location --silent --show-error --fail https://sh.rustup.rs | sh -s -- --default-toolchain none -y echo "$HOME/.cargo/bin" >> $GITHUB_PATH + # rust-toolchain.toml handles version + components; just trigger install + rustup show active-toolchain || true - name: Get dependency SHAs id: deps run: | diff --git a/Makefile b/Makefile index c2fcbfc1..74d433d8 100644 --- a/Makefile +++ b/Makefile @@ -242,7 +242,12 @@ check-disk: # CRITICAL: Uses fcvm's proper cleanup commands to handle btrfs CoW correctly clean-test-data: build @echo "==> Killing stale VM processes from previous runs..." - @sudo pkill -9 firecracker 2>/dev/null; sudo pkill -9 pasta 2>/dev/null; sleep 1; true + @sudo pkill -9 firecracker 2>/dev/null; sudo pkill -9 pasta 2>/dev/null; sudo kill -9 $$(pgrep -x sleep -P 1) 2>/dev/null; sleep 1; true + @echo "==> Cleaning stale network namespaces..." + @for ns in $$(sudo ip netns list 2>/dev/null | grep '^fcvm-' | awk '{print $$1}'); do sudo ip netns del "$$ns" 2>/dev/null && echo " deleted $$ns"; done; true + @echo "==> Cleaning stale iptables rules from fcvm VMs..." + @sudo iptables-save -t nat 2>/dev/null | grep -E 'MASQUERADE.*(172\.30\.|10\.0\.)' | sed 's/^-A//' | while read rule; do sudo iptables -t nat -D $$rule 2>/dev/null; done; true + @sudo ip6tables-save -t nat 2>/dev/null | grep 'MASQUERADE' | grep -v NETAVARK | sed 's/^-A//' | while read rule; do sudo ip6tables -t nat -D $$rule 2>/dev/null; done; true @echo "==> Force unmounting stale FUSE mounts..." @# Find and force unmount any FUSE mounts from previous test runs @mount | grep fuse | grep -E '/tmp|/var/tmp' | cut -d' ' -f3 | xargs -r -I{} fusermount3 -u -z {} 2>/dev/null || true diff --git a/benches/clone.rs b/benches/clone.rs index dff8f705..55794835 100644 --- a/benches/clone.rs +++ b/benches/clone.rs @@ -295,9 +295,48 @@ impl CloneFixture { } if !last_response.contains("200 OK") { + let clone_log = std::fs::read_to_string(&clone_log_path).unwrap_or_default(); + + let ss_check = Command::new("ss") + .args(["-tlnp", "src", &loopback_ip]) + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) + .unwrap_or_default(); + + let connect_err = TcpStream::connect_timeout( + &format!("{}:{}", loopback_ip, health_port).parse().unwrap(), + Duration::from_secs(2), + ) + .err() + .map(|e| format!("{}", e)) + .unwrap_or_else(|| "connect succeeded".to_string()); + + let log_tail: String = clone_log + .lines() + .rev() + .take(30) + .collect::>() + .into_iter() + .rev() + .collect::>() + .join("\n"); + panic!( - "unexpected HTTP response after 3 attempts: {}", - &last_response[..std::cmp::min(200, last_response.len())] + "clone HTTP failed after 3 attempts\n\ + addr: {}:{}\n\ + last_response: {} bytes\n\ + clone_pid: {}\n\ + connect_err: {}\n\ + \n=== listening sockets on {} ===\n{}\ + \n=== clone log (last 30 lines) ===\n{}", + loopback_ip, + health_port, + last_response.len(), + clone_pid, + connect_err, + loopback_ip, + ss_check, + log_tail, ); } diff --git a/benches/exec.rs b/benches/exec.rs index 98458064..549751a4 100644 --- a/benches/exec.rs +++ b/benches/exec.rs @@ -601,9 +601,73 @@ impl CloneFixture { } } if !http_ok { + // Comprehensive diagnostics for CI debugging + let clone_log = std::fs::read_to_string(&clone_log_path).unwrap_or_default(); + + // Check if pasta is running for this clone + let pasta_check = Command::new("pgrep") + .args(["-af", "pasta"]) + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) + .unwrap_or_else(|e| format!("pgrep failed: {}", e)); + + // Check listening sockets on the loopback IP + let ss_check = Command::new("ss") + .args(["-tlnp", "src", &loopback_ip]) + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) + .unwrap_or_else(|e| format!("ss failed: {}", e)); + + // Try connecting and report exact error + let connect_err = TcpStream::connect_timeout( + &format!("{}:{}", loopback_ip, health_port).parse().unwrap(), + Duration::from_secs(2), + ) + .err() + .map(|e| format!("{}", e)) + .unwrap_or_else(|| "connect succeeded".to_string()); + + // Check stale sleep/pasta/fc processes + let stale_check = Command::new("sh") + .args([ + "-c", + "echo 'sleep:' $(pgrep -cx sleep); echo 'pasta:' $(pgrep -cx pasta); echo 'firecracker:' $(pgrep -cf firecracker)", + ]) + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) + .unwrap_or_default(); + + // Last 30 lines of clone log (full, not filtered) + let log_tail: String = clone_log + .lines() + .rev() + .take(30) + .collect::>() + .into_iter() + .rev() + .collect::>() + .join("\n"); + panic!( - "unexpected HTTP response after 10 attempts: {}", - &last_response[..std::cmp::min(200, last_response.len())] + "clone HTTP failed after 10 attempts\n\ + addr: {}:{}\n\ + last_response: {} bytes\n\ + clone_pid: {}\n\ + connect_err: {}\n\ + \n=== listening sockets on {} ===\n{}\ + \n=== pasta processes ===\n{}\ + \n=== stale process counts ===\n{}\ + \n=== clone log (last 30 lines) ===\n{}", + loopback_ip, + health_port, + last_response.len(), + clone_pid, + connect_err, + loopback_ip, + ss_check, + pasta_check, + stale_check, + log_tail, ); } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 9b822e37..d66c90e4 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.92.0" +channel = "1.93.0" components = ["rustfmt", "clippy"] # musl target for statically linked fc-agent (portable across glibc versions) targets = ["aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl"] diff --git a/src/commands/common.rs b/src/commands/common.rs index f0b90580..dc808b68 100644 --- a/src/commands/common.rs +++ b/src/commands/common.rs @@ -87,7 +87,7 @@ async fn setup_namespace_mappings(pid: u32) -> anyhow::Result<()> { let self_ino = std::fs::metadata("/proc/self/ns/user") .context("reading own user namespace inode")? .ino(); - let ns_deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + let ns_deadline = std::time::Instant::now() + std::time::Duration::from_secs(30); loop { if std::fs::metadata(format!("/proc/{pid}/ns/user")) .map(|m| m.ino() != self_ino) @@ -157,18 +157,29 @@ pub async fn spawn_namespace_holder( loop { attempt += 1; - let child = tokio::process::Command::new(&holder_cmd[0]) - .args(&holder_cmd[1..]) + let mut cmd = tokio::process::Command::new(&holder_cmd[0]); + cmd.args(&holder_cmd[1..]) .stdin(std::process::Stdio::null()) .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::piped()) - .spawn() - .with_context(|| { - format!( - "spawning namespace holder (attempt {}): {:?}", - attempt, holder_cmd - ) - })?; + .stderr(std::process::Stdio::piped()); + + // Kill holder if parent (fcvm) dies. Without this, holders orphan to init + // and accumulate when fcvm is SIGKILL'd or crashes. + unsafe { + cmd.pre_exec(|| { + if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) != 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) + }); + } + + let child = cmd.spawn().with_context(|| { + format!( + "spawning namespace holder (attempt {}): {:?}", + attempt, holder_cmd + ) + })?; let holder_pid = child.id().context("getting holder process PID")?; if attempt > 1 { diff --git a/src/commands/podman/namespace.rs b/src/commands/podman/namespace.rs index 35395296..f7c731b5 100644 --- a/src/commands/podman/namespace.rs +++ b/src/commands/podman/namespace.rs @@ -212,7 +212,20 @@ pub(super) async fn setup_rootless_namespace( } debug!(tap_device = %tap_device, "TAP device verified"); - // Set holder_pid so VmManager uses nsenter + // Use pre_exec setns path (not nsenter) for rootless baselines. + // nsenter enters the user namespace internally, which clears PR_SET_PDEATHSIG + // (kernel zeros task->pdeath_signal on credential changes). The pre_exec setns + // path sets pdeathsig AFTER entering the user namespace, so Firecracker gets + // SIGKILL when fcvm dies. + vm_manager.set_user_namespace_path(std::path::PathBuf::from(format!( + "/proc/{}/ns/user", + holder_pid + ))); + vm_manager.set_net_namespace_path(std::path::PathBuf::from(format!( + "/proc/{}/ns/net", + holder_pid + ))); + // Still track holder_pid for health checks (nsenter curl) and cleanup vm_manager.set_holder_pid(holder_pid); // Store holder_pid in state for health checks and cleanup diff --git a/src/commands/setup.rs b/src/commands/setup.rs index 0bb73a9c..0d9216e9 100644 --- a/src/commands/setup.rs +++ b/src/commands/setup.rs @@ -126,8 +126,7 @@ pub async fn cmd_setup(args: SetupArgs) -> Result<()> { .context("setting up fc-agent initrd")?; println!(" ✓ Initrd ready: {}", initrd_path.display()); - if args.kernel_profile.is_some() { - let profile_name = args.kernel_profile.as_ref().unwrap(); + if let Some(profile_name) = &args.kernel_profile { println!("\nFor '{}' profile, use:", profile_name); println!( " fcvm podman run --kernel-profile {} --privileged ...", diff --git a/src/firecracker/vm.rs b/src/firecracker/vm.rs index b8524cfc..91e1abc6 100644 --- a/src/firecracker/vm.rs +++ b/src/firecracker/vm.rs @@ -154,17 +154,19 @@ impl VmManager { // because pre_exec runs BEFORE nsenter would enter the namespace, and we need CAP_SYS_ADMIN // from the user namespace to do mount operations. let mut cmd = if self.user_namespace_path.is_some() { - // Use direct Firecracker - namespaces will be entered via setns in pre_exec - // This is required for rootless clones that need mount namespace isolation - info!(target: "vm", vm_id = %self.vm_id, "using pre_exec setns for rootless clone"); + // Use direct Firecracker - namespaces will be entered via setns in pre_exec. + // Used for ALL rootless VMs (baselines + clones) because nsenter's internal + // setns(CLONE_NEWUSER) clears PR_SET_PDEATHSIG, leaving Firecracker orphaned + // if fcvm is SIGKILL'd. The pre_exec path sets pdeathsig AFTER setns. + info!(target: "vm", vm_id = %self.vm_id, "using pre_exec setns for rootless VM"); let mut c = Command::new(firecracker_bin); c.arg("--api-sock").arg(&self.socket_path); c } else if let Some(holder_pid) = self.holder_pid { - // Use nsenter to enter user+network namespace with preserved credentials - // --preserve-credentials keeps UID, GID, and supplementary groups (including kvm) - // This allows KVM access while being in the isolated network namespace - // NOTE: This path is for baseline VMs that don't need mount namespace isolation + // Fallback: nsenter to enter user+network namespace. + // NOTE: This path is not normally reached — rootless VMs now use pre_exec + // setns (above) which correctly preserves PR_SET_PDEATHSIG. nsenter's + // internal setns(CLONE_NEWUSER) clears pdeathsig. Kept as safety net. info!(target: "vm", vm_id = %self.vm_id, holder_pid = holder_pid, "using nsenter for rootless networking"); let mut c = Command::new("nsenter"); c.args([ diff --git a/tests/test_signal_cleanup.rs b/tests/test_signal_cleanup.rs index b809eae5..454542cf 100644 --- a/tests/test_signal_cleanup.rs +++ b/tests/test_signal_cleanup.rs @@ -503,9 +503,13 @@ fn test_sigkill_kills_firecracker_rootless() -> Result<()> { anyhow::bail!("VM did not become healthy within 120 seconds"); } - // Find the specific firecracker process for THIS VM + // Find the specific child processes for THIS VM let our_fc_pid = find_firecracker_for_fcvm(fcvm_pid); - println!("Our firecracker PID: {:?}", our_fc_pid); + let our_holder_pid = find_holder_for_fcvm(fcvm_pid); + println!( + "Our firecracker PID: {:?}, holder PID: {:?}", + our_fc_pid, our_holder_pid + ); assert!( our_fc_pid.is_some(), @@ -525,18 +529,20 @@ fn test_sigkill_kills_firecracker_rootless() -> Result<()> { let _ = fcvm.wait(); println!("fcvm process reaped"); - // PR_SET_PDEATHSIG delivers SIGKILL to Firecracker immediately when parent dies. + // PR_SET_PDEATHSIG delivers SIGKILL to child processes when parent dies. // Poll briefly in case of scheduling delay. let start = std::time::Instant::now(); while start.elapsed() < Duration::from_secs(5) { - if !process_exists(fc_pid) { + let fc_alive = process_exists(fc_pid); + let holder_alive = our_holder_pid.is_some_and(process_exists); + if !fc_alive && !holder_alive { break; } std::thread::sleep(common::POLL_INTERVAL); } - let still_running = process_exists(fc_pid); - if still_running { + let fc_still_running = process_exists(fc_pid); + if fc_still_running { println!( "BUG: firecracker (PID {}) still running after fcvm SIGKILL!", fc_pid @@ -544,11 +550,28 @@ fn test_sigkill_kills_firecracker_rootless() -> Result<()> { let _ = send_signal(fc_pid, "KILL"); } assert!( - !still_running, + !fc_still_running, "firecracker (PID {}) should die via PR_SET_PDEATHSIG when fcvm is SIGKILL'd", fc_pid ); + if let Some(holder_pid) = our_holder_pid { + let holder_still_running = process_exists(holder_pid); + if holder_still_running { + println!( + "BUG: namespace holder (PID {}) still running after fcvm SIGKILL!", + holder_pid + ); + let _ = send_signal(holder_pid, "KILL"); + } + assert!( + !holder_still_running, + "namespace holder (PID {}) should die via PR_SET_PDEATHSIG when fcvm is SIGKILL'd", + holder_pid + ); + println!("Holder PID {} correctly cleaned up", holder_pid); + } + assert!( !process_exists(fcvm_pid), "fcvm process (PID {}) should be dead", @@ -624,6 +647,27 @@ fn find_pasta_for_fcvm(fcvm_pid: u32) -> Option { None } +fn find_holder_for_fcvm(fcvm_pid: u32) -> Option { + let output = Command::new("pgrep") + .args(["-f", "sleep infinity"]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let stdout = String::from_utf8_lossy(&output.stdout); + for line in stdout.lines() { + if let Ok(pid) = line.trim().parse::() { + if is_descendant_of(pid, fcvm_pid) { + return Some(pid); + } + } + } + None +} + /// Check if a process is a descendant of another process fn is_descendant_of(pid: u32, ancestor_pid: u32) -> bool { let mut current = pid; diff --git a/tests/test_vsock_connect_stress.rs b/tests/test_vsock_connect_stress.rs index 4e7f7344..38cc4f1f 100644 --- a/tests/test_vsock_connect_stress.rs +++ b/tests/test_vsock_connect_stress.rs @@ -23,7 +23,7 @@ use std::time::{Duration, Instant}; const EXEC_ITERATIONS: usize = 20; /// Maximum acceptable time for a single exec (milliseconds) -const MAX_EXEC_MS: u128 = 2000; +const MAX_EXEC_MS: u128 = 5000; /// Number of kill-on-drop iterations per round const KILL_ON_DROP_ITERATIONS: usize = 5; @@ -176,7 +176,7 @@ async fn kill_on_drop_stress(pid: u32, label: &str) -> Result<()> { /// Full lifecycle stress test for vsock CONNECT reliability. /// /// Tests exec through: baseline → clone → clone-of-clone → parallel clones → sibling death. -/// Every exec must succeed and complete in under 2 seconds. +/// Every exec must succeed and complete within MAX_EXEC_MS. #[cfg(feature = "privileged-tests")] #[tokio::test] async fn test_vsock_connect_lifecycle_stress_bridged() -> Result<()> {