diff --git a/DESIGN.md b/DESIGN.md index e4b2815d..c1c573a4 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -469,7 +469,7 @@ pasta <──────────────────┼── pasta0 2. Pre-pasta setup via nsenter: create Firecracker TAP device only 3. Start pasta attached to holder's namespace (creates pasta0 TAP) 4. Post-pasta setup via nsenter: create bridge, attach pasta0 + tap-fc, add namespace IP -5. Run Firecracker via nsenter: `nsenter -t HOLDER_PID -U -n -- firecracker ...` +5. Run Firecracker via pre_exec setns (enters user+net namespace, preserves PR_SET_PDEATHSIG) 6. Health checks via nsenter: `nsenter -t HOLDER_PID -U -n -- curl 10.0.2.100:80` **Pre-Pasta Setup Script** (Phase 2, executed via nsenter): diff --git a/src/firecracker/vm.rs b/src/firecracker/vm.rs index 9de15e88..03936cf9 100644 --- a/src/firecracker/vm.rs +++ b/src/firecracker/vm.rs @@ -36,9 +36,9 @@ pub struct VmManager { socket_path: PathBuf, log_path: Option, namespace_id: Option, - holder_pid: Option, // namespace holder PID for rootless mode (use nsenter to run FC) - user_namespace_path: Option, // User namespace path for rootless clones (enter via setns in pre_exec) - net_namespace_path: Option, // Net namespace path for rootless clones (enter via setns in pre_exec) + holder_pid: Option, // namespace holder PID for rootless mode (health checks, cleanup) + user_namespace_path: Option, // User namespace path for rootless VMs (enter via setns in pre_exec) + net_namespace_path: Option, // Net namespace path for rootless VMs (enter via setns in pre_exec) mount_redirects: Option<(Vec, PathBuf)>, // (baseline_dirs, clone_dir) for mount namespace isolation process: Option, client: Option, @@ -88,14 +88,11 @@ impl VmManager { self.holder_pid = Some(pid); } - /// Set user namespace path for rootless clones + /// Set user namespace path for rootless VMs (baselines + clones) /// - /// When set along with mount_redirects, pre_exec will enter this user namespace - /// first (via setns) before doing mount operations. This gives CAP_SYS_ADMIN - /// inside the user namespace, allowing unshare(CLONE_NEWNS) to succeed. - /// - /// Use this instead of set_holder_pid when mount namespace isolation is needed, - /// since nsenter wrapper runs AFTER pre_exec. + /// Pre_exec will enter this user namespace via setns before starting Firecracker. + /// This preserves PR_SET_PDEATHSIG (nsenter's internal setns clears it) and gives + /// CAP_SYS_ADMIN for mount operations when mount_redirects are set. pub fn set_user_namespace_path(&mut self, path: PathBuf) { self.user_namespace_path = Some(path); } @@ -147,12 +144,9 @@ impl VmManager { // Build command based on mode: // 1. user_namespace_path set: direct Firecracker (namespaces entered via pre_exec setns) - // 2. holder_pid set (no user_namespace_path): use nsenter to enter existing namespace (rootless baseline) + // — used for ALL rootless VMs to preserve PR_SET_PDEATHSIG + // 2. holder_pid set (no user_namespace_path): nsenter fallback (not normally reached) // 3. neither: direct Firecracker (privileged/bridged mode) - // - // For rootless clones with mount_redirects, we MUST use pre_exec setns instead of nsenter, - // because pre_exec runs BEFORE nsenter would enter the namespace, and we need CAP_SYS_ADMIN - // from the user namespace to do mount operations. let mut cmd = if self.user_namespace_path.is_some() { // Use direct Firecracker - namespaces will be entered via setns in pre_exec. // Used for ALL rootless VMs (baselines + clones) because nsenter's internal @@ -163,10 +157,10 @@ impl VmManager { c.arg("--api-sock").arg(&self.socket_path); c } else if let Some(holder_pid) = self.holder_pid { - // Use nsenter to enter user+network namespace with preserved credentials - // --preserve-credentials keeps UID, GID, and supplementary groups (including kvm) - // This allows KVM access while being in the isolated network namespace - // NOTE: This path is for baseline VMs that don't need mount namespace isolation + // Fallback: nsenter to enter user+network namespace. + // NOTE: This path is not normally reached — rootless VMs now use pre_exec + // setns (above) which correctly preserves PR_SET_PDEATHSIG. nsenter's + // internal setns(CLONE_NEWUSER) clears pdeathsig. Kept as safety net. info!(target: "vm", vm_id = %self.vm_id, holder_pid = holder_pid, "using nsenter for rootless networking"); let mut c = Command::new("nsenter"); c.args([ diff --git a/tests/test_vsock_connect_stress.rs b/tests/test_vsock_connect_stress.rs index cc08312e..38cc4f1f 100644 --- a/tests/test_vsock_connect_stress.rs +++ b/tests/test_vsock_connect_stress.rs @@ -176,7 +176,7 @@ async fn kill_on_drop_stress(pid: u32, label: &str) -> Result<()> { /// Full lifecycle stress test for vsock CONNECT reliability. /// /// Tests exec through: baseline → clone → clone-of-clone → parallel clones → sibling death. -/// Every exec must succeed and complete in under 2 seconds. +/// Every exec must succeed and complete within MAX_EXEC_MS. #[cfg(feature = "privileged-tests")] #[tokio::test] async fn test_vsock_connect_lifecycle_stress_bridged() -> Result<()> {