Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESIGN.md
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ pasta <──────────────────┼── pasta0
2. Pre-pasta setup via nsenter: create Firecracker TAP device only
3. Start pasta attached to holder's namespace (creates pasta0 TAP)
4. Post-pasta setup via nsenter: create bridge, attach pasta0 + tap-fc, add namespace IP
5. Run Firecracker via nsenter: `nsenter -t HOLDER_PID -U -n -- firecracker ...`
5. Run Firecracker via pre_exec setns (enters user+net namespace, preserves PR_SET_PDEATHSIG)
6. Health checks via nsenter: `nsenter -t HOLDER_PID -U -n -- curl 10.0.2.100:80`

**Pre-Pasta Setup Script** (Phase 2, executed via nsenter):
Expand Down
32 changes: 13 additions & 19 deletions src/firecracker/vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ pub struct VmManager {
socket_path: PathBuf,
log_path: Option<PathBuf>,
namespace_id: Option<String>,
holder_pid: Option<u32>, // namespace holder PID for rootless mode (use nsenter to run FC)
user_namespace_path: Option<PathBuf>, // User namespace path for rootless clones (enter via setns in pre_exec)
net_namespace_path: Option<PathBuf>, // Net namespace path for rootless clones (enter via setns in pre_exec)
holder_pid: Option<u32>, // namespace holder PID for rootless mode (health checks, cleanup)
user_namespace_path: Option<PathBuf>, // User namespace path for rootless VMs (enter via setns in pre_exec)
net_namespace_path: Option<PathBuf>, // Net namespace path for rootless VMs (enter via setns in pre_exec)
mount_redirects: Option<(Vec<PathBuf>, PathBuf)>, // (baseline_dirs, clone_dir) for mount namespace isolation
process: Option<Child>,
client: Option<FirecrackerClient>,
Expand Down Expand Up @@ -88,14 +88,11 @@ impl VmManager {
self.holder_pid = Some(pid);
}

/// Set user namespace path for rootless clones
/// Set user namespace path for rootless VMs (baselines + clones)
///
/// When set along with mount_redirects, pre_exec will enter this user namespace
/// first (via setns) before doing mount operations. This gives CAP_SYS_ADMIN
/// inside the user namespace, allowing unshare(CLONE_NEWNS) to succeed.
///
/// Use this instead of set_holder_pid when mount namespace isolation is needed,
/// since nsenter wrapper runs AFTER pre_exec.
/// Pre_exec will enter this user namespace via setns before starting Firecracker.
/// This preserves PR_SET_PDEATHSIG (nsenter's internal setns clears it) and gives
/// CAP_SYS_ADMIN for mount operations when mount_redirects are set.
pub fn set_user_namespace_path(&mut self, path: PathBuf) {
self.user_namespace_path = Some(path);
}
Expand Down Expand Up @@ -147,12 +144,9 @@ impl VmManager {

// Build command based on mode:
// 1. user_namespace_path set: direct Firecracker (namespaces entered via pre_exec setns)
// 2. holder_pid set (no user_namespace_path): use nsenter to enter existing namespace (rootless baseline)
// — used for ALL rootless VMs to preserve PR_SET_PDEATHSIG
// 2. holder_pid set (no user_namespace_path): nsenter fallback (not normally reached)
// 3. neither: direct Firecracker (privileged/bridged mode)
//
// For rootless clones with mount_redirects, we MUST use pre_exec setns instead of nsenter,
// because pre_exec runs BEFORE nsenter would enter the namespace, and we need CAP_SYS_ADMIN
// from the user namespace to do mount operations.
let mut cmd = if self.user_namespace_path.is_some() {
// Use direct Firecracker - namespaces will be entered via setns in pre_exec.
// Used for ALL rootless VMs (baselines + clones) because nsenter's internal
Expand All @@ -163,10 +157,10 @@ impl VmManager {
c.arg("--api-sock").arg(&self.socket_path);
c
} else if let Some(holder_pid) = self.holder_pid {
// Use nsenter to enter user+network namespace with preserved credentials
// --preserve-credentials keeps UID, GID, and supplementary groups (including kvm)
// This allows KVM access while being in the isolated network namespace
// NOTE: This path is for baseline VMs that don't need mount namespace isolation
// Fallback: nsenter to enter user+network namespace.
// NOTE: This path is not normally reached — rootless VMs now use pre_exec
// setns (above) which correctly preserves PR_SET_PDEATHSIG. nsenter's
// internal setns(CLONE_NEWUSER) clears pdeathsig. Kept as safety net.
info!(target: "vm", vm_id = %self.vm_id, holder_pid = holder_pid, "using nsenter for rootless networking");
let mut c = Command::new("nsenter");
c.args([
Expand Down
2 changes: 1 addition & 1 deletion tests/test_vsock_connect_stress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ async fn kill_on_drop_stress(pid: u32, label: &str) -> Result<()> {
/// Full lifecycle stress test for vsock CONNECT reliability.
///
/// Tests exec through: baseline → clone → clone-of-clone → parallel clones → sibling death.
/// Every exec must succeed and complete in under 2 seconds.
/// Every exec must succeed and complete within MAX_EXEC_MS.
#[cfg(feature = "privileged-tests")]
#[tokio::test]
async fn test_vsock_connect_lifecycle_stress_bridged() -> Result<()> {
Expand Down
Loading