Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,7 @@ jobs:
path: fcvm
- uses: ./fcvm/.github/actions/checkout-deps
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
with:
components: rustfmt, clippy
uses: dtolnay/rust-toolchain@1.93.0
- name: Get dependency SHAs
id: deps
run: |
Expand Down Expand Up @@ -151,7 +149,7 @@ jobs:
path: fcvm
- uses: ./fcvm/.github/actions/checkout-deps
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
uses: dtolnay/rust-toolchain@1.93.0
- name: Get dependency SHAs
id: deps
run: |
Expand Down Expand Up @@ -182,7 +180,7 @@ jobs:
path: fcvm
- uses: ./fcvm/.github/actions/checkout-deps
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
uses: dtolnay/rust-toolchain@1.93.0
- name: Get dependency SHAs
id: deps
run: |
Expand Down Expand Up @@ -247,8 +245,10 @@ jobs:
- uses: ./fcvm/.github/actions/checkout-deps
- name: Install Rust
run: |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
curl --proto '=https' --tlsv1.2 --retry 10 --retry-connrefused --location --silent --show-error --fail https://sh.rustup.rs | sh -s -- --default-toolchain none -y
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
# rust-toolchain.toml handles version + components; just trigger install
rustup show active-toolchain || true
- name: Get dependency SHAs
id: deps
run: |
Expand Down Expand Up @@ -462,8 +462,10 @@ jobs:
- uses: ./fcvm/.github/actions/checkout-deps
- name: Install Rust
run: |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
curl --proto '=https' --tlsv1.2 --retry 10 --retry-connrefused --location --silent --show-error --fail https://sh.rustup.rs | sh -s -- --default-toolchain none -y
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
# rust-toolchain.toml handles version + components; just trigger install
rustup show active-toolchain || true
- name: Get dependency SHAs
id: deps
run: |
Expand Down
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,12 @@ check-disk:
# CRITICAL: Uses fcvm's proper cleanup commands to handle btrfs CoW correctly
clean-test-data: build
@echo "==> Killing stale VM processes from previous runs..."
@sudo pkill -9 firecracker 2>/dev/null; sudo pkill -9 pasta 2>/dev/null; sleep 1; true
@sudo pkill -9 firecracker 2>/dev/null; sudo pkill -9 pasta 2>/dev/null; sudo kill -9 $$(pgrep -x sleep -P 1) 2>/dev/null; sleep 1; true
@echo "==> Cleaning stale network namespaces..."
@for ns in $$(sudo ip netns list 2>/dev/null | grep '^fcvm-' | awk '{print $$1}'); do sudo ip netns del "$$ns" 2>/dev/null && echo " deleted $$ns"; done; true
@echo "==> Cleaning stale iptables rules from fcvm VMs..."
@sudo iptables-save -t nat 2>/dev/null | grep -E 'MASQUERADE.*(172\.30\.|10\.0\.)' | sed 's/^-A//' | while read rule; do sudo iptables -t nat -D $$rule 2>/dev/null; done; true
@sudo ip6tables-save -t nat 2>/dev/null | grep 'MASQUERADE' | grep -v NETAVARK | sed 's/^-A//' | while read rule; do sudo ip6tables -t nat -D $$rule 2>/dev/null; done; true
@echo "==> Force unmounting stale FUSE mounts..."
@# Find and force unmount any FUSE mounts from previous test runs
@mount | grep fuse | grep -E '/tmp|/var/tmp' | cut -d' ' -f3 | xargs -r -I{} fusermount3 -u -z {} 2>/dev/null || true
Expand Down
43 changes: 41 additions & 2 deletions benches/clone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,48 @@ impl CloneFixture {
}

if !last_response.contains("200 OK") {
let clone_log = std::fs::read_to_string(&clone_log_path).unwrap_or_default();

let ss_check = Command::new("ss")
.args(["-tlnp", "src", &loopback_ip])
.output()
.map(|o| String::from_utf8_lossy(&o.stdout).to_string())
.unwrap_or_default();

let connect_err = TcpStream::connect_timeout(
&format!("{}:{}", loopback_ip, health_port).parse().unwrap(),
Duration::from_secs(2),
)
.err()
.map(|e| format!("{}", e))
.unwrap_or_else(|| "connect succeeded".to_string());

let log_tail: String = clone_log
.lines()
.rev()
.take(30)
.collect::<Vec<_>>()
.into_iter()
.rev()
.collect::<Vec<_>>()
.join("\n");

panic!(
"unexpected HTTP response after 3 attempts: {}",
&last_response[..std::cmp::min(200, last_response.len())]
"clone HTTP failed after 3 attempts\n\
addr: {}:{}\n\
last_response: {} bytes\n\
clone_pid: {}\n\
connect_err: {}\n\
\n=== listening sockets on {} ===\n{}\
\n=== clone log (last 30 lines) ===\n{}",
loopback_ip,
health_port,
last_response.len(),
clone_pid,
connect_err,
loopback_ip,
ss_check,
log_tail,
);
}

Expand Down
68 changes: 66 additions & 2 deletions benches/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -601,9 +601,73 @@ impl CloneFixture {
}
}
if !http_ok {
// Comprehensive diagnostics for CI debugging
let clone_log = std::fs::read_to_string(&clone_log_path).unwrap_or_default();

// Check if pasta is running for this clone
let pasta_check = Command::new("pgrep")
.args(["-af", "pasta"])
.output()
.map(|o| String::from_utf8_lossy(&o.stdout).to_string())
.unwrap_or_else(|e| format!("pgrep failed: {}", e));

// Check listening sockets on the loopback IP
let ss_check = Command::new("ss")
.args(["-tlnp", "src", &loopback_ip])
.output()
.map(|o| String::from_utf8_lossy(&o.stdout).to_string())
.unwrap_or_else(|e| format!("ss failed: {}", e));

// Try connecting and report exact error
let connect_err = TcpStream::connect_timeout(
&format!("{}:{}", loopback_ip, health_port).parse().unwrap(),
Duration::from_secs(2),
)
.err()
.map(|e| format!("{}", e))
.unwrap_or_else(|| "connect succeeded".to_string());

// Check stale sleep/pasta/fc processes
let stale_check = Command::new("sh")
.args([
"-c",
"echo 'sleep:' $(pgrep -cx sleep); echo 'pasta:' $(pgrep -cx pasta); echo 'firecracker:' $(pgrep -cf firecracker)",
])
.output()
.map(|o| String::from_utf8_lossy(&o.stdout).to_string())
.unwrap_or_default();

// Last 30 lines of clone log (full, not filtered)
let log_tail: String = clone_log
.lines()
.rev()
.take(30)
.collect::<Vec<_>>()
.into_iter()
.rev()
.collect::<Vec<_>>()
.join("\n");

panic!(
"unexpected HTTP response after 10 attempts: {}",
&last_response[..std::cmp::min(200, last_response.len())]
"clone HTTP failed after 10 attempts\n\
addr: {}:{}\n\
last_response: {} bytes\n\
clone_pid: {}\n\
connect_err: {}\n\
\n=== listening sockets on {} ===\n{}\
\n=== pasta processes ===\n{}\
\n=== stale process counts ===\n{}\
\n=== clone log (last 30 lines) ===\n{}",
loopback_ip,
health_port,
last_response.len(),
clone_pid,
connect_err,
loopback_ip,
ss_check,
pasta_check,
stale_check,
log_tail,
);
}

Expand Down
2 changes: 1 addition & 1 deletion rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[toolchain]
channel = "1.92.0"
channel = "1.93.0"
components = ["rustfmt", "clippy"]
# musl target for statically linked fc-agent (portable across glibc versions)
targets = ["aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl"]
33 changes: 22 additions & 11 deletions src/commands/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ async fn setup_namespace_mappings(pid: u32) -> anyhow::Result<()> {
let self_ino = std::fs::metadata("/proc/self/ns/user")
.context("reading own user namespace inode")?
.ino();
let ns_deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
let ns_deadline = std::time::Instant::now() + std::time::Duration::from_secs(30);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep namespace readiness deadline consistent

setup_namespace_mappings() now waits up to 30s for /proc/<pid>/ns/user, but spawn_namespace_holder() still computes its absolute deadline from HOLDER_RETRY_TIMEOUT (5s) and passes that same deadline into wait_for_namespace_ready(). If namespace creation takes longer than 5s (the case this commit targets), wait_for_namespace_ready() starts with an already-expired deadline and will time out after the first failed probe, so rootless startup can still fail under load instead of getting the intended extended grace period.

Useful? React with 👍 / 👎.

loop {
if std::fs::metadata(format!("/proc/{pid}/ns/user"))
.map(|m| m.ino() != self_ino)
Expand Down Expand Up @@ -157,18 +157,29 @@ pub async fn spawn_namespace_holder(
loop {
attempt += 1;

let child = tokio::process::Command::new(&holder_cmd[0])
.args(&holder_cmd[1..])
let mut cmd = tokio::process::Command::new(&holder_cmd[0]);
cmd.args(&holder_cmd[1..])
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::piped())
.spawn()
.with_context(|| {
format!(
"spawning namespace holder (attempt {}): {:?}",
attempt, holder_cmd
)
})?;
.stderr(std::process::Stdio::piped());

// Kill holder if parent (fcvm) dies. Without this, holders orphan to init
// and accumulate when fcvm is SIGKILL'd or crashes.
unsafe {
cmd.pre_exec(|| {
if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) != 0 {
return Err(std::io::Error::last_os_error());
}
Ok(())
});
}

let child = cmd.spawn().with_context(|| {
format!(
"spawning namespace holder (attempt {}): {:?}",
attempt, holder_cmd
)
})?;

let holder_pid = child.id().context("getting holder process PID")?;
if attempt > 1 {
Expand Down
15 changes: 14 additions & 1 deletion src/commands/podman/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,20 @@ pub(super) async fn setup_rootless_namespace(
}
debug!(tap_device = %tap_device, "TAP device verified");

// Set holder_pid so VmManager uses nsenter
// Use pre_exec setns path (not nsenter) for rootless baselines.
// nsenter enters the user namespace internally, which clears PR_SET_PDEATHSIG
// (kernel zeros task->pdeath_signal on credential changes). The pre_exec setns
// path sets pdeathsig AFTER entering the user namespace, so Firecracker gets
// SIGKILL when fcvm dies.
vm_manager.set_user_namespace_path(std::path::PathBuf::from(format!(
"/proc/{}/ns/user",
holder_pid
)));
vm_manager.set_net_namespace_path(std::path::PathBuf::from(format!(
"/proc/{}/ns/net",
holder_pid
)));
// Still track holder_pid for health checks (nsenter curl) and cleanup
vm_manager.set_holder_pid(holder_pid);

// Store holder_pid in state for health checks and cleanup
Expand Down
3 changes: 1 addition & 2 deletions src/commands/setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ pub async fn cmd_setup(args: SetupArgs) -> Result<()> {
.context("setting up fc-agent initrd")?;
println!(" ✓ Initrd ready: {}", initrd_path.display());

if args.kernel_profile.is_some() {
let profile_name = args.kernel_profile.as_ref().unwrap();
if let Some(profile_name) = &args.kernel_profile {
println!("\nFor '{}' profile, use:", profile_name);
println!(
" fcvm podman run --kernel-profile {} --privileged ...",
Expand Down
16 changes: 9 additions & 7 deletions src/firecracker/vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,17 +154,19 @@ impl VmManager {
// because pre_exec runs BEFORE nsenter would enter the namespace, and we need CAP_SYS_ADMIN
// from the user namespace to do mount operations.
let mut cmd = if self.user_namespace_path.is_some() {
// Use direct Firecracker - namespaces will be entered via setns in pre_exec
// This is required for rootless clones that need mount namespace isolation
info!(target: "vm", vm_id = %self.vm_id, "using pre_exec setns for rootless clone");
// Use direct Firecracker - namespaces will be entered via setns in pre_exec.
// Used for ALL rootless VMs (baselines + clones) because nsenter's internal
// setns(CLONE_NEWUSER) clears PR_SET_PDEATHSIG, leaving Firecracker orphaned
// if fcvm is SIGKILL'd. The pre_exec path sets pdeathsig AFTER setns.
info!(target: "vm", vm_id = %self.vm_id, "using pre_exec setns for rootless VM");
let mut c = Command::new(firecracker_bin);
c.arg("--api-sock").arg(&self.socket_path);
c
} else if let Some(holder_pid) = self.holder_pid {
// Use nsenter to enter user+network namespace with preserved credentials
// --preserve-credentials keeps UID, GID, and supplementary groups (including kvm)
// This allows KVM access while being in the isolated network namespace
// NOTE: This path is for baseline VMs that don't need mount namespace isolation
// Fallback: nsenter to enter user+network namespace.
// NOTE: This path is not normally reached — rootless VMs now use pre_exec
// setns (above) which correctly preserves PR_SET_PDEATHSIG. nsenter's
// internal setns(CLONE_NEWUSER) clears pdeathsig. Kept as safety net.
info!(target: "vm", vm_id = %self.vm_id, holder_pid = holder_pid, "using nsenter for rootless networking");
let mut c = Command::new("nsenter");
c.args([
Expand Down
Loading
Loading