From 64f70c7baf3ce56d56fa1c1636f251dbb9648f33 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 16:58:23 +0000 Subject: [PATCH 01/23] Fix Layer 2 setup: proper error handling and package resolution Key changes: 1. dpkg failures now fail loudly with captured output - Uses tee to log dpkg output to /tmp/dpkg-install.log - Shows specific error messages on failure - Exits with clear error instead of continuing silently 2. Setup completion verified with marker file - Writes /etc/fcvm-setup-complete on successful setup - Rust code mounts rootfs and verifies marker exists - Detects FCVM_SETUP_FAILED in serial output for early bail 3. Fixed package download using apt-get install --download-only - Previous apt-cache depends pulled conflicting alternatives (e.g., libqt5gui5t64 vs libqt5gui5-gles both downloaded) - Now uses apt-get which properly resolves dependencies 4. Fixed dangling symlinks when writing config files - /etc/resolv.conf is symlink to /run/systemd/... in cloud image - Now removes symlinks before writing files 5. Added codename field to rootfs-plan.toml - Specifies target Ubuntu version (noble) for package download - Ensures packages match target, not host OS Tested: sudo fcvm setup && sudo fcvm podman run --name test --network bridged nginx:alpine - Setup completes in ~15 seconds - VM boots, pulls image, nginx serves HTTP - Health checks pass --- rootfs-plan.toml | 2 + src/setup/rootfs.rs | 211 ++++++++++++++++++++++++++++++++------------ 2 files changed, 158 insertions(+), 55 deletions(-) diff --git a/rootfs-plan.toml b/rootfs-plan.toml index 066b74f6..8425cf4e 100644 --- a/rootfs-plan.toml +++ b/rootfs-plan.toml @@ -12,6 +12,8 @@ # Ubuntu 24.04 LTS (Noble Numbat) cloud images # Using "current" for latest updates - URL changes trigger plan SHA change version = "24.04" +# Codename used to download packages from correct Ubuntu release +codename = "noble" [base.arm64] url = "https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-arm64.img" diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 7aa6cfa4..7acd0f84 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -34,6 +34,8 @@ pub struct Plan { #[derive(Debug, Deserialize, Clone)] pub struct BaseConfig { pub version: String, + /// Ubuntu codename (e.g., "noble" for 24.04) - used to download packages + pub codename: String, pub arm64: ArchConfig, pub amd64: ArchConfig, } @@ -121,21 +123,71 @@ pub struct CleanupConfig { /// This script installs packages from /mnt/packages and removes conflicting packages. pub fn generate_install_script() -> String { r#"#!/bin/bash -set -e +set -euo pipefail + echo 'FCVM: Removing conflicting packages before install...' # Remove time-daemon provider that conflicts with chrony -apt-get remove -y --purge systemd-timesyncd 2>/dev/null || true +apt-get remove -y --purge systemd-timesyncd || true # Remove packages we don't need in microVM (also frees space) -apt-get remove -y --purge cloud-init snapd ubuntu-server 2>/dev/null || true +apt-get remove -y --purge cloud-init snapd ubuntu-server || true echo 'FCVM: Installing packages from initrd...' -dpkg -i /mnt/packages/*.deb || true -apt-get -f install -y || true +PKG_COUNT=$(ls /mnt/packages/*.deb 2>/dev/null | wc -l) +echo "FCVM: Found $PKG_COUNT .deb files" + +# Capture dpkg output for error reporting +DPKG_LOG=/tmp/dpkg-install.log +dpkg -i /mnt/packages/*.deb 2>&1 | tee "$DPKG_LOG" +DPKG_STATUS=${PIPESTATUS[0]} + +if [ $DPKG_STATUS -ne 0 ]; then + echo '' + echo '==========================================' + echo 'FCVM ERROR: dpkg -i failed!' + echo '==========================================' + echo 'Failed packages:' + grep -E '^dpkg: error|^Errors were encountered' "$DPKG_LOG" || true + echo '' + echo 'Dependency problems:' + grep -E 'dependency problems|depends on' "$DPKG_LOG" || true + echo '==========================================' + exit 1 +fi + echo 'FCVM: Packages installed successfully' "# .to_string() } +/// Generate the script used to download packages via podman. +/// This script is included in the hash to ensure cache invalidation +/// when the download method or target Ubuntu version changes. +pub fn generate_download_script(plan: &Plan) -> String { + let packages = plan.packages.all_packages(); + let packages_str = packages.join(" "); + let codename = &plan.base.codename; + + format!( + r#"#!/bin/bash +# Download packages for Ubuntu {codename} using podman +# This script is hashed to invalidate cache when download method changes +set -euo pipefail +CONTAINER_IMAGE="ubuntu:{codename}" +PACKAGES="{packages}" + +podman run --rm -v "$PACKAGES_DIR:/packages" "$CONTAINER_IMAGE" bash -c ' +set -euo pipefail +apt-get update -qq +# Use apt-get install --download-only to properly resolve dependencies +apt-get install --download-only --yes --no-install-recommends '"$PACKAGES"' +cp /var/cache/apt/archives/*.deb /packages/ 2>/dev/null || true +' +"#, + codename = codename, + packages = packages_str + ) +} + /// Generate the init script that runs in the initrd during Layer 2 setup. /// This script mounts filesystems, runs install + setup scripts, then powers off. /// @@ -205,12 +257,20 @@ echo "FCVM Layer 2 Setup: Installing packages..." chroot /newroot /bin/bash /tmp/install-packages.sh INSTALL_RESULT=$? echo "FCVM Layer 2 Setup: Package installation returned: $INSTALL_RESULT" +if [ $INSTALL_RESULT -ne 0 ]; then + echo "FCVM_SETUP_FAILED: Package installation failed with exit code $INSTALL_RESULT" + poweroff -f +fi # Run setup script using chroot echo "FCVM Layer 2 Setup: Running setup script..." chroot /newroot /bin/bash /tmp/fcvm-setup.sh SETUP_RESULT=$? echo "FCVM Layer 2 Setup: Setup script returned: $SETUP_RESULT" +if [ $SETUP_RESULT -ne 0 ]; then + echo "FCVM_SETUP_FAILED: Setup script failed with exit code $SETUP_RESULT" + poweroff -f +fi # Cleanup chroot mounts (use lazy unmount as fallback) echo "FCVM Layer 2 Setup: Cleaning up..." @@ -221,6 +281,10 @@ rm -rf /newroot/mnt/packages rm -f /newroot/tmp/install-packages.sh rm -f /newroot/tmp/fcvm-setup.sh +# Write marker file to rootfs (proves setup completed successfully) +date -u '+%Y-%m-%dT%H:%M:%SZ' > /newroot/etc/fcvm-setup-complete +echo "FCVM Layer 2 Setup: Wrote marker file /etc/fcvm-setup-complete" + # Sync and unmount rootfs sync umount /newroot 2>/dev/null || umount -l /newroot 2>/dev/null || true @@ -269,6 +333,8 @@ pub fn generate_setup_script(plan: &Plan) -> String { s.push_str(&format!("mkdir -p {}\n", parent.display())); } } + // Remove dangling symlinks (e.g., /etc/resolv.conf -> /run/systemd/...) + s.push_str(&format!("rm -f {} 2>/dev/null || true\n", path)); s.push_str(&format!("cat > {} << 'FCVM_EOF'\n", path)); s.push_str(&config.content); if !config.content.ends_with('\n') { @@ -439,19 +505,23 @@ pub async fn ensure_rootfs(allow_create: bool) -> Result { let setup_script = generate_setup_script(&plan); let install_script = generate_install_script(); let init_script = generate_init_script(&install_script, &setup_script); + let download_script = generate_download_script(&plan); // Get kernel URL for the current architecture let kernel_config = plan.kernel.current_arch()?; let kernel_url = &kernel_config.url; - // Hash the complete init script + kernel URL + // Hash the complete init script + kernel URL + download script // Any change to: // - init logic, install script, or setup script // - kernel URL (different kernel version/release) + // - download method (podman image, codename, packages) // invalidates the cache let mut combined = init_script.clone(); combined.push_str("\n# KERNEL_URL: "); combined.push_str(kernel_url); + combined.push_str("\n# DOWNLOAD_SCRIPT:\n"); + combined.push_str(&download_script); let script_sha = compute_sha256(combined.as_bytes()); let script_sha_short = &script_sha[..12]; @@ -1403,61 +1473,54 @@ async fn download_packages(plan: &Plan, script_sha_short: &str) -> Result/dev/null || true +"#, + packages = packages_str + ); - // Also download dependencies - info!("downloading package dependencies"); - let deps_output = Command::new("sh") + let output = Command::new("podman") .args([ + "run", + "--rm", + "-v", + &format!("{}:/packages", packages_dir.display()), + &container_image, + "bash", "-c", - &format!( - "apt-cache depends --recurse --no-recommends --no-suggests --no-conflicts \ - --no-breaks --no-replaces --no-enhances {} | \ - grep '^\\w' | sort -u | xargs apt-get download 2>/dev/null || true", - packages_str - ), + &download_script, ]) - .current_dir(&packages_dir) .output() - .await; + .await + .context("downloading packages with podman")?; - if let Err(e) = deps_output { - warn!(error = %e, "failed to download some dependencies, continuing..."); + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + warn!(stderr = %stderr, "podman download had errors, checking results..."); } // Count downloaded packages @@ -1474,10 +1537,15 @@ async fn download_packages(plan: &Plan, script_sha_short: &str) -> Result Result<()> { let serial_content = tokio::fs::read_to_string(&serial_path) .await .unwrap_or_default(); + if serial_content.contains("FCVM_SETUP_FAILED") { + warn!("Setup failed! Serial console output:\n{}", serial_content); + if let Ok(log_content) = tokio::fs::read_to_string(&log_path).await { + warn!("Firecracker log:\n{}", log_content); + } + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + bail!("Layer 2 setup failed (script exited with error - check logs above)"); + } if !serial_content.contains("FCVM_SETUP_COMPLETE") { warn!("Setup failed! Serial console output:\n{}", serial_content); if let Ok(log_content) = tokio::fs::read_to_string(&log_path).await { @@ -1727,6 +1803,31 @@ async fn boot_vm_for_setup(disk_path: &Path, initrd_path: &Path) -> Result<()> { let _ = tokio::fs::remove_dir_all(&temp_dir).await; bail!("Layer 2 setup failed (no FCVM_SETUP_COMPLETE marker found)"); } + + // Verify marker file exists in the rootfs + let mount_dir = temp_dir.join("verify-mount"); + tokio::fs::create_dir_all(&mount_dir).await?; + let mount_output = Command::new("mount") + .args(["-o", "ro", path_to_str(disk_path)?, path_to_str(&mount_dir)?]) + .output() + .await?; + if !mount_output.status.success() { + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + bail!( + "Failed to mount rootfs for verification: {}", + String::from_utf8_lossy(&mount_output.stderr) + ); + } + let marker_path = mount_dir.join("etc/fcvm-setup-complete"); + let marker_exists = marker_path.exists(); + // Unmount before checking result + let _ = Command::new("umount").arg(&mount_dir).output().await; + if !marker_exists { + warn!("Setup failed! Serial console output:\n{}", serial_content); + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + bail!("Layer 2 setup failed: marker file /etc/fcvm-setup-complete not found in rootfs"); + } + let _ = tokio::fs::remove_dir_all(&temp_dir).await; info!( elapsed_secs = elapsed.as_secs(), From b0654f7e3a47a78b520dec22068f170454edc958 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 17:00:16 +0000 Subject: [PATCH 02/23] Update docs to reflect Layer 2 setup improvements CLAUDE.md: - Document package download via podman run ubuntu:noble - Add setup verification with marker file - Update hash calculation components DESIGN.md: - Expand fcvm setup command description with steps - Add packages cache directory to data layout - Document rootfs hash calculation - Bump version to 2.3 --- .claude/CLAUDE.md | 29 ++++++++++++++++++++++++++--- DESIGN.md | 27 ++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index a60f9790..ab17ede9 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -727,9 +727,25 @@ fuse-pipe/benches/ - Initrd: `/mnt/fcvm-btrfs/initrd/fc-agent-{sha}.initrd` (injects fc-agent at boot) **Layer System:** -The rootfs is named after the SHA of the setup script + kernel URL. This ensures automatic cache invalidation when: +The rootfs is named after the SHA of a combined script that includes: +- Init script (embeds install script + setup script) +- Kernel URL +- Download script (packages + Ubuntu codename) + +This ensures automatic cache invalidation when: - The init logic, install script, or setup script changes - The kernel URL changes (different kernel version) +- The package list or target Ubuntu version changes + +**Package Download:** +Packages are downloaded using `podman run ubuntu:{codename}` with `apt-get install --download-only`. +This ensures packages match the target Ubuntu version (Noble/24.04), not the host OS. +The `codename` is specified in `rootfs-plan.toml`. + +**Setup Verification:** +Layer 2 setup writes a marker file `/etc/fcvm-setup-complete` on successful completion. +After the setup VM exits, fcvm mounts the rootfs and verifies this marker exists. +If missing, setup fails with a clear error. The initrd contains a statically-linked busybox and fc-agent binary, injected at boot before systemd. @@ -887,8 +903,15 @@ ERROR fcvm: Error: setting up rootfs: Rootfs not found. Run 'fcvm setup' first, **What `fcvm setup` does:** 1. Downloads Kata kernel from URL in `rootfs-plan.toml` (~15MB, cached by URL hash) -2. Creates Layer 2 rootfs (~10GB, downloads Ubuntu cloud image, boots VM to install packages) -3. Creates fc-agent initrd (embeds statically-linked fc-agent binary) +2. Downloads packages using `podman run ubuntu:noble` with `apt-get install --download-only` + - Packages specified in `rootfs-plan.toml` (podman, crun, fuse-overlayfs, skopeo, fuse3, haveged, chrony, strace) + - Uses target Ubuntu version (noble/24.04) to get correct package versions +3. Creates Layer 2 rootfs (~10GB): + - Downloads Ubuntu cloud image + - Boots VM with packages embedded in initrd + - Runs install script (dpkg) + setup script (config files, services) + - Verifies setup completed by checking for `/etc/fcvm-setup-complete` marker file +4. Creates fc-agent initrd (embeds statically-linked fc-agent binary) **Kernel source**: Kata Containers kernel (6.12.47 from Kata 3.24.0 release) with `CONFIG_FUSE_FS=y` built-in. diff --git a/DESIGN.md b/DESIGN.md index 5866df08..6b689880 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -920,7 +920,14 @@ The guest is configured to support rootless Podman: fcvm setup ``` -This downloads the Kata kernel (~15MB) and creates the Layer 2 rootfs (~10GB with Ubuntu + Podman). Takes 5-10 minutes on first run. +**What it does:** +1. Downloads Kata kernel (~15MB, cached by URL hash) +2. Downloads packages via `podman run ubuntu:noble` with `apt-get install --download-only` +3. Creates Layer 2 rootfs (~10GB): boots VM, installs packages, writes config +4. Verifies setup by checking `/etc/fcvm-setup-complete` marker file +5. Creates fc-agent initrd (embeds statically-linked fc-agent binary) + +Takes 5-10 minutes on first run. Subsequent runs are instant (cached by content hash). **Note**: Must be run before `fcvm podman run` with bridged networking. For rootless mode, you can use `--setup` flag on `fcvm podman run` instead. @@ -1310,7 +1317,7 @@ Override with `FCVM_BASE_DIR` environment variable. /mnt/fcvm-btrfs/ ├── kernels/ # Kernel binaries │ └── vmlinux-{sha}.bin -├── rootfs/ # Base rootfs images +├── rootfs/ # Base rootfs images (contains /etc/fcvm-setup-complete marker) │ └── layer2-{sha}.raw ├── initrd/ # fc-agent injection initrds │ └── fc-agent-{sha}.initrd @@ -1319,9 +1326,19 @@ Override with `FCVM_BASE_DIR` environment variable. ├── snapshots/ # Firecracker snapshots ├── state/ # VM state JSON files │ └── {vm-id}.json -└── cache/ # Downloaded images +└── cache/ # Downloaded images and packages + ├── ubuntu-24.04-arm64-{sha}.img # Cloud image cache + └── packages-{sha}/ # Downloaded .deb files ``` +**Rootfs Hash Calculation:** +The layer2-{sha}.raw name is computed from: +- Init script (embeds install + setup scripts) +- Kernel URL +- Download script (package list + Ubuntu codename) + +This ensures automatic cache invalidation when any component changes. + ### State Persistence **VM State** (`/mnt/fcvm-btrfs/state/{vm-id}.json`): @@ -1726,6 +1743,6 @@ The 64 CPUs help within each crate (LLVM codegen), but crate-level parallelism i **End of Design Specification** -*Version: 2.2* -*Date: 2025-12-24* +*Version: 2.3* +*Date: 2025-12-25* *Author: fcvm project* From f12d7d9700fd5f21591a74d05c6c5796d8c4a82f Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 17:02:29 +0000 Subject: [PATCH 03/23] README: Expand setup section with detailed steps --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 596e6fcb..1be7c431 100644 --- a/README.md +++ b/README.md @@ -83,11 +83,22 @@ cargo build --release --workspace ### Setup (First Time) ```bash -# Create btrfs filesystem and download kernel + rootfs (takes 5-10 minutes) +# Create btrfs filesystem make setup-btrfs + +# Download kernel and create rootfs (takes 5-10 minutes first time) fcvm setup ``` +**What `fcvm setup` does:** +1. Downloads Kata kernel (~15MB, cached by URL hash) +2. Downloads packages via `podman run ubuntu:noble` (ensures correct Ubuntu 24.04 versions) +3. Creates Layer 2 rootfs (~10GB): boots VM, installs packages, writes config files +4. Verifies setup completed successfully (checks marker file) +5. Creates fc-agent initrd + +Subsequent runs are instant - everything is cached by content hash. + ### Run a Container ```bash # Run nginx in a Firecracker VM (using AWS ECR public registry to avoid Docker Hub rate limits) From 053cec83fc2365409183f4eacc743bec4f8ef098 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 17:03:35 +0000 Subject: [PATCH 04/23] README: Document --setup flag for auto-setup on first run --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 1be7c431..d31c9870 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,13 @@ fcvm setup Subsequent runs are instant - everything is cached by content hash. +**Alternative: Auto-setup on first run (rootless only)** +```bash +# Skip explicit setup - does it automatically on first run +fcvm podman run --name web1 --network rootless --setup nginx:alpine +``` +The `--setup` flag triggers setup if kernel/rootfs are missing. Only works with `--network rootless` (no sudo). + ### Run a Container ```bash # Run nginx in a Firecracker VM (using AWS ECR public registry to avoid Docker Hub rate limits) From 8a908210afb9b4143c1e43b74fb184b717428c36 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 17:07:42 +0000 Subject: [PATCH 05/23] README: Clarify why --setup is rootless only --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d31c9870..fb5f6d5d 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Subsequent runs are instant - everything is cached by content hash. # Skip explicit setup - does it automatically on first run fcvm podman run --name web1 --network rootless --setup nginx:alpine ``` -The `--setup` flag triggers setup if kernel/rootfs are missing. Only works with `--network rootless` (no sudo). +The `--setup` flag triggers setup if kernel/rootfs are missing. Only works with `--network rootless` to avoid file ownership issues when running as root. ### Run a Container ```bash From 0210025bea1457a2c93acc6e7ab0558d64078ce5 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 17:15:17 +0000 Subject: [PATCH 06/23] CI: Auto-cancel in-progress runs on new push --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ed2efcd..cf870eea 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,11 @@ on: push: branches: [main] +# Cancel in-progress runs when a new revision is pushed +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: CARGO_TERM_COLOR: always FUSE_BACKEND_RS: ${{ github.workspace }}/fuse-backend-rs From e3e75bc114106206aa5b74ec78ffad4308ace311 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 17:34:24 +0000 Subject: [PATCH 07/23] CI: Add missing dependencies to Container job Container job needs qemu-utils, e2fsprogs, podman, skopeo, busybox-static, cpio, zstd on the host for setup-fcvm to work (rootfs creation). --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cf870eea..20ea8dae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -104,6 +104,10 @@ jobs: repository: ejc3/fuser ref: master path: fuser + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y qemu-utils e2fsprogs podman skopeo busybox-static cpio zstd - name: Setup KVM and rootless podman run: | sudo chmod 666 /dev/kvm From 5f1853a21694cbd700f4a065e3d8f5196da455e6 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 17:36:42 +0000 Subject: [PATCH 08/23] Fix VM shutdown in Layer 2 setup Use sysrq trigger (echo o > /proc/sysrq-trigger) for reliable shutdown instead of poweroff -f which doesn't work in minimal initrd environment. The CI was timing out because poweroff -f failed silently and the VM kept running for 15 minutes after setup completed. --- src/setup/rootfs.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 7acd0f84..28a89132 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -224,7 +224,8 @@ mount -o rw /dev/vda /newroot if [ $? -ne 0 ]; then echo "ERROR: Failed to mount rootfs" sleep 5 - poweroff -f + echo 1 > /proc/sys/kernel/sysrq 2>/dev/null || true + echo o > /proc/sysrq-trigger 2>/dev/null || poweroff -f fi # Copy embedded packages from initrd to rootfs @@ -259,7 +260,8 @@ INSTALL_RESULT=$? echo "FCVM Layer 2 Setup: Package installation returned: $INSTALL_RESULT" if [ $INSTALL_RESULT -ne 0 ]; then echo "FCVM_SETUP_FAILED: Package installation failed with exit code $INSTALL_RESULT" - poweroff -f + echo 1 > /proc/sys/kernel/sysrq 2>/dev/null || true + echo o > /proc/sysrq-trigger 2>/dev/null || poweroff -f fi # Run setup script using chroot @@ -269,7 +271,8 @@ SETUP_RESULT=$? echo "FCVM Layer 2 Setup: Setup script returned: $SETUP_RESULT" if [ $SETUP_RESULT -ne 0 ]; then echo "FCVM_SETUP_FAILED: Setup script failed with exit code $SETUP_RESULT" - poweroff -f + echo 1 > /proc/sys/kernel/sysrq 2>/dev/null || true + echo o > /proc/sysrq-trigger 2>/dev/null || poweroff -f fi # Cleanup chroot mounts (use lazy unmount as fallback) @@ -291,7 +294,13 @@ umount /newroot 2>/dev/null || umount -l /newroot 2>/dev/null || true echo "FCVM_SETUP_COMPLETE" echo "FCVM Layer 2 Setup: Complete! Powering off..." -umount /proc /sys /dev 2>/dev/null || true + +# Use sysrq trigger for reliable shutdown (poweroff may not work in minimal initrd) +echo 1 > /proc/sys/kernel/sysrq 2>/dev/null || true +echo o > /proc/sysrq-trigger 2>/dev/null || true + +# Fallback: try poweroff if sysrq didn't work +sleep 1 poweroff -f "#, install_script, setup_script From 4b640fa3072a8038c787597dc9c1bffa8d6c44b6 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 18:02:47 +0000 Subject: [PATCH 09/23] CI: Run setup inside container, add sanity checks - Add container-setup-fcvm target that runs setup inside the container (container already has Firecracker, qemu-utils, etc.) - Remove host Firecracker installation from Container CI job - Use debugfs instead of mount for marker file verification (no root needed) - Add sanity checks before writing marker file: - Verify podman, crun, skopeo binaries exist - Verify systemd exists - Verify /etc/resolv.conf exists - Improved VM shutdown with /proc re-mount and multiple fallbacks --- .github/workflows/ci.yml | 8 ++--- Makefile | 13 ++++++++ src/setup/rootfs.rs | 64 ++++++++++++++++++++++++++++------------ 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20ea8dae..33182b0d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -104,10 +104,6 @@ jobs: repository: ejc3/fuser ref: master path: fuser - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y qemu-utils e2fsprogs podman skopeo busybox-static cpio zstd - name: Setup KVM and rootless podman run: | sudo chmod 666 /dev/kvm @@ -117,9 +113,9 @@ jobs: - name: container-test-unit working-directory: fcvm run: make container-test-unit - - name: setup-fcvm + - name: container-setup-fcvm working-directory: fcvm - run: make setup-fcvm + run: make container-setup-fcvm - name: container-test working-directory: fcvm run: make container-test diff --git a/Makefile b/Makefile index 65db587b..68f5c862 100644 --- a/Makefile +++ b/Makefile @@ -135,6 +135,19 @@ setup-fcvm: build setup-btrfs @echo "==> Running fcvm setup..." ./target/release/fcvm setup +# Run setup inside container (for CI - container has Firecracker) +container-setup-fcvm: container-build setup-btrfs + @echo "==> Running fcvm setup in container..." + $(CONTAINER_RUN) $(CONTAINER_TAG) make _setup-fcvm + +_setup-fcvm: + @FREE_GB=$$(df -BG /mnt/fcvm-btrfs 2>/dev/null | awk 'NR==2 {gsub("G",""); print $$4}'); \ + if [ -n "$$FREE_GB" ] && [ "$$FREE_GB" -lt 15 ]; then \ + echo "ERROR: Need 15GB on /mnt/fcvm-btrfs (have $${FREE_GB}GB)"; \ + exit 1; \ + fi + ./target/release/fcvm setup + bench: build @echo "==> Running benchmarks..." sudo cargo bench -p fuse-pipe --bench throughput diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 28a89132..02a7364e 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -284,6 +284,38 @@ rm -rf /newroot/mnt/packages rm -f /newroot/tmp/install-packages.sh rm -f /newroot/tmp/fcvm-setup.sh +# Sanity checks before writing marker file +echo "FCVM Layer 2 Setup: Running sanity checks..." +SANITY_FAILED=0 + +# Check critical binaries exist +for bin in podman crun skopeo; do + if [ ! -x "/newroot/usr/bin/$bin" ]; then + echo "FCVM ERROR: $bin not found at /newroot/usr/bin/$bin" + SANITY_FAILED=1 + fi +done + +# Check systemd exists +if [ ! -x "/newroot/lib/systemd/systemd" ] && [ ! -x "/newroot/usr/lib/systemd/systemd" ]; then + echo "FCVM ERROR: systemd not found" + SANITY_FAILED=1 +fi + +# Check resolv.conf exists +if [ ! -f "/newroot/etc/resolv.conf" ]; then + echo "FCVM ERROR: /etc/resolv.conf not found" + SANITY_FAILED=1 +fi + +if [ $SANITY_FAILED -ne 0 ]; then + echo "FCVM_SETUP_FAILED: Sanity checks failed" + mount -t proc proc /proc 2>/dev/null || true + echo o > /proc/sysrq-trigger 2>/dev/null || poweroff -f +fi + +echo "FCVM Layer 2 Setup: Sanity checks passed" + # Write marker file to rootfs (proves setup completed successfully) date -u '+%Y-%m-%dT%H:%M:%SZ' > /newroot/etc/fcvm-setup-complete echo "FCVM Layer 2 Setup: Wrote marker file /etc/fcvm-setup-complete" @@ -295,13 +327,18 @@ umount /newroot 2>/dev/null || umount -l /newroot 2>/dev/null || true echo "FCVM_SETUP_COMPLETE" echo "FCVM Layer 2 Setup: Complete! Powering off..." -# Use sysrq trigger for reliable shutdown (poweroff may not work in minimal initrd) +# Re-mount /proc in case bind unmount affected it, then use sysrq for reliable shutdown +mount -t proc proc /proc 2>/dev/null || true echo 1 > /proc/sys/kernel/sysrq 2>/dev/null || true echo o > /proc/sysrq-trigger 2>/dev/null || true -# Fallback: try poweroff if sysrq didn't work +# Fallback methods if sysrq didn't work sleep 1 -poweroff -f +reboot -f 2>/dev/null || true +poweroff -f 2>/dev/null || true + +# Last resort: halt via kernel +echo b > /proc/sysrq-trigger 2>/dev/null || true "#, install_script, setup_script ) @@ -1813,24 +1850,13 @@ async fn boot_vm_for_setup(disk_path: &Path, initrd_path: &Path) -> Result<()> { bail!("Layer 2 setup failed (no FCVM_SETUP_COMPLETE marker found)"); } - // Verify marker file exists in the rootfs - let mount_dir = temp_dir.join("verify-mount"); - tokio::fs::create_dir_all(&mount_dir).await?; - let mount_output = Command::new("mount") - .args(["-o", "ro", path_to_str(disk_path)?, path_to_str(&mount_dir)?]) + // Verify marker file exists in the rootfs using debugfs (no root needed) + let debugfs_output = Command::new("debugfs") + .args(["-R", "stat /etc/fcvm-setup-complete", path_to_str(disk_path)?]) .output() .await?; - if !mount_output.status.success() { - let _ = tokio::fs::remove_dir_all(&temp_dir).await; - bail!( - "Failed to mount rootfs for verification: {}", - String::from_utf8_lossy(&mount_output.stderr) - ); - } - let marker_path = mount_dir.join("etc/fcvm-setup-complete"); - let marker_exists = marker_path.exists(); - // Unmount before checking result - let _ = Command::new("umount").arg(&mount_dir).output().await; + let marker_exists = debugfs_output.status.success() + && !String::from_utf8_lossy(&debugfs_output.stdout).contains("not found"); if !marker_exists { warn!("Setup failed! Serial console output:\n{}", serial_content); let _ = tokio::fs::remove_dir_all(&temp_dir).await; From 62aeb8cc9e75cae2518e882ca5fa1865f5b38d45 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 19:14:20 +0000 Subject: [PATCH 10/23] CI: Run setup inside container, add sanity checks - Add container-setup-fcvm target that runs setup inside the container (container already has Firecracker, qemu-utils, etc.) - Update container-test-fast/all to depend on container-setup-fcvm - Add fdisk package to Containerfile (provides sfdisk for partition info) - Use debugfs instead of mount for marker file verification (no root needed) - Add sanity checks before writing marker file: - Verify podman, crun, skopeo binaries exist - Verify systemd exists - Verify /etc/resolv.conf exists - Improved VM shutdown with /proc re-mount and multiple fallbacks - Fix cargo fmt issues --- Containerfile | 2 +- Makefile | 6 +++--- fuse-pipe/src/server/passthrough.rs | 11 +++++++++-- fuse-pipe/tests/common/mod.rs | 5 ++++- fuse-pipe/tests/integration.rs | 8 +++++++- src/setup/rootfs.rs | 11 +++++++++-- 6 files changed, 33 insertions(+), 10 deletions(-) diff --git a/Containerfile b/Containerfile index dbeb849a..5e854f90 100644 --- a/Containerfile +++ b/Containerfile @@ -15,7 +15,7 @@ RUN cargo install cargo-nextest cargo-audit cargo-deny --locked RUN apt-get update && apt-get install -y \ fuse3 libfuse3-dev autoconf automake libtool perl libclang-dev clang \ musl-tools iproute2 iptables slirp4netns dnsmasq qemu-utils e2fsprogs \ - parted podman skopeo git curl sudo procps zstd busybox-static cpio uidmap \ + parted fdisk podman skopeo git curl sudo procps zstd busybox-static cpio uidmap \ && rm -rf /var/lib/apt/lists/* # Install Firecracker diff --git a/Makefile b/Makefile index 68f5c862..48c7195d 100644 --- a/Makefile +++ b/Makefile @@ -84,11 +84,11 @@ container-test-unit: container-build @echo "==> Running unit tests in container..." $(CONTAINER_RUN) $(CONTAINER_TAG) make build _test-unit -container-test-fast: setup-fcvm container-build +container-test-fast: container-setup-fcvm @echo "==> Running fast tests in container..." $(CONTAINER_RUN) $(CONTAINER_TAG) make _test-fast -container-test-all: setup-fcvm container-build +container-test-all: container-setup-fcvm @echo "==> Running all tests in container..." $(CONTAINER_RUN) $(CONTAINER_TAG) make _test-all @@ -138,7 +138,7 @@ setup-fcvm: build setup-btrfs # Run setup inside container (for CI - container has Firecracker) container-setup-fcvm: container-build setup-btrfs @echo "==> Running fcvm setup in container..." - $(CONTAINER_RUN) $(CONTAINER_TAG) make _setup-fcvm + $(CONTAINER_RUN) $(CONTAINER_TAG) make build _setup-fcvm _setup-fcvm: @FREE_GB=$$(df -BG /mnt/fcvm-btrfs 2>/dev/null | awk 'NR==2 {gsub("G",""); print $$4}'); \ diff --git a/fuse-pipe/src/server/passthrough.rs b/fuse-pipe/src/server/passthrough.rs index 335238ed..90d09d0a 100644 --- a/fuse-pipe/src/server/passthrough.rs +++ b/fuse-pipe/src/server/passthrough.rs @@ -1355,7 +1355,10 @@ mod tests { }; // Create hardlink - eprintln!("Calling link(source_ino={}, parent=1, name='link.txt')...", source_ino); + eprintln!( + "Calling link(source_ino={}, parent=1, name='link.txt')...", + source_ino + ); let resp = fs.link(source_ino, 1, "link.txt", uid, gid, 0); let link_ino = match resp { VolumeResponse::Entry { attr, .. } => { @@ -1369,7 +1372,11 @@ mod tests { let src_path = dir.path().join("source.txt"); let link_path = dir.path().join("link.txt"); eprintln!("=== link() FAILED ==="); - eprintln!("errno: {} ({})", errno, std::io::Error::from_raw_os_error(errno)); + eprintln!( + "errno: {} ({})", + errno, + std::io::Error::from_raw_os_error(errno) + ); eprintln!("source.txt exists: {}", src_path.exists()); eprintln!("link.txt exists: {}", link_path.exists()); eprintln!( diff --git a/fuse-pipe/tests/common/mod.rs b/fuse-pipe/tests/common/mod.rs index e7478a09..9d3118e4 100644 --- a/fuse-pipe/tests/common/mod.rs +++ b/fuse-pipe/tests/common/mod.rs @@ -365,7 +365,10 @@ pub fn supports_at_empty_path(dir: &Path) -> bool { eprintln!("AT_EMPTY_PATH: supported"); } else { let err = std::io::Error::last_os_error(); - eprintln!("AT_EMPTY_PATH: not supported ({}) - skipping hardlink test", err); + eprintln!( + "AT_EMPTY_PATH: not supported ({}) - skipping hardlink test", + err + ); } supported } diff --git a/fuse-pipe/tests/integration.rs b/fuse-pipe/tests/integration.rs index 641b1109..0f8c25d1 100644 --- a/fuse-pipe/tests/integration.rs +++ b/fuse-pipe/tests/integration.rs @@ -210,7 +210,13 @@ fn test_hardlink_survives_source_removal() { eprintln!("=== Hardlink failed ==="); eprintln!("source: {:?} exists={}", source, source.exists()); eprintln!("link: {:?}", link); - eprintln!("mount contents: {:?}", fs::read_dir(mount).ok().map(|d| d.filter_map(|e| e.ok()).map(|e| e.file_name()).collect::>())); + eprintln!( + "mount contents: {:?}", + fs::read_dir(mount).ok().map(|d| d + .filter_map(|e| e.ok()) + .map(|e| e.file_name()) + .collect::>()) + ); panic!("create hardlink failed: {}", e); } diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 02a7364e..28a2c770 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -1852,7 +1852,11 @@ async fn boot_vm_for_setup(disk_path: &Path, initrd_path: &Path) -> Result<()> { // Verify marker file exists in the rootfs using debugfs (no root needed) let debugfs_output = Command::new("debugfs") - .args(["-R", "stat /etc/fcvm-setup-complete", path_to_str(disk_path)?]) + .args([ + "-R", + "stat /etc/fcvm-setup-complete", + path_to_str(disk_path)?, + ]) .output() .await?; let marker_exists = debugfs_output.status.success() @@ -1877,7 +1881,10 @@ async fn boot_vm_for_setup(disk_path: &Path, initrd_path: &Path) -> Result<()> { Err(_) => { // Print serial log on timeout for debugging if let Ok(serial_content) = tokio::fs::read_to_string(&serial_path).await { - eprintln!("=== Layer 2 setup VM timed out! Serial console output: ===\n{}", serial_content); + eprintln!( + "=== Layer 2 setup VM timed out! Serial console output: ===\n{}", + serial_content + ); } if let Ok(log_content) = tokio::fs::read_to_string(&log_path).await { eprintln!("=== Firecracker log: ===\n{}", log_content); From 7e4fa8b9635b5129308c2a19967f1585974b7019 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:09:31 +0000 Subject: [PATCH 11/23] Fix podman-in-podman for rootless container setup Add --cgroups=disabled to inner podman run command when downloading packages. This allows package download to work inside rootless containers where cgroup creation is not permitted. The error was: "crun: create /sys/fs/cgroup/libpod_parent: Permission denied" Tested: make container-setup-fcvm (completes in ~1 min) --- src/setup/rootfs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 28a2c770..a1197ed5 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -175,7 +175,7 @@ set -euo pipefail CONTAINER_IMAGE="ubuntu:{codename}" PACKAGES="{packages}" -podman run --rm -v "$PACKAGES_DIR:/packages" "$CONTAINER_IMAGE" bash -c ' +podman run --rm --cgroups=disabled -v "$PACKAGES_DIR:/packages" "$CONTAINER_IMAGE" bash -c ' set -euo pipefail apt-get update -qq # Use apt-get install --download-only to properly resolve dependencies From 80590980f28927bac183cf68670b89fbfeef47b8 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:22:13 +0000 Subject: [PATCH 12/23] CI: Add Rust cache for faster builds --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 33182b0d..15a5604c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,6 +41,10 @@ jobs: run: | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y echo "$HOME/.cargo/bin" >> $GITHUB_PATH + - uses: Swatinem/rust-cache@v2 + with: + cache-provider: buildjet + workspaces: fcvm -> target - name: Install dependencies run: | sudo apt-get update From b30f67aeb85d362b4fd65decb82e445df301b995 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:23:54 +0000 Subject: [PATCH 13/23] CI: Add cargo cache for container builds - Add CARGO_CACHE_DIR variable to Makefile for mounting cache volumes - Add actions/cache step to cache cargo registry and target between runs - Mount cache into container for faster rebuilds This caches both the cargo registry and target directory, so subsequent runs skip downloading crates and recompiling unchanged dependencies. --- .github/workflows/ci.yml | 14 ++++++++++++++ Makefile | 12 ++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 15a5604c..60861cc3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -114,12 +114,26 @@ jobs: # Configure rootless podman to use cgroupfs (no systemd session on CI) mkdir -p ~/.config/containers printf '[engine]\ncgroup_manager = "cgroupfs"\nevents_logger = "file"\n' > ~/.config/containers/containers.conf + # Create cargo cache directory for container + mkdir -p ${{ github.workspace }}/cargo-cache/registry ${{ github.workspace }}/cargo-cache/target + - name: Cache container cargo + uses: actions/cache@v4 + with: + path: ${{ github.workspace }}/cargo-cache + key: container-cargo-${{ hashFiles('fcvm/Cargo.lock') }} + restore-keys: container-cargo- - name: container-test-unit + env: + CARGO_CACHE_DIR: ${{ github.workspace }}/cargo-cache working-directory: fcvm run: make container-test-unit - name: container-setup-fcvm + env: + CARGO_CACHE_DIR: ${{ github.workspace }}/cargo-cache working-directory: fcvm run: make container-setup-fcvm - name: container-test + env: + CARGO_CACHE_DIR: ${{ github.workspace }}/cargo-cache working-directory: fcvm run: make container-test diff --git a/Makefile b/Makefile index 48c7195d..27eb0edd 100644 --- a/Makefile +++ b/Makefile @@ -30,11 +30,19 @@ endif # Base test command NEXTEST := CARGO_TARGET_DIR=target cargo nextest $(NEXTEST_CMD) --release -# Container run command (runs as testuser via Containerfile USER directive) +# Optional cargo cache directory (for CI caching) +CARGO_CACHE_DIR ?= +ifneq ($(CARGO_CACHE_DIR),) +CARGO_CACHE_MOUNT := -v $(CARGO_CACHE_DIR)/registry:/usr/local/cargo/registry -v $(CARGO_CACHE_DIR)/target:/workspace/fcvm/target +else +CARGO_CACHE_MOUNT := +endif + +# Container run command CONTAINER_RUN := podman run --rm --privileged \ -v .:/workspace/fcvm -v $(FUSE_BACKEND_RS):/workspace/fuse-backend-rs -v $(FUSER):/workspace/fuser \ --device /dev/fuse --device /dev/kvm \ - --ulimit nofile=65536:65536 --pids-limit=65536 -v /mnt/fcvm-btrfs:/mnt/fcvm-btrfs + --ulimit nofile=65536:65536 --pids-limit=65536 -v /mnt/fcvm-btrfs:/mnt/fcvm-btrfs $(CARGO_CACHE_MOUNT) .PHONY: all help build clean test test-unit test-fast test-all test-root \ _test-unit _test-fast _test-all _test-root \ From 676388d0eba312da02148fae911026750ab47ef2 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:34:38 +0000 Subject: [PATCH 14/23] Fix: Add --cgroups=disabled to actual podman command The previous fix only updated the hash function, not the actual Command that executes podman. This adds --cgroups=disabled to the real download command at line 1552. --- src/setup/rootfs.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index a1197ed5..4ecb982b 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -1553,6 +1553,7 @@ cp /var/cache/apt/archives/*.deb /packages/ 2>/dev/null || true .args([ "run", "--rm", + "--cgroups=disabled", "-v", &format!("{}:/packages", packages_dir.display()), &container_image, From b27bb306c122b65afc0c0b5baee3b2a58d7201f7 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:36:15 +0000 Subject: [PATCH 15/23] Refactor: Use single source for download script Remove duplicate script definition - now generate_download_script() is used for both hashing AND execution. This prevents the bug where the hash version had --cgroups=disabled but the execution version didn't. --- src/setup/rootfs.rs | 54 +++++++++------------------------------------ 1 file changed, 11 insertions(+), 43 deletions(-) diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 4ecb982b..c9550970 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -159,29 +159,23 @@ echo 'FCVM: Packages installed successfully' .to_string() } -/// Generate the script used to download packages via podman. -/// This script is included in the hash to ensure cache invalidation -/// when the download method or target Ubuntu version changes. +/// Generate the bash script that runs INSIDE the ubuntu container to download packages. +/// This script is included in the hash to ensure cache invalidation when the +/// download method or package list changes. The same script is used for execution +/// in download_packages(). pub fn generate_download_script(plan: &Plan) -> String { let packages = plan.packages.all_packages(); let packages_str = packages.join(" "); let codename = &plan.base.codename; + // This is the script that runs inside the ubuntu container + // Format: codename is used for the container image, packages for apt-get format!( - r#"#!/bin/bash -# Download packages for Ubuntu {codename} using podman -# This script is hashed to invalidate cache when download method changes -set -euo pipefail -CONTAINER_IMAGE="ubuntu:{codename}" -PACKAGES="{packages}" - -podman run --rm --cgroups=disabled -v "$PACKAGES_DIR:/packages" "$CONTAINER_IMAGE" bash -c ' + r#"# Download packages for Ubuntu {codename} set -euo pipefail apt-get update -qq -# Use apt-get install --download-only to properly resolve dependencies -apt-get install --download-only --yes --no-install-recommends '"$PACKAGES"' +apt-get install --download-only --yes --no-install-recommends {packages} cp /var/cache/apt/archives/*.deb /packages/ 2>/dev/null || true -' "#, codename = codename, packages = packages_str @@ -1515,39 +1509,13 @@ async fn download_packages(plan: &Plan, script_sha_short: &str) -> Result/dev/null || true -"#, - packages = packages_str - ); + // Use the same script that's included in the hash + let download_script = generate_download_script(plan); let output = Command::new("podman") .args([ From 92902559425ed77fdd9fa44a5914e73cd04b4da7 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:56:32 +0000 Subject: [PATCH 16/23] Separate lint tests from integration tests Add lint-tests feature to gate fmt/clippy/audit/deny tests. These were causing test-fast to fail due to corrupt cargo-audit DB. Now run lint explicitly with: make lint --- Cargo.toml | 3 +++ Makefile | 2 +- tests/lint.rs | 4 +++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b9a664ad..0df6e800 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,9 @@ integration-slow = [] # Slow VM tests, > 30s each (clone, snapshot, fuse # Privileged tests require sudo (bridged networking, pjdfstest, iptables) privileged-tests = [] +# Lint tests (fmt, clippy, audit, deny) - run separately via: make lint +lint-tests = [] + [dev-dependencies] serial_test = "3" criterion = "0.5" diff --git a/Makefile b/Makefile index 27eb0edd..e87192b4 100644 --- a/Makefile +++ b/Makefile @@ -163,7 +163,7 @@ bench: build cargo bench -p fuse-pipe --bench protocol lint: - cargo test --test lint + cargo test --test lint --features lint-tests fmt: cargo fmt diff --git a/tests/lint.rs b/tests/lint.rs index 223092df..11889e9c 100644 --- a/tests/lint.rs +++ b/tests/lint.rs @@ -1,6 +1,8 @@ //! Lint tests - run fmt, clippy, audit, deny in parallel via cargo test. +//! Gated by "lint-tests" feature so they don't run during VM tests. +//! Run explicitly with: make lint -#![cfg(feature = "integration-fast")] +#![cfg(feature = "lint-tests")] use std::process::Command; From f0b9cec3a1057e1d20ef836cee890bb9a671bad4 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:58:36 +0000 Subject: [PATCH 17/23] CI: Install cargo-audit/deny for CVSS 4.0 support The Host job was missing cargo-audit and cargo-deny, causing lint tests to fail with 'unsupported CVSS version: 4.0' from the RustSec DB. Added cargo install for both tools alongside cargo-nextest. --- .github/workflows/ci.yml | 4 ++-- Cargo.toml | 3 --- Makefile | 2 +- tests/lint.rs | 4 +--- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 60861cc3..0702c91a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,8 +60,8 @@ jobs: release-v1.14.0-x86_64/jailer-v1.14.0-x86_64 sudo mv /usr/local/bin/firecracker-v1.14.0-x86_64 /usr/local/bin/firecracker sudo mv /usr/local/bin/jailer-v1.14.0-x86_64 /usr/local/bin/jailer - - name: Install cargo-nextest - run: cargo install cargo-nextest --locked + - name: Install cargo tools + run: cargo install cargo-nextest cargo-audit cargo-deny --locked - name: Setup KVM and networking run: | sudo chmod 666 /dev/kvm diff --git a/Cargo.toml b/Cargo.toml index 0df6e800..b9a664ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,9 +56,6 @@ integration-slow = [] # Slow VM tests, > 30s each (clone, snapshot, fuse # Privileged tests require sudo (bridged networking, pjdfstest, iptables) privileged-tests = [] -# Lint tests (fmt, clippy, audit, deny) - run separately via: make lint -lint-tests = [] - [dev-dependencies] serial_test = "3" criterion = "0.5" diff --git a/Makefile b/Makefile index e87192b4..27eb0edd 100644 --- a/Makefile +++ b/Makefile @@ -163,7 +163,7 @@ bench: build cargo bench -p fuse-pipe --bench protocol lint: - cargo test --test lint --features lint-tests + cargo test --test lint fmt: cargo fmt diff --git a/tests/lint.rs b/tests/lint.rs index 11889e9c..223092df 100644 --- a/tests/lint.rs +++ b/tests/lint.rs @@ -1,8 +1,6 @@ //! Lint tests - run fmt, clippy, audit, deny in parallel via cargo test. -//! Gated by "lint-tests" feature so they don't run during VM tests. -//! Run explicitly with: make lint -#![cfg(feature = "lint-tests")] +#![cfg(feature = "integration-fast")] use std::process::Command; From d98ed1da11b20272efb45f75c0c17c1f8fdc9197 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 20:59:01 +0000 Subject: [PATCH 18/23] docs: Add NO HACKS policy to CLAUDE.md --- .claude/CLAUDE.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index ab17ede9..407723d0 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -1,5 +1,18 @@ # fcvm Development Log +## NO HACKS + +**Fix the root cause, not the symptom.** When something fails: +1. Understand WHY it's failing +2. Fix the actual problem +3. Don't hide errors, disable tests, or add workarounds + +Examples of hacks to avoid: +- Gating tests behind feature flags to skip failures +- Adding sleeps or retries without understanding the race +- Clearing caches instead of updating tools +- Using `|| true` to ignore errors + ## Overview fcvm is a Firecracker VM manager for running Podman containers in lightweight microVMs. This document tracks implementation findings and decisions. From a83bdbe0c1337ab2ab4ae6592bf9d839369b6a6d Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 21:04:29 +0000 Subject: [PATCH 19/23] CI: Add shared-key to rust-cache for cache reuse --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0702c91a..32b76773 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,7 @@ jobs: with: cache-provider: buildjet workspaces: fcvm -> target + shared-key: build-and-test - name: Install dependencies run: | sudo apt-get update From d2b678a332675ce6be97a841166b6c5957d4a1c6 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 21:09:02 +0000 Subject: [PATCH 20/23] CI: Save rust cache even on failure --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 32b76773..fc426d87 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,7 @@ jobs: with: cache-provider: buildjet workspaces: fcvm -> target - shared-key: build-and-test + cache-on-failure: "true" - name: Install dependencies run: | sudo apt-get update From 8e714fdbd9e9ae493e896e541974f3bebe69a65a Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 21:47:20 +0000 Subject: [PATCH 21/23] Fix disk space exhaustion in CI snapshot tests Root cause: 15 snapshot tests running in parallel, each creating a 5.6GB snapshot (2GB memory + 3.6GB disk). With 20GB btrfs, only ~3 tests fit. Changes: - Increase btrfs loopback from 20G to 60G - Add snapshot-tests group with max-threads=3 in nextest.toml - Assign snapshot/clone tests to this group This limits concurrent snapshots to ~17GB disk usage, well under the 60GB limit. Belt and suspenders approach ensures CI stability. --- .config/nextest.toml | 14 ++++++++++++-- Makefile | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.config/nextest.toml b/.config/nextest.toml index 755d4a35..4700846f 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -42,6 +42,10 @@ retries = 0 [test-groups.stress-tests] max-threads = 1 +# Snapshot tests limited to 3 concurrent (each snapshot is ~5.6GB on disk) +[test-groups.snapshot-tests] +max-threads = 3 + # VM tests run at full parallelism (num-cpus) [test-groups.vm-tests] max-threads = "num-cpus" @@ -51,9 +55,15 @@ filter = "package(fcvm) & test(/stress_100/)" test-group = "stress-tests" slow-timeout = { period = "600s", terminate-after = 1 } -# VM tests get 10 minute timeout +# Snapshot tests: limited to 3 concurrent (each creates ~5.6GB snapshot on disk) +[[profile.default.overrides]] +filter = "package(fcvm) & (test(/snapshot/) | test(/clone/))" +test-group = "snapshot-tests" +slow-timeout = { period = "600s", terminate-after = 1 } + +# VM tests get 10 minute timeout (non-snapshot tests) [[profile.default.overrides]] -filter = "package(fcvm) & test(/test_/) & !test(/stress_100/) & !test(/pjdfstest_vm/)" +filter = "package(fcvm) & test(/test_/) & !test(/stress_100/) & !test(/pjdfstest_vm/) & !test(/snapshot/) & !test(/clone/)" test-group = "vm-tests" slow-timeout = { period = "600s", terminate-after = 1 } diff --git a/Makefile b/Makefile index 27eb0edd..af3817d0 100644 --- a/Makefile +++ b/Makefile @@ -125,7 +125,7 @@ setup-btrfs: @if ! mountpoint -q /mnt/fcvm-btrfs 2>/dev/null; then \ echo '==> Creating btrfs loopback...'; \ if [ ! -f /var/fcvm-btrfs.img ]; then \ - sudo truncate -s 20G /var/fcvm-btrfs.img && sudo mkfs.btrfs /var/fcvm-btrfs.img; \ + sudo truncate -s 60G /var/fcvm-btrfs.img && sudo mkfs.btrfs /var/fcvm-btrfs.img; \ fi && \ sudo mkdir -p /mnt/fcvm-btrfs && \ sudo mount -o loop /var/fcvm-btrfs.img /mnt/fcvm-btrfs && \ From 0cad682ac02a5370f348d3ce4889be3ecb3f6f41 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 21:55:00 +0000 Subject: [PATCH 22/23] CI: Enable userfaultfd in Container job Container tests were failing with "userfaultfd access check failed" because the Container job wasn't setting vm.unprivileged_userfaultfd=1. The Host job already had this, but Container was missing it. Containers inherit host sysctl settings, so setting it on the host before running podman allows snapshot cloning to work inside the container. --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fc426d87..a32e9634 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,6 +112,8 @@ jobs: - name: Setup KVM and rootless podman run: | sudo chmod 666 /dev/kvm + # Enable userfaultfd for snapshot cloning (container inherits host sysctl) + sudo sysctl -w vm.unprivileged_userfaultfd=1 # Configure rootless podman to use cgroupfs (no systemd session on CI) mkdir -p ~/.config/containers printf '[engine]\ncgroup_manager = "cgroupfs"\nevents_logger = "file"\n' > ~/.config/containers/containers.conf From 39512ce20fb5bed48332a27f3710862605b25bfc Mon Sep 17 00:00:00 2001 From: ejc3 Date: Thu, 25 Dec 2025 22:07:59 +0000 Subject: [PATCH 23/23] CI: Create and pass /dev/userfaultfd to container Snapshot cloning requires /dev/userfaultfd device, not just the sysctl. - Create device with mknod in CI setup - Pass device to container via --device flag --- .github/workflows/ci.yml | 6 +++++- Makefile | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a32e9634..d7fe9247 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,7 +112,11 @@ jobs: - name: Setup KVM and rootless podman run: | sudo chmod 666 /dev/kvm - # Enable userfaultfd for snapshot cloning (container inherits host sysctl) + # Create userfaultfd device for snapshot cloning + if [ ! -e /dev/userfaultfd ]; then + sudo mknod /dev/userfaultfd c 10 126 + fi + sudo chmod 666 /dev/userfaultfd sudo sysctl -w vm.unprivileged_userfaultfd=1 # Configure rootless podman to use cgroupfs (no systemd session on CI) mkdir -p ~/.config/containers diff --git a/Makefile b/Makefile index af3817d0..e6af819a 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ endif # Container run command CONTAINER_RUN := podman run --rm --privileged \ -v .:/workspace/fcvm -v $(FUSE_BACKEND_RS):/workspace/fuse-backend-rs -v $(FUSER):/workspace/fuser \ - --device /dev/fuse --device /dev/kvm \ + --device /dev/fuse --device /dev/kvm --device /dev/userfaultfd \ --ulimit nofile=65536:65536 --pids-limit=65536 -v /mnt/fcvm-btrfs:/mnt/fcvm-btrfs $(CARGO_CACHE_MOUNT) .PHONY: all help build clean test test-unit test-fast test-all test-root \