diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f7d9d501..84ef3a94 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,42 +10,9 @@ env: CARGO_TERM_COLOR: always jobs: - # Fast jobs run in parallel on every PR and push - - lint: - name: Lint - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - path: fcvm - - uses: actions/checkout@v4 - with: - repository: ejc3/fuse-backend-rs - ref: master - path: fuse-backend-rs - - uses: actions/checkout@v4 - with: - repository: ejc3/fuser - ref: master - path: fuser - - uses: dtolnay/rust-toolchain@stable - with: - components: clippy, rustfmt - - name: Install cargo-machete - run: cargo install cargo-machete - - name: Check formatting - working-directory: fcvm - run: cargo fmt --all -- --check - - name: Clippy - working-directory: fcvm - run: cargo clippy --all-targets --all-features -- -D warnings - - name: Check unused dependencies - working-directory: fcvm - run: cargo machete - + # Build inside container, upload artifacts for parallel test jobs build: - name: Build + name: Build [container/ubuntu-latest] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -61,16 +28,29 @@ jobs: repository: ejc3/fuser ref: master path: fuser - - uses: dtolnay/rust-toolchain@stable - - uses: Swatinem/rust-cache@v2 - with: - workspaces: fcvm - - name: Build + - name: Build inside container working-directory: fcvm - run: cargo build --release --all-targets + run: | + export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs + export FUSER=${{ github.workspace }}/fuser + export CONTAINER_ARCH=x86_64 + export CI=1 + make container-build-only + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: container-build + path: | + fcvm/target/release + !fcvm/target/release/.fingerprint + !fcvm/target/release/build + !fcvm/target/release/deps + !fcvm/target/release/incremental + retention-days: 1 - test-unit: - name: Unit Tests + # Lint runs in parallel with build (just needs source) + lint: + name: Lint (fmt+clippy+machete) [host/ubuntu-latest] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -87,68 +67,26 @@ jobs: ref: master path: fuser - uses: dtolnay/rust-toolchain@stable - - uses: Swatinem/rust-cache@v2 - with: - workspaces: fcvm - - name: Run unit tests - working-directory: fcvm - run: cargo test --release --lib --all - - test-fuse-integration: - name: FUSE Integration - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 with: - path: fcvm - - uses: actions/checkout@v4 - with: - repository: ejc3/fuse-backend-rs - ref: master - path: fuse-backend-rs - - uses: actions/checkout@v4 - with: - repository: ejc3/fuser - ref: master - path: fuser - - uses: dtolnay/rust-toolchain@stable + components: clippy, rustfmt - uses: Swatinem/rust-cache@v2 with: workspaces: fcvm - - name: Build + - name: Check formatting working-directory: fcvm - run: cargo build --release -p fuse-pipe - - name: Run integration_root tests + run: cargo fmt --all -- --check + - name: Clippy working-directory: fcvm - run: sudo -E env "PATH=$PATH" cargo test --release -p fuse-pipe --test integration_root -- --test-threads=1 - - test-fuse-noroot: - name: FUSE No-Root - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - path: fcvm - - uses: actions/checkout@v4 - with: - repository: ejc3/fuse-backend-rs - ref: master - path: fuse-backend-rs - - uses: actions/checkout@v4 - with: - repository: ejc3/fuser - ref: master - path: fuser - - name: Run no-root FUSE tests (container) + run: cargo clippy --all-targets --all-features -- -D warnings + - name: Install cargo-machete + run: cargo install cargo-machete + - name: Check unused dependencies working-directory: fcvm - run: | - export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs - export FUSER=${{ github.workspace }}/fuser - export CONTAINER_ARCH=x86_64 - make container-test-noroot + run: cargo machete - test-cli: - name: CLI Tests + # Native tests use rust-cache (compiles incrementally) + test-native: + name: Unit+CLI+FUSE-root [host/ubuntu-latest] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -168,12 +106,20 @@ jobs: - uses: Swatinem/rust-cache@v2 with: workspaces: fcvm - - name: Run CLI tests + - name: Unit tests + working-directory: fcvm + run: cargo test --release --lib --all + - name: CLI tests working-directory: fcvm run: cargo test --release --test test_cli_parsing --test test_state_manager + - name: FUSE integration tests (root) + working-directory: fcvm + run: sudo -E env "PATH=$PATH" cargo test --release -p fuse-pipe --test integration_root -- --test-threads=1 - test-fuse-permissions: - name: FUSE Permissions + # Container FUSE tests - download pre-built artifacts + fuse-tests: + name: FUSE (noroot+root) [container/ubuntu-latest] + needs: build runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -189,16 +135,25 @@ jobs: repository: ejc3/fuser ref: master path: fuser - - name: Run permission tests (container) + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: container-build + path: fcvm/target/release + - name: Run FUSE tests (container, no rebuild) working-directory: fcvm run: | export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs export FUSER=${{ github.workspace }}/fuser export CONTAINER_ARCH=x86_64 - make container-test-root + export CI=1 + mkdir -p cargo-home + make container-test - test-pjdfstest: - name: POSIX Compliance + # POSIX compliance - download pre-built artifacts + posix-compliance: + name: POSIX (pjdfstest 8789) [container/ubuntu-latest] + needs: build runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -214,66 +169,25 @@ jobs: repository: ejc3/fuser ref: master path: fuser - - name: Run pjdfstest (container) - working-directory: fcvm - run: | - export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs - export FUSER=${{ github.workspace }}/fuser - export CONTAINER_ARCH=x86_64 - make container-test-pjdfstest - - test-vm-sanity: - name: VM Sanity - runs-on: buildjet-32vcpu-ubuntu-2204 - steps: - - uses: actions/checkout@v4 - with: - path: fcvm - - uses: actions/checkout@v4 + - name: Download build artifacts + uses: actions/download-artifact@v4 with: - repository: ejc3/fuse-backend-rs - ref: master - path: fuse-backend-rs - - uses: actions/checkout@v4 - with: - repository: ejc3/fuser - ref: master - path: fuser - - name: Check KVM availability - run: | - echo "=== KVM device ===" - ls -la /dev/kvm || echo "No /dev/kvm" - echo "=== CPU virtualization ===" - grep -E "(vmx|svm)" /proc/cpuinfo | head -1 || echo "No VMX/SVM" - echo "=== KVM modules ===" - lsmod | grep kvm || echo "No KVM modules" - - name: Setup KVM permissions - run: sudo chmod 666 /dev/kvm - - name: Setup NBD module for rootfs extraction - run: | - sudo modprobe nbd max_part=8 - ls -la /dev/nbd* | head -5 - - name: Setup network namespace directory - run: sudo mkdir -p /var/run/netns - - name: Setup iptables for VM networking - run: | - # BuildJet runners have FORWARD chain set to DROP by default - # Set to ACCEPT and add MASQUERADE rule for VM NAT - sudo iptables -P FORWARD ACCEPT - sudo iptables -t nat -A POSTROUTING -s 172.30.0.0/16 -o eth0 -j MASQUERADE || true - - name: Run VM sanity test (bridged) + name: container-build + path: fcvm/target/release + - name: Run pjdfstest (container, no rebuild) working-directory: fcvm run: | export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs export FUSER=${{ github.workspace }}/fuser export CONTAINER_ARCH=x86_64 - make container-test-vm-bridged + export CI=1 + mkdir -p cargo-home + make container-test-pjdfstest - test-vm-exec: - name: VM Exec + # VM tests on BuildJet - builds inside container (separate from ubuntu-latest) + vm-tests: + name: VM (bridged+rootless) [container/buildjet-32cpu] runs-on: buildjet-32vcpu-ubuntu-2204 - needs: test-vm-sanity # Sequential: flock doesn't work across podman containers sharing /dev/nbd0 - if: always() # Run even if previous job failed (rootfs will be cached after first success) steps: - uses: actions/checkout@v4 with: @@ -298,47 +212,17 @@ jobs: run: | sudo iptables -P FORWARD ACCEPT sudo iptables -t nat -A POSTROUTING -s 172.30.0.0/16 -o eth0 -j MASQUERADE || true - - name: Run VM exec tests - working-directory: fcvm + - name: Setup userfaultfd for snapshot cloning run: | - export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs - export FUSER=${{ github.workspace }}/fuser - export CONTAINER_ARCH=x86_64 - make container-test-vm-exec - - test-vm-egress: - name: VM Egress - runs-on: buildjet-32vcpu-ubuntu-2204 - needs: test-vm-exec # Sequential: flock doesn't work across podman containers sharing /dev/nbd0 - if: always() # Run even if previous job failed (rootfs will be cached after first success) - steps: - - uses: actions/checkout@v4 - with: - path: fcvm - - uses: actions/checkout@v4 - with: - repository: ejc3/fuse-backend-rs - ref: master - path: fuse-backend-rs - - uses: actions/checkout@v4 - with: - repository: ejc3/fuser - ref: master - path: fuser - - name: Setup KVM permissions - run: sudo chmod 666 /dev/kvm - - name: Setup NBD module - run: sudo modprobe nbd max_part=8 - - name: Setup network namespace directory - run: sudo mkdir -p /var/run/netns - - name: Setup iptables for VM networking - run: | - sudo iptables -P FORWARD ACCEPT - sudo iptables -t nat -A POSTROUTING -s 172.30.0.0/16 -o eth0 -j MASQUERADE || true - - name: Run VM egress tests + if [ ! -e /dev/userfaultfd ]; then + sudo mknod /dev/userfaultfd c 10 126 + fi + sudo chmod 666 /dev/userfaultfd + sudo sysctl -w vm.unprivileged_userfaultfd=1 + - name: Run all VM tests working-directory: fcvm run: | export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs export FUSER=${{ github.workspace }}/fuser export CONTAINER_ARCH=x86_64 - make container-test-vm-egress + make container-test-vm diff --git a/Cargo.lock b/Cargo.lock index 1fc5ce6f..d50c9806 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -175,6 +175,15 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bumpalo" version = "3.19.0" @@ -347,6 +356,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "criterion" version = "0.5.1" @@ -423,6 +441,16 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -436,6 +464,16 @@ dependencies = [ "parking_lot_core", ] +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "dirs" version = "6.0.0" @@ -537,6 +575,7 @@ dependencies = [ "clap", "criterion", "fuse-pipe", + "hex", "hyper 0.14.32", "hyperlocal", "libc", @@ -548,11 +587,13 @@ dependencies = [ "serde", "serde_json", "serial_test", + "sha2", "shell-words", "shellexpand", "tempfile", "tokio", "tokio-util", + "toml", "tracing", "tracing-subscriber", "url", @@ -737,6 +778,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -2051,6 +2102,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2088,6 +2148,17 @@ dependencies = [ "syn", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2382,6 +2453,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tower" version = "0.5.2" @@ -2507,6 +2619,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + [[package]] name = "unicode-ident" version = "1.0.22" @@ -2586,6 +2704,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "vm-memory" version = "0.14.1" @@ -3061,6 +3185,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.50.0" diff --git a/Cargo.toml b/Cargo.toml index 719410d6..be5d4880 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,9 @@ atty = "0.2" clap = { version = "4", features = ["derive", "env"] } serde = { version = "1", features = ["derive"] } serde_json = "1" +sha2 = "0.10" +hex = "0.4" +toml = "0.8" tokio = { version = "1", features = ["rt-multi-thread", "macros", "process", "fs", "signal", "io-util", "sync", "time"] } reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } which = "6" @@ -40,6 +43,11 @@ url = "2" tokio-util = "0.7" regex = "1.12.2" +[features] +# Test category - only gate tests that require sudo +# Unprivileged tests run by default (no feature flag needed) +privileged-tests = [] # Tests requiring sudo (iptables, root podman storage) + [dev-dependencies] serial_test = "3" criterion = "0.5" diff --git a/Containerfile b/Containerfile index 55513d45..424cfae2 100644 --- a/Containerfile +++ b/Containerfile @@ -50,6 +50,7 @@ RUN curl -L -o /tmp/firecracker.tgz \ https://github.com/firecracker-microvm/firecracker/releases/download/v1.14.0/firecracker-v1.14.0-${ARCH}.tgz \ && tar -xzf /tmp/firecracker.tgz -C /tmp \ && mv /tmp/release-v1.14.0-${ARCH}/firecracker-v1.14.0-${ARCH} /usr/local/bin/firecracker \ + && chown root:root /usr/local/bin/firecracker \ && chmod +x /usr/local/bin/firecracker \ && rm -rf /tmp/firecracker.tgz /tmp/release-v1.14.0-${ARCH} diff --git a/DESIGN.md b/DESIGN.md index f4869d4c..da566686 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -378,37 +378,89 @@ Each VM has: ## Networking -### Rootless Mode (slirp4netns) +### Rootless Mode (slirp4netns with Dual-TAP Architecture) + +**Key Insight**: slirp4netns and Firecracker CANNOT share a TAP device (both need exclusive access). +**Solution**: Use two TAP devices with IP forwarding between them inside a user namespace. **Topology**: ``` -┌─────────────┐ -│ Host Process│ -└──────┬──────┘ - │ - ├─── Firecracker VM (VM namespace) - │ └─── eth0: 10.0.2.15 - │ - └─── slirp4netns (User namespace) - └─── Provides NAT + port forwarding +Host │ User Namespace (unshare --user --map-root-user --net) + │ +slirp4netns <────────────┼── slirp0 (10.0.2.100/24) + (userspace NAT) │ │ + │ │ IP forwarding + iptables NAT + │ ▼ + │ tap0 (192.168.1.1/24) + │ │ + │ ▼ + │ Firecracker VM + │ eth0: 192.168.1.2 +``` + +**Setup Sequence** (3-phase with nsenter): +1. Spawn holder process: `unshare --user --map-root-user --net -- sleep infinity` +2. Run setup via nsenter: create TAPs, iptables, enable IP forwarding +3. Start slirp4netns attached to holder's namespace +4. Run Firecracker via nsenter: `nsenter -t HOLDER_PID -U -n -- firecracker ...` +5. Health checks via nsenter: `nsenter -t HOLDER_PID -U -n -- curl guest_ip:80` + +**Network Setup Script** (executed via nsenter): +```bash +# Create slirp0 TAP for slirp4netns connectivity +ip tuntap add slirp0 mode tap +ip addr add 10.0.2.100/24 dev slirp0 +ip link set slirp0 up +ip route add default via 10.0.2.2 dev slirp0 + +# Create tap0 for Firecracker (guest uses 192.168.1.2) +ip tuntap add tap0 mode tap +ip addr add 192.168.1.1/24 dev tap0 +ip link set tap0 up + +# Enable IP forwarding +echo 1 > /proc/sys/net/ipv4/ip_forward + +# Allow forwarding between slirp0 and FC TAP +iptables -A FORWARD -i slirp0 -o tap0 -j ACCEPT +iptables -A FORWARD -i tap0 -o slirp0 -j ACCEPT + +# NAT guest traffic (192.168.x.x) to slirp0's address (10.0.2.100) +iptables -t nat -A POSTROUTING -s 192.168.1.0/24 -o slirp0 -j MASQUERADE ``` -**Port Forwarding**: +**Port Forwarding** (unique loopback IPs): ```bash +# Each VM gets a unique loopback IP (127.x.y.z) for port forwarding +# No IP aliasing needed - Linux routes all 127.0.0.0/8 to loopback slirp4netns \ --configure \ --mtu=65520 \ - --port tcp:8080:80 \ - --port udp:53:53 \ - \ - tap0 + --api-socket /tmp/slirp-{vm_id}.sock \ + \ + slirp0 + +# Port forwarding via JSON-RPC API: +echo '{"execute":"add_hostfwd","arguments":{"proto":"tcp","host_addr":"127.0.0.2","host_port":8080,"guest_addr":"10.0.2.100","guest_port":8080}}' | nc -U /tmp/slirp-{vm_id}.sock +``` + +**Traffic Flow** (VM to Internet): +``` +Guest (192.168.1.2) → tap0 → iptables MASQUERADE → slirp0 (10.0.2.100) → slirp4netns → Host → Internet +``` + +**Traffic Flow** (Host to VM port forward): +``` +Host (127.0.0.2:8080) → slirp4netns → slirp0 (10.0.2.100:8080) → IP forward → tap0 → Guest (192.168.1.2:80) ``` **Characteristics**: -- No root required -- Slightly slower than native networking -- Works in nested VMs -- Fully compatible with rootless Podman +- No root required (runs entirely in user namespace) +- Isolated 192.168.1.0/24 subnet per VM (no conflicts) +- Unique loopback IP per VM enables same port on multiple VMs +- Slightly slower than bridged (~10-20% overhead) +- Works in nested VMs and restricted environments +- Fully compatible with rootless Podman in guest ### Privileged Mode (nftables + bridge) @@ -1326,6 +1378,28 @@ RUST_LOG=trace fcvm run nginx:latest ## Testing Strategy +### Test Infrastructure + +**Network Mode Guards**: The fcvm binary enforces proper network mode usage: +- **Bridged without root**: Fails with helpful error message suggesting `sudo` or `--network rootless` +- **Rootless with root**: Runs but prints warning that bridged would be faster + +**Test Isolation**: All tests use unique resource names to enable parallel execution: +- `unique_names()` helper generates timestamp+counter-based names +- PID-based naming for additional uniqueness +- Automatic cleanup on test exit + +**Dynamic NBD Device Selection**: When creating rootfs (extracting qcow2 images): +- Scans `/dev/nbd0` through `/dev/nbd15` to find a free device +- Checks `/sys/block/nbdN/pid` to detect in-use devices +- Includes retry logic for race conditions during parallel execution + +**Root/Rootless Test Organization**: +- Rootless tests: Use `require_non_root()` guard, fail loudly if run as root +- Bridged tests: Rely on fcvm binary's built-in check +- Makefile targets: Split by network mode (`test-vm-exec-bridged`/`test-vm-exec-rootless`) +- Container tests: Use appropriate container run configurations (CONTAINER_RUN_FCVM vs CONTAINER_RUN_ROOTLESS) + ### Unit Tests Test individual components in isolation: @@ -1541,6 +1615,6 @@ kill $CLONE_PID $SERVE_PID $BASELINE_PID **End of Design Specification** -*Version: 2.0* -*Date: 2025-12-14* +*Version: 2.1* +*Date: 2025-12-21* *Author: fcvm project* diff --git a/Makefile b/Makefile index e7bec4aa..817e1c1a 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,12 @@ TEST_FUSE_ROOT := cargo test --release -p fuse-pipe --test integration_root TEST_FUSE_PERMISSION := cargo test --release -p fuse-pipe --test test_permission_edge_cases TEST_PJDFSTEST := cargo test --release -p fuse-pipe --test pjdfstest_full -- --nocapture TEST_VM_BRIDGED := sh -c "cargo build --release && cargo test --release --test test_sanity test_sanity_bridged -- --nocapture" -TEST_VM_EXEC := sh -c "cargo build --release && cargo test --release --test test_exec -- --nocapture --test-threads=1" -TEST_VM_EGRESS := sh -c "cargo build --release && cargo test --release --test test_egress -- --nocapture --test-threads=1" +TEST_VM_EXEC_BRIDGED := sh -c "cargo build --release && cargo test --release --test test_exec test_exec_bridged -- --nocapture" +TEST_VM_EGRESS_BRIDGED := sh -c "cargo build --release && cargo test --release --test test_egress bridged -- --nocapture" + +# No root required (rootless networking): +TEST_VM_EXEC_ROOTLESS := sh -c "cargo build --release && cargo test --release --test test_exec test_exec_rootless -- --nocapture" +TEST_VM_EGRESS_ROOTLESS := sh -c "cargo build --release && cargo test --release --test test_egress rootless -- --nocapture" # Legacy alias TEST_VM := cargo test --release --test test_sanity -- --nocapture @@ -37,11 +41,15 @@ BENCH_EXEC := cargo bench --bench exec .PHONY: all help build clean \ test test-noroot test-root test-unit test-fuse test-vm test-vm-rootless test-vm-bridged test-all \ + test-vm-exec test-vm-exec-bridged test-vm-exec-rootless \ + test-vm-egress test-vm-egress-bridged test-vm-egress-rootless \ bench bench-throughput bench-operations bench-protocol bench-exec bench-quick bench-logs bench-clean \ lint clippy fmt fmt-check \ rootfs rebuild \ container-test container-test-unit container-test-noroot container-test-root container-test-fuse \ - container-test-vm container-test-vm-rootless container-test-vm-bridged container-test-vm-exec container-test-vm-egress container-test-fcvm \ + container-test-vm container-test-vm-rootless container-test-vm-bridged container-test-fcvm \ + container-test-vm-exec container-test-vm-exec-bridged container-test-vm-exec-rootless \ + container-test-vm-egress container-test-vm-egress-bridged container-test-vm-egress-rootless \ container-test-pjdfstest container-test-all container-test-allow-other container-build-allow-other \ container-bench container-bench-throughput container-bench-operations container-bench-protocol container-bench-exec \ container-shell container-clean \ @@ -62,9 +70,11 @@ help: @echo " make test-root - Tests requiring root: integration_root (sudo)" @echo " make test-unit - Unit tests only (no root)" @echo " make test-fuse - fuse-pipe: integration + permission + stress" - @echo " make test-vm - VM tests: rootless + bridged" - @echo " make test-vm-rootless - VM test with slirp4netns (no root)" - @echo " make test-vm-bridged - VM test with bridged networking (sudo)" + @echo " make test-vm - VM tests: rootless + bridged sanity" + @echo " make test-vm-rootless - VM sanity test with slirp4netns (no sudo)" + @echo " make test-vm-bridged - VM sanity test with bridged networking (sudo)" + @echo " make test-vm-exec - VM exec tests: rootless + bridged" + @echo " make test-vm-egress - VM egress tests: rootless + bridged" @echo " make test-all - Everything: test + test-vm" @echo "" @echo "Benchmarks:" @@ -89,9 +99,11 @@ help: @echo " make container-test-root - Tests as root" @echo " make container-test-unit - Unit tests only (non-root)" @echo " make container-test-fuse - All fuse-pipe tests explicitly" - @echo " make container-test-vm - VM tests (rootless + bridged)" - @echo " make container-test-vm-rootless - VM test with slirp4netns" - @echo " make container-test-vm-bridged - VM test with bridged networking" + @echo " make container-test-vm - VM sanity tests (rootless + bridged)" + @echo " make container-test-vm-rootless - VM sanity with slirp4netns" + @echo " make container-test-vm-bridged - VM sanity with bridged networking" + @echo " make container-test-vm-exec - VM exec tests (rootless + bridged)" + @echo " make container-test-vm-egress - VM egress tests (rootless + bridged)" @echo " make container-test-pjdfstest - POSIX compliance (8789 tests)" @echo " make container-test-all - Everything: test + vm + pjdfstest" @echo " make container-test-allow-other - Test AllowOther with fuse.conf" @@ -219,6 +231,24 @@ test-vm-rootless: build setup-kernel test-vm-bridged: build setup-kernel sudo $(TEST_VM_BRIDGED) +# VM exec tests +test-vm-exec-bridged: build setup-kernel + sudo $(TEST_VM_EXEC_BRIDGED) + +test-vm-exec-rootless: build setup-kernel + $(TEST_VM_EXEC_ROOTLESS) + +test-vm-exec: test-vm-exec-rootless test-vm-exec-bridged + +# VM egress tests +test-vm-egress-bridged: build setup-kernel + sudo $(TEST_VM_EGRESS_BRIDGED) + +test-vm-egress-rootless: build setup-kernel + $(TEST_VM_EGRESS_ROOTLESS) + +test-vm-egress: test-vm-egress-rootless test-vm-egress-bridged + # All VM tests: rootless first, then bridged test-vm: test-vm-rootless test-vm-bridged @@ -309,14 +339,25 @@ rebuild: rootfs # Marker file for container build state CONTAINER_MARKER := .container-built +# CI mode: use host directories instead of named volumes (for artifact sharing) +# Set CI=1 to enable artifact-compatible mode +CI ?= 0 +ifeq ($(CI),1) +VOLUME_TARGET := -v ./target:/workspace/fcvm/target +VOLUME_CARGO := -v ./cargo-home:/home/testuser/.cargo +else +VOLUME_TARGET := -v fcvm-cargo-target:/workspace/fcvm/target +VOLUME_CARGO := -v fcvm-cargo-home:/home/testuser/.cargo +endif + # Container run with source mounts (code always fresh, can't run stale) # Cargo cache goes to testuser's home so non-root builds work CONTAINER_RUN_BASE := sudo podman run --rm --privileged \ -v .:/workspace/fcvm \ -v $(FUSE_BACKEND_RS):/workspace/fuse-backend-rs \ -v $(FUSER):/workspace/fuser \ - -v fcvm-cargo-target:/workspace/fcvm/target \ - -v fcvm-cargo-home:/home/testuser/.cargo \ + $(VOLUME_TARGET) \ + $(VOLUME_CARGO) \ -e CARGO_HOME=/home/testuser/.cargo # Container run options for fuse-pipe tests @@ -340,22 +381,32 @@ CONTAINER_RUN_FCVM := $(CONTAINER_RUN_BASE) \ -v /var/run/netns:/var/run/netns:rshared \ --network host -# Truly rootless container run - matches unprivileged host user exactly -# Runs podman WITHOUT sudo (rootless podman) - this is the true unprivileged test -# Uses separate storage (--root) to avoid conflicts with root-owned storage -# --network host so slirp4netns can bind to loopback addresses (127.x.y.z) -# --security-opt seccomp=unconfined allows unshare syscall (no extra capabilities granted) -# No --privileged, no CAP_SYS_ADMIN - matches real unprivileged user +# Container run for rootless networking tests +# Uses rootless podman (no sudo!) with --privileged for user namespace capabilities. +# --privileged with rootless podman grants capabilities within the user namespace, +# not actual host root. We're root inside the container but unprivileged on host. +# --group-add keep-groups preserves host user's groups (kvm) for /dev/kvm access. +# --device /dev/userfaultfd needed for snapshot/clone UFFD memory sharing. +# The container's user namespace is the isolation boundary. +ifeq ($(CI),1) +VOLUME_TARGET_ROOTLESS := -v ./target:/workspace/fcvm/target +VOLUME_CARGO_ROOTLESS := -v ./cargo-home:/home/testuser/.cargo +else +VOLUME_TARGET_ROOTLESS := -v fcvm-cargo-target-rootless:/workspace/fcvm/target +VOLUME_CARGO_ROOTLESS := -v fcvm-cargo-home-rootless:/home/testuser/.cargo +endif CONTAINER_RUN_ROOTLESS := podman --root=/tmp/podman-rootless run --rm \ - --security-opt seccomp=unconfined \ + --privileged \ + --group-add keep-groups \ -v .:/workspace/fcvm \ -v $(FUSE_BACKEND_RS):/workspace/fuse-backend-rs \ -v $(FUSER):/workspace/fuser \ - -v fcvm-cargo-target-rootless:/workspace/fcvm/target \ - -v fcvm-cargo-home-rootless:/home/testuser/.cargo \ + $(VOLUME_TARGET_ROOTLESS) \ + $(VOLUME_CARGO_ROOTLESS) \ -e CARGO_HOME=/home/testuser/.cargo \ --device /dev/kvm \ --device /dev/net/tun \ + --device /dev/userfaultfd \ -v /mnt/fcvm-btrfs:/mnt/fcvm-btrfs \ --network host @@ -368,6 +419,13 @@ $(CONTAINER_MARKER): Containerfile container-build: $(CONTAINER_MARKER) +# Build inside container only (no tests) - useful for CI artifact caching +# Creates target/ with compiled binaries that can be uploaded/downloaded +container-build-only: container-build + @echo "==> Building inside container (CI mode)..." + @mkdir -p target cargo-home + $(CONTAINER_RUN_FUSE) $(CONTAINER_IMAGE) cargo build --release --all-targets -p fuse-pipe + # Export container image for rootless podman (needed for container-test-vm-rootless) # Rootless podman has separate image storage, so we export from root and import CONTAINER_ROOTLESS_MARKER := .container-rootless-imported @@ -420,9 +478,9 @@ container-test-allow-other: container-build-allow-other # All fuse-pipe tests: noroot first, then root container-test: container-test-noroot container-test-root -# VM tests - rootless (truly unprivileged - no --privileged, runs as testuser) -# Uses CONTAINER_RUN_ROOTLESS which drops privileges to match a normal host user -# Depends on container-build-rootless to export image to rootless podman storage +# VM tests - rootless (tests fcvm's rootless networking mode inside container) +# Uses CONTAINER_RUN_ROOTLESS with rootless podman --privileged +# Tests that fcvm can set up slirp4netns + user namespace networking container-test-vm-rootless: container-build-rootless setup-kernel $(CONTAINER_RUN_ROOTLESS) $(CONTAINER_IMAGE) $(TEST_VM_ROOTLESS) @@ -430,16 +488,30 @@ container-test-vm-rootless: container-build-rootless setup-kernel container-test-vm-bridged: container-build setup-kernel $(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_BRIDGED) -# VM exec tests - tests fcvm exec functionality -container-test-vm-exec: container-build setup-kernel - $(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EXEC) +# VM exec tests - bridged (needs root) +container-test-vm-exec-bridged: container-build setup-kernel + $(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EXEC_BRIDGED) -# VM egress tests - tests network egress from VMs -container-test-vm-egress: container-build setup-kernel - $(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EGRESS) +# VM exec tests - rootless (tests fcvm's rootless networking mode) +container-test-vm-exec-rootless: container-build-rootless setup-kernel + $(CONTAINER_RUN_ROOTLESS) $(CONTAINER_IMAGE) $(TEST_VM_EXEC_ROOTLESS) -# All VM tests: rootless first, then bridged -container-test-vm: container-test-vm-rootless container-test-vm-bridged +# VM exec tests - all (bridged first to create rootfs, then rootless) +container-test-vm-exec: container-test-vm-exec-bridged container-test-vm-exec-rootless + +# VM egress tests - bridged (needs root) +container-test-vm-egress-bridged: container-build setup-kernel + $(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EGRESS_BRIDGED) + +# VM egress tests - rootless (tests fcvm's rootless networking mode) +container-test-vm-egress-rootless: container-build-rootless setup-kernel + $(CONTAINER_RUN_ROOTLESS) $(CONTAINER_IMAGE) $(TEST_VM_EGRESS_ROOTLESS) + +# VM egress tests - all (bridged first to create rootfs, then rootless) +container-test-vm-egress: container-test-vm-egress-bridged container-test-vm-egress-rootless + +# All VM tests: bridged first (creates rootfs), then rootless +container-test-vm: container-test-vm-bridged container-test-vm-rootless # Legacy alias (runs both VM tests) container-test-fcvm: container-test-vm diff --git a/README.md b/README.md index f4788f47..15595bff 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ A Rust implementation that launches Firecracker microVMs to run Podman container > - Instant VM cloning via UFFD memory server + btrfs reflinks (~3ms) > - Multiple VMs share memory via kernel page cache (50 VMs = ~512MB, not 25GB!) > - Dual networking: bridged (iptables) or rootless (slirp4netns) +> - Port forwarding for both regular VMs and clones > - FUSE-based host directory mapping via fuse-pipe > - Container exit code forwarding @@ -138,7 +139,13 @@ sudo fcvm snapshot ls sudo fcvm snapshot run --pid --name clone1 sudo fcvm snapshot run --pid --name clone2 -# 7. Clone and execute command (auto-cleans up after) +# 7. Clone with port forwarding (each clone can have unique ports) +sudo fcvm snapshot run --pid --name web1 --publish 8081:80 +sudo fcvm snapshot run --pid --name web2 --publish 8082:80 +curl localhost:8081 # Reaches clone web1 +curl localhost:8082 # Reaches clone web2 + +# 8. Clone and execute command (auto-cleans up after) sudo fcvm snapshot run --pid --exec "curl localhost" # Clone starts → execs command in container → returns result → cleans up ``` @@ -537,7 +544,8 @@ Run `make help` for the full list. Key targets: | `test_fuse_posix.rs` | POSIX FUSE compliance tests | | `test_fuse_in_vm.rs` | FUSE-in-VM integration | | `test_localhost_image.rs` | Local image tests | -| `test_snapshot_clone.rs` | Snapshot/clone workflow | +| `test_snapshot_clone.rs` | Snapshot/clone workflow, clone port forwarding | +| `test_port_forward.rs` | Port forwarding for regular VMs | #### fuse-pipe Tests (`fuse-pipe/tests/`) | File | Description | diff --git a/rootfs-plan.toml b/rootfs-plan.toml new file mode 100644 index 00000000..be8083d4 --- /dev/null +++ b/rootfs-plan.toml @@ -0,0 +1,116 @@ +# Rootfs Modification Plan +# +# This file describes all modifications applied to the base Ubuntu cloud image. +# The SHA256 of the generated setup script determines the image name: layer2-{sha}.raw +# If this file changes, Layer 2 is rebuilt automatically. +# +# fc-agent is NOT in Layer 2 at all (neither binary nor service). +# Both are injected per-VM at boot time via initrd. +# This allows updating fc-agent without rebuilding Layer 2. + +[base] +# Ubuntu 24.04 LTS (Noble Numbat) cloud images +# Using "current" for latest updates - URL changes trigger plan SHA change +version = "24.04" + +[base.arm64] +url = "https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-arm64.img" + +[base.amd64] +url = "https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img" + +[kernel] +# Kata Containers kernel with FUSE support built-in +# Firecracker's official kernel lacks FUSE, but Kata's has it +# URL hash is included in Layer 2 SHA calculation + +[kernel.arm64] +# Kata 3.24.0 release - kernel 6.12.47 with CONFIG_FUSE_FS=y +url = "https://github.com/kata-containers/kata-containers/releases/download/3.24.0/kata-static-3.24.0-arm64.tar.zst" +# Path within the tarball to extract +path = "opt/kata/share/kata-containers/vmlinux-6.12.47-173" + +[kernel.amd64] +url = "https://github.com/kata-containers/kata-containers/releases/download/3.24.0/kata-static-3.24.0-amd64.tar.zst" +path = "opt/kata/share/kata-containers/vmlinux-6.12.47-173" + +[packages] +# Container runtime +runtime = ["podman", "crun", "fuse-overlayfs", "skopeo"] + +# FUSE support for overlay filesystem +fuse = ["fuse3"] + +# System services +system = ["haveged", "chrony"] + +[services] +# Services to enable +# NOTE: fc-agent is NOT enabled here - it's injected per-VM via initrd +# NOTE: systemd-resolved is NOT enabled - DNS comes from kernel cmdline via fc-agent +enable = [ + "haveged", + "chrony", + "systemd-networkd", +] + +# Services to disable +disable = [ + "multipathd", + "snapd", + "cloud-init", + "cloud-config", + "cloud-final", +] + +[files] +# Files to create/modify in the rootfs + +[files."/etc/resolv.conf"] +content = """ +# Placeholder - fc-agent configures DNS at boot from kernel cmdline +nameserver 127.0.0.53 +""" + +[files."/etc/chrony/chrony.conf"] +content = """ +# NTP servers from pool.ntp.org +pool pool.ntp.org iburst + +# Allow clock to be stepped (not slewed) for large time differences +makestep 1.0 3 + +# Directory for drift and other runtime files +driftfile /var/lib/chrony/drift +""" + +[files."/etc/systemd/network/10-eth0.network"] +content = """ +[Match] +Name=eth0 + +[Network] +# Keep kernel IP configuration from ip= boot parameter +KeepConfiguration=yes +""" + +[files."/etc/systemd/network/10-eth0.network.d/mmds.conf"] +content = """ +[Route] +Destination=169.254.169.254/32 +Scope=link +""" + +# NOTE: fc-agent.service is NOT defined here - it's injected per-VM via initrd + +[fstab] +# Lines to remove from /etc/fstab (patterns to filter out) +remove_patterns = ["LABEL=BOOT", "LABEL=UEFI"] + +[cleanup] +# Patterns to remove for smaller image +remove_dirs = [ + "/usr/share/doc/*", + "/usr/share/man/*", + "/var/cache/apt/archives/*", +] diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 723be8c6..418668f5 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -274,6 +274,22 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { state_manager.init().await?; // Setup networking based on mode + // Bridged mode requires root for iptables and network namespace setup + if matches!(args.network, NetworkMode::Bridged) && !nix::unistd::geteuid().is_root() { + bail!( + "Bridged networking requires root. Either:\n \ + - Run with sudo: sudo fcvm podman run ...\n \ + - Use rootless mode: fcvm podman run --network rootless ..." + ); + } + // Rootless with sudo is pointless - bridged would be faster + if matches!(args.network, NetworkMode::Rootless) && nix::unistd::geteuid().is_root() { + warn!( + "Running rootless mode as root is unnecessary. \ + Consider using --network bridged for better performance." + ); + } + let tap_device = format!("tap-{}", truncate_id(&vm_id, 8)); let mut network: Box = match args.network { NetworkMode::Bridged => Box::new(BridgedNetwork::new( diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs index 61275444..d3dbc47b 100644 --- a/src/commands/snapshot.rs +++ b/src/commands/snapshot.rs @@ -18,6 +18,80 @@ use crate::storage::{DiskManager, SnapshotManager}; use crate::uffd::UffdServer; use crate::volume::{spawn_volume_servers, VolumeConfig}; +const USERFAULTFD_DEVICE: &str = "/dev/userfaultfd"; + +/// Check if /dev/userfaultfd is accessible for clone operations. +/// Clones use UFFD (userfaultfd) to share memory pages on-demand from the serve process. +/// Returns Ok(()) if accessible, or an error with detailed fix instructions. +fn check_userfaultfd_access() -> Result<()> { + use std::fs::OpenOptions; + use std::path::Path; + + let path = Path::new(USERFAULTFD_DEVICE); + + // Check if device exists + if !path.exists() { + bail!( + r#" +╔══════════════════════════════════════════════════════════════════════════════╗ +║ USERFAULTFD DEVICE NOT FOUND ║ +╠══════════════════════════════════════════════════════════════════════════════╣ +║ {USERFAULTFD_DEVICE} does not exist on this system. ║ +║ ║ +║ This device is required for snapshot cloning (UFFD memory sharing). ║ +║ It's available on Linux 5.11+ kernels. ║ +║ ║ +║ Check your kernel version: ║ +║ uname -r ║ +╚══════════════════════════════════════════════════════════════════════════════╝ +"# + ); + } + + // Check if we have read/write access + match OpenOptions::new().read(true).write(true).open(path) { + Ok(_) => Ok(()), + Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => { + bail!( + r#" +╔══════════════════════════════════════════════════════════════════════════════╗ +║ USERFAULTFD PERMISSION DENIED ║ +╠══════════════════════════════════════════════════════════════════════════════╣ +║ Cannot access /dev/userfaultfd - permission denied. ║ +║ ║ +║ Snapshot clones require access to userfaultfd for memory sharing. ║ +║ ║ +║ FIX (choose one): ║ +║ ║ +║ Option 1 - Device permissions (recommended): ║ +║ # Persistent udev rule (survives reboots): ║ +║ echo 'KERNEL=="userfaultfd", MODE="0666"' | \ ║ +║ sudo tee /etc/udev/rules.d/99-userfaultfd.rules ║ +║ sudo udevadm control --reload-rules ║ +║ sudo chmod 666 /dev/userfaultfd ║ +║ ║ +║ Option 2 - Sysctl (system-wide, affects syscall fallback): ║ +║ sudo sysctl vm.unprivileged_userfaultfd=1 ║ +║ # To persist: add 'vm.unprivileged_userfaultfd=1' to /etc/sysctl.conf ║ +║ ║ +║ Option 3 - One-time fix (must redo after reboot): ║ +║ sudo chmod 666 /dev/userfaultfd ║ +║ ║ +║ After fixing, retry your clone command. ║ +╚══════════════════════════════════════════════════════════════════════════════╝ +"# + ); + } + Err(e) => { + bail!( + "Cannot access {}: {} - ensure the device exists and is readable", + USERFAULTFD_DEVICE, + e + ); + } + } +} + /// Main dispatcher for snapshot commands pub async fn cmd_snapshot(args: SnapshotArgs) -> Result<()> { match args.cmd { @@ -400,7 +474,11 @@ async fn cmd_snapshot_serve(args: SnapshotServeArgs) -> Result<()> { /// Run clone from snapshot async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> { - // First verify the serve process is actually alive before attempting any work + // Check userfaultfd access FIRST - this is a system requirement + // Give a clear error message if permissions aren't configured + check_userfaultfd_access().context("userfaultfd access check failed")?; + + // Now verify the serve process is actually alive before attempting any work // This prevents wasted setup if the serve process died between state file creation and now if !crate::utils::is_process_alive(args.pid) { anyhow::bail!( @@ -543,6 +621,22 @@ async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> { // Extract guest_ip from snapshot metadata for network config reuse let saved_network = &snapshot_config.metadata.network_config; + // Bridged mode requires root for iptables and network namespace setup + if matches!(args.network, NetworkMode::Bridged) && !nix::unistd::geteuid().is_root() { + bail!( + "Bridged networking requires root. Either:\n \ + - Run with sudo: sudo fcvm snapshot run ...\n \ + - Use rootless mode: fcvm snapshot run --network rootless ..." + ); + } + // Rootless with sudo is pointless - bridged would be faster + if matches!(args.network, NetworkMode::Rootless) && nix::unistd::geteuid().is_root() { + warn!( + "Running rootless mode as root is unnecessary. \ + Consider using --network bridged for better performance." + ); + } + // Setup networking based on mode - reuse guest_ip from snapshot if available let mut network: Box = match args.network { NetworkMode::Bridged => { @@ -991,8 +1085,19 @@ async fn run_clone_setup( "parallel disk + network setup complete" ); - // Step 3: Set holder_pid so VmManager uses nsenter - vm_manager.set_holder_pid(holder_pid); + // Step 3: Set namespace paths for pre_exec setns (NOT nsenter wrapper) + // For clones, we need to enter namespaces in pre_exec because: + // - pre_exec runs BEFORE nsenter would enter the namespace + // - We need CAP_SYS_ADMIN (from user namespace) for mount operations + // - Entering user namespace first gives us CAP_SYS_ADMIN for unshare(CLONE_NEWNS) + vm_manager.set_user_namespace_path(std::path::PathBuf::from(format!( + "/proc/{}/ns/user", + holder_pid + ))); + vm_manager.set_net_namespace_path(std::path::PathBuf::from(format!( + "/proc/{}/ns/net", + holder_pid + ))); // Store holder_pid in state for health checks vm_state.holder_pid = Some(holder_pid); diff --git a/src/firecracker/vm.rs b/src/firecracker/vm.rs index f198233c..7da888a7 100644 --- a/src/firecracker/vm.rs +++ b/src/firecracker/vm.rs @@ -36,6 +36,8 @@ pub struct VmManager { log_path: Option, namespace_id: Option, holder_pid: Option, // namespace holder PID for rootless mode (use nsenter to run FC) + user_namespace_path: Option, // User namespace path for rootless clones (enter via setns in pre_exec) + net_namespace_path: Option, // Net namespace path for rootless clones (enter via setns in pre_exec) vsock_redirect: Option<(PathBuf, PathBuf)>, // (baseline_dir, clone_dir) for mount namespace isolation process: Option, client: Option, @@ -50,6 +52,8 @@ impl VmManager { log_path, namespace_id: None, holder_pid: None, + user_namespace_path: None, + net_namespace_path: None, vsock_redirect: None, process: None, client: None, @@ -80,6 +84,27 @@ impl VmManager { self.holder_pid = Some(pid); } + /// Set user namespace path for rootless clones + /// + /// When set along with vsock_redirect, pre_exec will enter this user namespace + /// first (via setns) before doing mount operations. This gives CAP_SYS_ADMIN + /// inside the user namespace, allowing unshare(CLONE_NEWNS) to succeed. + /// + /// Use this instead of set_holder_pid when mount namespace isolation is needed, + /// since nsenter wrapper runs AFTER pre_exec. + pub fn set_user_namespace_path(&mut self, path: PathBuf) { + self.user_namespace_path = Some(path); + } + + /// Set network namespace path for rootless clones + /// + /// When set, pre_exec will enter this network namespace (via setns) after + /// completing mount operations. Use with set_user_namespace_path for + /// rootless clones that need mount namespace isolation. + pub fn set_net_namespace_path(&mut self, path: PathBuf) { + self.net_namespace_path = Some(path); + } + /// Set vsock redirect for mount namespace isolation /// /// When set, Firecracker will be launched in a new mount namespace with @@ -109,12 +134,25 @@ impl VmManager { let _ = std::fs::remove_file(&self.socket_path); // Build command based on mode: - // 1. holder_pid set: use nsenter to enter existing namespace (rootless) - // 2. direct Firecracker (privileged/bridged mode) - let mut cmd = if let Some(holder_pid) = self.holder_pid { + // 1. user_namespace_path set: direct Firecracker (namespaces entered via pre_exec setns) + // 2. holder_pid set (no user_namespace_path): use nsenter to enter existing namespace (rootless baseline) + // 3. neither: direct Firecracker (privileged/bridged mode) + // + // For rootless clones with vsock_redirect, we MUST use pre_exec setns instead of nsenter, + // because pre_exec runs BEFORE nsenter would enter the namespace, and we need CAP_SYS_ADMIN + // from the user namespace to do mount operations. + let mut cmd = if self.user_namespace_path.is_some() { + // Use direct Firecracker - namespaces will be entered via setns in pre_exec + // This is required for rootless clones that need mount namespace isolation + info!(target: "vm", vm_id = %self.vm_id, "using pre_exec setns for rootless clone"); + let mut c = Command::new(firecracker_bin); + c.arg("--api-sock").arg(&self.socket_path); + c + } else if let Some(holder_pid) = self.holder_pid { // Use nsenter to enter user+network namespace with preserved credentials // --preserve-credentials keeps UID, GID, and supplementary groups (including kvm) // This allows KVM access while being in the isolated network namespace + // NOTE: This path is for baseline VMs that don't need mount namespace isolation info!(target: "vm", vm_id = %self.vm_id, holder_pid = holder_pid, "using nsenter for rootless networking"); let mut c = Command::new("nsenter"); c.args([ @@ -155,6 +193,8 @@ impl VmManager { // We need to handle these in a single pre_exec because it can only be called once let ns_id_clone = self.namespace_id.clone(); let vsock_redirect_clone = self.vsock_redirect.clone(); + let user_ns_path_clone = self.user_namespace_path.clone(); + let net_ns_path_clone = self.net_namespace_path.clone(); // Ensure baseline directory exists for bind mount target // The baseline VM may have been cleaned up, but we need the directory for mount @@ -165,7 +205,11 @@ impl VmManager { } } - if ns_id_clone.is_some() || vsock_redirect_clone.is_some() { + if ns_id_clone.is_some() + || vsock_redirect_clone.is_some() + || user_ns_path_clone.is_some() + || net_ns_path_clone.is_some() + { use std::ffi::CString; // Prepare CStrings outside the closure (async-signal-safe requirement) @@ -179,6 +223,28 @@ impl VmManager { None }; + // User namespace path (for rootless clones that need CAP_SYS_ADMIN for mount ops) + let user_ns_cstr = if let Some(ref path) = user_ns_path_clone { + info!(target: "vm", vm_id = %self.vm_id, path = %path.display(), "will enter user namespace in pre_exec"); + Some( + CString::new(path.to_string_lossy().as_bytes()) + .context("user namespace path contains invalid characters")?, + ) + } else { + None + }; + + // Network namespace path (for rootless clones via /proc/PID/ns/net) + let net_ns_cstr = if let Some(ref path) = net_ns_path_clone { + info!(target: "vm", vm_id = %self.vm_id, path = %path.display(), "will enter net namespace in pre_exec"); + Some( + CString::new(path.to_string_lossy().as_bytes()) + .context("net namespace path contains invalid characters")?, + ) + } else { + None + }; + let vsock_paths = if let Some((ref baseline_dir, ref clone_dir)) = vsock_redirect_clone { info!(target: "vm", vm_id = %self.vm_id, @@ -210,8 +276,31 @@ impl VmManager { use nix::sys::stat::Mode; use std::os::unix::io::{FromRawFd, OwnedFd}; + // Step 0: Enter user namespace if specified (for rootless clones) + // This MUST be done first to get CAP_SYS_ADMIN for mount operations. + // The user namespace was created by the holder process with --map-root-user, + // so entering it gives us UID 0 with full capabilities inside the namespace. + if let Some(ref user_ns_path) = user_ns_cstr { + let ns_fd_raw = open( + user_ns_path.as_c_str(), + OFlag::O_RDONLY, + Mode::empty(), + ) + .map_err(|e| { + std::io::Error::other(format!("failed to open user namespace: {}", e)) + })?; + + let ns_fd = OwnedFd::from_raw_fd(ns_fd_raw); + + setns(&ns_fd, CloneFlags::CLONE_NEWUSER).map_err(|e| { + std::io::Error::other(format!("failed to enter user namespace: {}", e)) + })?; + // Now we have CAP_SYS_ADMIN inside the user namespace! + } + // Step 1: Set up mount namespace for vsock redirect if needed // This must be done BEFORE entering network namespace + // Note: This now succeeds because we entered user namespace first (if needed) if let Some((ref baseline_cstr, ref clone_cstr)) = vsock_paths { // Create a new mount namespace so our bind mount is isolated unshare(CloneFlags::CLONE_NEWNS).map_err(|e| { @@ -252,21 +341,24 @@ impl VmManager { } // Step 2: Enter network namespace if specified - if let Some(ref ns_path_cstr) = ns_path_cstr { - let ns_fd_raw = open( - ns_path_cstr.as_c_str(), - OFlag::O_RDONLY, - Mode::empty(), - ) - .map_err(|e| { - std::io::Error::other(format!("failed to open namespace: {}", e)) - })?; + // This can come from either: + // - net_ns_cstr: /proc/PID/ns/net (rootless clones via pre_exec) - preferred + // - ns_path_cstr: /var/run/netns/NAME (bridged mode) + let net_ns_to_enter = net_ns_cstr.as_ref().or(ns_path_cstr.as_ref()); + if let Some(ns_path) = net_ns_to_enter { + let ns_fd_raw = open(ns_path.as_c_str(), OFlag::O_RDONLY, Mode::empty()) + .map_err(|e| { + std::io::Error::other(format!( + "failed to open net namespace: {}", + e + )) + })?; // SAFETY: from_raw_fd takes ownership of the file descriptor. let ns_fd = OwnedFd::from_raw_fd(ns_fd_raw); setns(&ns_fd, CloneFlags::CLONE_NEWNET).map_err(|e| { - std::io::Error::other(format!("failed to enter namespace: {}", e)) + std::io::Error::other(format!("failed to enter net namespace: {}", e)) })?; // fd is automatically closed when OwnedFd is dropped } diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 29f18eac..600e7e9e 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -151,17 +151,17 @@ impl SlirpNetwork { /// Build the setup script to run inside the namespace via nsenter /// - /// This script creates both TAP devices and sets up iptables rules for egress. - /// Health checks use nsenter to curl the guest directly, no port forwarding needed. + /// This script creates both TAP devices and configures networking. /// Run via: nsenter -t HOLDER_PID -U -n -- bash -c '' pub fn build_setup_script(&self) -> String { format!( r#" set -e -# Create slirp0 TAP for slirp4netns (slirp4netns will attach to this) +# Create slirp0 TAP for slirp4netns connectivity +# Use 10.0.2.100 as the address for DNAT to work with port forwarding ip tuntap add {slirp_dev} mode tap -ip addr add 10.0.2.1/24 dev {slirp_dev} +ip addr add 10.0.2.100/24 dev {slirp_dev} ip link set {slirp_dev} up # Create TAP device for Firecracker (must exist before Firecracker starts) @@ -183,12 +183,19 @@ iptables -A FORWARD -i {slirp_dev} -o {fc_tap} -j ACCEPT 2>/dev/null || true iptables -A FORWARD -i {fc_tap} -o {slirp_dev} -j ACCEPT 2>/dev/null || true # Set up iptables MASQUERADE for traffic from guest subnet (egress) +# This NATs guest traffic (192.168.x.x) to slirp0's address (10.0.2.100) iptables -t nat -A POSTROUTING -s {guest_subnet} -o {slirp_dev} -j MASQUERADE 2>/dev/null || true + +# Set up DNAT for inbound connections from slirp4netns +# When slirp4netns forwards traffic to 10.0.2.100, redirect it to the actual guest IP +# This enables port forwarding: host -> slirp4netns -> 10.0.2.100 -> DNAT -> guest (192.168.x.2) +iptables -t nat -A PREROUTING -d 10.0.2.100 -j DNAT --to-destination {guest_ip} 2>/dev/null || true "#, slirp_dev = self.slirp_device, fc_tap = self.tap_device, ns_ip = self.namespace_ip, guest_subnet = self.guest_subnet, + guest_ip = self.guest_ip, ) } diff --git a/src/setup/kernel.rs b/src/setup/kernel.rs index ed0373b8..f698b7cd 100644 --- a/src/setup/kernel.rs +++ b/src/setup/kernel.rs @@ -1,121 +1,135 @@ use anyhow::{bail, Context, Result}; -use std::path::{Path, PathBuf}; -use std::process::Command; +use sha2::{Digest, Sha256}; +use std::path::PathBuf; +use tokio::process::Command; use tracing::info; use crate::paths; +use crate::setup::rootfs::{load_plan, KernelArchConfig}; + +/// Compute SHA256 of bytes, return hex string (first 12 chars) +fn compute_sha256_short(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + hex::encode(&result[..6]) // 12 hex chars +} + +/// Get the kernel URL hash for the current architecture +/// This is used to include in Layer 2 SHA calculation +pub fn get_kernel_url_hash() -> Result { + let (plan, _, _) = load_plan()?; + let kernel_config = plan.kernel.current_arch()?; + Ok(compute_sha256_short(kernel_config.url.as_bytes())) +} -/// Ensure kernel exists, extracting from host if needed +/// Ensure kernel exists, downloading from Kata release if needed pub async fn ensure_kernel() -> Result { + let (plan, _, _) = load_plan()?; + let kernel_config = plan.kernel.current_arch()?; + + download_kernel(kernel_config).await +} + +/// Download kernel from Kata release tarball +async fn download_kernel(config: &KernelArchConfig) -> Result { let kernel_dir = paths::kernel_dir(); - let kernel_path = kernel_dir.join("vmlinux.bin"); + + // Cache by URL hash - changing URL triggers re-download + let url_hash = compute_sha256_short(config.url.as_bytes()); + let kernel_path = kernel_dir.join(format!("vmlinux-{}.bin", url_hash)); if kernel_path.exists() { - info!(path = %kernel_path.display(), "kernel already exists"); + info!(path = %kernel_path.display(), url_hash = %url_hash, "kernel already exists"); return Ok(kernel_path); } - println!("⚙️ Setting up kernel (first run)..."); + println!("⚙️ Downloading kernel (first run)..."); + info!(url = %config.url, path_in_archive = %config.path, "downloading kernel from Kata release"); // Create directory tokio::fs::create_dir_all(&kernel_dir) .await .context("creating kernel directory")?; - // Find host kernel - let host_kernel = find_host_kernel().context("finding host kernel")?; + // Download and extract in one pipeline: + // curl -> zstd -d -> tar --extract + let cache_dir = paths::base_dir().join("cache"); + tokio::fs::create_dir_all(&cache_dir).await?; - info!(host_kernel = %host_kernel.display(), "found host kernel"); - println!(" → Extracting from {}...", host_kernel.display()); + let tarball_path = cache_dir.join(format!("kata-kernel-{}.tar.zst", url_hash)); - // Extract kernel - extract_kernel(&host_kernel, &kernel_path) - .await - .context("extracting kernel")?; - - println!(" ✓ Kernel ready"); - - Ok(kernel_path) -} - -/// Find host kernel in /boot -fn find_host_kernel() -> Result { - // Try current running kernel first - let uname_output = Command::new("uname") - .arg("-r") - .output() - .context("running uname -r")?; + // Download if not cached + if !tarball_path.exists() { + println!(" → Downloading Kata release tarball..."); - let kernel_version = String::from_utf8_lossy(&uname_output.stdout) - .trim() - .to_string(); + let output = Command::new("curl") + .args(["-fSL", &config.url, "-o"]) + .arg(&tarball_path) + .output() + .await + .context("running curl")?; - let kernel_path = PathBuf::from(format!("/boot/vmlinuz-{}", kernel_version)); + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + bail!("Failed to download kernel: {}", stderr); + } - if kernel_path.exists() { - return Ok(kernel_path); + info!(path = %tarball_path.display(), "downloaded Kata tarball"); + } else { + info!(path = %tarball_path.display(), "using cached Kata tarball"); } - // Fallback: find any vmlinuz in /boot - let boot_dir = std::fs::read_dir("/boot").context("reading /boot directory")?; + // Extract just the kernel file using tar with zstd + println!(" → Extracting kernel from tarball..."); + + // Use tar to extract, piping through zstd + // tar expects path with ./ prefix based on how Kata packages it + let extract_path = format!("./{}", config.path); + + let output = Command::new("tar") + .args([ + "--use-compress-program=zstd", + "-xf", + ]) + .arg(&tarball_path) + .arg("-C") + .arg(&cache_dir) + .arg(&extract_path) + .output() + .await + .context("extracting kernel from tarball")?; - for entry in boot_dir { - let entry = entry?; - let file_name = entry.file_name(); - let name = file_name.to_string_lossy(); + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + bail!("Failed to extract kernel: {}", stderr); + } - if name.starts_with("vmlinuz") && !name.contains("rescue") { - return Ok(entry.path()); - } + // Move extracted kernel to final location + let extracted_path = cache_dir.join(&config.path); + if !extracted_path.exists() { + bail!( + "Kernel not found after extraction at {}", + extracted_path.display() + ); } - bail!("no kernel found in /boot") -} + tokio::fs::copy(&extracted_path, &kernel_path) + .await + .context("copying kernel to final location")?; -/// Extract uncompressed kernel from potentially compressed vmlinuz -async fn extract_kernel(src: &Path, dst: &Path) -> Result<()> { - // Most modern kernels are self-extracting ELF with embedded compressed payload - // We need the uncompressed ELF - - // Try finding extract-vmlinux in common locations - let extract_vmlinux_paths = vec![ - "/usr/src/linux-headers-*/scripts/extract-vmlinux", - "/usr/src/*/scripts/extract-vmlinux", - ]; - - for pattern in &extract_vmlinux_paths { - if let Ok(output) = Command::new("sh") - .arg("-c") - .arg(format!("ls {} 2>/dev/null | head -1", pattern)) - .output() - { - if let Ok(script_path) = String::from_utf8(output.stdout) { - let script_path = script_path.trim(); - if !script_path.is_empty() { - info!(script = %script_path, "using extract-vmlinux script"); - let output = Command::new(script_path) - .arg(src) - .output() - .context("running extract-vmlinux")?; - - if output.status.success() && !output.stdout.is_empty() { - tokio::fs::write(dst, &output.stdout) - .await - .context("writing extracted kernel")?; - return Ok(()); - } - } - } - } + // Clean up extracted files (keep tarball for cache) + let opt_dir = cache_dir.join("opt"); + if opt_dir.exists() { + tokio::fs::remove_dir_all(&opt_dir).await.ok(); } - bail!( - "extract-vmlinux script not found. Please install it or download a pre-built kernel from Firecracker releases. - - To install extract-vmlinux: - sudo apt-get install linux-tools-generic + println!(" ✓ Kernel ready"); + info!( + path = %kernel_path.display(), + url_hash = %url_hash, + "kernel downloaded and cached" + ); - Or download a pre-built kernel: - wget https://github.com/firecracker-microvm/firecracker/releases/download/v1.13.1/vmlinux-5.10.217" - ) + Ok(kernel_path) } diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs index 2100f36c..789b84d8 100644 --- a/src/setup/rootfs.rs +++ b/src/setup/rootfs.rs @@ -1,79 +1,460 @@ use anyhow::{bail, Context, Result}; +use serde::Deserialize; +use sha2::{Digest, Sha256}; +use std::collections::HashMap; use std::path::{Path, PathBuf}; -use tokio::fs::File; -use tokio::io::AsyncWriteExt; use tokio::process::Command; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; use crate::paths; -/// Find the fc-agent binary +/// Plan file location (relative to workspace root) +const PLAN_FILE: &str = "rootfs-plan.toml"; + +/// Size of the Layer 2 disk image +const LAYER2_SIZE: &str = "10G"; + +// ============================================================================ +// Plan File Data Structures +// ============================================================================ + +#[derive(Debug, Deserialize, Clone)] +pub struct Plan { + pub base: BaseConfig, + pub kernel: KernelConfig, + pub packages: PackagesConfig, + pub services: ServicesConfig, + pub files: HashMap, + pub fstab: FstabConfig, + #[serde(default)] + pub cleanup: CleanupConfig, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct BaseConfig { + pub version: String, + pub arm64: ArchConfig, + pub amd64: ArchConfig, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct ArchConfig { + pub url: String, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct KernelConfig { + pub arm64: KernelArchConfig, + pub amd64: KernelArchConfig, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct KernelArchConfig { + /// URL to the kernel archive (e.g., Kata release tarball) + pub url: String, + /// Path within the archive to extract + pub path: String, +} + +impl KernelConfig { + /// Get the kernel config for the current architecture + pub fn current_arch(&self) -> anyhow::Result<&KernelArchConfig> { + match std::env::consts::ARCH { + "x86_64" => Ok(&self.amd64), + "aarch64" => Ok(&self.arm64), + other => anyhow::bail!("unsupported architecture: {}", other), + } + } +} + +#[derive(Debug, Deserialize, Clone)] +pub struct PackagesConfig { + pub runtime: Vec, + pub fuse: Vec, + pub system: Vec, +} + +impl PackagesConfig { + pub fn all_packages(&self) -> Vec<&str> { + self.runtime + .iter() + .chain(&self.fuse) + .chain(&self.system) + .map(|s| s.as_str()) + .collect() + } +} + +#[derive(Debug, Deserialize, Clone)] +pub struct ServicesConfig { + pub enable: Vec, + pub disable: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct FileConfig { + pub content: String, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct FstabConfig { + pub remove_patterns: Vec, +} + +#[derive(Debug, Deserialize, Default, Clone)] +pub struct CleanupConfig { + #[serde(default)] + pub remove_dirs: Vec, +} + +// ============================================================================ +// Script Generation +// ============================================================================ + +/// Generate a setup script from the plan /// -/// Both fcvm and fc-agent are workspace members built together with: -/// cargo build --release +/// Generate the install script that runs BEFORE the setup script. +/// This script installs packages from /mnt/packages and removes conflicting packages. +pub fn generate_install_script() -> String { + r#"#!/bin/bash +set -e +echo 'FCVM: Removing conflicting packages before install...' +# Remove time-daemon provider that conflicts with chrony +apt-get remove -y --purge systemd-timesyncd 2>/dev/null || true +# Remove packages we don't need in microVM (also frees space) +apt-get remove -y --purge cloud-init snapd ubuntu-server 2>/dev/null || true + +echo 'FCVM: Installing packages from initrd...' +dpkg -i /mnt/packages/*.deb || true +apt-get -f install -y || true +echo 'FCVM: Packages installed successfully' +"# + .to_string() +} + +/// Generate the init script that runs in the initrd during Layer 2 setup. +/// This script mounts filesystems, runs install + setup scripts, then powers off. /// -/// Search order: -/// 1. Same directory as current exe (for cargo install) -/// 2. Parent directory (for tests running from target/release/deps/) -/// 3. FC_AGENT_PATH environment variable -fn find_fc_agent_binary() -> Result { - let exe_path = std::env::current_exe().context("getting current executable path")?; - let exe_dir = exe_path.parent().context("getting executable directory")?; +/// The SHA256 of this complete script determines the rootfs name, ensuring +/// any changes to mounts, commands, or embedded scripts invalidate the cache. +pub fn generate_init_script(install_script: &str, setup_script: &str) -> String { + format!( + r#"#!/bin/busybox sh +# FCVM Layer 2 setup initrd +# Runs package installation before systemd +# Packages are embedded in the initrd at /packages + +echo "FCVM Layer 2 Setup: Starting..." + +# Install busybox commands +/bin/busybox mkdir -p /bin /sbin /proc /sys /dev /newroot +/bin/busybox --install -s /bin +/bin/busybox --install -s /sbin + +# Mount essential filesystems +mount -t proc proc /proc +mount -t sysfs sys /sys +mount -t devtmpfs dev /dev + +# Populate /dev with device nodes from sysfs +mdev -s + +# Debug: show available block devices +echo "FCVM Layer 2 Setup: Available block devices:" +ls -la /dev/vd* 2>/dev/null || echo "No /dev/vd* devices found" + +echo "FCVM Layer 2 Setup: Mounting rootfs..." +mount -o rw /dev/vda /newroot +if [ $? -ne 0 ]; then + echo "ERROR: Failed to mount rootfs" + sleep 5 + poweroff -f +fi + +# Copy embedded packages from initrd to rootfs +# Packages are in /packages directory inside the initrd (loaded in RAM) +echo "FCVM Layer 2 Setup: Copying packages from initrd to rootfs..." +mkdir -p /newroot/mnt/packages +cp -a /packages/* /newroot/mnt/packages/ +echo "FCVM Layer 2 Setup: Copied $(ls /newroot/mnt/packages/*.deb 2>/dev/null | wc -l) packages" + +# Write the install script to rootfs +cat > /newroot/tmp/install-packages.sh << 'INSTALL_SCRIPT_EOF' +{} +INSTALL_SCRIPT_EOF +chmod 755 /newroot/tmp/install-packages.sh + +# Write the setup script to rootfs +cat > /newroot/tmp/fcvm-setup.sh << 'SETUP_SCRIPT_EOF' +{} +SETUP_SCRIPT_EOF +chmod 755 /newroot/tmp/fcvm-setup.sh + +# Set up chroot environment (proc, sys, dev) +echo "FCVM Layer 2 Setup: Setting up chroot environment..." +mount --bind /proc /newroot/proc +mount --bind /sys /newroot/sys +mount --bind /dev /newroot/dev + +# Install packages using chroot +echo "FCVM Layer 2 Setup: Installing packages..." +chroot /newroot /bin/bash /tmp/install-packages.sh +INSTALL_RESULT=$? +echo "FCVM Layer 2 Setup: Package installation returned: $INSTALL_RESULT" + +# Run setup script using chroot +echo "FCVM Layer 2 Setup: Running setup script..." +chroot /newroot /bin/bash /tmp/fcvm-setup.sh +SETUP_RESULT=$? +echo "FCVM Layer 2 Setup: Setup script returned: $SETUP_RESULT" + +# Cleanup chroot mounts (use lazy unmount as fallback) +echo "FCVM Layer 2 Setup: Cleaning up..." +umount /newroot/dev 2>/dev/null || umount -l /newroot/dev 2>/dev/null || true +umount /newroot/sys 2>/dev/null || umount -l /newroot/sys 2>/dev/null || true +umount /newroot/proc 2>/dev/null || umount -l /newroot/proc 2>/dev/null || true +rm -rf /newroot/mnt/packages +rm -f /newroot/tmp/install-packages.sh +rm -f /newroot/tmp/fcvm-setup.sh + +# Sync and unmount rootfs +sync +umount /newroot 2>/dev/null || umount -l /newroot 2>/dev/null || true + +echo "FCVM_SETUP_COMPLETE" +echo "FCVM Layer 2 Setup: Complete! Powering off..." +umount /proc /sys /dev 2>/dev/null || true +poweroff -f +"#, + install_script, setup_script + ) +} - // Check same directory (cargo install case) - let fc_agent = exe_dir.join("fc-agent"); - if fc_agent.exists() { - return Ok(fc_agent); +/// The script content is deterministic - same plan always produces same script. +/// The SHA256 of this script determines the rootfs image name. +/// +/// NOTE: This script does NOT install packages - they are installed from +/// install-packages.sh before this script runs. +pub fn generate_setup_script(plan: &Plan) -> String { + let mut s = String::new(); + + // Script header - runs after packages are installed from initrd + s.push_str("#!/bin/bash\n"); + s.push_str("set -euo pipefail\n\n"); + + // Note: No partition resize needed - filesystem is already resized on host + // (we use a raw ext4 filesystem without partition table)\n + + // Note: Packages are already installed by install-packages.sh + // We just need to include the package list in the script for SHA calculation + let packages = plan.packages.all_packages(); + s.push_str("# Packages (installed from initrd): "); + s.push_str(&packages.join(", ")); + s.push_str("\n\n"); + + // Write configuration files (sorted for deterministic output) + let mut file_paths: Vec<_> = plan.files.keys().collect(); + file_paths.sort(); + + s.push_str("# Write configuration files\n"); + for path in file_paths { + let config = &plan.files[path]; + // Create parent directory if needed + if let Some(parent) = std::path::Path::new(path).parent() { + if parent != std::path::Path::new("") && parent != std::path::Path::new("/") { + s.push_str(&format!("mkdir -p {}\n", parent.display())); + } + } + s.push_str(&format!("cat > {} << 'FCVM_EOF'\n", path)); + s.push_str(&config.content); + if !config.content.ends_with('\n') { + s.push('\n'); + } + s.push_str("FCVM_EOF\n\n"); } - // Check parent directory (test case: exe in target/release/deps/, agent in target/release/) - if let Some(parent) = exe_dir.parent() { - let fc_agent_parent = parent.join("fc-agent"); - if fc_agent_parent.exists() { - return Ok(fc_agent_parent); + // Fix fstab (remove problematic entries) + if !plan.fstab.remove_patterns.is_empty() { + s.push_str("# Fix /etc/fstab\n"); + for pattern in &plan.fstab.remove_patterns { + // Use sed to remove lines containing the pattern + s.push_str(&format!("sed -i '/{}/d' /etc/fstab\n", pattern.replace('/', "\\/"))); } + s.push('\n'); } - // Fallback: environment variable override for special cases - if let Ok(path) = std::env::var("FC_AGENT_PATH") { - let p = PathBuf::from(&path); - if p.exists() { - return Ok(p); + // Configure container registries + s.push_str("# Configure Podman registries\n"); + s.push_str("cat > /etc/containers/registries.conf << 'FCVM_EOF'\n"); + s.push_str("unqualified-search-registries = [\"docker.io\"]\n\n"); + s.push_str("[[registry]]\n"); + s.push_str("location = \"docker.io\"\n"); + s.push_str("FCVM_EOF\n\n"); + + // Enable services + if !plan.services.enable.is_empty() { + s.push_str("# Enable services\n"); + s.push_str("systemctl enable"); + for svc in &plan.services.enable { + s.push_str(&format!(" {}", svc)); + } + s.push('\n'); + } + + // Also enable serial console + s.push_str("systemctl enable serial-getty@ttyS0\n\n"); + + // Disable services + if !plan.services.disable.is_empty() { + s.push_str("# Disable services\n"); + s.push_str("systemctl disable"); + for svc in &plan.services.disable { + s.push_str(&format!(" {}", svc)); + } + s.push_str(" || true\n\n"); + } + + // Cleanup + if !plan.cleanup.remove_dirs.is_empty() { + s.push_str("# Cleanup unnecessary files\n"); + for pattern in &plan.cleanup.remove_dirs { + s.push_str(&format!("rm -rf {}\n", pattern)); + } + s.push('\n'); + } + + // Clean apt cache for smaller image + s.push_str("# Clean apt cache\n"); + s.push_str("apt-get clean\n"); + s.push_str("rm -rf /var/lib/apt/lists/*\n\n"); + + s.push_str("echo 'FCVM_SETUP_COMPLETE'\n"); + s.push_str("# Shutdown to signal completion\n"); + s.push_str("shutdown -h now\n"); + s +} + + +// ============================================================================ +// Plan Loading and SHA256 +// ============================================================================ + +/// Find the plan file in the workspace +fn find_plan_file() -> Result { + // Try relative to current exe (for installed binary) + let exe_path = std::env::current_exe().context("getting current executable path")?; + let exe_dir = exe_path.parent().context("getting executable directory")?; + + // Check various locations + let candidates = [ + exe_dir.join(PLAN_FILE), + exe_dir.join("..").join(PLAN_FILE), + exe_dir.join("../..").join(PLAN_FILE), + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(PLAN_FILE), + ]; + + for path in &candidates { + if path.exists() { + return Ok(path.canonicalize().context("canonicalizing plan file path")?); } } + // Fallback to CARGO_MANIFEST_DIR for development + let manifest_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(PLAN_FILE); + if manifest_path.exists() { + return Ok(manifest_path); + } + bail!( - "fc-agent binary not found at {} or via FC_AGENT_PATH env var.\n\ - Build with: cargo build --release", - fc_agent.display() + "rootfs-plan.toml not found. Checked: {:?}", + candidates.iter().map(|p| p.display().to_string()).collect::>() ) } -/// Helper to convert Path to str with proper error handling -fn path_to_str(path: &Path) -> Result<&str> { - path.to_str() - .ok_or_else(|| anyhow::anyhow!("path contains invalid UTF-8: {:?}", path)) +/// Load and parse the plan file +pub fn load_plan() -> Result<(Plan, String, String)> { + let plan_path = find_plan_file()?; + let plan_content = std::fs::read_to_string(&plan_path) + .with_context(|| format!("reading plan file: {}", plan_path.display()))?; + + // Compute SHA256 of plan content (first 12 chars for image naming) + let plan_sha = compute_sha256(plan_content.as_bytes()); + let plan_sha_short = plan_sha[..12].to_string(); + + let plan: Plan = toml::from_str(&plan_content) + .with_context(|| format!("parsing plan file: {}", plan_path.display()))?; + + info!( + plan_file = %plan_path.display(), + plan_sha = %plan_sha_short, + "loaded rootfs plan" + ); + + Ok((plan, plan_sha, plan_sha_short)) +} + +/// Compute SHA256 of bytes, return hex string +pub fn compute_sha256(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + format!("{:x}", hasher.finalize()) } -/// Ensure rootfs exists, creating minimal Ubuntu + Podman if needed +// ============================================================================ +// Public API +// ============================================================================ + +/// Ensure rootfs exists, creating if needed (NO ROOT REQUIRED) +/// +/// The rootfs is named after the generated setup script SHA256: layer2-{script_sha}.raw +/// If the script changes (due to plan changes), a new rootfs is created automatically. /// -/// Caches the rootfs filesystem - only creates it once. -/// The base rootfs is immutable after creation to prevent corruption when VMs start in parallel. +/// Layer 2 creation flow (all rootless): +/// 1. Download Ubuntu cloud image (qcow2) +/// 2. Convert to raw with qemu-img +/// 3. Expand to 10GB with truncate +/// 4. Download packages +/// 5. Create initrd with embedded packages +/// 6. Boot VM with initrd to install packages (no network needed) +/// 6. Wait for VM to shut down +/// 7. Rename to layer2-{sha}.raw +/// +/// NOTE: fc-agent is NOT included in Layer 2. It will be injected per-VM at boot time. +/// Layer 2 only contains packages (podman, crun, etc.). pub async fn ensure_rootfs() -> Result { + let (plan, _plan_sha_full, _plan_sha_short) = load_plan()?; + + // Generate all scripts and compute hash of the complete init script + let setup_script = generate_setup_script(&plan); + let install_script = generate_install_script(); + let init_script = generate_init_script(&install_script, &setup_script); + + // Get kernel URL for the current architecture + let kernel_config = plan.kernel.current_arch()?; + let kernel_url = &kernel_config.url; + + // Hash the complete init script + kernel URL + // Any change to: + // - init logic, install script, or setup script + // - kernel URL (different kernel version/release) + // invalidates the cache + let mut combined = init_script.clone(); + combined.push_str("\n# KERNEL_URL: "); + combined.push_str(kernel_url); + let script_sha = compute_sha256(combined.as_bytes()); + let script_sha_short = &script_sha[..12]; + let rootfs_dir = paths::rootfs_dir(); - let rootfs_path = paths::base_rootfs(); + let rootfs_path = rootfs_dir.join(format!("layer2-{}.raw", script_sha_short)); let lock_file = rootfs_dir.join(".rootfs-creation.lock"); - // If rootfs exists, return it immediately (it's immutable after creation) - // DO NOT modify the base rootfs on every VM start - this causes: - // 1. Filesystem corruption when VMs start in parallel - // 2. Unnecessary latency (~100ms per VM start) - // 3. Violates the "base rootfs is immutable" principle - // - // To update fc-agent: delete the rootfs and it will be recreated, OR - // explicitly run `fcvm setup rootfs` (TODO: implement setup command) + // If rootfs exists for this script, return it if rootfs_path.exists() { - info!(path = %rootfs_path.display(), "rootfs exists (using cached)"); + info!( + path = %rootfs_path.display(), + script_sha = %script_sha_short, + "rootfs exists for current script (using cached)" + ); return Ok(rootfs_path); } @@ -83,7 +464,6 @@ pub async fn ensure_rootfs() -> Result { .context("creating rootfs directory")?; // Acquire lock to prevent concurrent rootfs creation - // If multiple VMs start simultaneously, only one creates the rootfs info!("acquiring rootfs creation lock"); use std::os::unix::fs::OpenOptionsExt; let lock_fd = std::fs::OpenOptions::new() @@ -99,39 +479,41 @@ pub async fn ensure_rootfs() -> Result { .map_err(|(_, err)| err) .context("acquiring rootfs creation lock")?; - // Check again after acquiring lock (another process may have created it) + // Check again after acquiring lock if rootfs_path.exists() { - info!(path = %rootfs_path.display(), "rootfs exists (created by another process)"); + info!( + path = %rootfs_path.display(), + "rootfs exists (created by another process)" + ); flock.unlock().map_err(|(_, err)| err).ok(); let _ = std::fs::remove_file(&lock_file); return Ok(rootfs_path); } - // Now we have exclusive access, create the rootfs - info!("creating base rootfs from Ubuntu cloud image"); - info!("note: first-time cloud image download may take 5-15 minutes"); - info!("cached rootfs creation takes ~45 seconds"); + // Create the rootfs + info!( + script_sha = %script_sha_short, + "creating Layer 2 rootfs (first-time may take 5-15 minutes)" + ); - // Create at temp path first, then rename when complete to avoid race conditions. - // Other processes check if rootfs_path exists, so we must not create it until - // package installation is complete. - let temp_rootfs_path = rootfs_path.with_extension("ext4.tmp"); + // Log the generated script for debugging + debug!("generated setup script:\n{}", setup_script); - // Clean up any leftover temp file from a previous failed attempt + let temp_rootfs_path = rootfs_path.with_extension("raw.tmp"); let _ = tokio::fs::remove_file(&temp_rootfs_path).await; - let result = create_ubuntu_rootfs(&temp_rootfs_path) - .await - .context("creating Ubuntu rootfs"); + let result = create_layer2_rootless(&plan, script_sha_short, &setup_script, &temp_rootfs_path).await; - // If successful, rename temp file to final path if result.is_ok() { tokio::fs::rename(&temp_rootfs_path, &rootfs_path) .await .context("renaming temp rootfs to final path")?; - info!("rootfs creation complete"); + info!( + path = %rootfs_path.display(), + script_sha = %script_sha_short, + "Layer 2 rootfs creation complete" + ); } else { - // Clean up temp file on failure let _ = tokio::fs::remove_file(&temp_rootfs_path).await; } @@ -143,599 +525,1057 @@ pub async fn ensure_rootfs() -> Result { let _ = std::fs::remove_file(&lock_file); result?; - Ok(rootfs_path) } -/// Create Ubuntu rootfs from official cloud image +/// Find the fc-agent binary for per-VM injection /// -/// Downloads Ubuntu 24.04 cloud image (cached), customizes it with virt-customize, -/// extracts to ext4, then installs packages. -async fn create_ubuntu_rootfs(output_path: &Path) -> Result<()> { - // Download Ubuntu cloud image (cached) - let cloud_image = download_ubuntu_cloud_image().await?; - - info!("customizing Ubuntu cloud image with virt-customize"); +/// fc-agent is NOT included in Layer 2 (the base rootfs). Instead, it is +/// injected per-VM at boot time via initrd. This function is used to locate +/// the binary for that injection. +/// +/// Both fcvm and fc-agent are workspace members built together. +/// Search order: +/// 1. Same directory as current exe +/// 2. Parent directory (for tests in target/release/deps/) +/// 3. FC_AGENT_PATH environment variable +pub fn find_fc_agent_binary() -> Result { + let exe_path = std::env::current_exe().context("getting current executable path")?; + let exe_dir = exe_path.parent().context("getting executable directory")?; - // Customize the qcow2 image BEFORE extracting - customize_ubuntu_cloud_image(&cloud_image).await?; + // Check same directory + let fc_agent = exe_dir.join("fc-agent"); + if fc_agent.exists() { + return Ok(fc_agent); + } - // Extract root partition from customized cloud image - info!("extracting customized root partition"); - extract_root_partition(&cloud_image, output_path).await?; + // Check parent directory (test case) + if let Some(parent) = exe_dir.parent() { + let fc_agent_parent = parent.join("fc-agent"); + if fc_agent_parent.exists() { + return Ok(fc_agent_parent); + } + } - // Install packages after extraction (virt-customize has networking issues) - info!("installing packages in extracted rootfs"); - install_packages_in_rootfs(output_path).await?; + // Fallback: environment variable + if let Ok(path) = std::env::var("FC_AGENT_PATH") { + let p = PathBuf::from(&path); + if p.exists() { + return Ok(p); + } + } - Ok(()) + bail!( + "fc-agent binary not found at {} or via FC_AGENT_PATH env var.\n\ + Build with: cargo build --release", + fc_agent.display() + ) } -/// Download Ubuntu cloud image (cached) -async fn download_ubuntu_cloud_image() -> Result { - let cache_dir = paths::base_dir().join("cache"); - tokio::fs::create_dir_all(&cache_dir) - .await - .context("creating cache directory")?; - - // Detect architecture and use appropriate cloud image - let (arch_name, cloud_arch) = match std::env::consts::ARCH { - "x86_64" => ("amd64", "amd64"), - "aarch64" => ("arm64", "arm64"), - other => bail!("unsupported architecture: {}", other), - }; - - let image_url = format!( - "https://cloud-images.ubuntu.com/releases/24.04/release/ubuntu-24.04-server-cloudimg-{cloud_arch}.img" - ); - let image_path = cache_dir.join(format!("ubuntu-24.04-{arch_name}.img")); - - // Return cached image if it exists - if image_path.exists() { - info!(path = %image_path.display(), "using cached Ubuntu cloud image"); - return Ok(image_path); +// ============================================================================ +// fc-agent Initrd Creation +// ============================================================================ + +/// The fc-agent systemd service unit file content +const FC_AGENT_SERVICE: &str = r#"[Unit] +Description=fcvm guest agent for container orchestration +After=network.target + +[Service] +Type=simple +ExecStart=/usr/local/bin/fc-agent +Restart=on-failure +RestartSec=1 + +[Install] +WantedBy=multi-user.target +"#; + +/// The init script for the initrd +/// This runs before the real init, copies fc-agent to the rootfs, then switches root +const INITRD_INIT_SCRIPT: &str = r#"#!/bin/busybox sh +# fc-agent injection initrd +# This runs before systemd, copies fc-agent to the rootfs, then switch_root + +# Install busybox applets +/bin/busybox mkdir -p /bin /sbin /proc /sys /dev /newroot +/bin/busybox --install -s /bin +/bin/busybox --install -s /sbin + +# Mount essential filesystems +mount -t proc proc /proc +mount -t sysfs sys /sys +mount -t devtmpfs dev /dev + +# Parse kernel cmdline to find root device +ROOT="" +for param in $(cat /proc/cmdline); do + case "$param" in + root=*) + ROOT="${param#root=}" + ;; + esac +done + +if [ -z "$ROOT" ]; then + echo "ERROR: No root= parameter found in kernel cmdline" + exec /bin/sh +fi + +# Handle /dev/vda1 style paths +case "$ROOT" in + /dev/*) + # Wait for device to appear + for i in 1 2 3 4 5; do + if [ -b "$ROOT" ]; then + break + fi + echo "Waiting for $ROOT..." + sleep 1 + done + ;; +esac + +# Mount the real root filesystem +echo "Mounting $ROOT as real root..." +mount -o rw "$ROOT" /newroot + +if [ ! -d /newroot/usr ]; then + echo "ERROR: Failed to mount root filesystem" + exec /bin/sh +fi + +# Copy fc-agent binary +echo "Installing fc-agent..." +cp /fc-agent /newroot/usr/local/bin/fc-agent +chmod 755 /newroot/usr/local/bin/fc-agent + +# Copy service file +cp /fc-agent.service /newroot/etc/systemd/system/fc-agent.service + +# Enable the service (create symlink) +mkdir -p /newroot/etc/systemd/system/multi-user.target.wants +ln -sf ../fc-agent.service /newroot/etc/systemd/system/multi-user.target.wants/fc-agent.service + +echo "fc-agent installed successfully" + +# Also ensure MMDS route config exists (in case setup script failed) +mkdir -p /newroot/etc/systemd/network/10-eth0.network.d +if [ ! -f /newroot/etc/systemd/network/10-eth0.network.d/mmds.conf ]; then + echo "Adding MMDS route config..." + cat > /newroot/etc/systemd/network/10-eth0.network.d/mmds.conf << 'MMDSCONF' +[Route] +Destination=169.254.169.254/32 +Scope=link +MMDSCONF +fi + +# Also create the base network config if missing +if [ ! -f /newroot/etc/systemd/network/10-eth0.network ]; then + echo "Adding base network config..." + cat > /newroot/etc/systemd/network/10-eth0.network << 'NETCONF' +[Match] +Name=eth0 + +[Network] +KeepConfiguration=yes +NETCONF +fi + +# Cleanup +umount /proc +umount /sys +umount /dev + +# Switch to the real root and exec init +exec switch_root /newroot /sbin/init +"#; + +/// Ensure the fc-agent initrd exists, creating if needed +/// +/// The initrd is cached by fc-agent binary hash. When fc-agent is rebuilt, +/// a new initrd is automatically created. +/// +/// Returns the path to the initrd file. +pub async fn ensure_fc_agent_initrd() -> Result { + // Find fc-agent binary + let fc_agent_path = find_fc_agent_binary()?; + let fc_agent_bytes = std::fs::read(&fc_agent_path) + .with_context(|| format!("reading fc-agent binary at {}", fc_agent_path.display()))?; + let fc_agent_sha = compute_sha256(&fc_agent_bytes); + let fc_agent_sha_short = &fc_agent_sha[..12]; + + // Check if initrd already exists for this fc-agent version + let initrd_dir = paths::base_dir().join("initrd"); + let initrd_path = initrd_dir.join(format!("fc-agent-{}.initrd", fc_agent_sha_short)); + + if initrd_path.exists() { + debug!( + path = %initrd_path.display(), + fc_agent_sha = %fc_agent_sha_short, + "using cached fc-agent initrd" + ); + return Ok(initrd_path); } - info!(url = %image_url, "downloading Ubuntu 24.04 cloud image"); - info!("download size: ~644MB (one-time, cached for future use)"); - info!("download may take 5-15 minutes depending on network speed"); - - // Download with reqwest - let client = reqwest::Client::new(); - let response = client - .get(image_url) - .send() + // Create initrd directory + tokio::fs::create_dir_all(&initrd_dir) .await - .context("downloading cloud image")?; + .context("creating initrd directory")?; - if !response.status().is_success() { - bail!("download failed with status: {}", response.status()); - } - - // Get content length for progress reporting - let total_size = response.content_length().unwrap_or(0); - let total_mb = total_size as f64 / 1024.0 / 1024.0; - - // Stream to file with progress - let mut file = File::create(&image_path) - .await - .context("creating image file")?; + info!( + fc_agent = %fc_agent_path.display(), + fc_agent_sha = %fc_agent_sha_short, + "creating fc-agent initrd" + ); - let bytes = response.bytes().await.context("reading response body")?; - let downloaded_mb = bytes.len() as f64 / 1024.0 / 1024.0; + // Create temporary directory for initrd contents + let temp_dir = initrd_dir.join(format!(".initrd-build-{}", fc_agent_sha_short)); + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + tokio::fs::create_dir_all(&temp_dir).await?; - file.write_all(&bytes).await.context("writing image file")?; - file.flush().await.context("flushing image file")?; + // Create directory structure + for dir in &["bin", "sbin", "dev", "proc", "sys", "newroot"] { + tokio::fs::create_dir_all(temp_dir.join(dir)).await?; + } - info!(path = %image_path.display(), - downloaded_mb = downloaded_mb, - expected_mb = total_mb, - "cloud image download complete"); + // Find busybox (prefer static version) + let busybox_path = find_busybox()?; - Ok(image_path) -} + // Copy busybox + tokio::fs::copy(&busybox_path, temp_dir.join("bin/busybox")).await?; -/// Extract root partition from qcow2 cloud image to a raw ext4 file -async fn extract_root_partition(qcow2_path: &Path, output_path: &Path) -> Result<()> { - info!("extracting root partition from cloud image"); + // Make busybox executable + Command::new("chmod") + .args(["755", temp_dir.join("bin/busybox").to_str().unwrap()]) + .output() + .await?; - // Find a free NBD device - let nbd_device = "/dev/nbd0"; + // Write init script + tokio::fs::write(temp_dir.join("init"), INITRD_INIT_SCRIPT).await?; + Command::new("chmod") + .args(["755", temp_dir.join("init").to_str().unwrap()]) + .output() + .await?; - // Load nbd kernel module if not already loaded - let _ = Command::new("modprobe") - .arg("nbd") - .arg("max_part=8") + // Copy fc-agent binary + tokio::fs::copy(&fc_agent_path, temp_dir.join("fc-agent")).await?; + Command::new("chmod") + .args(["755", temp_dir.join("fc-agent").to_str().unwrap()]) .output() - .await; + .await?; - // Connect qcow2 to NBD device - info!("connecting qcow2 to NBD device"); - let output = Command::new("qemu-nbd") - .args(["--connect", nbd_device, "-r", path_to_str(qcow2_path)?]) + // Write service file + tokio::fs::write(temp_dir.join("fc-agent.service"), FC_AGENT_SERVICE).await?; + + // Create cpio archive (initrd format) + let temp_initrd = initrd_path.with_extension("initrd.tmp"); + let output = Command::new("sh") + .args([ + "-c", + &format!( + "cd {} && find . | cpio -o -H newc 2>/dev/null | gzip > {}", + temp_dir.display(), + temp_initrd.display() + ), + ]) .output() .await - .context("running qemu-nbd connect")?; + .context("creating initrd cpio archive")?; if !output.status.success() { bail!( - "qemu-nbd connect failed: {}", + "Failed to create initrd: {}", String::from_utf8_lossy(&output.stderr) ); } - // Force kernel to re-read partition table - required on some systems (e.g., CI runners) - // Try partprobe first (from parted), fall back to partx (from util-linux) - info!("scanning partition table"); - let partprobe_result = Command::new("partprobe").arg(nbd_device).output().await; - if partprobe_result.is_err() - || !partprobe_result - .as_ref() - .map(|o| o.status.success()) - .unwrap_or(false) - { - // Fallback to partx - let _ = Command::new("partx") - .args(["-a", nbd_device]) - .output() - .await; - } - - // Wait for partition to appear with retry loop - let partition = format!("{}p1", nbd_device); - - // Small delay to allow kernel to create partition device nodes - // This is needed because partprobe/partx returns before udev creates the nodes - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - - let mut retries = 10; - while retries > 0 && !std::path::Path::new(&partition).exists() { - info!( - partition = %partition, - retries_left = retries, - "waiting for partition to appear" - ); - tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; - retries -= 1; - } + // Rename to final path + tokio::fs::rename(&temp_initrd, &initrd_path).await?; - // If partition still doesn't exist, try to create the device node manually. - // This is needed when running in a container where the host kernel creates - // the partition device on the host's devtmpfs, but the container has its own. - // NBD major is 43, partition 1 is minor 1. - if !std::path::Path::new(&partition).exists() { - info!("partition not auto-created, trying mknod"); + // Cleanup temp directory + let _ = tokio::fs::remove_dir_all(&temp_dir).await; - // Get partition info from sysfs - let sysfs_path = "/sys/block/nbd0/nbd0p1/dev"; - let dev_info = tokio::fs::read_to_string(sysfs_path).await; + info!( + path = %initrd_path.display(), + fc_agent_sha = %fc_agent_sha_short, + "fc-agent initrd created" + ); - if let Ok(dev_str) = dev_info { - // dev_str is "major:minor" e.g., "43:1" - let dev_str = dev_str.trim(); - info!(dev = %dev_str, "found partition info in sysfs"); + Ok(initrd_path) +} - // Create device node with mknod - let mknod_result = Command::new("mknod") - .args([&partition, "b", "43", "1"]) - .output() - .await; +/// Find busybox binary (prefer static version) +fn find_busybox() -> Result { + // Check for busybox-static first + for path in &["/bin/busybox-static", "/usr/bin/busybox-static", "/bin/busybox", "/usr/bin/busybox"] { + let p = PathBuf::from(path); + if p.exists() { + return Ok(p); + } + } - if let Ok(output) = mknod_result { - if output.status.success() { - info!(partition = %partition, "created partition device node"); - } else { - warn!("mknod failed: {}", String::from_utf8_lossy(&output.stderr)); - } + // Try which + if let Ok(output) = std::process::Command::new("which").arg("busybox").output() { + if output.status.success() { + let path = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if !path.is_empty() { + return Ok(PathBuf::from(path)); } - } else { - // Try mknod with assumed minor number (1 for first partition) - info!("sysfs info not available, trying mknod with assumed minor 1"); - let _ = Command::new("mknod") - .args([&partition, "b", "43", "1"]) - .output() - .await; } } - // Final check - if !std::path::Path::new(&partition).exists() { - // List what devices exist for debugging - let ls_output = Command::new("sh") - .args([ - "-c", - "ls -la /dev/nbd0* 2>/dev/null || echo 'no nbd devices'", - ]) - .output() - .await; - let devices = ls_output - .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) - .unwrap_or_else(|_| "failed to list".to_string()); - - // Also check sysfs for partition info - let sysfs_output = Command::new("sh") - .args([ - "-c", - "cat /sys/block/nbd0/nbd0p1/dev 2>/dev/null || echo 'no sysfs info'", - ]) - .output() - .await; - let sysfs_info = sysfs_output - .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) - .unwrap_or_else(|_| "no sysfs".to_string()); + bail!("busybox not found. Install with: apt-get install busybox-static") +} + +// ============================================================================ +// Layer 2 Creation (Rootless) +// ============================================================================ +/// Create Layer 2 rootfs without requiring root +/// +/// 1. Download cloud image (qcow2, cached) +/// 2. Convert to raw with qemu-img (no root) +/// 3. Expand to 10GB (no root) +/// 4. Download .deb packages on host (has network) +/// 5. Create initrd with embedded packages +/// 6. Boot VM with initrd to install packages (no network needed) +/// 7. Wait for VM to shut down +/// +/// NOTE: fc-agent is NOT included - it will be injected per-VM at boot time. +async fn create_layer2_rootless( + plan: &Plan, + script_sha_short: &str, + script: &str, + output_path: &Path, +) -> Result<()> { + // Step 1: Download cloud image (cached by URL) + let cloud_image = download_cloud_image(plan).await?; + + // Step 2: Convert qcow2 to raw (no root required!) + info!("converting qcow2 to raw format (no root required)"); + let full_disk_path = output_path.with_extension("full"); + let output = Command::new("qemu-img") + .args([ + "convert", + "-f", "qcow2", + "-O", "raw", + path_to_str(&cloud_image)?, + path_to_str(&full_disk_path)?, + ]) + .output() + .await + .context("running qemu-img convert")?; + + if !output.status.success() { bail!( - "partition {} not found after waiting. Devices: {}, Sysfs: {}", - partition, - devices.trim(), - sysfs_info.trim() + "qemu-img convert failed: {}", + String::from_utf8_lossy(&output.stderr) ); } - info!(partition = %partition, "copying root partition"); + // Step 3: Extract partition 1 (root filesystem) using fdisk and dd + // This avoids GPT partition table issues with Firecracker + info!("extracting root partition from GPT disk (no root required)"); + let partition_path = output_path.with_extension("converting"); + + // Get partition info using sfdisk + let output = Command::new("sfdisk") + .args(["-J", path_to_str(&full_disk_path)?]) + .output() + .await + .context("getting partition info")?; + + if !output.status.success() { + bail!("sfdisk failed: {}", String::from_utf8_lossy(&output.stderr)); + } + + // Parse sfdisk JSON output to find partition 1 + #[derive(serde::Deserialize)] + struct SfdiskOutput { + partitiontable: PartitionTable, + } + #[derive(serde::Deserialize)] + struct PartitionTable { + partitions: Vec, + } + #[derive(serde::Deserialize)] + struct Partition { + node: String, + start: u64, + size: u64, + #[serde(rename = "type")] + ptype: String, + } + + let sfdisk_output: SfdiskOutput = serde_json::from_slice(&output.stdout) + .context("parsing sfdisk JSON output")?; + + // Find the Linux filesystem partition (type ends with 0FC63DAF-8483-4772-8E79-3D69D8477DE4 or similar) + let root_part = sfdisk_output.partitiontable.partitions.iter() + .find(|p| p.ptype.contains("0FC63DAF") || p.node.ends_with("1")) + .ok_or_else(|| anyhow::anyhow!("Could not find root partition in GPT disk"))?; + + info!( + partition = %root_part.node, + start_sector = root_part.start, + size_sectors = root_part.size, + "found root partition" + ); + + // Extract partition using dd (sector size is 512 bytes) let output = Command::new("dd") .args([ - &format!("if={}", partition), - &format!("of={}", path_to_str(output_path)?), - "bs=4M", + &format!("if={}", path_to_str(&full_disk_path)?), + &format!("of={}", path_to_str(&partition_path)?), + "bs=512", + &format!("skip={}", root_part.start), + &format!("count={}", root_part.size), + "status=progress", ]) .output() - .await; + .await + .context("extracting partition with dd")?; + + if !output.status.success() { + bail!("dd failed: {}", String::from_utf8_lossy(&output.stderr)); + } - // Always disconnect NBD - let disconnect_output = Command::new("qemu-nbd") - .args(["--disconnect", nbd_device]) + // Remove full disk image (no longer needed) + let _ = tokio::fs::remove_file(&full_disk_path).await; + + // Step 4: Expand the extracted partition to 10GB + info!("expanding partition to {}", LAYER2_SIZE); + let output = Command::new("truncate") + .args(["-s", LAYER2_SIZE, path_to_str(&partition_path)?]) .output() - .await; + .await + .context("expanding partition")?; - // Check dd result - let output = output.context("running dd")?; if !output.status.success() { - bail!("dd failed: {}", String::from_utf8_lossy(&output.stderr)); + bail!("truncate failed: {}", String::from_utf8_lossy(&output.stderr)); } - // Check disconnect result - if let Ok(disc_out) = disconnect_output { - if !disc_out.status.success() { - warn!( - "qemu-nbd disconnect warning: {}", - String::from_utf8_lossy(&disc_out.stderr) - ); - } + // Resize the ext4 filesystem to fill the partition + info!("resizing ext4 filesystem"); + let _output = Command::new("e2fsck") + .args(["-f", "-y", path_to_str(&partition_path)?]) + .output() + .await + .context("running e2fsck")?; + // e2fsck may return non-zero even on success (exit code 1 = errors corrected) + + let output = Command::new("resize2fs") + .args([path_to_str(&partition_path)?]) + .output() + .await + .context("running resize2fs")?; + + if !output.status.success() { + bail!("resize2fs failed: {}", String::from_utf8_lossy(&output.stderr)); } - // Resize the extracted ext4 to 10GB (plenty of space for containers) - info!("resizing filesystem to 10GB"); + // Step 4b: Fix /etc/fstab to remove BOOT and UEFI entries + // This MUST happen before booting - systemd reads fstab before cloud-init runs + info!("fixing /etc/fstab to remove non-existent partition entries"); + fix_fstab_in_image(&partition_path).await?; + + // Step 5: Download packages on host (host has network!) + let packages_dir = download_packages(plan, script_sha_short).await?; + + // Step 6: Create initrd for Layer 2 setup with embedded packages + // The initrd runs before systemd and: + // - Mounts rootfs at /newroot + // - Copies packages from initrd to rootfs + // - Runs dpkg -i to install packages + // - Runs the setup script + // - Powers off + // Packages are embedded in the initrd (no second disk needed) + let install_script = generate_install_script(); + + let setup_initrd = create_layer2_setup_initrd(&install_script, script, &packages_dir).await?; + + // Step 7: Boot VM with initrd to run setup (no cloud-init needed!) + // Now we boot a pure ext4 partition (no GPT), so root=/dev/vda works + // Only one disk needed - packages are in the initrd + info!( + script_sha = %script_sha_short, + "booting VM with setup initrd (packages embedded)" + ); - // First resize the file itself to 10GB - let output = Command::new("truncate") - .args(["-s", "10G", path_to_str(output_path)?]) + boot_vm_for_setup(&partition_path, &setup_initrd).await?; + + // Step 8: Rename to final path + tokio::fs::rename(&partition_path, output_path) + .await + .context("renaming partition to output path")?; + + info!("Layer 2 creation complete (packages embedded in initrd)"); + Ok(()) +} + +/// Fix /etc/fstab in an ext4 image to remove BOOT and UEFI partition entries +/// +/// The Ubuntu cloud image has fstab entries for LABEL=BOOT and LABEL=UEFI +/// which cause systemd to enter emergency mode when these partitions don't exist. +/// We use debugfs to modify fstab directly in the ext4 image without mounting. +async fn fix_fstab_in_image(image_path: &Path) -> Result<()> { + // Read current fstab using debugfs + let output = Command::new("debugfs") + .args(["-R", "cat /etc/fstab", path_to_str(image_path)?]) .output() .await - .context("running truncate")?; + .context("reading fstab with debugfs")?; if !output.status.success() { bail!( - "truncate failed: {}", + "debugfs read failed: {}", String::from_utf8_lossy(&output.stderr) ); } - // Check and fix filesystem - let output = Command::new("e2fsck") - .args(["-f", "-y", path_to_str(output_path)?]) + let fstab_content = String::from_utf8_lossy(&output.stdout); + + // Filter out BOOT and UEFI entries + let new_fstab: String = fstab_content + .lines() + .filter(|line| { + !line.contains("LABEL=BOOT") && !line.contains("LABEL=UEFI") + }) + .collect::>() + .join("\n"); + + debug!("new fstab content:\n{}", new_fstab); + + // Write new fstab to a temp file + let temp_fstab = std::env::temp_dir().join("fstab.new"); + tokio::fs::write(&temp_fstab, format!("{}\n", new_fstab)) + .await + .context("writing temp fstab")?; + + // Write the new fstab back using debugfs -w + // debugfs command: rm /etc/fstab; write /tmp/fstab.new /etc/fstab + let output = Command::new("debugfs") + .args([ + "-w", + "-R", + &format!("rm /etc/fstab"), + path_to_str(image_path)?, + ]) .output() .await - .context("running e2fsck")?; + .context("removing old fstab with debugfs")?; - if !output.status.success() - && !output - .status - .code() - .map(|c| c == 1 || c == 2) - .unwrap_or(false) - { - // Exit codes 1-2 are warnings, not errors - warn!( - "e2fsck warnings: {}", + // rm might fail if file doesn't exist, that's OK + if !output.status.success() { + debug!( + "debugfs rm fstab (might be expected): {}", String::from_utf8_lossy(&output.stderr) ); } - // Resize filesystem to fill the file - let output = Command::new("resize2fs") - .arg(path_to_str(output_path)?) + let output = Command::new("debugfs") + .args([ + "-w", + "-R", + &format!("write {} /etc/fstab", temp_fstab.display()), + path_to_str(image_path)?, + ]) .output() .await - .context("running resize2fs")?; + .context("writing new fstab with debugfs")?; if !output.status.success() { bail!( - "resize2fs failed: {}", + "debugfs write failed: {}", String::from_utf8_lossy(&output.stderr) ); } + // Cleanup temp file + let _ = tokio::fs::remove_file(&temp_fstab).await; + + // Verify the change + let output = Command::new("debugfs") + .args(["-R", "cat /etc/fstab", path_to_str(image_path)?]) + .output() + .await + .context("verifying fstab with debugfs")?; + + let new_content = String::from_utf8_lossy(&output.stdout); + if new_content.contains("LABEL=BOOT") || new_content.contains("LABEL=UEFI") { + warn!("fstab still contains BOOT/UEFI entries after fix - VM may enter emergency mode"); + } else { + info!("fstab fixed - removed BOOT and UEFI entries"); + } + Ok(()) } -/// Customize Ubuntu cloud image using virt-customize +/// Create a Layer 2 setup initrd with embedded packages /// -/// This modifies the qcow2 image in-place, adding Podman, fc-agent, and all configs. -/// Much simpler and more robust than manual mount/chroot/unmount. -async fn customize_ubuntu_cloud_image(image_path: &Path) -> Result<()> { - // Find fc-agent binary - let fc_agent_src = find_fc_agent_binary()?; - - info!("running virt-customize on cloud image"); - - let mut cmd = Command::new("virt-customize"); - cmd.arg("-a").arg(path_to_str(image_path)?); - - // Disable networking to avoid passt errors (packages installed later via chroot) - cmd.arg("--no-network"); - - // 1. Fix /etc/fstab - remove BOOT and UEFI partitions that don't exist - cmd.arg("--run-command") - .arg("sed -i '/LABEL=BOOT/d;/LABEL=UEFI/d' /etc/fstab"); - - // 2. Copy fc-agent binary (packages installed later via chroot) - // Note: universe repository already enabled in base cloud image - info!("adding fc-agent binary"); - cmd.arg("--run-command").arg("mkdir -p /usr/local/bin"); - cmd.arg("--copy-in") - .arg(format!("{}:/usr/local/bin/", fc_agent_src.display())); - cmd.arg("--chmod").arg("0755:/usr/local/bin/fc-agent"); - - // 4. Write chrony config (create directory first) - info!("adding chrony config"); - cmd.arg("--run-command").arg("mkdir -p /etc/chrony"); - let chrony_conf = "# NTP servers from pool.ntp.org\npool pool.ntp.org iburst\n\n\ - # Allow clock to be stepped (not slewed) for large time differences\n\ - makestep 1.0 3\n\n\ - # Directory for drift and other runtime files\n\ - driftfile /var/lib/chrony/drift\n"; - cmd.arg("--write") - .arg(format!("/etc/chrony/chrony.conf:{}", chrony_conf)); - - // 5. Write systemd-networkd config - info!("adding network config"); - cmd.arg("--run-command") - .arg("mkdir -p /etc/systemd/network /etc/systemd/network/10-eth0.network.d"); - - let network_config = "[Match]\nName=eth0\n\n[Network]\n# Keep kernel IP configuration from ip= boot parameter\nKeepConfiguration=yes\n# DNS is provided via kernel ip= boot parameter (gateway IP where dnsmasq listens)\n"; - cmd.arg("--write").arg(format!( - "/etc/systemd/network/10-eth0.network:{}", - network_config - )); +/// This creates a busybox-based initrd that: +/// 1. Mounts /dev/vda (rootfs) at /newroot +/// 2. Copies packages from /packages (embedded in initrd) to rootfs +/// 3. Runs dpkg -i to install packages inside rootfs +/// 4. Runs the setup script +/// 5. Powers off the VM +/// +/// Packages are embedded directly in the initrd, no second disk needed. +/// This allows using Kata's kernel which has FUSE but no ISO9660/SquashFS. +async fn create_layer2_setup_initrd( + install_script: &str, + setup_script: &str, + packages_dir: &Path, +) -> Result { + info!("creating Layer 2 setup initrd with embedded packages"); + + let temp_dir = PathBuf::from("/tmp/fcvm-layer2-initrd"); + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + tokio::fs::create_dir_all(&temp_dir).await?; - let mmds_route = "[Route]\nDestination=169.254.169.254/32\nScope=link\n"; - cmd.arg("--write").arg(format!( - "/etc/systemd/network/10-eth0.network.d/mmds.conf:{}", - mmds_route - )); + // Create the init script that runs before systemd + let init_script = generate_init_script(install_script, setup_script); - // 6. DNS configuration note - // DNS is now handled by fc-agent at startup (parses kernel cmdline, writes /etc/resolv.conf) - // This avoids relying on systemd service ordering which was unreliable on some CI runners - - // 7. Write fc-agent systemd service - info!("adding fc-agent service"); - let fc_agent_service = "[Unit]\nDescription=fcvm guest agent for container orchestration\n\ - After=network.target\nWants=network.target\n\n\ - [Service]\nType=simple\nExecStart=/usr/local/bin/fc-agent\n\ - Restart=on-failure\nRestartSec=5\n\ - StandardOutput=journal+console\nStandardError=journal+console\n\n\ - [Install]\nWantedBy=multi-user.target\n"; - cmd.arg("--write").arg(format!( - "/etc/systemd/system/fc-agent.service:{}", - fc_agent_service - )); + // Write init script + let init_path = temp_dir.join("init"); + tokio::fs::write(&init_path, &init_script).await?; - // 9. Enable services (fc-agent, other services enabled after package install) - info!("enabling systemd services"); - cmd.arg("--run-command") - .arg("systemctl enable fc-agent systemd-networkd serial-getty@ttyS0"); + // Make init executable + let output = Command::new("chmod") + .args(["755", path_to_str(&init_path)?]) + .output() + .await + .context("making init executable")?; - info!("executing virt-customize (this should be quick)"); + if !output.status.success() { + bail!("Failed to chmod init: {}", String::from_utf8_lossy(&output.stderr)); + } - let output = cmd.output().await.context("running virt-customize")?; + // Copy busybox static binary + let busybox_src = PathBuf::from("/bin/busybox"); + let busybox_dst = temp_dir.join("bin").join("busybox"); + tokio::fs::create_dir_all(temp_dir.join("bin")).await?; + tokio::fs::copy(&busybox_src, &busybox_dst) + .await + .context("copying busybox")?; + + let output = Command::new("chmod") + .args(["755", path_to_str(&busybox_dst)?]) + .output() + .await + .context("making busybox executable")?; if !output.status.success() { + bail!("Failed to chmod busybox: {}", String::from_utf8_lossy(&output.stderr)); + } + + // Copy packages into initrd + let initrd_packages_dir = temp_dir.join("packages"); + tokio::fs::create_dir_all(&initrd_packages_dir).await?; + + // Copy all .deb files from packages_dir to initrd + let mut entries = tokio::fs::read_dir(packages_dir).await?; + let mut package_count = 0; + while let Some(entry) = entries.next_entry().await? { + let path = entry.path(); + if path.extension().map(|e| e == "deb").unwrap_or(false) { + let dest = initrd_packages_dir.join(entry.file_name()); + tokio::fs::copy(&path, &dest).await?; + package_count += 1; + } + } + info!(count = package_count, "embedded packages in initrd"); + + // Create the initrd using cpio + let initrd_path = temp_dir.join("initrd.cpio.gz"); + let cpio_output = Command::new("sh") + .args([ + "-c", + &format!( + "cd {} && find . | cpio -o -H newc 2>/dev/null | gzip > {}", + temp_dir.display(), + initrd_path.display() + ), + ]) + .output() + .await + .context("creating initrd cpio archive")?; + + if !cpio_output.status.success() { bail!( - "virt-customize failed:\n{}", - String::from_utf8_lossy(&output.stderr) + "Failed to create initrd: {}", + String::from_utf8_lossy(&cpio_output.stderr) ); } - info!("virt-customize completed successfully"); + // Log initrd size + if let Ok(meta) = tokio::fs::metadata(&initrd_path).await { + let size_mb = meta.len() as f64 / 1024.0 / 1024.0; + info!(path = %initrd_path.display(), size_mb = format!("{:.1}", size_mb), "Layer 2 setup initrd created"); + } - Ok(()) + Ok(initrd_path) } -/// Install packages in extracted rootfs using mount + chroot +/// Download all required .deb packages on the host /// -/// This is done AFTER extraction because virt-customize has networking issues. -/// Still much simpler than the old approach - single-purpose mount+chroot. -async fn install_packages_in_rootfs(rootfs_path: &Path) -> Result<()> { - let temp_dir = PathBuf::from("/tmp/fcvm-rootfs-install"); - let mount_point = temp_dir.join("mnt"); - - // Cleanup any previous mounts - let _ = Command::new("umount") - .arg("-R") - .arg(path_to_str(&mount_point).unwrap_or("/tmp/fcvm-rootfs-install/mnt")) - .output() - .await; - let _ = tokio::fs::remove_dir_all(&temp_dir).await; +/// Returns the path to the packages directory (not an ISO). +/// Packages will be embedded directly in the initrd. +/// +/// NOTE: fc-agent is NOT included - it will be injected per-VM at boot time. +async fn download_packages(plan: &Plan, script_sha_short: &str) -> Result { + let cache_dir = paths::base_dir().join("cache"); + let packages_dir = cache_dir.join(format!("packages-{}", script_sha_short)); + + // If packages directory already exists with .deb files, use it + if packages_dir.exists() { + if let Ok(mut entries) = tokio::fs::read_dir(&packages_dir).await { + let mut has_debs = false; + while let Ok(Some(entry)) = entries.next_entry().await { + if entry.path().extension().map(|e| e == "deb").unwrap_or(false) { + has_debs = true; + break; + } + } + if has_debs { + info!(path = %packages_dir.display(), "using cached packages directory"); + return Ok(packages_dir); + } + } + } - tokio::fs::create_dir_all(&mount_point) - .await - .context("creating temp mount directory")?; + // Create packages directory + let _ = tokio::fs::remove_dir_all(&packages_dir).await; + tokio::fs::create_dir_all(&packages_dir).await?; - // Mount the rootfs - let output = Command::new("mount") + // Get list of packages + let packages = plan.packages.all_packages(); + let packages_str = packages.join(" "); + + info!(packages = %packages_str, "downloading .deb packages on host"); + + // Download packages with dependencies using apt-get download + // We need to run this in a way that downloads packages for the target system + // Using apt-get download with proper architecture + let output = Command::new("apt-get") .args([ - "-o", - "loop", - path_to_str(rootfs_path)?, - path_to_str(&mount_point)?, + "download", + "-o", &format!("Dir::Cache::archives={}", packages_dir.display()), ]) + .args(&packages) + .current_dir(&packages_dir) .output() .await - .context("mounting rootfs for package installation")?; + .context("downloading packages with apt-get")?; if !output.status.success() { - bail!( - "mount failed: {}. Are you running as root?", - String::from_utf8_lossy(&output.stderr) - ); + // apt-get download might fail, try with apt-cache to get dependencies first + warn!("apt-get download failed, trying alternative method"); + + // Alternative: use apt-rdepends or manually download + for pkg in &packages { + let output = Command::new("apt-get") + .args(["download", pkg]) + .current_dir(&packages_dir) + .output() + .await; + + if let Ok(out) = output { + if !out.status.success() { + warn!(package = %pkg, "failed to download package, continuing..."); + } + } + } } - // Mount required filesystems for chroot - for (fs, target) in [ - ("proc", "proc"), - ("sysfs", "sys"), - ("devtmpfs", "dev"), - ("devpts", "dev/pts"), - ] { - let target_path = mount_point.join(target); - let _ = Command::new("mount") - .args(["-t", fs, fs, path_to_str(&target_path)?]) - .output() - .await; - } - - // Copy DNS resolution config into chroot for apt-get update - let resolv_conf_dest = mount_point.join("etc/resolv.conf"); - // Remove existing resolv.conf (might be a symlink) - let _ = tokio::fs::remove_file(&resolv_conf_dest).await; - tokio::fs::copy("/etc/resolv.conf", &resolv_conf_dest) - .await - .context("copying /etc/resolv.conf into chroot")?; - - // Install packages via chroot - let result = async { - // Update apt cache (universe already enabled in base cloud image) - info!("running apt-get update in chroot"); - let output = Command::new("chroot") - .arg(path_to_str(&mount_point)?) - .args(["apt-get", "update", "-y"]) - .output() - .await - .context("running apt-get update in chroot")?; + // Also download dependencies + info!("downloading package dependencies"); + let deps_output = Command::new("sh") + .args([ + "-c", + &format!( + "apt-cache depends --recurse --no-recommends --no-suggests --no-conflicts \ + --no-breaks --no-replaces --no-enhances {} | \ + grep '^\\w' | sort -u | xargs apt-get download 2>/dev/null || true", + packages_str + ), + ]) + .current_dir(&packages_dir) + .output() + .await; - // apt-get update completed successfully - no need to log verbose output + if let Err(e) = deps_output { + warn!(error = %e, "failed to download some dependencies, continuing..."); + } - if !output.status.success() { - bail!( - "apt-get update failed: {}", - String::from_utf8_lossy(&output.stderr) - ); + // Count downloaded packages + let mut count = 0; + if let Ok(mut entries) = tokio::fs::read_dir(&packages_dir).await { + while let Ok(Some(entry)) = entries.next_entry().await { + if entry.path().extension().map(|e| e == "deb").unwrap_or(false) { + count += 1; + } } + } + info!(count = count, "downloaded .deb packages"); - // Install packages (with verbose output) - info!("installing packages: podman crun fuse-overlayfs fuse3 haveged chrony"); - info!("package installation typically takes 30-60 seconds"); - - let output = Command::new("chroot") - .arg(path_to_str(&mount_point)?) - .env("DEBIAN_FRONTEND", "noninteractive") - .args([ - "apt-get", - "install", - "-y", - "-o", - "Dpkg::Options::=--force-confnew", // Force install new config files - "podman", - "crun", - "fuse-overlayfs", - "fuse3", - "haveged", - "chrony", - ]) - .output() - .await - .context("installing packages in chroot")?; + if count == 0 { + bail!("No packages downloaded. Check network and apt configuration."); + } - // Log apt output for debugging - info!( - "apt-get install stdout:\n{}", - String::from_utf8_lossy(&output.stdout) - ); - if !output.stderr.is_empty() { - info!( - "apt-get install stderr:\n{}", - String::from_utf8_lossy(&output.stderr) - ); - } + info!(path = %packages_dir.display(), count = count, "packages downloaded"); + Ok(packages_dir) +} - if !output.status.success() { - bail!( - "apt-get install failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } +/// Download cloud image (cached by URL hash) +async fn download_cloud_image(plan: &Plan) -> Result { + let cache_dir = paths::base_dir().join("cache"); + tokio::fs::create_dir_all(&cache_dir) + .await + .context("creating cache directory")?; - // Enable services - let output = Command::new("chroot") - .arg(path_to_str(&mount_point)?) - .args(["systemctl", "enable", "haveged", "chrony"]) - .output() - .await - .context("enabling services in chroot")?; + // Get arch-specific config + let arch_config = match std::env::consts::ARCH { + "x86_64" => &plan.base.amd64, + "aarch64" => &plan.base.arm64, + other => bail!("unsupported architecture: {}", other), + }; - if !output.status.success() { - bail!( - "systemctl enable failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } + let arch_name = match std::env::consts::ARCH { + "x86_64" => "amd64", + "aarch64" => "arm64", + other => other, + }; - // Configure Podman registries (after packages installed to avoid conffile conflict) - info!("configuring Podman container registries"); - let registries_conf_path = mount_point.join("etc/containers/registries.conf"); - let registries_content = "unqualified-search-registries = [\"docker.io\"]\n\n\ - [[registry]]\n\ - location = \"docker.io\"\n"; - tokio::fs::write(®istries_conf_path, registries_content) - .await - .context("writing registries.conf")?; - - // Write initial resolv.conf - will be overwritten by fcvm-setup-dns.service at boot - // The startup script extracts gateway IP from kernel cmdline and configures DNS - info!("configuring initial resolv.conf (will be updated at boot)"); - let resolv_conf_path = mount_point.join("etc/resolv.conf"); - tokio::fs::write( - &resolv_conf_path, - "# Placeholder - fcvm-setup-dns.service configures DNS at boot from kernel cmdline\nnameserver 127.0.0.53\n", - ) - .await - .context("writing resolv.conf")?; + // Cache by URL hash - changing URL triggers re-download + let url_hash = &compute_sha256(arch_config.url.as_bytes())[..12]; + let image_path = cache_dir.join(format!( + "ubuntu-{}-{}-{}.img", + plan.base.version, + arch_name, + url_hash + )); - Ok(()) + // If cached, use it + if image_path.exists() { + info!(path = %image_path.display(), "using cached cloud image"); + return Ok(image_path); } - .await; - // Always unmount (in reverse order) - for target in ["dev/pts", "dev", "sys", "proc", ""] { - let target_path = if target.is_empty() { - mount_point.clone() - } else { - mount_point.join(target) - }; - let _ = Command::new("umount") - .arg(path_to_str(&target_path).unwrap_or("")) - .output() - .await; + // Download + info!( + url = %arch_config.url, + "downloading Ubuntu cloud image (this may take several minutes)" + ); + + let temp_path = image_path.with_extension("img.download"); + let output = Command::new("curl") + .args([ + "-L", + "-o", + path_to_str(&temp_path)?, + "--progress-bar", + &arch_config.url, + ]) + .status() + .await + .context("downloading cloud image")?; + + if !output.success() { + bail!("curl failed to download cloud image"); } - // Cleanup + // Rename to final path + tokio::fs::rename(&temp_path, &image_path) + .await + .context("renaming downloaded image")?; + + info!( + path = %image_path.display(), + "cloud image downloaded" + ); + + Ok(image_path) +} + +/// Boot a Firecracker VM to run the Layer 2 setup initrd +/// +/// This boots with an initrd that has packages embedded: +/// - Mounts rootfs (/dev/vda) at /newroot +/// - Copies packages from /packages (in initrd RAM) to rootfs +/// - Runs dpkg -i to install packages inside rootfs via chroot +/// - Runs the setup script +/// - Powers off when complete +/// +/// Only one disk is needed - packages are embedded in the initrd. +/// This allows using Kata's kernel which has FUSE but no ISO9660/SquashFS. +async fn boot_vm_for_setup(disk_path: &Path, initrd_path: &Path) -> Result<()> { + use std::time::Duration; + use tokio::time::timeout; + + // Create a temporary directory for this setup VM + let temp_dir = PathBuf::from("/tmp/fcvm-layer2-setup"); let _ = tokio::fs::remove_dir_all(&temp_dir).await; + tokio::fs::create_dir_all(&temp_dir).await?; - result?; + let api_socket = temp_dir.join("firecracker.sock"); + let log_path = temp_dir.join("firecracker.log"); - info!("packages installed successfully"); + // Find kernel - downloaded from Kata release if needed + let kernel_path = crate::setup::kernel::ensure_kernel().await?; - Ok(()) + // Create serial console output file + let serial_path = temp_dir.join("serial.log"); + let serial_file = std::fs::File::create(&serial_path) + .context("creating serial console file")?; + + // Start Firecracker with serial console output + info!("starting Firecracker for Layer 2 setup (serial output: {})", serial_path.display()); + let mut fc_process = Command::new("firecracker") + .args([ + "--api-sock", path_to_str(&api_socket)?, + "--log-path", path_to_str(&log_path)?, + "--level", "Info", + ]) + .stdout(serial_file.try_clone().context("cloning serial file")?) + .stderr(std::process::Stdio::null()) + .spawn() + .context("starting Firecracker")?; + + // Wait for socket to be ready + for _ in 0..50 { + if api_socket.exists() { + break; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + + if !api_socket.exists() { + fc_process.kill().await.ok(); + bail!("Firecracker API socket not created"); + } + + // Configure VM via API + let client = crate::firecracker::api::FirecrackerClient::new(api_socket.clone())?; + + // Set boot source - boot from raw ext4 partition (no GPT) + // The disk IS the filesystem, so use root=/dev/vda directly + // No cloud-init needed - scripts are injected via debugfs and run by rc.local + client + .set_boot_source(crate::firecracker::api::BootSource { + kernel_image_path: kernel_path.display().to_string(), + // Boot with initrd that runs setup before trying to use systemd + // The initrd handles everything and powers off, so we don't need to worry about systemd + boot_args: Some("console=ttyS0 reboot=k panic=1 pci=off".to_string()), + initrd_path: Some(initrd_path.display().to_string()), + }) + .await?; + + // Add root drive (raw ext4 filesystem, no partition table) + client + .add_drive( + "rootfs", + crate::firecracker::api::Drive { + drive_id: "rootfs".to_string(), + path_on_host: disk_path.display().to_string(), + is_root_device: true, + is_read_only: false, + partuuid: None, + rate_limiter: None, + }, + ) + .await?; + + // No packages drive needed - packages are embedded in the initrd + + // Configure machine (minimal for setup) + client + .set_machine_config(crate::firecracker::api::MachineConfig { + vcpu_count: 2, + mem_size_mib: 2048, // 2GB for package installation + smt: Some(false), + cpu_template: None, + track_dirty_pages: None, + }) + .await?; + + // No network needed! Packages are installed from local ISO. + + // Start the VM + client.put_action(crate::firecracker::api::InstanceAction::InstanceStart).await?; + info!("Layer 2 setup VM started, waiting for completion (this takes several minutes)"); + + // Wait for VM to shut down (setup script runs shutdown -h now when done) + // Timeout after 15 minutes + let start = std::time::Instant::now(); + let mut last_serial_len = 0usize; + let result = timeout(Duration::from_secs(900), async { + loop { + // Check if Firecracker process has exited + match fc_process.try_wait() { + Ok(Some(status)) => { + let elapsed = start.elapsed(); + info!("Firecracker exited with status: {:?} after {:?}", status, elapsed); + return Ok(elapsed); + } + Ok(None) => { + // Still running, check for new serial output and log it + if let Ok(serial_content) = tokio::fs::read_to_string(&serial_path).await { + if serial_content.len() > last_serial_len { + // Log new output (trimmed to avoid excessive logging) + let new_output = &serial_content[last_serial_len..]; + for line in new_output.lines() { + // Skip empty lines and lines that are just timestamps + if !line.trim().is_empty() { + debug!(target: "layer2_setup", "{}", line); + } + } + last_serial_len = serial_content.len(); + } + } + tokio::time::sleep(Duration::from_secs(5)).await; + } + Err(e) => { + return Err(anyhow::anyhow!("Error checking Firecracker status: {}", e)); + } + } + } + }) + .await; + + // Cleanup + fc_process.kill().await.ok(); + + match result { + Ok(Ok(elapsed)) => { + // Check for completion marker in serial output + let serial_content = tokio::fs::read_to_string(&serial_path).await.unwrap_or_default(); + if !serial_content.contains("FCVM_SETUP_COMPLETE") { + warn!("Setup failed! Serial console output:\n{}", serial_content); + if let Ok(log_content) = tokio::fs::read_to_string(&log_path).await { + warn!("Firecracker log:\n{}", log_content); + } + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + bail!("Layer 2 setup failed (no FCVM_SETUP_COMPLETE marker found)"); + } + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + info!(elapsed_secs = elapsed.as_secs(), "Layer 2 setup VM completed successfully"); + Ok(()) + } + Ok(Err(e)) => { + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + Err(e) + } + Err(_) => { + let _ = tokio::fs::remove_dir_all(&temp_dir).await; + bail!("Layer 2 setup VM timed out after 15 minutes") + } + } +} + +/// Helper to convert Path to str +fn path_to_str(path: &Path) -> Result<&str> { + path.to_str() + .ok_or_else(|| anyhow::anyhow!("path contains invalid UTF-8: {:?}", path)) } diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 26a73f3d..16041926 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -13,6 +13,68 @@ use tokio::time::sleep; /// Global counter for unique test IDs static TEST_COUNTER: AtomicUsize = AtomicUsize::new(0); +/// Fail loudly if running as actual host root. +/// +/// Rootless tests break when run with `sudo` on the host because user namespace +/// mapping doesn't work correctly when you're already root. +/// +/// However, running as root inside a container is fine - the container provides +/// the isolation boundary, not the UID inside it. +/// +/// Call this at the start of any rootless test function. +pub fn require_non_root(test_name: &str) -> anyhow::Result<()> { + // Skip check if we're in a container - container is the isolation boundary + if is_in_container() { + return Ok(()); + } + + if nix::unistd::geteuid().is_root() { + anyhow::bail!( + "Rootless test '{}' cannot run as root! Run without sudo.", + test_name + ); + } + Ok(()) +} + +/// Check if we're running inside a container. +/// +/// Containers create marker files that we can use to detect containerized environments. +fn is_in_container() -> bool { + // Podman creates /run/.containerenv + if std::path::Path::new("/run/.containerenv").exists() { + return true; + } + // Docker creates /.dockerenv + if std::path::Path::new("/.dockerenv").exists() { + return true; + } + false +} + +/// Generate unique names for snapshot/clone tests. +/// +/// Returns (baseline_name, clone_name, snapshot_name, serve_name) with unique suffixes. +/// Uses process ID and atomic counter to ensure uniqueness across parallel tests. +/// +/// # Arguments +/// * `prefix` - Base name for the test (e.g., "portfwd", "internet") +/// +/// # Returns +/// Tuple of (baseline, clone, snapshot, serve) names +pub fn unique_names(prefix: &str) -> (String, String, String, String) { + let id = TEST_COUNTER.fetch_add(1, Ordering::SeqCst); + let pid = std::process::id(); + let suffix = format!("{}-{}", pid, id); + + ( + format!("{}-base-{}", prefix, suffix), + format!("{}-clone-{}", prefix, suffix), + format!("{}-snap-{}", prefix, suffix), + format!("{}-serve-{}", prefix, suffix), + ) +} + /// Fixture for managing a VM with FUSE volume for testing pub struct VmFixture { pub child: tokio::process::Child, diff --git a/tests/test_egress.rs b/tests/test_egress.rs index f067bdc2..5b672290 100644 --- a/tests/test_egress.rs +++ b/tests/test_egress.rs @@ -26,6 +26,7 @@ async fn test_egress_fresh_bridged() -> Result<()> { /// Test egress connectivity for fresh VM with rootless networking #[tokio::test] async fn test_egress_fresh_rootless() -> Result<()> { + common::require_non_root("test_egress_fresh_rootless")?; egress_fresh_test_impl("rootless").await } @@ -38,12 +39,13 @@ async fn test_egress_clone_bridged() -> Result<()> { /// Test egress connectivity for cloned VM with rootless networking #[tokio::test] async fn test_egress_clone_rootless() -> Result<()> { + common::require_non_root("test_egress_clone_rootless")?; egress_clone_test_impl("rootless").await } /// Implementation for testing egress on a fresh (non-cloned) VM async fn egress_fresh_test_impl(network: &str) -> Result<()> { - let vm_name = format!("egress-fresh-{}", network); + let (vm_name, _, _, _) = common::unique_names(&format!("egress-fresh-{}", network)); println!("\n╔═══════════════════════════════════════════════════════════════╗"); println!( @@ -103,9 +105,8 @@ async fn egress_fresh_test_impl(network: &str) -> Result<()> { /// Implementation for testing egress on a cloned VM async fn egress_clone_test_impl(network: &str) -> Result<()> { - let snapshot_name = format!("egress-snapshot-{}", network); - let baseline_name = format!("egress-baseline-{}", network); - let clone_name = format!("egress-clone-{}", network); + let (baseline_name, clone_name, snapshot_name, _) = + common::unique_names(&format!("egress-{}", network)); println!("\n╔═══════════════════════════════════════════════════════════════╗"); println!( diff --git a/tests/test_egress_stress.rs b/tests/test_egress_stress.rs index 6250e5ff..dc3c9dee 100644 --- a/tests/test_egress_stress.rs +++ b/tests/test_egress_stress.rs @@ -37,6 +37,7 @@ async fn test_egress_stress_bridged() -> Result<()> { /// Test egress stress with rootless networking using local HTTP server #[tokio::test] async fn test_egress_stress_rootless() -> Result<()> { + common::require_non_root("test_egress_stress_rootless")?; egress_stress_impl("rootless", NUM_CLONES, REQUESTS_PER_CLONE).await } @@ -45,7 +46,10 @@ async fn egress_stress_impl( num_clones: usize, requests_per_clone: usize, ) -> Result<()> { - let test_name = format!("egress-stress-{}", network); + // Use unique prefix for all resources + let (baseline_name, _, snapshot_name, _) = + common::unique_names(&format!("estress-{}", network)); + let test_name = baseline_name.clone(); // Use for clone naming println!("\n╔═══════════════════════════════════════════════════════════════╗"); println!( @@ -84,7 +88,6 @@ async fn egress_stress_impl( let fcvm_path = common::find_fcvm_binary()?; // Step 1: Start baseline VM - let baseline_name = format!("{}-baseline", test_name); println!("\nStep 1: Starting baseline VM '{}'...", baseline_name); let (_baseline_child, baseline_pid) = common::spawn_fcvm_with_logs( @@ -146,7 +149,6 @@ async fn egress_stress_impl( println!(" ✓ Baseline egress works"); // Step 2: Create snapshot - let snapshot_name = format!("{}-snapshot", test_name); println!("\nStep 2: Creating snapshot '{}'...", snapshot_name); let output = tokio::process::Command::new(&fcvm_path) diff --git a/tests/test_exec.rs b/tests/test_exec.rs index 96791263..8ce334ed 100644 --- a/tests/test_exec.rs +++ b/tests/test_exec.rs @@ -18,6 +18,7 @@ async fn test_exec_bridged() -> Result<()> { #[tokio::test] async fn test_exec_rootless() -> Result<()> { + common::require_non_root("test_exec_rootless")?; exec_test_impl("rootless").await } @@ -26,7 +27,7 @@ async fn exec_test_impl(network: &str) -> Result<()> { println!("================================"); let fcvm_path = common::find_fcvm_binary()?; - let vm_name = format!("exec-test-{}", network); + let (vm_name, _, _, _) = common::unique_names(&format!("exec-{}", network)); // Start the VM using spawn_fcvm helper (uses Stdio::inherit to prevent deadlock) println!("Starting VM..."); diff --git a/tests/test_port_forward.rs b/tests/test_port_forward.rs index 4fe4357c..e09d5302 100644 --- a/tests/test_port_forward.rs +++ b/tests/test_port_forward.rs @@ -22,15 +22,10 @@ struct VmDisplay { /// Test port forwarding with bridged networking #[test] fn test_port_forward_bridged() -> Result<()> { - // Requires root for bridged networking - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_port_forward_bridged: requires root"); - return Ok(()); - } - println!("\ntest_port_forward_bridged"); let fcvm_path = common::find_fcvm_binary()?; + let vm_name = format!("port-bridged-{}", std::process::id()); // Start VM with port forwarding let mut fcvm = Command::new(&fcvm_path) @@ -38,7 +33,7 @@ fn test_port_forward_bridged() -> Result<()> { "podman", "run", "--name", - "port-test", + &vm_name, "--network", "bridged", "--publish", @@ -187,9 +182,11 @@ fn test_port_forward_bridged() -> Result<()> { /// allowing multiple VMs to all forward the same port. #[test] fn test_port_forward_rootless() -> Result<()> { + common::require_non_root("test_port_forward_rootless")?; println!("\ntest_port_forward_rootless"); let fcvm_path = common::find_fcvm_binary()?; + let vm_name = format!("port-rootless-{}", std::process::id()); // Start VM with rootless networking and port forwarding // Use unprivileged port 8080 since rootless can't bind to 80 @@ -198,7 +195,7 @@ fn test_port_forward_rootless() -> Result<()> { "podman", "run", "--name", - "port-test-rootless", + &vm_name, "--network", "rootless", "--publish", diff --git a/tests/test_readme_examples.rs b/tests/test_readme_examples.rs index 17362444..28223f10 100644 --- a/tests/test_readme_examples.rs +++ b/tests/test_readme_examples.rs @@ -30,12 +30,6 @@ async fn test_readonly_volume() -> Result<()> { println!("\ntest_readonly_volume"); println!("===================="); - // Requires root for bridged networking (more reliable health checks) - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_readonly_volume: requires root for bridged networking"); - return Ok(()); - } - let test_id = format!("ro-{}", std::process::id()); let vm_name = format!("ro-vol-{}", std::process::id()); @@ -133,12 +127,6 @@ async fn test_env_variables() -> Result<()> { println!("\ntest_env_variables"); println!("=================="); - // Requires root for bridged networking (more reliable health checks) - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_env_variables: requires root for bridged networking"); - return Ok(()); - } - let vm_name = format!("env-test-{}", std::process::id()); // Start VM with environment variables using bridged mode for reliable health checks @@ -218,12 +206,6 @@ async fn test_custom_resources() -> Result<()> { println!("\ntest_custom_resources"); println!("====================="); - // Requires root for bridged networking (more reliable health checks) - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_custom_resources: requires root for bridged networking"); - return Ok(()); - } - let vm_name = format!("resources-test-{}", std::process::id()); // Start VM with custom resources using bridged mode for reliable health checks @@ -303,12 +285,6 @@ async fn test_fcvm_ls() -> Result<()> { println!("\ntest_fcvm_ls"); println!("============"); - // Requires root for bridged networking (more reliable health checks) - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_fcvm_ls: requires root for bridged networking"); - return Ok(()); - } - let fcvm_path = common::find_fcvm_binary()?; let vm_name = format!("ls-test-{}", std::process::id()); @@ -440,12 +416,6 @@ async fn test_custom_command() -> Result<()> { println!("\ntest_custom_command"); println!("==================="); - // Requires root for bridged networking (more reliable for custom commands) - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_custom_command: requires root for bridged networking"); - return Ok(()); - } - let vm_name = format!("cmd-test-{}", std::process::id()); // Use nginx:alpine with a custom command that: diff --git a/tests/test_sanity.rs b/tests/test_sanity.rs index 0356590f..65355c00 100644 --- a/tests/test_sanity.rs +++ b/tests/test_sanity.rs @@ -14,6 +14,7 @@ async fn test_sanity_bridged() -> Result<()> { #[tokio::test] async fn test_sanity_rootless() -> Result<()> { + common::require_non_root("test_sanity_rootless")?; sanity_test_impl("rootless").await } @@ -26,7 +27,7 @@ async fn sanity_test_impl(network: &str) -> Result<()> { // Start the VM using spawn_fcvm helper (uses Stdio::inherit to prevent deadlock) println!("Starting VM..."); - let vm_name = format!("sanity-test-{}", network); + let (vm_name, _, _, _) = common::unique_names(&format!("sanity-{}", network)); let (mut child, fcvm_pid) = common::spawn_fcvm(&[ "podman", "run", diff --git a/tests/test_signal_cleanup.rs b/tests/test_signal_cleanup.rs index 6bb62676..beb6930f 100644 --- a/tests/test_signal_cleanup.rs +++ b/tests/test_signal_cleanup.rs @@ -52,12 +52,6 @@ fn send_signal(pid: u32, signal: &str) -> Result<()> { /// Test that SIGINT properly kills the VM and cleans up firecracker #[test] fn test_sigint_kills_firecracker() -> Result<()> { - // This test requires root for bridged networking - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_sigint_kills_firecracker: requires root"); - return Ok(()); - } - println!("\ntest_sigint_kills_firecracker"); // Get initial firecracker count @@ -76,12 +70,13 @@ fn test_sigint_kills_firecracker() -> Result<()> { // Start fcvm in background let fcvm_path = common::find_fcvm_binary()?; + let vm_name = format!("signal-int-{}", std::process::id()); let mut fcvm = Command::new(&fcvm_path) .args([ "podman", "run", "--name", - "signal-test", + &vm_name, "--network", "bridged", "nginx:alpine", @@ -210,22 +205,17 @@ fn test_sigint_kills_firecracker() -> Result<()> { /// Test that SIGTERM properly kills the VM and cleans up firecracker #[test] fn test_sigterm_kills_firecracker() -> Result<()> { - // This test requires root for bridged networking - if !nix::unistd::geteuid().is_root() { - eprintln!("Skipping test_sigterm_kills_firecracker: requires root"); - return Ok(()); - } - println!("\ntest_sigterm_kills_firecracker"); // Start fcvm in background let fcvm_path = common::find_fcvm_binary()?; + let vm_name = format!("signal-term-{}", std::process::id()); let mut fcvm = Command::new(&fcvm_path) .args([ "podman", "run", "--name", - "signal-test-term", + &vm_name, "--network", "bridged", "nginx:alpine", diff --git a/tests/test_snapshot_clone.rs b/tests/test_snapshot_clone.rs index 6f8716f6..6d6d5a9b 100644 --- a/tests/test_snapshot_clone.rs +++ b/tests/test_snapshot_clone.rs @@ -17,12 +17,14 @@ use tokio::sync::Mutex; /// Full snapshot/clone workflow test with rootless networking (10 clones) #[tokio::test] async fn test_snapshot_clone_rootless_10() -> Result<()> { + common::require_non_root("test_snapshot_clone_rootless_10")?; snapshot_clone_test_impl("rootless", 10).await } /// Stress test with 100 clones using rootless networking #[tokio::test] async fn test_snapshot_clone_stress_100() -> Result<()> { + common::require_non_root("test_snapshot_clone_stress_100")?; snapshot_clone_test_impl("rootless", 100).await } @@ -36,8 +38,7 @@ struct CloneResult { } async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<()> { - let snapshot_name = format!("test-snapshot-{}", network); - let baseline_name = format!("baseline-{}", network); + let (baseline_name, _, snapshot_name, _) = common::unique_names(&format!("snap-{}", network)); let test_start = Instant::now(); println!("\n╔═══════════════════════════════════════════════════════════════╗"); @@ -145,7 +146,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<() let mut spawn_handles = Vec::new(); for i in 0..num_clones { - let clone_name = format!("clone-{}-{}", network, i); + let clone_name = format!("{}-{}", baseline_name.replace("-base-", "-clone-"), i); let network = network.to_string(); let results = Arc::clone(&results); let clone_pids = Arc::clone(&clone_pids); @@ -191,7 +192,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<() }; results.lock().await.push(CloneResult { - name: clone_name, + name: clone_name.clone(), pid: clone_pid, spawn_time_ms: spawn_ms, health_time_secs: health_time, @@ -200,7 +201,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<() } Err(e) => { results.lock().await.push(CloneResult { - name: clone_name, + name: clone_name.clone(), pid: 0, spawn_time_ms: spawn_start.elapsed().as_secs_f64() * 1000.0, health_time_secs: None, @@ -378,8 +379,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<() /// isolation, Firecracker would try to bind to the same socket path stored in vmstate.bin. #[tokio::test] async fn test_clone_while_baseline_running() -> Result<()> { - let snapshot_name = "test-clone-running"; - let baseline_name = "baseline-running"; + let (baseline_name, clone_name, snapshot_name, _) = common::unique_names("running"); println!("\n╔═══════════════════════════════════════════════════════════════╗"); println!("║ Clone While Baseline Running Test ║"); @@ -394,12 +394,12 @@ async fn test_clone_while_baseline_running() -> Result<()> { "podman", "run", "--name", - baseline_name, + &baseline_name, "--network", "bridged", common::TEST_IMAGE, ], - baseline_name, + &baseline_name, ) .await .context("spawning baseline VM")?; @@ -417,7 +417,7 @@ async fn test_clone_while_baseline_running() -> Result<()> { "--pid", &baseline_pid.to_string(), "--tag", - snapshot_name, + &snapshot_name, ]) .output() .await @@ -437,19 +437,18 @@ async fn test_clone_while_baseline_running() -> Result<()> { // Step 4: Start memory server println!("\nStep 4: Starting memory server..."); let (_serve_child, serve_pid) = - common::spawn_fcvm_with_logs(&["snapshot", "serve", snapshot_name], "uffd-server") + common::spawn_fcvm_with_logs(&["snapshot", "serve", &snapshot_name], "uffd-server") .await .context("spawning memory server")?; // Wait for serve to be ready (poll for socket) - common::poll_serve_ready(snapshot_name, serve_pid, 30).await?; + common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?; println!(" ✓ Memory server ready (PID: {})", serve_pid); // Step 5: Clone WHILE baseline is still running (this is the key test!) println!("\nStep 5: Spawning clone while baseline is STILL RUNNING..."); println!(" (This tests vsock socket isolation via mount namespace)"); - let clone_name = "clone-running"; let serve_pid_str = serve_pid.to_string(); let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs( &[ @@ -458,11 +457,11 @@ async fn test_clone_while_baseline_running() -> Result<()> { "--pid", &serve_pid_str, "--name", - clone_name, + &clone_name, "--network", "bridged", ], - clone_name, + &clone_name, ) .await .context("spawning clone while baseline running")?; @@ -533,12 +532,13 @@ async fn test_clone_internet_bridged() -> Result<()> { /// Test that clones can reach the internet in rootless mode #[tokio::test] async fn test_clone_internet_rootless() -> Result<()> { + common::require_non_root("test_clone_internet_rootless")?; clone_internet_test_impl("rootless").await } async fn clone_internet_test_impl(network: &str) -> Result<()> { - let snapshot_name = format!("test-internet-{}", network); - let baseline_name = format!("baseline-internet-{}", network); + let (baseline_name, clone_name, snapshot_name, _) = + common::unique_names(&format!("inet-{}", network)); println!("\n╔═══════════════════════════════════════════════════════════════╗"); println!( @@ -608,7 +608,6 @@ async fn clone_internet_test_impl(network: &str) -> Result<()> { // Step 4: Spawn clone println!("\nStep 4: Spawning clone..."); - let clone_name = format!("clone-internet-{}", network); let serve_pid_str = serve_pid.to_string(); let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs( &[ @@ -762,6 +761,429 @@ async fn test_clone_http(fcvm_path: &std::path::Path, clone_pid: u32) -> Result< } } +/// Test port forwarding on clones with bridged networking +/// +/// Verifies that --publish correctly forwards ports to cloned VMs. +/// This tests the full port forwarding path: host → iptables DNAT → clone VM → nginx. +#[tokio::test] +async fn test_clone_port_forward_bridged() -> Result<()> { + let (baseline_name, clone_name, snapshot_name, _) = common::unique_names("pf-bridged"); + + println!("\n╔═══════════════════════════════════════════════════════════════╗"); + println!("║ Clone Port Forwarding Test (bridged) ║"); + println!("╚═══════════════════════════════════════════════════════════════╝\n"); + + let fcvm_path = common::find_fcvm_binary()?; + + // Step 1: Start baseline VM with nginx + println!("Step 1: Starting baseline VM with nginx..."); + let (_baseline_child, baseline_pid) = common::spawn_fcvm_with_logs( + &[ + "podman", + "run", + "--name", + &baseline_name, + "--network", + "bridged", + common::TEST_IMAGE, + ], + &baseline_name, + ) + .await + .context("spawning baseline VM")?; + + println!(" Waiting for baseline VM to become healthy..."); + common::poll_health_by_pid(baseline_pid, 60).await?; + println!(" ✓ Baseline VM healthy (PID: {})", baseline_pid); + + // Step 2: Create snapshot + println!("\nStep 2: Creating snapshot..."); + let output = tokio::process::Command::new(&fcvm_path) + .args([ + "snapshot", + "create", + "--pid", + &baseline_pid.to_string(), + "--tag", + &snapshot_name, + ]) + .output() + .await + .context("running snapshot create")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("Snapshot creation failed: {}", stderr); + } + println!(" ✓ Snapshot created"); + + // Kill baseline - we only need the snapshot for clones + common::kill_process(baseline_pid).await; + println!(" Killed baseline VM (only need snapshot)"); + + // Step 3: Start memory server + println!("\nStep 3: Starting memory server..."); + let (_serve_child, serve_pid) = + common::spawn_fcvm_with_logs(&["snapshot", "serve", &snapshot_name], "uffd-server") + .await + .context("spawning memory server")?; + + // Wait for serve to be ready (poll for socket) + common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?; + println!(" ✓ Memory server ready (PID: {})", serve_pid); + + // Step 4: Spawn clone WITH port forwarding + println!("\nStep 4: Spawning clone with --publish 19080:80..."); + let serve_pid_str = serve_pid.to_string(); + let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs( + &[ + "snapshot", + "run", + "--pid", + &serve_pid_str, + "--name", + &clone_name, + "--network", + "bridged", + "--publish", + "19080:80", + ], + &clone_name, + ) + .await + .context("spawning clone with port forward")?; + + // Wait for clone to become healthy + println!(" Waiting for clone to become healthy..."); + common::poll_health_by_pid(clone_pid, 60).await?; + println!(" ✓ Clone is healthy (PID: {})", clone_pid); + + // Step 5: Test port forwarding + println!("\nStep 5: Testing port forwarding..."); + + // Get clone's guest IP from state + let output = tokio::process::Command::new(&fcvm_path) + .args(["ls", "--json", "--pid", &clone_pid.to_string()]) + .output() + .await + .context("getting clone state")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + let guest_ip: String = serde_json::from_str::>(&stdout) + .ok() + .and_then(|v| v.first().cloned()) + .and_then(|v| { + v.get("config")? + .get("network")? + .get("guest_ip")? + .as_str() + .map(|s| s.to_string()) + }) + .unwrap_or_default(); + + println!(" Clone guest IP: {}", guest_ip); + + // Test 1: Direct access to guest IP + println!(" Testing direct access to guest..."); + let direct_result = tokio::process::Command::new("curl") + .args(["-s", "--max-time", "10", &format!("http://{}:80", guest_ip)]) + .output() + .await; + + let direct_works = direct_result + .map(|o| o.status.success() && !o.stdout.is_empty()) + .unwrap_or(false); + println!( + " Direct access: {}", + if direct_works { "✓ OK" } else { "✗ FAIL" } + ); + + // Test 2: Access via host's primary IP and forwarded port + let host_ip = tokio::process::Command::new("hostname") + .arg("-I") + .output() + .await + .ok() + .and_then(|o| String::from_utf8(o.stdout).ok()) + .and_then(|s| s.split_whitespace().next().map(|ip| ip.to_string())) + .unwrap_or_else(|| "127.0.0.1".to_string()); + + println!(" Testing access via host IP {}:19080...", host_ip); + let forward_result = tokio::process::Command::new("curl") + .args([ + "-s", + "--max-time", + "10", + &format!("http://{}:19080", host_ip), + ]) + .output() + .await; + + let forward_works = forward_result + .map(|o| o.status.success() && !o.stdout.is_empty()) + .unwrap_or(false); + println!( + " Port forward (host IP): {}", + if forward_works { "✓ OK" } else { "✗ FAIL" } + ); + + // Test 3: Access via localhost + println!(" Testing access via localhost:19080..."); + let localhost_result = tokio::process::Command::new("curl") + .args(["-s", "--max-time", "10", "http://127.0.0.1:19080"]) + .output() + .await; + + let localhost_works = localhost_result + .map(|o| o.status.success() && !o.stdout.is_empty()) + .unwrap_or(false); + println!( + " Localhost access: {}", + if localhost_works { + "✓ OK" + } else { + "✗ FAIL" + } + ); + + // Cleanup + println!("\nCleaning up..."); + common::kill_process(clone_pid).await; + println!(" Killed clone"); + common::kill_process(serve_pid).await; + println!(" Killed memory server"); + + // Results + println!("\n╔═══════════════════════════════════════════════════════════════╗"); + println!("║ RESULTS ║"); + println!("╠═══════════════════════════════════════════════════════════════╣"); + println!( + "║ Direct access to guest: {} ║", + if direct_works { + "✓ PASSED" + } else { + "✗ FAILED" + } + ); + println!( + "║ Port forward (host IP): {} ║", + if forward_works { + "✓ PASSED" + } else { + "✗ FAILED" + } + ); + println!( + "║ Localhost port forward: {} ║", + if localhost_works { + "✓ PASSED" + } else { + "✗ FAILED" + } + ); + println!("╚═══════════════════════════════════════════════════════════════╝"); + + // All port forwarding methods must work + if direct_works && forward_works && localhost_works { + println!("\n✅ CLONE PORT FORWARDING TEST PASSED!"); + Ok(()) + } else { + anyhow::bail!( + "Clone port forwarding test failed: direct={}, forward={}, localhost={}", + direct_works, + forward_works, + localhost_works + ) + } +} + +/// Test port forwarding on clones with rootless networking +/// +/// This is the key test - rootless clones with port forwarding. +/// Port forwarding is done via slirp4netns API, accessing via unique loopback IP. +#[tokio::test] +async fn test_clone_port_forward_rootless() -> Result<()> { + common::require_non_root("test_clone_port_forward_rootless")?; + + let (baseline_name, clone_name, snapshot_name, _) = common::unique_names("pf-rootless"); + + println!("\n╔═══════════════════════════════════════════════════════════════╗"); + println!("║ Clone Port Forwarding Test (rootless) ║"); + println!("╚═══════════════════════════════════════════════════════════════╝\n"); + + let fcvm_path = common::find_fcvm_binary()?; + + // Step 1: Start baseline VM with nginx (rootless) + println!("Step 1: Starting baseline VM with nginx (rootless)..."); + let (_baseline_child, baseline_pid) = common::spawn_fcvm_with_logs( + &[ + "podman", + "run", + "--name", + &baseline_name, + "--network", + "rootless", + common::TEST_IMAGE, + ], + &baseline_name, + ) + .await + .context("spawning baseline VM")?; + + println!(" Waiting for baseline VM to become healthy..."); + common::poll_health_by_pid(baseline_pid, 90).await?; + println!(" ✓ Baseline VM healthy (PID: {})", baseline_pid); + + // Step 2: Create snapshot + println!("\nStep 2: Creating snapshot..."); + let output = tokio::process::Command::new(&fcvm_path) + .args([ + "snapshot", + "create", + "--pid", + &baseline_pid.to_string(), + "--tag", + &snapshot_name, + ]) + .output() + .await + .context("running snapshot create")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("Snapshot creation failed: {}", stderr); + } + println!(" ✓ Snapshot created"); + + // Kill baseline - we only need the snapshot for clones + common::kill_process(baseline_pid).await; + println!(" Killed baseline VM (only need snapshot)"); + + // Step 3: Start memory server + println!("\nStep 3: Starting memory server..."); + let (_serve_child, serve_pid) = + common::spawn_fcvm_with_logs(&["snapshot", "serve", &snapshot_name], "uffd-server") + .await + .context("spawning memory server")?; + + // Wait for serve to be ready (poll for socket) + common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?; + println!(" ✓ Memory server ready (PID: {})", serve_pid); + + // Step 4: Spawn clone WITH port forwarding (rootless) + // Use port 8080 (unprivileged) since rootless can't bind to 80 + println!("\nStep 4: Spawning clone with --publish 8080:80 (rootless)..."); + let serve_pid_str = serve_pid.to_string(); + let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs( + &[ + "snapshot", + "run", + "--pid", + &serve_pid_str, + "--name", + &clone_name, + "--network", + "rootless", + "--publish", + "8080:80", + ], + &clone_name, + ) + .await + .context("spawning clone with port forward")?; + + // Wait for clone to become healthy + println!(" Waiting for clone to become healthy..."); + common::poll_health_by_pid(clone_pid, 60).await?; + println!(" ✓ Clone is healthy (PID: {})", clone_pid); + + // Step 5: Test port forwarding via loopback IP + println!("\nStep 5: Testing port forwarding..."); + + // Get clone's loopback IP from state (rootless uses 127.x.y.z) + let output = tokio::process::Command::new(&fcvm_path) + .args(["ls", "--json", "--pid", &clone_pid.to_string()]) + .output() + .await + .context("getting clone state")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + let loopback_ip: String = serde_json::from_str::>(&stdout) + .ok() + .and_then(|v| v.first().cloned()) + .and_then(|v| { + v.get("config")? + .get("network")? + .get("loopback_ip")? + .as_str() + .map(|s| s.to_string()) + }) + .unwrap_or_default(); + + println!(" Clone loopback IP: {}", loopback_ip); + + // Test: Access via loopback IP and forwarded port + println!(" Testing access via loopback {}:8080...", loopback_ip); + let loopback_result = tokio::process::Command::new("curl") + .args([ + "-s", + "--max-time", + "10", + &format!("http://{}:8080", loopback_ip), + ]) + .output() + .await; + + let loopback_works = loopback_result + .as_ref() + .map(|o| o.status.success() && !o.stdout.is_empty()) + .unwrap_or(false); + + if let Ok(ref out) = loopback_result { + if loopback_works { + println!(" Loopback access: ✓ OK"); + let response = String::from_utf8_lossy(&out.stdout); + println!( + " Response: {} bytes (nginx welcome page)", + response.len() + ); + } else { + println!(" Loopback access: ✗ FAIL"); + println!(" stderr: {}", String::from_utf8_lossy(&out.stderr)); + } + } else { + println!(" Loopback access: ✗ FAIL (request error)"); + } + + // Cleanup + println!("\nCleaning up..."); + common::kill_process(clone_pid).await; + println!(" Killed clone"); + common::kill_process(serve_pid).await; + println!(" Killed memory server"); + + // Results + println!("\n╔═══════════════════════════════════════════════════════════════╗"); + println!("║ RESULTS ║"); + println!("╠═══════════════════════════════════════════════════════════════╣"); + println!( + "║ Loopback port forward: {} ║", + if loopback_works { + "✓ PASSED" + } else { + "✗ FAILED" + } + ); + println!("╚═══════════════════════════════════════════════════════════════╝"); + + if loopback_works { + println!("\n✅ ROOTLESS CLONE PORT FORWARDING TEST PASSED!"); + Ok(()) + } else { + anyhow::bail!("Rootless clone port forwarding test failed") + } +} + /// Test snapshot run --exec with bridged networking #[tokio::test] async fn test_snapshot_run_exec_bridged() -> Result<()> { @@ -771,13 +1193,13 @@ async fn test_snapshot_run_exec_bridged() -> Result<()> { /// Test snapshot run --exec with rootless networking #[tokio::test] async fn test_snapshot_run_exec_rootless() -> Result<()> { + common::require_non_root("test_snapshot_run_exec_rootless")?; snapshot_run_exec_test_impl("rootless").await } /// Implementation of snapshot run --exec test async fn snapshot_run_exec_test_impl(network: &str) -> Result<()> { - let snapshot_name = format!("test-exec-{}", network); - let baseline_name = format!("baseline-exec-{}", network); + let (baseline_name, _, snapshot_name, _) = common::unique_names(&format!("exec-{}", network)); println!("\n╔═══════════════════════════════════════════════════════════════╗"); println!(