diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f7d9d501..84ef3a94 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,42 +10,9 @@ env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  # Fast jobs run in parallel on every PR and push
-
-  lint:
-    name: Lint
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          path: fcvm
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuse-backend-rs
-          ref: master
-          path: fuse-backend-rs
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuser
-          ref: master
-          path: fuser
-      - uses: dtolnay/rust-toolchain@stable
-        with:
-          components: clippy, rustfmt
-      - name: Install cargo-machete
-        run: cargo install cargo-machete
-      - name: Check formatting
-        working-directory: fcvm
-        run: cargo fmt --all -- --check
-      - name: Clippy
-        working-directory: fcvm
-        run: cargo clippy --all-targets --all-features -- -D warnings
-      - name: Check unused dependencies
-        working-directory: fcvm
-        run: cargo machete
-
+  # Build inside container, upload artifacts for parallel test jobs
   build:
-    name: Build
+    name: Build [container/ubuntu-latest]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -61,16 +28,29 @@ jobs:
           repository: ejc3/fuser
           ref: master
           path: fuser
-      - uses: dtolnay/rust-toolchain@stable
-      - uses: Swatinem/rust-cache@v2
-        with:
-          workspaces: fcvm
-      - name: Build
+      - name: Build inside container
         working-directory: fcvm
-        run: cargo build --release --all-targets
+        run: |
+          export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs
+          export FUSER=${{ github.workspace }}/fuser
+          export CONTAINER_ARCH=x86_64
+          export CI=1
+          make container-build-only
+      - name: Upload build artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: container-build
+          path: |
+            fcvm/target/release
+            !fcvm/target/release/.fingerprint
+            !fcvm/target/release/build
+            !fcvm/target/release/deps
+            !fcvm/target/release/incremental
+          retention-days: 1
 
-  test-unit:
-    name: Unit Tests
+  # Lint runs in parallel with build (just needs source)
+  lint:
+    name: Lint (fmt+clippy+machete) [host/ubuntu-latest]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -87,68 +67,26 @@ jobs:
           ref: master
           path: fuser
       - uses: dtolnay/rust-toolchain@stable
-      - uses: Swatinem/rust-cache@v2
-        with:
-          workspaces: fcvm
-      - name: Run unit tests
-        working-directory: fcvm
-        run: cargo test --release --lib --all
-
-  test-fuse-integration:
-    name: FUSE Integration
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
         with:
-          path: fcvm
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuse-backend-rs
-          ref: master
-          path: fuse-backend-rs
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuser
-          ref: master
-          path: fuser
-      - uses: dtolnay/rust-toolchain@stable
+          components: clippy, rustfmt
       - uses: Swatinem/rust-cache@v2
         with:
           workspaces: fcvm
-      - name: Build
+      - name: Check formatting
         working-directory: fcvm
-        run: cargo build --release -p fuse-pipe
-      - name: Run integration_root tests
+        run: cargo fmt --all -- --check
+      - name: Clippy
         working-directory: fcvm
-        run: sudo -E env "PATH=$PATH" cargo test --release -p fuse-pipe --test integration_root -- --test-threads=1
-
-  test-fuse-noroot:
-    name: FUSE No-Root
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          path: fcvm
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuse-backend-rs
-          ref: master
-          path: fuse-backend-rs
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuser
-          ref: master
-          path: fuser
-      - name: Run no-root FUSE tests (container)
+        run: cargo clippy --all-targets --all-features -- -D warnings
+      - name: Install cargo-machete
+        run: cargo install cargo-machete
+      - name: Check unused dependencies
         working-directory: fcvm
-        run: |
-          export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs
-          export FUSER=${{ github.workspace }}/fuser
-          export CONTAINER_ARCH=x86_64
-          make container-test-noroot
+        run: cargo machete
 
-  test-cli:
-    name: CLI Tests
+  # Native tests use rust-cache (compiles incrementally)
+  test-native:
+    name: Unit+CLI+FUSE-root [host/ubuntu-latest]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -168,12 +106,20 @@ jobs:
       - uses: Swatinem/rust-cache@v2
         with:
           workspaces: fcvm
-      - name: Run CLI tests
+      - name: Unit tests
+        working-directory: fcvm
+        run: cargo test --release --lib --all
+      - name: CLI tests
         working-directory: fcvm
         run: cargo test --release --test test_cli_parsing --test test_state_manager
+      - name: FUSE integration tests (root)
+        working-directory: fcvm
+        run: sudo -E env "PATH=$PATH" cargo test --release -p fuse-pipe --test integration_root -- --test-threads=1
 
-  test-fuse-permissions:
-    name: FUSE Permissions
+  # Container FUSE tests - download pre-built artifacts
+  fuse-tests:
+    name: FUSE (noroot+root) [container/ubuntu-latest]
+    needs: build
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -189,16 +135,25 @@ jobs:
           repository: ejc3/fuser
           ref: master
           path: fuser
-      - name: Run permission tests (container)
+      - name: Download build artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: container-build
+          path: fcvm/target/release
+      - name: Run FUSE tests (container, no rebuild)
         working-directory: fcvm
         run: |
           export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs
           export FUSER=${{ github.workspace }}/fuser
           export CONTAINER_ARCH=x86_64
-          make container-test-root
+          export CI=1
+          mkdir -p cargo-home
+          make container-test
 
-  test-pjdfstest:
-    name: POSIX Compliance
+  # POSIX compliance - download pre-built artifacts
+  posix-compliance:
+    name: POSIX (pjdfstest 8789) [container/ubuntu-latest]
+    needs: build
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -214,66 +169,25 @@ jobs:
           repository: ejc3/fuser
           ref: master
           path: fuser
-      - name: Run pjdfstest (container)
-        working-directory: fcvm
-        run: |
-          export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs
-          export FUSER=${{ github.workspace }}/fuser
-          export CONTAINER_ARCH=x86_64
-          make container-test-pjdfstest
-
-  test-vm-sanity:
-    name: VM Sanity
-    runs-on: buildjet-32vcpu-ubuntu-2204
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          path: fcvm
-      - uses: actions/checkout@v4
+      - name: Download build artifacts
+        uses: actions/download-artifact@v4
         with:
-          repository: ejc3/fuse-backend-rs
-          ref: master
-          path: fuse-backend-rs
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuser
-          ref: master
-          path: fuser
-      - name: Check KVM availability
-        run: |
-          echo "=== KVM device ==="
-          ls -la /dev/kvm || echo "No /dev/kvm"
-          echo "=== CPU virtualization ==="
-          grep -E "(vmx|svm)" /proc/cpuinfo | head -1 || echo "No VMX/SVM"
-          echo "=== KVM modules ==="
-          lsmod | grep kvm || echo "No KVM modules"
-      - name: Setup KVM permissions
-        run: sudo chmod 666 /dev/kvm
-      - name: Setup NBD module for rootfs extraction
-        run: |
-          sudo modprobe nbd max_part=8
-          ls -la /dev/nbd* | head -5
-      - name: Setup network namespace directory
-        run: sudo mkdir -p /var/run/netns
-      - name: Setup iptables for VM networking
-        run: |
-          # BuildJet runners have FORWARD chain set to DROP by default
-          # Set to ACCEPT and add MASQUERADE rule for VM NAT
-          sudo iptables -P FORWARD ACCEPT
-          sudo iptables -t nat -A POSTROUTING -s 172.30.0.0/16 -o eth0 -j MASQUERADE || true
-      - name: Run VM sanity test (bridged)
+          name: container-build
+          path: fcvm/target/release
+      - name: Run pjdfstest (container, no rebuild)
         working-directory: fcvm
         run: |
           export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs
           export FUSER=${{ github.workspace }}/fuser
           export CONTAINER_ARCH=x86_64
-          make container-test-vm-bridged
+          export CI=1
+          mkdir -p cargo-home
+          make container-test-pjdfstest
 
-  test-vm-exec:
-    name: VM Exec
+  # VM tests on BuildJet - builds inside container (separate from ubuntu-latest)
+  vm-tests:
+    name: VM (bridged+rootless) [container/buildjet-32cpu]
     runs-on: buildjet-32vcpu-ubuntu-2204
-    needs: test-vm-sanity  # Sequential: flock doesn't work across podman containers sharing /dev/nbd0
-    if: always()  # Run even if previous job failed (rootfs will be cached after first success)
     steps:
       - uses: actions/checkout@v4
         with:
@@ -298,47 +212,17 @@ jobs:
         run: |
           sudo iptables -P FORWARD ACCEPT
           sudo iptables -t nat -A POSTROUTING -s 172.30.0.0/16 -o eth0 -j MASQUERADE || true
-      - name: Run VM exec tests
-        working-directory: fcvm
+      - name: Setup userfaultfd for snapshot cloning
         run: |
-          export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs
-          export FUSER=${{ github.workspace }}/fuser
-          export CONTAINER_ARCH=x86_64
-          make container-test-vm-exec
-
-  test-vm-egress:
-    name: VM Egress
-    runs-on: buildjet-32vcpu-ubuntu-2204
-    needs: test-vm-exec  # Sequential: flock doesn't work across podman containers sharing /dev/nbd0
-    if: always()  # Run even if previous job failed (rootfs will be cached after first success)
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          path: fcvm
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuse-backend-rs
-          ref: master
-          path: fuse-backend-rs
-      - uses: actions/checkout@v4
-        with:
-          repository: ejc3/fuser
-          ref: master
-          path: fuser
-      - name: Setup KVM permissions
-        run: sudo chmod 666 /dev/kvm
-      - name: Setup NBD module
-        run: sudo modprobe nbd max_part=8
-      - name: Setup network namespace directory
-        run: sudo mkdir -p /var/run/netns
-      - name: Setup iptables for VM networking
-        run: |
-          sudo iptables -P FORWARD ACCEPT
-          sudo iptables -t nat -A POSTROUTING -s 172.30.0.0/16 -o eth0 -j MASQUERADE || true
-      - name: Run VM egress tests
+          if [ ! -e /dev/userfaultfd ]; then
+            sudo mknod /dev/userfaultfd c 10 126
+          fi
+          sudo chmod 666 /dev/userfaultfd
+          sudo sysctl -w vm.unprivileged_userfaultfd=1
+      - name: Run all VM tests
         working-directory: fcvm
         run: |
           export FUSE_BACKEND_RS=${{ github.workspace }}/fuse-backend-rs
           export FUSER=${{ github.workspace }}/fuser
           export CONTAINER_ARCH=x86_64
-          make container-test-vm-egress
+          make container-test-vm
diff --git a/Cargo.lock b/Cargo.lock
index 1fc5ce6f..d50c9806 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -175,6 +175,15 @@ version = "2.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
 
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.19.0"
@@ -347,6 +356,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "criterion"
 version = "0.5.1"
@@ -423,6 +441,16 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
 
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
 [[package]]
 name = "dashmap"
 version = "5.5.3"
@@ -436,6 +464,16 @@ dependencies = [
  "parking_lot_core",
 ]
 
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
+
 [[package]]
 name = "dirs"
 version = "6.0.0"
@@ -537,6 +575,7 @@ dependencies = [
  "clap",
  "criterion",
  "fuse-pipe",
+ "hex",
  "hyper 0.14.32",
  "hyperlocal",
  "libc",
@@ -548,11 +587,13 @@ dependencies = [
  "serde",
  "serde_json",
  "serial_test",
+ "sha2",
  "shell-words",
  "shellexpand",
  "tempfile",
  "tokio",
  "tokio-util",
+ "toml",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -737,6 +778,16 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.16"
@@ -2051,6 +2102,15 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "serde_spanned"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_urlencoded"
 version = "0.7.1"
@@ -2088,6 +2148,17 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "sharded-slab"
 version = "0.1.7"
@@ -2382,6 +2453,47 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "toml"
+version = "0.8.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
+dependencies = [
+ "indexmap",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_write",
+ "winnow",
+]
+
+[[package]]
+name = "toml_write"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
+
 [[package]]
 name = "tower"
 version = "0.5.2"
@@ -2507,6 +2619,12 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
+[[package]]
+name = "typenum"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.22"
@@ -2586,6 +2704,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
 
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
 [[package]]
 name = "vm-memory"
 version = "0.14.1"
@@ -3061,6 +3185,15 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
 
+[[package]]
+name = "winnow"
+version = "0.7.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "winreg"
 version = "0.50.0"
diff --git a/Cargo.toml b/Cargo.toml
index 719410d6..be5d4880 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,9 @@ atty = "0.2"
 clap = { version = "4", features = ["derive", "env"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
+sha2 = "0.10"
+hex = "0.4"
+toml = "0.8"
 tokio = { version = "1", features = ["rt-multi-thread", "macros", "process", "fs", "signal", "io-util", "sync", "time"] }
 reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
 which = "6"
@@ -40,6 +43,11 @@ url = "2"
 tokio-util = "0.7"
 regex = "1.12.2"
 
+[features]
+# Test category - only gate tests that require sudo
+# Unprivileged tests run by default (no feature flag needed)
+privileged-tests = []  # Tests requiring sudo (iptables, root podman storage)
+
 [dev-dependencies]
 serial_test = "3"
 criterion = "0.5"
diff --git a/Containerfile b/Containerfile
index 55513d45..424cfae2 100644
--- a/Containerfile
+++ b/Containerfile
@@ -50,6 +50,7 @@ RUN curl -L -o /tmp/firecracker.tgz \
     https://github.com/firecracker-microvm/firecracker/releases/download/v1.14.0/firecracker-v1.14.0-${ARCH}.tgz \
     && tar -xzf /tmp/firecracker.tgz -C /tmp \
     && mv /tmp/release-v1.14.0-${ARCH}/firecracker-v1.14.0-${ARCH} /usr/local/bin/firecracker \
+    && chown root:root /usr/local/bin/firecracker \
     && chmod +x /usr/local/bin/firecracker \
     && rm -rf /tmp/firecracker.tgz /tmp/release-v1.14.0-${ARCH}
 
diff --git a/DESIGN.md b/DESIGN.md
index f4869d4c..da566686 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -378,37 +378,89 @@ Each VM has:
 
 ## Networking
 
-### Rootless Mode (slirp4netns)
+### Rootless Mode (slirp4netns with Dual-TAP Architecture)
+
+**Key Insight**: slirp4netns and Firecracker CANNOT share a TAP device (both need exclusive access).
+**Solution**: Use two TAP devices with IP forwarding between them inside a user namespace.
 
 **Topology**:
 ```
-┌─────────────┐
-│ Host Process│
-└──────┬──────┘
-       │
-       ├─── Firecracker VM (VM namespace)
-       │      └─── eth0: 10.0.2.15
-       │
-       └─── slirp4netns (User namespace)
-              └─── Provides NAT + port forwarding
+Host                     │ User Namespace (unshare --user --map-root-user --net)
+                         │
+slirp4netns <────────────┼── slirp0 (10.0.2.100/24)
+  (userspace NAT)        │        │
+                         │        │ IP forwarding + iptables NAT
+                         │        ▼
+                         │   tap0 (192.168.1.1/24)
+                         │        │
+                         │        ▼
+                         │   Firecracker VM
+                         │     eth0: 192.168.1.2
+```
+
+**Setup Sequence** (3-phase with nsenter):
+1. Spawn holder process: `unshare --user --map-root-user --net -- sleep infinity`
+2. Run setup via nsenter: create TAPs, iptables, enable IP forwarding
+3. Start slirp4netns attached to holder's namespace
+4. Run Firecracker via nsenter: `nsenter -t HOLDER_PID -U -n -- firecracker ...`
+5. Health checks via nsenter: `nsenter -t HOLDER_PID -U -n -- curl guest_ip:80`
+
+**Network Setup Script** (executed via nsenter):
+```bash
+# Create slirp0 TAP for slirp4netns connectivity
+ip tuntap add slirp0 mode tap
+ip addr add 10.0.2.100/24 dev slirp0
+ip link set slirp0 up
+ip route add default via 10.0.2.2 dev slirp0
+
+# Create tap0 for Firecracker (guest uses 192.168.1.2)
+ip tuntap add tap0 mode tap
+ip addr add 192.168.1.1/24 dev tap0
+ip link set tap0 up
+
+# Enable IP forwarding
+echo 1 > /proc/sys/net/ipv4/ip_forward
+
+# Allow forwarding between slirp0 and FC TAP
+iptables -A FORWARD -i slirp0 -o tap0 -j ACCEPT
+iptables -A FORWARD -i tap0 -o slirp0 -j ACCEPT
+
+# NAT guest traffic (192.168.x.x) to slirp0's address (10.0.2.100)
+iptables -t nat -A POSTROUTING -s 192.168.1.0/24 -o slirp0 -j MASQUERADE
 ```
 
-**Port Forwarding**:
+**Port Forwarding** (unique loopback IPs):
 ```bash
+# Each VM gets a unique loopback IP (127.x.y.z) for port forwarding
+# No IP aliasing needed - Linux routes all 127.0.0.0/8 to loopback
 slirp4netns \
   --configure \
   --mtu=65520 \
-  --port tcp:8080:80 \
-  --port udp:53:53 \
-  <vm-pid> \
-  tap0
+  --api-socket /tmp/slirp-{vm_id}.sock \
+  <holder-pid> \
+  slirp0
+
+# Port forwarding via JSON-RPC API:
+echo '{"execute":"add_hostfwd","arguments":{"proto":"tcp","host_addr":"127.0.0.2","host_port":8080,"guest_addr":"10.0.2.100","guest_port":8080}}' | nc -U /tmp/slirp-{vm_id}.sock
+```
+
+**Traffic Flow** (VM to Internet):
+```
+Guest (192.168.1.2) → tap0 → iptables MASQUERADE → slirp0 (10.0.2.100) → slirp4netns → Host → Internet
+```
+
+**Traffic Flow** (Host to VM port forward):
+```
+Host (127.0.0.2:8080) → slirp4netns → slirp0 (10.0.2.100:8080) → IP forward → tap0 → Guest (192.168.1.2:80)
 ```
 
 **Characteristics**:
-- No root required
-- Slightly slower than native networking
-- Works in nested VMs
-- Fully compatible with rootless Podman
+- No root required (runs entirely in user namespace)
+- Isolated 192.168.1.0/24 subnet per VM (no conflicts)
+- Unique loopback IP per VM enables same port on multiple VMs
+- Slightly slower than bridged (~10-20% overhead)
+- Works in nested VMs and restricted environments
+- Fully compatible with rootless Podman in guest
 
 ### Privileged Mode (nftables + bridge)
 
@@ -1326,6 +1378,28 @@ RUST_LOG=trace fcvm run nginx:latest
 
 ## Testing Strategy
 
+### Test Infrastructure
+
+**Network Mode Guards**: The fcvm binary enforces proper network mode usage:
+- **Bridged without root**: Fails with helpful error message suggesting `sudo` or `--network rootless`
+- **Rootless with root**: Runs but prints warning that bridged would be faster
+
+**Test Isolation**: All tests use unique resource names to enable parallel execution:
+- `unique_names()` helper generates timestamp+counter-based names
+- PID-based naming for additional uniqueness
+- Automatic cleanup on test exit
+
+**Dynamic NBD Device Selection**: When creating rootfs (extracting qcow2 images):
+- Scans `/dev/nbd0` through `/dev/nbd15` to find a free device
+- Checks `/sys/block/nbdN/pid` to detect in-use devices
+- Includes retry logic for race conditions during parallel execution
+
+**Root/Rootless Test Organization**:
+- Rootless tests: Use `require_non_root()` guard, fail loudly if run as root
+- Bridged tests: Rely on fcvm binary's built-in check
+- Makefile targets: Split by network mode (`test-vm-exec-bridged`/`test-vm-exec-rootless`)
+- Container tests: Use appropriate container run configurations (CONTAINER_RUN_FCVM vs CONTAINER_RUN_ROOTLESS)
+
 ### Unit Tests
 
 Test individual components in isolation:
@@ -1541,6 +1615,6 @@ kill $CLONE_PID $SERVE_PID $BASELINE_PID
 
 **End of Design Specification**
 
-*Version: 2.0*
-*Date: 2025-12-14*
+*Version: 2.1*
+*Date: 2025-12-21*
 *Author: fcvm project*
diff --git a/Makefile b/Makefile
index e7bec4aa..817e1c1a 100644
--- a/Makefile
+++ b/Makefile
@@ -21,8 +21,12 @@ TEST_FUSE_ROOT := cargo test --release -p fuse-pipe --test integration_root
 TEST_FUSE_PERMISSION := cargo test --release -p fuse-pipe --test test_permission_edge_cases
 TEST_PJDFSTEST := cargo test --release -p fuse-pipe --test pjdfstest_full -- --nocapture
 TEST_VM_BRIDGED := sh -c "cargo build --release && cargo test --release --test test_sanity test_sanity_bridged -- --nocapture"
-TEST_VM_EXEC := sh -c "cargo build --release && cargo test --release --test test_exec -- --nocapture --test-threads=1"
-TEST_VM_EGRESS := sh -c "cargo build --release && cargo test --release --test test_egress -- --nocapture --test-threads=1"
+TEST_VM_EXEC_BRIDGED := sh -c "cargo build --release && cargo test --release --test test_exec test_exec_bridged -- --nocapture"
+TEST_VM_EGRESS_BRIDGED := sh -c "cargo build --release && cargo test --release --test test_egress bridged -- --nocapture"
+
+# No root required (rootless networking):
+TEST_VM_EXEC_ROOTLESS := sh -c "cargo build --release && cargo test --release --test test_exec test_exec_rootless -- --nocapture"
+TEST_VM_EGRESS_ROOTLESS := sh -c "cargo build --release && cargo test --release --test test_egress rootless -- --nocapture"
 
 # Legacy alias
 TEST_VM := cargo test --release --test test_sanity -- --nocapture
@@ -37,11 +41,15 @@ BENCH_EXEC := cargo bench --bench exec
 
 .PHONY: all help build clean \
         test test-noroot test-root test-unit test-fuse test-vm test-vm-rootless test-vm-bridged test-all \
+        test-vm-exec test-vm-exec-bridged test-vm-exec-rootless \
+        test-vm-egress test-vm-egress-bridged test-vm-egress-rootless \
         bench bench-throughput bench-operations bench-protocol bench-exec bench-quick bench-logs bench-clean \
         lint clippy fmt fmt-check \
         rootfs rebuild \
         container-test container-test-unit container-test-noroot container-test-root container-test-fuse \
-        container-test-vm container-test-vm-rootless container-test-vm-bridged container-test-vm-exec container-test-vm-egress container-test-fcvm \
+        container-test-vm container-test-vm-rootless container-test-vm-bridged container-test-fcvm \
+        container-test-vm-exec container-test-vm-exec-bridged container-test-vm-exec-rootless \
+        container-test-vm-egress container-test-vm-egress-bridged container-test-vm-egress-rootless \
         container-test-pjdfstest container-test-all container-test-allow-other container-build-allow-other \
         container-bench container-bench-throughput container-bench-operations container-bench-protocol container-bench-exec \
         container-shell container-clean \
@@ -62,9 +70,11 @@ help:
 	@echo "  make test-root       - Tests requiring root: integration_root (sudo)"
 	@echo "  make test-unit       - Unit tests only (no root)"
 	@echo "  make test-fuse       - fuse-pipe: integration + permission + stress"
-	@echo "  make test-vm         - VM tests: rootless + bridged"
-	@echo "  make test-vm-rootless - VM test with slirp4netns (no root)"
-	@echo "  make test-vm-bridged  - VM test with bridged networking (sudo)"
+	@echo "  make test-vm         - VM tests: rootless + bridged sanity"
+	@echo "  make test-vm-rootless - VM sanity test with slirp4netns (no sudo)"
+	@echo "  make test-vm-bridged  - VM sanity test with bridged networking (sudo)"
+	@echo "  make test-vm-exec     - VM exec tests: rootless + bridged"
+	@echo "  make test-vm-egress   - VM egress tests: rootless + bridged"
 	@echo "  make test-all        - Everything: test + test-vm"
 	@echo ""
 	@echo "Benchmarks:"
@@ -89,9 +99,11 @@ help:
 	@echo "  make container-test-root         - Tests as root"
 	@echo "  make container-test-unit         - Unit tests only (non-root)"
 	@echo "  make container-test-fuse         - All fuse-pipe tests explicitly"
-	@echo "  make container-test-vm           - VM tests (rootless + bridged)"
-	@echo "  make container-test-vm-rootless  - VM test with slirp4netns"
-	@echo "  make container-test-vm-bridged   - VM test with bridged networking"
+	@echo "  make container-test-vm           - VM sanity tests (rootless + bridged)"
+	@echo "  make container-test-vm-rootless  - VM sanity with slirp4netns"
+	@echo "  make container-test-vm-bridged   - VM sanity with bridged networking"
+	@echo "  make container-test-vm-exec      - VM exec tests (rootless + bridged)"
+	@echo "  make container-test-vm-egress    - VM egress tests (rootless + bridged)"
 	@echo "  make container-test-pjdfstest    - POSIX compliance (8789 tests)"
 	@echo "  make container-test-all          - Everything: test + vm + pjdfstest"
 	@echo "  make container-test-allow-other  - Test AllowOther with fuse.conf"
@@ -219,6 +231,24 @@ test-vm-rootless: build setup-kernel
 test-vm-bridged: build setup-kernel
 	sudo $(TEST_VM_BRIDGED)
 
+# VM exec tests
+test-vm-exec-bridged: build setup-kernel
+	sudo $(TEST_VM_EXEC_BRIDGED)
+
+test-vm-exec-rootless: build setup-kernel
+	$(TEST_VM_EXEC_ROOTLESS)
+
+test-vm-exec: test-vm-exec-rootless test-vm-exec-bridged
+
+# VM egress tests
+test-vm-egress-bridged: build setup-kernel
+	sudo $(TEST_VM_EGRESS_BRIDGED)
+
+test-vm-egress-rootless: build setup-kernel
+	$(TEST_VM_EGRESS_ROOTLESS)
+
+test-vm-egress: test-vm-egress-rootless test-vm-egress-bridged
+
 # All VM tests: rootless first, then bridged
 test-vm: test-vm-rootless test-vm-bridged
 
@@ -309,14 +339,25 @@ rebuild: rootfs
 # Marker file for container build state
 CONTAINER_MARKER := .container-built
 
+# CI mode: use host directories instead of named volumes (for artifact sharing)
+# Set CI=1 to enable artifact-compatible mode
+CI ?= 0
+ifeq ($(CI),1)
+VOLUME_TARGET := -v ./target:/workspace/fcvm/target
+VOLUME_CARGO := -v ./cargo-home:/home/testuser/.cargo
+else
+VOLUME_TARGET := -v fcvm-cargo-target:/workspace/fcvm/target
+VOLUME_CARGO := -v fcvm-cargo-home:/home/testuser/.cargo
+endif
+
 # Container run with source mounts (code always fresh, can't run stale)
 # Cargo cache goes to testuser's home so non-root builds work
 CONTAINER_RUN_BASE := sudo podman run --rm --privileged \
 	-v .:/workspace/fcvm \
 	-v $(FUSE_BACKEND_RS):/workspace/fuse-backend-rs \
 	-v $(FUSER):/workspace/fuser \
-	-v fcvm-cargo-target:/workspace/fcvm/target \
-	-v fcvm-cargo-home:/home/testuser/.cargo \
+	$(VOLUME_TARGET) \
+	$(VOLUME_CARGO) \
 	-e CARGO_HOME=/home/testuser/.cargo
 
 # Container run options for fuse-pipe tests
@@ -340,22 +381,32 @@ CONTAINER_RUN_FCVM := $(CONTAINER_RUN_BASE) \
 	-v /var/run/netns:/var/run/netns:rshared \
 	--network host
 
-# Truly rootless container run - matches unprivileged host user exactly
-# Runs podman WITHOUT sudo (rootless podman) - this is the true unprivileged test
-# Uses separate storage (--root) to avoid conflicts with root-owned storage
-# --network host so slirp4netns can bind to loopback addresses (127.x.y.z)
-# --security-opt seccomp=unconfined allows unshare syscall (no extra capabilities granted)
-# No --privileged, no CAP_SYS_ADMIN - matches real unprivileged user
+# Container run for rootless networking tests
+# Uses rootless podman (no sudo!) with --privileged for user namespace capabilities.
+# --privileged with rootless podman grants capabilities within the user namespace,
+# not actual host root. We're root inside the container but unprivileged on host.
+# --group-add keep-groups preserves host user's groups (kvm) for /dev/kvm access.
+# --device /dev/userfaultfd needed for snapshot/clone UFFD memory sharing.
+# The container's user namespace is the isolation boundary.
+ifeq ($(CI),1)
+VOLUME_TARGET_ROOTLESS := -v ./target:/workspace/fcvm/target
+VOLUME_CARGO_ROOTLESS := -v ./cargo-home:/home/testuser/.cargo
+else
+VOLUME_TARGET_ROOTLESS := -v fcvm-cargo-target-rootless:/workspace/fcvm/target
+VOLUME_CARGO_ROOTLESS := -v fcvm-cargo-home-rootless:/home/testuser/.cargo
+endif
 CONTAINER_RUN_ROOTLESS := podman --root=/tmp/podman-rootless run --rm \
-	--security-opt seccomp=unconfined \
+	--privileged \
+	--group-add keep-groups \
 	-v .:/workspace/fcvm \
 	-v $(FUSE_BACKEND_RS):/workspace/fuse-backend-rs \
 	-v $(FUSER):/workspace/fuser \
-	-v fcvm-cargo-target-rootless:/workspace/fcvm/target \
-	-v fcvm-cargo-home-rootless:/home/testuser/.cargo \
+	$(VOLUME_TARGET_ROOTLESS) \
+	$(VOLUME_CARGO_ROOTLESS) \
 	-e CARGO_HOME=/home/testuser/.cargo \
 	--device /dev/kvm \
 	--device /dev/net/tun \
+	--device /dev/userfaultfd \
 	-v /mnt/fcvm-btrfs:/mnt/fcvm-btrfs \
 	--network host
 
@@ -368,6 +419,13 @@ $(CONTAINER_MARKER): Containerfile
 
 container-build: $(CONTAINER_MARKER)
 
+# Build inside container only (no tests) - useful for CI artifact caching
+# Creates target/ with compiled binaries that can be uploaded/downloaded
+container-build-only: container-build
+	@echo "==> Building inside container (CI mode)..."
+	@mkdir -p target cargo-home
+	$(CONTAINER_RUN_FUSE) $(CONTAINER_IMAGE) cargo build --release --all-targets -p fuse-pipe
+
 # Export container image for rootless podman (needed for container-test-vm-rootless)
 # Rootless podman has separate image storage, so we export from root and import
 CONTAINER_ROOTLESS_MARKER := .container-rootless-imported
@@ -420,9 +478,9 @@ container-test-allow-other: container-build-allow-other
 # All fuse-pipe tests: noroot first, then root
 container-test: container-test-noroot container-test-root
 
-# VM tests - rootless (truly unprivileged - no --privileged, runs as testuser)
-# Uses CONTAINER_RUN_ROOTLESS which drops privileges to match a normal host user
-# Depends on container-build-rootless to export image to rootless podman storage
+# VM tests - rootless (tests fcvm's rootless networking mode inside container)
+# Uses CONTAINER_RUN_ROOTLESS with rootless podman --privileged
+# Tests that fcvm can set up slirp4netns + user namespace networking
 container-test-vm-rootless: container-build-rootless setup-kernel
 	$(CONTAINER_RUN_ROOTLESS) $(CONTAINER_IMAGE) $(TEST_VM_ROOTLESS)
 
@@ -430,16 +488,30 @@ container-test-vm-rootless: container-build-rootless setup-kernel
 container-test-vm-bridged: container-build setup-kernel
 	$(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_BRIDGED)
 
-# VM exec tests - tests fcvm exec functionality
-container-test-vm-exec: container-build setup-kernel
-	$(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EXEC)
+# VM exec tests - bridged (needs root)
+container-test-vm-exec-bridged: container-build setup-kernel
+	$(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EXEC_BRIDGED)
 
-# VM egress tests - tests network egress from VMs
-container-test-vm-egress: container-build setup-kernel
-	$(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EGRESS)
+# VM exec tests - rootless (tests fcvm's rootless networking mode)
+container-test-vm-exec-rootless: container-build-rootless setup-kernel
+	$(CONTAINER_RUN_ROOTLESS) $(CONTAINER_IMAGE) $(TEST_VM_EXEC_ROOTLESS)
 
-# All VM tests: rootless first, then bridged
-container-test-vm: container-test-vm-rootless container-test-vm-bridged
+# VM exec tests - all (bridged first to create rootfs, then rootless)
+container-test-vm-exec: container-test-vm-exec-bridged container-test-vm-exec-rootless
+
+# VM egress tests - bridged (needs root)
+container-test-vm-egress-bridged: container-build setup-kernel
+	$(CONTAINER_RUN_FCVM) $(CONTAINER_IMAGE) $(TEST_VM_EGRESS_BRIDGED)
+
+# VM egress tests - rootless (tests fcvm's rootless networking mode)
+container-test-vm-egress-rootless: container-build-rootless setup-kernel
+	$(CONTAINER_RUN_ROOTLESS) $(CONTAINER_IMAGE) $(TEST_VM_EGRESS_ROOTLESS)
+
+# VM egress tests - all (bridged first to create rootfs, then rootless)
+container-test-vm-egress: container-test-vm-egress-bridged container-test-vm-egress-rootless
+
+# All VM tests: bridged first (creates rootfs), then rootless
+container-test-vm: container-test-vm-bridged container-test-vm-rootless
 
 # Legacy alias (runs both VM tests)
 container-test-fcvm: container-test-vm
diff --git a/README.md b/README.md
index f4788f47..15595bff 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ A Rust implementation that launches Firecracker microVMs to run Podman container
 > - Instant VM cloning via UFFD memory server + btrfs reflinks (~3ms)
 > - Multiple VMs share memory via kernel page cache (50 VMs = ~512MB, not 25GB!)
 > - Dual networking: bridged (iptables) or rootless (slirp4netns)
+> - Port forwarding for both regular VMs and clones
 > - FUSE-based host directory mapping via fuse-pipe
 > - Container exit code forwarding
 
@@ -138,7 +139,13 @@ sudo fcvm snapshot ls
 sudo fcvm snapshot run --pid <serve_pid> --name clone1
 sudo fcvm snapshot run --pid <serve_pid> --name clone2
 
-# 7. Clone and execute command (auto-cleans up after)
+# 7. Clone with port forwarding (each clone can have unique ports)
+sudo fcvm snapshot run --pid <serve_pid> --name web1 --publish 8081:80
+sudo fcvm snapshot run --pid <serve_pid> --name web2 --publish 8082:80
+curl localhost:8081  # Reaches clone web1
+curl localhost:8082  # Reaches clone web2
+
+# 8. Clone and execute command (auto-cleans up after)
 sudo fcvm snapshot run --pid <serve_pid> --exec "curl localhost"
 # Clone starts → execs command in container → returns result → cleans up
 ```
@@ -537,7 +544,8 @@ Run `make help` for the full list. Key targets:
 | `test_fuse_posix.rs` | POSIX FUSE compliance tests |
 | `test_fuse_in_vm.rs` | FUSE-in-VM integration |
 | `test_localhost_image.rs` | Local image tests |
-| `test_snapshot_clone.rs` | Snapshot/clone workflow |
+| `test_snapshot_clone.rs` | Snapshot/clone workflow, clone port forwarding |
+| `test_port_forward.rs` | Port forwarding for regular VMs |
 
 #### fuse-pipe Tests (`fuse-pipe/tests/`)
 | File | Description |
diff --git a/rootfs-plan.toml b/rootfs-plan.toml
new file mode 100644
index 00000000..be8083d4
--- /dev/null
+++ b/rootfs-plan.toml
@@ -0,0 +1,116 @@
+# Rootfs Modification Plan
+#
+# This file describes all modifications applied to the base Ubuntu cloud image.
+# The SHA256 of the generated setup script determines the image name: layer2-{sha}.raw
+# If this file changes, Layer 2 is rebuilt automatically.
+#
+# fc-agent is NOT in Layer 2 at all (neither binary nor service).
+# Both are injected per-VM at boot time via initrd.
+# This allows updating fc-agent without rebuilding Layer 2.
+
+[base]
+# Ubuntu 24.04 LTS (Noble Numbat) cloud images
+# Using "current" for latest updates - URL changes trigger plan SHA change
+version = "24.04"
+
+[base.arm64]
+url = "https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-arm64.img"
+
+[base.amd64]
+url = "https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img"
+
+[kernel]
+# Kata Containers kernel with FUSE support built-in
+# Firecracker's official kernel lacks FUSE, but Kata's has it
+# URL hash is included in Layer 2 SHA calculation
+
+[kernel.arm64]
+# Kata 3.24.0 release - kernel 6.12.47 with CONFIG_FUSE_FS=y
+url = "https://github.com/kata-containers/kata-containers/releases/download/3.24.0/kata-static-3.24.0-arm64.tar.zst"
+# Path within the tarball to extract
+path = "opt/kata/share/kata-containers/vmlinux-6.12.47-173"
+
+[kernel.amd64]
+url = "https://github.com/kata-containers/kata-containers/releases/download/3.24.0/kata-static-3.24.0-amd64.tar.zst"
+path = "opt/kata/share/kata-containers/vmlinux-6.12.47-173"
+
+[packages]
+# Container runtime
+runtime = ["podman", "crun", "fuse-overlayfs", "skopeo"]
+
+# FUSE support for overlay filesystem
+fuse = ["fuse3"]
+
+# System services
+system = ["haveged", "chrony"]
+
+[services]
+# Services to enable
+# NOTE: fc-agent is NOT enabled here - it's injected per-VM via initrd
+# NOTE: systemd-resolved is NOT enabled - DNS comes from kernel cmdline via fc-agent
+enable = [
+    "haveged",
+    "chrony",
+    "systemd-networkd",
+]
+
+# Services to disable
+disable = [
+    "multipathd",
+    "snapd",
+    "cloud-init",
+    "cloud-config",
+    "cloud-final",
+]
+
+[files]
+# Files to create/modify in the rootfs
+
+[files."/etc/resolv.conf"]
+content = """
+# Placeholder - fc-agent configures DNS at boot from kernel cmdline
+nameserver 127.0.0.53
+"""
+
+[files."/etc/chrony/chrony.conf"]
+content = """
+# NTP servers from pool.ntp.org
+pool pool.ntp.org iburst
+
+# Allow clock to be stepped (not slewed) for large time differences
+makestep 1.0 3
+
+# Directory for drift and other runtime files
+driftfile /var/lib/chrony/drift
+"""
+
+[files."/etc/systemd/network/10-eth0.network"]
+content = """
+[Match]
+Name=eth0
+
+[Network]
+# Keep kernel IP configuration from ip= boot parameter
+KeepConfiguration=yes
+"""
+
+[files."/etc/systemd/network/10-eth0.network.d/mmds.conf"]
+content = """
+[Route]
+Destination=169.254.169.254/32
+Scope=link
+"""
+
+# NOTE: fc-agent.service is NOT defined here - it's injected per-VM via initrd
+
+[fstab]
+# Lines to remove from /etc/fstab (patterns to filter out)
+remove_patterns = ["LABEL=BOOT", "LABEL=UEFI"]
+
+[cleanup]
+# Patterns to remove for smaller image
+remove_dirs = [
+    "/usr/share/doc/*",
+    "/usr/share/man/*",
+    "/var/cache/apt/archives/*",
+]
diff --git a/src/commands/podman.rs b/src/commands/podman.rs
index 723be8c6..418668f5 100644
--- a/src/commands/podman.rs
+++ b/src/commands/podman.rs
@@ -274,6 +274,22 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> {
     state_manager.init().await?;
 
     // Setup networking based on mode
+    // Bridged mode requires root for iptables and network namespace setup
+    if matches!(args.network, NetworkMode::Bridged) && !nix::unistd::geteuid().is_root() {
+        bail!(
+            "Bridged networking requires root. Either:\n  \
+             - Run with sudo: sudo fcvm podman run ...\n  \
+             - Use rootless mode: fcvm podman run --network rootless ..."
+        );
+    }
+    // Rootless with sudo is pointless - bridged would be faster
+    if matches!(args.network, NetworkMode::Rootless) && nix::unistd::geteuid().is_root() {
+        warn!(
+            "Running rootless mode as root is unnecessary. \
+             Consider using --network bridged for better performance."
+        );
+    }
+
     let tap_device = format!("tap-{}", truncate_id(&vm_id, 8));
     let mut network: Box<dyn NetworkManager> = match args.network {
         NetworkMode::Bridged => Box::new(BridgedNetwork::new(
diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs
index 61275444..d3dbc47b 100644
--- a/src/commands/snapshot.rs
+++ b/src/commands/snapshot.rs
@@ -18,6 +18,80 @@ use crate::storage::{DiskManager, SnapshotManager};
 use crate::uffd::UffdServer;
 use crate::volume::{spawn_volume_servers, VolumeConfig};
 
+const USERFAULTFD_DEVICE: &str = "/dev/userfaultfd";
+
+/// Check if /dev/userfaultfd is accessible for clone operations.
+/// Clones use UFFD (userfaultfd) to share memory pages on-demand from the serve process.
+/// Returns Ok(()) if accessible, or an error with detailed fix instructions.
+fn check_userfaultfd_access() -> Result<()> {
+    use std::fs::OpenOptions;
+    use std::path::Path;
+
+    let path = Path::new(USERFAULTFD_DEVICE);
+
+    // Check if device exists
+    if !path.exists() {
+        bail!(
+            r#"
+╔══════════════════════════════════════════════════════════════════════════════╗
+║                        USERFAULTFD DEVICE NOT FOUND                          ║
+╠══════════════════════════════════════════════════════════════════════════════╣
+║  {USERFAULTFD_DEVICE} does not exist on this system.                              ║
+║                                                                              ║
+║  This device is required for snapshot cloning (UFFD memory sharing).        ║
+║  It's available on Linux 5.11+ kernels.                                     ║
+║                                                                              ║
+║  Check your kernel version:                                                  ║
+║    uname -r                                                                  ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+"#
+        );
+    }
+
+    // Check if we have read/write access
+    match OpenOptions::new().read(true).write(true).open(path) {
+        Ok(_) => Ok(()),
+        Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
+            bail!(
+                r#"
+╔══════════════════════════════════════════════════════════════════════════════╗
+║                     USERFAULTFD PERMISSION DENIED                            ║
+╠══════════════════════════════════════════════════════════════════════════════╣
+║  Cannot access /dev/userfaultfd - permission denied.                         ║
+║                                                                              ║
+║  Snapshot clones require access to userfaultfd for memory sharing.           ║
+║                                                                              ║
+║  FIX (choose one):                                                           ║
+║                                                                              ║
+║  Option 1 - Device permissions (recommended):                                ║
+║    # Persistent udev rule (survives reboots):                                ║
+║    echo 'KERNEL=="userfaultfd", MODE="0666"' | \                             ║
+║      sudo tee /etc/udev/rules.d/99-userfaultfd.rules                         ║
+║    sudo udevadm control --reload-rules                                       ║
+║    sudo chmod 666 /dev/userfaultfd                                           ║
+║                                                                              ║
+║  Option 2 - Sysctl (system-wide, affects syscall fallback):                  ║
+║    sudo sysctl vm.unprivileged_userfaultfd=1                                 ║
+║    # To persist: add 'vm.unprivileged_userfaultfd=1' to /etc/sysctl.conf     ║
+║                                                                              ║
+║  Option 3 - One-time fix (must redo after reboot):                           ║
+║    sudo chmod 666 /dev/userfaultfd                                           ║
+║                                                                              ║
+║  After fixing, retry your clone command.                                     ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+"#
+            );
+        }
+        Err(e) => {
+            bail!(
+                "Cannot access {}: {} - ensure the device exists and is readable",
+                USERFAULTFD_DEVICE,
+                e
+            );
+        }
+    }
+}
+
 /// Main dispatcher for snapshot commands
 pub async fn cmd_snapshot(args: SnapshotArgs) -> Result<()> {
     match args.cmd {
@@ -400,7 +474,11 @@ async fn cmd_snapshot_serve(args: SnapshotServeArgs) -> Result<()> {
 
 /// Run clone from snapshot
 async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
-    // First verify the serve process is actually alive before attempting any work
+    // Check userfaultfd access FIRST - this is a system requirement
+    // Give a clear error message if permissions aren't configured
+    check_userfaultfd_access().context("userfaultfd access check failed")?;
+
+    // Now verify the serve process is actually alive before attempting any work
     // This prevents wasted setup if the serve process died between state file creation and now
     if !crate::utils::is_process_alive(args.pid) {
         anyhow::bail!(
@@ -543,6 +621,22 @@ async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
     // Extract guest_ip from snapshot metadata for network config reuse
     let saved_network = &snapshot_config.metadata.network_config;
 
+    // Bridged mode requires root for iptables and network namespace setup
+    if matches!(args.network, NetworkMode::Bridged) && !nix::unistd::geteuid().is_root() {
+        bail!(
+            "Bridged networking requires root. Either:\n  \
+             - Run with sudo: sudo fcvm snapshot run ...\n  \
+             - Use rootless mode: fcvm snapshot run --network rootless ..."
+        );
+    }
+    // Rootless with sudo is pointless - bridged would be faster
+    if matches!(args.network, NetworkMode::Rootless) && nix::unistd::geteuid().is_root() {
+        warn!(
+            "Running rootless mode as root is unnecessary. \
+             Consider using --network bridged for better performance."
+        );
+    }
+
     // Setup networking based on mode - reuse guest_ip from snapshot if available
     let mut network: Box<dyn NetworkManager> = match args.network {
         NetworkMode::Bridged => {
@@ -991,8 +1085,19 @@ async fn run_clone_setup(
             "parallel disk + network setup complete"
         );
 
-        // Step 3: Set holder_pid so VmManager uses nsenter
-        vm_manager.set_holder_pid(holder_pid);
+        // Step 3: Set namespace paths for pre_exec setns (NOT nsenter wrapper)
+        // For clones, we need to enter namespaces in pre_exec because:
+        // - pre_exec runs BEFORE nsenter would enter the namespace
+        // - We need CAP_SYS_ADMIN (from user namespace) for mount operations
+        // - Entering user namespace first gives us CAP_SYS_ADMIN for unshare(CLONE_NEWNS)
+        vm_manager.set_user_namespace_path(std::path::PathBuf::from(format!(
+            "/proc/{}/ns/user",
+            holder_pid
+        )));
+        vm_manager.set_net_namespace_path(std::path::PathBuf::from(format!(
+            "/proc/{}/ns/net",
+            holder_pid
+        )));
 
         // Store holder_pid in state for health checks
         vm_state.holder_pid = Some(holder_pid);
diff --git a/src/firecracker/vm.rs b/src/firecracker/vm.rs
index f198233c..7da888a7 100644
--- a/src/firecracker/vm.rs
+++ b/src/firecracker/vm.rs
@@ -36,6 +36,8 @@ pub struct VmManager {
     log_path: Option<PathBuf>,
     namespace_id: Option<String>,
     holder_pid: Option<u32>, // namespace holder PID for rootless mode (use nsenter to run FC)
+    user_namespace_path: Option<PathBuf>, // User namespace path for rootless clones (enter via setns in pre_exec)
+    net_namespace_path: Option<PathBuf>, // Net namespace path for rootless clones (enter via setns in pre_exec)
     vsock_redirect: Option<(PathBuf, PathBuf)>, // (baseline_dir, clone_dir) for mount namespace isolation
     process: Option<Child>,
     client: Option<FirecrackerClient>,
@@ -50,6 +52,8 @@ impl VmManager {
             log_path,
             namespace_id: None,
             holder_pid: None,
+            user_namespace_path: None,
+            net_namespace_path: None,
             vsock_redirect: None,
             process: None,
             client: None,
@@ -80,6 +84,27 @@ impl VmManager {
         self.holder_pid = Some(pid);
     }
 
+    /// Set user namespace path for rootless clones
+    ///
+    /// When set along with vsock_redirect, pre_exec will enter this user namespace
+    /// first (via setns) before doing mount operations. This gives CAP_SYS_ADMIN
+    /// inside the user namespace, allowing unshare(CLONE_NEWNS) to succeed.
+    ///
+    /// Use this instead of set_holder_pid when mount namespace isolation is needed,
+    /// since nsenter wrapper runs AFTER pre_exec.
+    pub fn set_user_namespace_path(&mut self, path: PathBuf) {
+        self.user_namespace_path = Some(path);
+    }
+
+    /// Set network namespace path for rootless clones
+    ///
+    /// When set, pre_exec will enter this network namespace (via setns) after
+    /// completing mount operations. Use with set_user_namespace_path for
+    /// rootless clones that need mount namespace isolation.
+    pub fn set_net_namespace_path(&mut self, path: PathBuf) {
+        self.net_namespace_path = Some(path);
+    }
+
     /// Set vsock redirect for mount namespace isolation
     ///
     /// When set, Firecracker will be launched in a new mount namespace with
@@ -109,12 +134,25 @@ impl VmManager {
         let _ = std::fs::remove_file(&self.socket_path);
 
         // Build command based on mode:
-        // 1. holder_pid set: use nsenter to enter existing namespace (rootless)
-        // 2. direct Firecracker (privileged/bridged mode)
-        let mut cmd = if let Some(holder_pid) = self.holder_pid {
+        // 1. user_namespace_path set: direct Firecracker (namespaces entered via pre_exec setns)
+        // 2. holder_pid set (no user_namespace_path): use nsenter to enter existing namespace (rootless baseline)
+        // 3. neither: direct Firecracker (privileged/bridged mode)
+        //
+        // For rootless clones with vsock_redirect, we MUST use pre_exec setns instead of nsenter,
+        // because pre_exec runs BEFORE nsenter would enter the namespace, and we need CAP_SYS_ADMIN
+        // from the user namespace to do mount operations.
+        let mut cmd = if self.user_namespace_path.is_some() {
+            // Use direct Firecracker - namespaces will be entered via setns in pre_exec
+            // This is required for rootless clones that need mount namespace isolation
+            info!(target: "vm", vm_id = %self.vm_id, "using pre_exec setns for rootless clone");
+            let mut c = Command::new(firecracker_bin);
+            c.arg("--api-sock").arg(&self.socket_path);
+            c
+        } else if let Some(holder_pid) = self.holder_pid {
             // Use nsenter to enter user+network namespace with preserved credentials
             // --preserve-credentials keeps UID, GID, and supplementary groups (including kvm)
             // This allows KVM access while being in the isolated network namespace
+            // NOTE: This path is for baseline VMs that don't need mount namespace isolation
             info!(target: "vm", vm_id = %self.vm_id, holder_pid = holder_pid, "using nsenter for rootless networking");
             let mut c = Command::new("nsenter");
             c.args([
@@ -155,6 +193,8 @@ impl VmManager {
         // We need to handle these in a single pre_exec because it can only be called once
         let ns_id_clone = self.namespace_id.clone();
         let vsock_redirect_clone = self.vsock_redirect.clone();
+        let user_ns_path_clone = self.user_namespace_path.clone();
+        let net_ns_path_clone = self.net_namespace_path.clone();
 
         // Ensure baseline directory exists for bind mount target
         // The baseline VM may have been cleaned up, but we need the directory for mount
@@ -165,7 +205,11 @@ impl VmManager {
             }
         }
 
-        if ns_id_clone.is_some() || vsock_redirect_clone.is_some() {
+        if ns_id_clone.is_some()
+            || vsock_redirect_clone.is_some()
+            || user_ns_path_clone.is_some()
+            || net_ns_path_clone.is_some()
+        {
             use std::ffi::CString;
 
             // Prepare CStrings outside the closure (async-signal-safe requirement)
@@ -179,6 +223,28 @@ impl VmManager {
                 None
             };
 
+            // User namespace path (for rootless clones that need CAP_SYS_ADMIN for mount ops)
+            let user_ns_cstr = if let Some(ref path) = user_ns_path_clone {
+                info!(target: "vm", vm_id = %self.vm_id, path = %path.display(), "will enter user namespace in pre_exec");
+                Some(
+                    CString::new(path.to_string_lossy().as_bytes())
+                        .context("user namespace path contains invalid characters")?,
+                )
+            } else {
+                None
+            };
+
+            // Network namespace path (for rootless clones via /proc/PID/ns/net)
+            let net_ns_cstr = if let Some(ref path) = net_ns_path_clone {
+                info!(target: "vm", vm_id = %self.vm_id, path = %path.display(), "will enter net namespace in pre_exec");
+                Some(
+                    CString::new(path.to_string_lossy().as_bytes())
+                        .context("net namespace path contains invalid characters")?,
+                )
+            } else {
+                None
+            };
+
             let vsock_paths = if let Some((ref baseline_dir, ref clone_dir)) = vsock_redirect_clone
             {
                 info!(target: "vm", vm_id = %self.vm_id,
@@ -210,8 +276,31 @@ impl VmManager {
                     use nix::sys::stat::Mode;
                     use std::os::unix::io::{FromRawFd, OwnedFd};
 
+                    // Step 0: Enter user namespace if specified (for rootless clones)
+                    // This MUST be done first to get CAP_SYS_ADMIN for mount operations.
+                    // The user namespace was created by the holder process with --map-root-user,
+                    // so entering it gives us UID 0 with full capabilities inside the namespace.
+                    if let Some(ref user_ns_path) = user_ns_cstr {
+                        let ns_fd_raw = open(
+                            user_ns_path.as_c_str(),
+                            OFlag::O_RDONLY,
+                            Mode::empty(),
+                        )
+                        .map_err(|e| {
+                            std::io::Error::other(format!("failed to open user namespace: {}", e))
+                        })?;
+
+                        let ns_fd = OwnedFd::from_raw_fd(ns_fd_raw);
+
+                        setns(&ns_fd, CloneFlags::CLONE_NEWUSER).map_err(|e| {
+                            std::io::Error::other(format!("failed to enter user namespace: {}", e))
+                        })?;
+                        // Now we have CAP_SYS_ADMIN inside the user namespace!
+                    }
+
                     // Step 1: Set up mount namespace for vsock redirect if needed
                     // This must be done BEFORE entering network namespace
+                    // Note: This now succeeds because we entered user namespace first (if needed)
                     if let Some((ref baseline_cstr, ref clone_cstr)) = vsock_paths {
                         // Create a new mount namespace so our bind mount is isolated
                         unshare(CloneFlags::CLONE_NEWNS).map_err(|e| {
@@ -252,21 +341,24 @@ impl VmManager {
                     }
 
                     // Step 2: Enter network namespace if specified
-                    if let Some(ref ns_path_cstr) = ns_path_cstr {
-                        let ns_fd_raw = open(
-                            ns_path_cstr.as_c_str(),
-                            OFlag::O_RDONLY,
-                            Mode::empty(),
-                        )
-                        .map_err(|e| {
-                            std::io::Error::other(format!("failed to open namespace: {}", e))
-                        })?;
+                    // This can come from either:
+                    // - net_ns_cstr: /proc/PID/ns/net (rootless clones via pre_exec) - preferred
+                    // - ns_path_cstr: /var/run/netns/NAME (bridged mode)
+                    let net_ns_to_enter = net_ns_cstr.as_ref().or(ns_path_cstr.as_ref());
+                    if let Some(ns_path) = net_ns_to_enter {
+                        let ns_fd_raw = open(ns_path.as_c_str(), OFlag::O_RDONLY, Mode::empty())
+                            .map_err(|e| {
+                                std::io::Error::other(format!(
+                                    "failed to open net namespace: {}",
+                                    e
+                                ))
+                            })?;
 
                         // SAFETY: from_raw_fd takes ownership of the file descriptor.
                         let ns_fd = OwnedFd::from_raw_fd(ns_fd_raw);
 
                         setns(&ns_fd, CloneFlags::CLONE_NEWNET).map_err(|e| {
-                            std::io::Error::other(format!("failed to enter namespace: {}", e))
+                            std::io::Error::other(format!("failed to enter net namespace: {}", e))
                         })?;
                         // fd is automatically closed when OwnedFd is dropped
                     }
diff --git a/src/network/slirp.rs b/src/network/slirp.rs
index 29f18eac..600e7e9e 100644
--- a/src/network/slirp.rs
+++ b/src/network/slirp.rs
@@ -151,17 +151,17 @@ impl SlirpNetwork {
 
     /// Build the setup script to run inside the namespace via nsenter
     ///
-    /// This script creates both TAP devices and sets up iptables rules for egress.
-    /// Health checks use nsenter to curl the guest directly, no port forwarding needed.
+    /// This script creates both TAP devices and configures networking.
     /// Run via: nsenter -t HOLDER_PID -U -n -- bash -c '<this script>'
     pub fn build_setup_script(&self) -> String {
         format!(
             r#"
 set -e
 
-# Create slirp0 TAP for slirp4netns (slirp4netns will attach to this)
+# Create slirp0 TAP for slirp4netns connectivity
+# Use 10.0.2.100 as the address for DNAT to work with port forwarding
 ip tuntap add {slirp_dev} mode tap
-ip addr add 10.0.2.1/24 dev {slirp_dev}
+ip addr add 10.0.2.100/24 dev {slirp_dev}
 ip link set {slirp_dev} up
 
 # Create TAP device for Firecracker (must exist before Firecracker starts)
@@ -183,12 +183,19 @@ iptables -A FORWARD -i {slirp_dev} -o {fc_tap} -j ACCEPT 2>/dev/null || true
 iptables -A FORWARD -i {fc_tap} -o {slirp_dev} -j ACCEPT 2>/dev/null || true
 
 # Set up iptables MASQUERADE for traffic from guest subnet (egress)
+# This NATs guest traffic (192.168.x.x) to slirp0's address (10.0.2.100)
 iptables -t nat -A POSTROUTING -s {guest_subnet} -o {slirp_dev} -j MASQUERADE 2>/dev/null || true
+
+# Set up DNAT for inbound connections from slirp4netns
+# When slirp4netns forwards traffic to 10.0.2.100, redirect it to the actual guest IP
+# This enables port forwarding: host -> slirp4netns -> 10.0.2.100 -> DNAT -> guest (192.168.x.2)
+iptables -t nat -A PREROUTING -d 10.0.2.100 -j DNAT --to-destination {guest_ip} 2>/dev/null || true
 "#,
             slirp_dev = self.slirp_device,
             fc_tap = self.tap_device,
             ns_ip = self.namespace_ip,
             guest_subnet = self.guest_subnet,
+            guest_ip = self.guest_ip,
         )
     }
 
diff --git a/src/setup/kernel.rs b/src/setup/kernel.rs
index ed0373b8..f698b7cd 100644
--- a/src/setup/kernel.rs
+++ b/src/setup/kernel.rs
@@ -1,121 +1,135 @@
 use anyhow::{bail, Context, Result};
-use std::path::{Path, PathBuf};
-use std::process::Command;
+use sha2::{Digest, Sha256};
+use std::path::PathBuf;
+use tokio::process::Command;
 use tracing::info;
 
 use crate::paths;
+use crate::setup::rootfs::{load_plan, KernelArchConfig};
+
+/// Compute SHA256 of bytes, return hex string (first 12 chars)
+fn compute_sha256_short(data: &[u8]) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(data);
+    let result = hasher.finalize();
+    hex::encode(&result[..6]) // 12 hex chars
+}
+
+/// Get the kernel URL hash for the current architecture
+/// This is used to include in Layer 2 SHA calculation
+pub fn get_kernel_url_hash() -> Result<String> {
+    let (plan, _, _) = load_plan()?;
+    let kernel_config = plan.kernel.current_arch()?;
+    Ok(compute_sha256_short(kernel_config.url.as_bytes()))
+}
 
-/// Ensure kernel exists, extracting from host if needed
+/// Ensure kernel exists, downloading from Kata release if needed
 pub async fn ensure_kernel() -> Result<PathBuf> {
+    let (plan, _, _) = load_plan()?;
+    let kernel_config = plan.kernel.current_arch()?;
+
+    download_kernel(kernel_config).await
+}
+
+/// Download kernel from Kata release tarball
+async fn download_kernel(config: &KernelArchConfig) -> Result<PathBuf> {
     let kernel_dir = paths::kernel_dir();
-    let kernel_path = kernel_dir.join("vmlinux.bin");
+
+    // Cache by URL hash - changing URL triggers re-download
+    let url_hash = compute_sha256_short(config.url.as_bytes());
+    let kernel_path = kernel_dir.join(format!("vmlinux-{}.bin", url_hash));
 
     if kernel_path.exists() {
-        info!(path = %kernel_path.display(), "kernel already exists");
+        info!(path = %kernel_path.display(), url_hash = %url_hash, "kernel already exists");
         return Ok(kernel_path);
     }
 
-    println!("⚙️  Setting up kernel (first run)...");
+    println!("⚙️  Downloading kernel (first run)...");
+    info!(url = %config.url, path_in_archive = %config.path, "downloading kernel from Kata release");
 
     // Create directory
     tokio::fs::create_dir_all(&kernel_dir)
         .await
         .context("creating kernel directory")?;
 
-    // Find host kernel
-    let host_kernel = find_host_kernel().context("finding host kernel")?;
+    // Download and extract in one pipeline:
+    // curl -> zstd -d -> tar --extract
+    let cache_dir = paths::base_dir().join("cache");
+    tokio::fs::create_dir_all(&cache_dir).await?;
 
-    info!(host_kernel = %host_kernel.display(), "found host kernel");
-    println!("  → Extracting from {}...", host_kernel.display());
+    let tarball_path = cache_dir.join(format!("kata-kernel-{}.tar.zst", url_hash));
 
-    // Extract kernel
-    extract_kernel(&host_kernel, &kernel_path)
-        .await
-        .context("extracting kernel")?;
-
-    println!("  ✓ Kernel ready");
-
-    Ok(kernel_path)
-}
-
-/// Find host kernel in /boot
-fn find_host_kernel() -> Result<PathBuf> {
-    // Try current running kernel first
-    let uname_output = Command::new("uname")
-        .arg("-r")
-        .output()
-        .context("running uname -r")?;
+    // Download if not cached
+    if !tarball_path.exists() {
+        println!("  → Downloading Kata release tarball...");
 
-    let kernel_version = String::from_utf8_lossy(&uname_output.stdout)
-        .trim()
-        .to_string();
+        let output = Command::new("curl")
+            .args(["-fSL", &config.url, "-o"])
+            .arg(&tarball_path)
+            .output()
+            .await
+            .context("running curl")?;
 
-    let kernel_path = PathBuf::from(format!("/boot/vmlinuz-{}", kernel_version));
+        if !output.status.success() {
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            bail!("Failed to download kernel: {}", stderr);
+        }
 
-    if kernel_path.exists() {
-        return Ok(kernel_path);
+        info!(path = %tarball_path.display(), "downloaded Kata tarball");
+    } else {
+        info!(path = %tarball_path.display(), "using cached Kata tarball");
     }
 
-    // Fallback: find any vmlinuz in /boot
-    let boot_dir = std::fs::read_dir("/boot").context("reading /boot directory")?;
+    // Extract just the kernel file using tar with zstd
+    println!("  → Extracting kernel from tarball...");
+
+    // Use tar to extract, piping through zstd
+    // tar expects path with ./ prefix based on how Kata packages it
+    let extract_path = format!("./{}", config.path);
+
+    let output = Command::new("tar")
+        .args([
+            "--use-compress-program=zstd",
+            "-xf",
+        ])
+        .arg(&tarball_path)
+        .arg("-C")
+        .arg(&cache_dir)
+        .arg(&extract_path)
+        .output()
+        .await
+        .context("extracting kernel from tarball")?;
 
-    for entry in boot_dir {
-        let entry = entry?;
-        let file_name = entry.file_name();
-        let name = file_name.to_string_lossy();
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        bail!("Failed to extract kernel: {}", stderr);
+    }
 
-        if name.starts_with("vmlinuz") && !name.contains("rescue") {
-            return Ok(entry.path());
-        }
+    // Move extracted kernel to final location
+    let extracted_path = cache_dir.join(&config.path);
+    if !extracted_path.exists() {
+        bail!(
+            "Kernel not found after extraction at {}",
+            extracted_path.display()
+        );
     }
 
-    bail!("no kernel found in /boot")
-}
+    tokio::fs::copy(&extracted_path, &kernel_path)
+        .await
+        .context("copying kernel to final location")?;
 
-/// Extract uncompressed kernel from potentially compressed vmlinuz
-async fn extract_kernel(src: &Path, dst: &Path) -> Result<()> {
-    // Most modern kernels are self-extracting ELF with embedded compressed payload
-    // We need the uncompressed ELF
-
-    // Try finding extract-vmlinux in common locations
-    let extract_vmlinux_paths = vec![
-        "/usr/src/linux-headers-*/scripts/extract-vmlinux",
-        "/usr/src/*/scripts/extract-vmlinux",
-    ];
-
-    for pattern in &extract_vmlinux_paths {
-        if let Ok(output) = Command::new("sh")
-            .arg("-c")
-            .arg(format!("ls {} 2>/dev/null | head -1", pattern))
-            .output()
-        {
-            if let Ok(script_path) = String::from_utf8(output.stdout) {
-                let script_path = script_path.trim();
-                if !script_path.is_empty() {
-                    info!(script = %script_path, "using extract-vmlinux script");
-                    let output = Command::new(script_path)
-                        .arg(src)
-                        .output()
-                        .context("running extract-vmlinux")?;
-
-                    if output.status.success() && !output.stdout.is_empty() {
-                        tokio::fs::write(dst, &output.stdout)
-                            .await
-                            .context("writing extracted kernel")?;
-                        return Ok(());
-                    }
-                }
-            }
-        }
+    // Clean up extracted files (keep tarball for cache)
+    let opt_dir = cache_dir.join("opt");
+    if opt_dir.exists() {
+        tokio::fs::remove_dir_all(&opt_dir).await.ok();
     }
 
-    bail!(
-        "extract-vmlinux script not found. Please install it or download a pre-built kernel from Firecracker releases.
-
-        To install extract-vmlinux:
-          sudo apt-get install linux-tools-generic
+    println!("  ✓ Kernel ready");
+    info!(
+        path = %kernel_path.display(),
+        url_hash = %url_hash,
+        "kernel downloaded and cached"
+    );
 
-        Or download a pre-built kernel:
-          wget https://github.com/firecracker-microvm/firecracker/releases/download/v1.13.1/vmlinux-5.10.217"
-    )
+    Ok(kernel_path)
 }
diff --git a/src/setup/rootfs.rs b/src/setup/rootfs.rs
index 2100f36c..789b84d8 100644
--- a/src/setup/rootfs.rs
+++ b/src/setup/rootfs.rs
@@ -1,79 +1,460 @@
 use anyhow::{bail, Context, Result};
+use serde::Deserialize;
+use sha2::{Digest, Sha256};
+use std::collections::HashMap;
 use std::path::{Path, PathBuf};
-use tokio::fs::File;
-use tokio::io::AsyncWriteExt;
 use tokio::process::Command;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 
 use crate::paths;
 
-/// Find the fc-agent binary
+/// Plan file location (relative to workspace root)
+const PLAN_FILE: &str = "rootfs-plan.toml";
+
+/// Size of the Layer 2 disk image
+const LAYER2_SIZE: &str = "10G";
+
+// ============================================================================
+// Plan File Data Structures
+// ============================================================================
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct Plan {
+    pub base: BaseConfig,
+    pub kernel: KernelConfig,
+    pub packages: PackagesConfig,
+    pub services: ServicesConfig,
+    pub files: HashMap<String, FileConfig>,
+    pub fstab: FstabConfig,
+    #[serde(default)]
+    pub cleanup: CleanupConfig,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct BaseConfig {
+    pub version: String,
+    pub arm64: ArchConfig,
+    pub amd64: ArchConfig,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct ArchConfig {
+    pub url: String,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct KernelConfig {
+    pub arm64: KernelArchConfig,
+    pub amd64: KernelArchConfig,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct KernelArchConfig {
+    /// URL to the kernel archive (e.g., Kata release tarball)
+    pub url: String,
+    /// Path within the archive to extract
+    pub path: String,
+}
+
+impl KernelConfig {
+    /// Get the kernel config for the current architecture
+    pub fn current_arch(&self) -> anyhow::Result<&KernelArchConfig> {
+        match std::env::consts::ARCH {
+            "x86_64" => Ok(&self.amd64),
+            "aarch64" => Ok(&self.arm64),
+            other => anyhow::bail!("unsupported architecture: {}", other),
+        }
+    }
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct PackagesConfig {
+    pub runtime: Vec<String>,
+    pub fuse: Vec<String>,
+    pub system: Vec<String>,
+}
+
+impl PackagesConfig {
+    pub fn all_packages(&self) -> Vec<&str> {
+        self.runtime
+            .iter()
+            .chain(&self.fuse)
+            .chain(&self.system)
+            .map(|s| s.as_str())
+            .collect()
+    }
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct ServicesConfig {
+    pub enable: Vec<String>,
+    pub disable: Vec<String>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct FileConfig {
+    pub content: String,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct FstabConfig {
+    pub remove_patterns: Vec<String>,
+}
+
+#[derive(Debug, Deserialize, Default, Clone)]
+pub struct CleanupConfig {
+    #[serde(default)]
+    pub remove_dirs: Vec<String>,
+}
+
+// ============================================================================
+// Script Generation
+// ============================================================================
+
+/// Generate a setup script from the plan
 ///
-/// Both fcvm and fc-agent are workspace members built together with:
-///   cargo build --release
+/// Generate the install script that runs BEFORE the setup script.
+/// This script installs packages from /mnt/packages and removes conflicting packages.
+pub fn generate_install_script() -> String {
+    r#"#!/bin/bash
+set -e
+echo 'FCVM: Removing conflicting packages before install...'
+# Remove time-daemon provider that conflicts with chrony
+apt-get remove -y --purge systemd-timesyncd 2>/dev/null || true
+# Remove packages we don't need in microVM (also frees space)
+apt-get remove -y --purge cloud-init snapd ubuntu-server 2>/dev/null || true
+
+echo 'FCVM: Installing packages from initrd...'
+dpkg -i /mnt/packages/*.deb || true
+apt-get -f install -y || true
+echo 'FCVM: Packages installed successfully'
+"#
+    .to_string()
+}
+
+/// Generate the init script that runs in the initrd during Layer 2 setup.
+/// This script mounts filesystems, runs install + setup scripts, then powers off.
 ///
-/// Search order:
-/// 1. Same directory as current exe (for cargo install)
-/// 2. Parent directory (for tests running from target/release/deps/)
-/// 3. FC_AGENT_PATH environment variable
-fn find_fc_agent_binary() -> Result<PathBuf> {
-    let exe_path = std::env::current_exe().context("getting current executable path")?;
-    let exe_dir = exe_path.parent().context("getting executable directory")?;
+/// The SHA256 of this complete script determines the rootfs name, ensuring
+/// any changes to mounts, commands, or embedded scripts invalidate the cache.
+pub fn generate_init_script(install_script: &str, setup_script: &str) -> String {
+    format!(
+        r#"#!/bin/busybox sh
+# FCVM Layer 2 setup initrd
+# Runs package installation before systemd
+# Packages are embedded in the initrd at /packages
+
+echo "FCVM Layer 2 Setup: Starting..."
+
+# Install busybox commands
+/bin/busybox mkdir -p /bin /sbin /proc /sys /dev /newroot
+/bin/busybox --install -s /bin
+/bin/busybox --install -s /sbin
+
+# Mount essential filesystems
+mount -t proc proc /proc
+mount -t sysfs sys /sys
+mount -t devtmpfs dev /dev
+
+# Populate /dev with device nodes from sysfs
+mdev -s
+
+# Debug: show available block devices
+echo "FCVM Layer 2 Setup: Available block devices:"
+ls -la /dev/vd* 2>/dev/null || echo "No /dev/vd* devices found"
+
+echo "FCVM Layer 2 Setup: Mounting rootfs..."
+mount -o rw /dev/vda /newroot
+if [ $? -ne 0 ]; then
+    echo "ERROR: Failed to mount rootfs"
+    sleep 5
+    poweroff -f
+fi
+
+# Copy embedded packages from initrd to rootfs
+# Packages are in /packages directory inside the initrd (loaded in RAM)
+echo "FCVM Layer 2 Setup: Copying packages from initrd to rootfs..."
+mkdir -p /newroot/mnt/packages
+cp -a /packages/* /newroot/mnt/packages/
+echo "FCVM Layer 2 Setup: Copied $(ls /newroot/mnt/packages/*.deb 2>/dev/null | wc -l) packages"
+
+# Write the install script to rootfs
+cat > /newroot/tmp/install-packages.sh << 'INSTALL_SCRIPT_EOF'
+{}
+INSTALL_SCRIPT_EOF
+chmod 755 /newroot/tmp/install-packages.sh
+
+# Write the setup script to rootfs
+cat > /newroot/tmp/fcvm-setup.sh << 'SETUP_SCRIPT_EOF'
+{}
+SETUP_SCRIPT_EOF
+chmod 755 /newroot/tmp/fcvm-setup.sh
+
+# Set up chroot environment (proc, sys, dev)
+echo "FCVM Layer 2 Setup: Setting up chroot environment..."
+mount --bind /proc /newroot/proc
+mount --bind /sys /newroot/sys
+mount --bind /dev /newroot/dev
+
+# Install packages using chroot
+echo "FCVM Layer 2 Setup: Installing packages..."
+chroot /newroot /bin/bash /tmp/install-packages.sh
+INSTALL_RESULT=$?
+echo "FCVM Layer 2 Setup: Package installation returned: $INSTALL_RESULT"
+
+# Run setup script using chroot
+echo "FCVM Layer 2 Setup: Running setup script..."
+chroot /newroot /bin/bash /tmp/fcvm-setup.sh
+SETUP_RESULT=$?
+echo "FCVM Layer 2 Setup: Setup script returned: $SETUP_RESULT"
+
+# Cleanup chroot mounts (use lazy unmount as fallback)
+echo "FCVM Layer 2 Setup: Cleaning up..."
+umount /newroot/dev 2>/dev/null || umount -l /newroot/dev 2>/dev/null || true
+umount /newroot/sys 2>/dev/null || umount -l /newroot/sys 2>/dev/null || true
+umount /newroot/proc 2>/dev/null || umount -l /newroot/proc 2>/dev/null || true
+rm -rf /newroot/mnt/packages
+rm -f /newroot/tmp/install-packages.sh
+rm -f /newroot/tmp/fcvm-setup.sh
+
+# Sync and unmount rootfs
+sync
+umount /newroot 2>/dev/null || umount -l /newroot 2>/dev/null || true
+
+echo "FCVM_SETUP_COMPLETE"
+echo "FCVM Layer 2 Setup: Complete! Powering off..."
+umount /proc /sys /dev 2>/dev/null || true
+poweroff -f
+"#,
+        install_script, setup_script
+    )
+}
 
-    // Check same directory (cargo install case)
-    let fc_agent = exe_dir.join("fc-agent");
-    if fc_agent.exists() {
-        return Ok(fc_agent);
+/// The script content is deterministic - same plan always produces same script.
+/// The SHA256 of this script determines the rootfs image name.
+///
+/// NOTE: This script does NOT install packages - they are installed from
+/// install-packages.sh before this script runs.
+pub fn generate_setup_script(plan: &Plan) -> String {
+    let mut s = String::new();
+
+    // Script header - runs after packages are installed from initrd
+    s.push_str("#!/bin/bash\n");
+    s.push_str("set -euo pipefail\n\n");
+
+    // Note: No partition resize needed - filesystem is already resized on host
+    // (we use a raw ext4 filesystem without partition table)\n
+
+    // Note: Packages are already installed by install-packages.sh
+    // We just need to include the package list in the script for SHA calculation
+    let packages = plan.packages.all_packages();
+    s.push_str("# Packages (installed from initrd): ");
+    s.push_str(&packages.join(", "));
+    s.push_str("\n\n");
+
+    // Write configuration files (sorted for deterministic output)
+    let mut file_paths: Vec<_> = plan.files.keys().collect();
+    file_paths.sort();
+
+    s.push_str("# Write configuration files\n");
+    for path in file_paths {
+        let config = &plan.files[path];
+        // Create parent directory if needed
+        if let Some(parent) = std::path::Path::new(path).parent() {
+            if parent != std::path::Path::new("") && parent != std::path::Path::new("/") {
+                s.push_str(&format!("mkdir -p {}\n", parent.display()));
+            }
+        }
+        s.push_str(&format!("cat > {} << 'FCVM_EOF'\n", path));
+        s.push_str(&config.content);
+        if !config.content.ends_with('\n') {
+            s.push('\n');
+        }
+        s.push_str("FCVM_EOF\n\n");
     }
 
-    // Check parent directory (test case: exe in target/release/deps/, agent in target/release/)
-    if let Some(parent) = exe_dir.parent() {
-        let fc_agent_parent = parent.join("fc-agent");
-        if fc_agent_parent.exists() {
-            return Ok(fc_agent_parent);
+    // Fix fstab (remove problematic entries)
+    if !plan.fstab.remove_patterns.is_empty() {
+        s.push_str("# Fix /etc/fstab\n");
+        for pattern in &plan.fstab.remove_patterns {
+            // Use sed to remove lines containing the pattern
+            s.push_str(&format!("sed -i '/{}/d' /etc/fstab\n", pattern.replace('/', "\\/")));
         }
+        s.push('\n');
     }
 
-    // Fallback: environment variable override for special cases
-    if let Ok(path) = std::env::var("FC_AGENT_PATH") {
-        let p = PathBuf::from(&path);
-        if p.exists() {
-            return Ok(p);
+    // Configure container registries
+    s.push_str("# Configure Podman registries\n");
+    s.push_str("cat > /etc/containers/registries.conf << 'FCVM_EOF'\n");
+    s.push_str("unqualified-search-registries = [\"docker.io\"]\n\n");
+    s.push_str("[[registry]]\n");
+    s.push_str("location = \"docker.io\"\n");
+    s.push_str("FCVM_EOF\n\n");
+
+    // Enable services
+    if !plan.services.enable.is_empty() {
+        s.push_str("# Enable services\n");
+        s.push_str("systemctl enable");
+        for svc in &plan.services.enable {
+            s.push_str(&format!(" {}", svc));
+        }
+        s.push('\n');
+    }
+
+    // Also enable serial console
+    s.push_str("systemctl enable serial-getty@ttyS0\n\n");
+
+    // Disable services
+    if !plan.services.disable.is_empty() {
+        s.push_str("# Disable services\n");
+        s.push_str("systemctl disable");
+        for svc in &plan.services.disable {
+            s.push_str(&format!(" {}", svc));
+        }
+        s.push_str(" || true\n\n");
+    }
+
+    // Cleanup
+    if !plan.cleanup.remove_dirs.is_empty() {
+        s.push_str("# Cleanup unnecessary files\n");
+        for pattern in &plan.cleanup.remove_dirs {
+            s.push_str(&format!("rm -rf {}\n", pattern));
+        }
+        s.push('\n');
+    }
+
+    // Clean apt cache for smaller image
+    s.push_str("# Clean apt cache\n");
+    s.push_str("apt-get clean\n");
+    s.push_str("rm -rf /var/lib/apt/lists/*\n\n");
+
+    s.push_str("echo 'FCVM_SETUP_COMPLETE'\n");
+    s.push_str("# Shutdown to signal completion\n");
+    s.push_str("shutdown -h now\n");
+    s
+}
+
+
+// ============================================================================
+// Plan Loading and SHA256
+// ============================================================================
+
+/// Find the plan file in the workspace
+fn find_plan_file() -> Result<PathBuf> {
+    // Try relative to current exe (for installed binary)
+    let exe_path = std::env::current_exe().context("getting current executable path")?;
+    let exe_dir = exe_path.parent().context("getting executable directory")?;
+
+    // Check various locations
+    let candidates = [
+        exe_dir.join(PLAN_FILE),
+        exe_dir.join("..").join(PLAN_FILE),
+        exe_dir.join("../..").join(PLAN_FILE),
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(PLAN_FILE),
+    ];
+
+    for path in &candidates {
+        if path.exists() {
+            return Ok(path.canonicalize().context("canonicalizing plan file path")?);
         }
     }
 
+    // Fallback to CARGO_MANIFEST_DIR for development
+    let manifest_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(PLAN_FILE);
+    if manifest_path.exists() {
+        return Ok(manifest_path);
+    }
+
     bail!(
-        "fc-agent binary not found at {} or via FC_AGENT_PATH env var.\n\
-         Build with: cargo build --release",
-        fc_agent.display()
+        "rootfs-plan.toml not found. Checked: {:?}",
+        candidates.iter().map(|p| p.display().to_string()).collect::<Vec<_>>()
     )
 }
 
-/// Helper to convert Path to str with proper error handling
-fn path_to_str(path: &Path) -> Result<&str> {
-    path.to_str()
-        .ok_or_else(|| anyhow::anyhow!("path contains invalid UTF-8: {:?}", path))
+/// Load and parse the plan file
+pub fn load_plan() -> Result<(Plan, String, String)> {
+    let plan_path = find_plan_file()?;
+    let plan_content = std::fs::read_to_string(&plan_path)
+        .with_context(|| format!("reading plan file: {}", plan_path.display()))?;
+
+    // Compute SHA256 of plan content (first 12 chars for image naming)
+    let plan_sha = compute_sha256(plan_content.as_bytes());
+    let plan_sha_short = plan_sha[..12].to_string();
+
+    let plan: Plan = toml::from_str(&plan_content)
+        .with_context(|| format!("parsing plan file: {}", plan_path.display()))?;
+
+    info!(
+        plan_file = %plan_path.display(),
+        plan_sha = %plan_sha_short,
+        "loaded rootfs plan"
+    );
+
+    Ok((plan, plan_sha, plan_sha_short))
+}
+
+/// Compute SHA256 of bytes, return hex string
+pub fn compute_sha256(data: &[u8]) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(data);
+    format!("{:x}", hasher.finalize())
 }
 
-/// Ensure rootfs exists, creating minimal Ubuntu + Podman if needed
+// ============================================================================
+// Public API
+// ============================================================================
+
+/// Ensure rootfs exists, creating if needed (NO ROOT REQUIRED)
+///
+/// The rootfs is named after the generated setup script SHA256: layer2-{script_sha}.raw
+/// If the script changes (due to plan changes), a new rootfs is created automatically.
 ///
-/// Caches the rootfs filesystem - only creates it once.
-/// The base rootfs is immutable after creation to prevent corruption when VMs start in parallel.
+/// Layer 2 creation flow (all rootless):
+/// 1. Download Ubuntu cloud image (qcow2)
+/// 2. Convert to raw with qemu-img
+/// 3. Expand to 10GB with truncate
+/// 4. Download packages
+/// 5. Create initrd with embedded packages
+/// 6. Boot VM with initrd to install packages (no network needed)
+/// 6. Wait for VM to shut down
+/// 7. Rename to layer2-{sha}.raw
+///
+/// NOTE: fc-agent is NOT included in Layer 2. It will be injected per-VM at boot time.
+/// Layer 2 only contains packages (podman, crun, etc.).
 pub async fn ensure_rootfs() -> Result<PathBuf> {
+    let (plan, _plan_sha_full, _plan_sha_short) = load_plan()?;
+
+    // Generate all scripts and compute hash of the complete init script
+    let setup_script = generate_setup_script(&plan);
+    let install_script = generate_install_script();
+    let init_script = generate_init_script(&install_script, &setup_script);
+
+    // Get kernel URL for the current architecture
+    let kernel_config = plan.kernel.current_arch()?;
+    let kernel_url = &kernel_config.url;
+
+    // Hash the complete init script + kernel URL
+    // Any change to:
+    // - init logic, install script, or setup script
+    // - kernel URL (different kernel version/release)
+    // invalidates the cache
+    let mut combined = init_script.clone();
+    combined.push_str("\n# KERNEL_URL: ");
+    combined.push_str(kernel_url);
+    let script_sha = compute_sha256(combined.as_bytes());
+    let script_sha_short = &script_sha[..12];
+
     let rootfs_dir = paths::rootfs_dir();
-    let rootfs_path = paths::base_rootfs();
+    let rootfs_path = rootfs_dir.join(format!("layer2-{}.raw", script_sha_short));
     let lock_file = rootfs_dir.join(".rootfs-creation.lock");
 
-    // If rootfs exists, return it immediately (it's immutable after creation)
-    // DO NOT modify the base rootfs on every VM start - this causes:
-    // 1. Filesystem corruption when VMs start in parallel
-    // 2. Unnecessary latency (~100ms per VM start)
-    // 3. Violates the "base rootfs is immutable" principle
-    //
-    // To update fc-agent: delete the rootfs and it will be recreated, OR
-    // explicitly run `fcvm setup rootfs` (TODO: implement setup command)
+    // If rootfs exists for this script, return it
     if rootfs_path.exists() {
-        info!(path = %rootfs_path.display(), "rootfs exists (using cached)");
+        info!(
+            path = %rootfs_path.display(),
+            script_sha = %script_sha_short,
+            "rootfs exists for current script (using cached)"
+        );
         return Ok(rootfs_path);
     }
 
@@ -83,7 +464,6 @@ pub async fn ensure_rootfs() -> Result<PathBuf> {
         .context("creating rootfs directory")?;
 
     // Acquire lock to prevent concurrent rootfs creation
-    // If multiple VMs start simultaneously, only one creates the rootfs
     info!("acquiring rootfs creation lock");
     use std::os::unix::fs::OpenOptionsExt;
     let lock_fd = std::fs::OpenOptions::new()
@@ -99,39 +479,41 @@ pub async fn ensure_rootfs() -> Result<PathBuf> {
         .map_err(|(_, err)| err)
         .context("acquiring rootfs creation lock")?;
 
-    // Check again after acquiring lock (another process may have created it)
+    // Check again after acquiring lock
     if rootfs_path.exists() {
-        info!(path = %rootfs_path.display(), "rootfs exists (created by another process)");
+        info!(
+            path = %rootfs_path.display(),
+            "rootfs exists (created by another process)"
+        );
         flock.unlock().map_err(|(_, err)| err).ok();
         let _ = std::fs::remove_file(&lock_file);
         return Ok(rootfs_path);
     }
 
-    // Now we have exclusive access, create the rootfs
-    info!("creating base rootfs from Ubuntu cloud image");
-    info!("note: first-time cloud image download may take 5-15 minutes");
-    info!("cached rootfs creation takes ~45 seconds");
+    // Create the rootfs
+    info!(
+        script_sha = %script_sha_short,
+        "creating Layer 2 rootfs (first-time may take 5-15 minutes)"
+    );
 
-    // Create at temp path first, then rename when complete to avoid race conditions.
-    // Other processes check if rootfs_path exists, so we must not create it until
-    // package installation is complete.
-    let temp_rootfs_path = rootfs_path.with_extension("ext4.tmp");
+    // Log the generated script for debugging
+    debug!("generated setup script:\n{}", setup_script);
 
-    // Clean up any leftover temp file from a previous failed attempt
+    let temp_rootfs_path = rootfs_path.with_extension("raw.tmp");
     let _ = tokio::fs::remove_file(&temp_rootfs_path).await;
 
-    let result = create_ubuntu_rootfs(&temp_rootfs_path)
-        .await
-        .context("creating Ubuntu rootfs");
+    let result = create_layer2_rootless(&plan, script_sha_short, &setup_script, &temp_rootfs_path).await;
 
-    // If successful, rename temp file to final path
     if result.is_ok() {
         tokio::fs::rename(&temp_rootfs_path, &rootfs_path)
             .await
             .context("renaming temp rootfs to final path")?;
-        info!("rootfs creation complete");
+        info!(
+            path = %rootfs_path.display(),
+            script_sha = %script_sha_short,
+            "Layer 2 rootfs creation complete"
+        );
     } else {
-        // Clean up temp file on failure
         let _ = tokio::fs::remove_file(&temp_rootfs_path).await;
     }
 
@@ -143,599 +525,1057 @@ pub async fn ensure_rootfs() -> Result<PathBuf> {
     let _ = std::fs::remove_file(&lock_file);
 
     result?;
-
     Ok(rootfs_path)
 }
 
-/// Create Ubuntu rootfs from official cloud image
+/// Find the fc-agent binary for per-VM injection
 ///
-/// Downloads Ubuntu 24.04 cloud image (cached), customizes it with virt-customize,
-/// extracts to ext4, then installs packages.
-async fn create_ubuntu_rootfs(output_path: &Path) -> Result<()> {
-    // Download Ubuntu cloud image (cached)
-    let cloud_image = download_ubuntu_cloud_image().await?;
-
-    info!("customizing Ubuntu cloud image with virt-customize");
+/// fc-agent is NOT included in Layer 2 (the base rootfs). Instead, it is
+/// injected per-VM at boot time via initrd. This function is used to locate
+/// the binary for that injection.
+///
+/// Both fcvm and fc-agent are workspace members built together.
+/// Search order:
+/// 1. Same directory as current exe
+/// 2. Parent directory (for tests in target/release/deps/)
+/// 3. FC_AGENT_PATH environment variable
+pub fn find_fc_agent_binary() -> Result<PathBuf> {
+    let exe_path = std::env::current_exe().context("getting current executable path")?;
+    let exe_dir = exe_path.parent().context("getting executable directory")?;
 
-    // Customize the qcow2 image BEFORE extracting
-    customize_ubuntu_cloud_image(&cloud_image).await?;
+    // Check same directory
+    let fc_agent = exe_dir.join("fc-agent");
+    if fc_agent.exists() {
+        return Ok(fc_agent);
+    }
 
-    // Extract root partition from customized cloud image
-    info!("extracting customized root partition");
-    extract_root_partition(&cloud_image, output_path).await?;
+    // Check parent directory (test case)
+    if let Some(parent) = exe_dir.parent() {
+        let fc_agent_parent = parent.join("fc-agent");
+        if fc_agent_parent.exists() {
+            return Ok(fc_agent_parent);
+        }
+    }
 
-    // Install packages after extraction (virt-customize has networking issues)
-    info!("installing packages in extracted rootfs");
-    install_packages_in_rootfs(output_path).await?;
+    // Fallback: environment variable
+    if let Ok(path) = std::env::var("FC_AGENT_PATH") {
+        let p = PathBuf::from(&path);
+        if p.exists() {
+            return Ok(p);
+        }
+    }
 
-    Ok(())
+    bail!(
+        "fc-agent binary not found at {} or via FC_AGENT_PATH env var.\n\
+         Build with: cargo build --release",
+        fc_agent.display()
+    )
 }
 
-/// Download Ubuntu cloud image (cached)
-async fn download_ubuntu_cloud_image() -> Result<PathBuf> {
-    let cache_dir = paths::base_dir().join("cache");
-    tokio::fs::create_dir_all(&cache_dir)
-        .await
-        .context("creating cache directory")?;
-
-    // Detect architecture and use appropriate cloud image
-    let (arch_name, cloud_arch) = match std::env::consts::ARCH {
-        "x86_64" => ("amd64", "amd64"),
-        "aarch64" => ("arm64", "arm64"),
-        other => bail!("unsupported architecture: {}", other),
-    };
-
-    let image_url = format!(
-        "https://cloud-images.ubuntu.com/releases/24.04/release/ubuntu-24.04-server-cloudimg-{cloud_arch}.img"
-    );
-    let image_path = cache_dir.join(format!("ubuntu-24.04-{arch_name}.img"));
-
-    // Return cached image if it exists
-    if image_path.exists() {
-        info!(path = %image_path.display(), "using cached Ubuntu cloud image");
-        return Ok(image_path);
+// ============================================================================
+// fc-agent Initrd Creation
+// ============================================================================
+
+/// The fc-agent systemd service unit file content
+const FC_AGENT_SERVICE: &str = r#"[Unit]
+Description=fcvm guest agent for container orchestration
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=/usr/local/bin/fc-agent
+Restart=on-failure
+RestartSec=1
+
+[Install]
+WantedBy=multi-user.target
+"#;
+
+/// The init script for the initrd
+/// This runs before the real init, copies fc-agent to the rootfs, then switches root
+const INITRD_INIT_SCRIPT: &str = r#"#!/bin/busybox sh
+# fc-agent injection initrd
+# This runs before systemd, copies fc-agent to the rootfs, then switch_root
+
+# Install busybox applets
+/bin/busybox mkdir -p /bin /sbin /proc /sys /dev /newroot
+/bin/busybox --install -s /bin
+/bin/busybox --install -s /sbin
+
+# Mount essential filesystems
+mount -t proc proc /proc
+mount -t sysfs sys /sys
+mount -t devtmpfs dev /dev
+
+# Parse kernel cmdline to find root device
+ROOT=""
+for param in $(cat /proc/cmdline); do
+    case "$param" in
+        root=*)
+            ROOT="${param#root=}"
+            ;;
+    esac
+done
+
+if [ -z "$ROOT" ]; then
+    echo "ERROR: No root= parameter found in kernel cmdline"
+    exec /bin/sh
+fi
+
+# Handle /dev/vda1 style paths
+case "$ROOT" in
+    /dev/*)
+        # Wait for device to appear
+        for i in 1 2 3 4 5; do
+            if [ -b "$ROOT" ]; then
+                break
+            fi
+            echo "Waiting for $ROOT..."
+            sleep 1
+        done
+        ;;
+esac
+
+# Mount the real root filesystem
+echo "Mounting $ROOT as real root..."
+mount -o rw "$ROOT" /newroot
+
+if [ ! -d /newroot/usr ]; then
+    echo "ERROR: Failed to mount root filesystem"
+    exec /bin/sh
+fi
+
+# Copy fc-agent binary
+echo "Installing fc-agent..."
+cp /fc-agent /newroot/usr/local/bin/fc-agent
+chmod 755 /newroot/usr/local/bin/fc-agent
+
+# Copy service file
+cp /fc-agent.service /newroot/etc/systemd/system/fc-agent.service
+
+# Enable the service (create symlink)
+mkdir -p /newroot/etc/systemd/system/multi-user.target.wants
+ln -sf ../fc-agent.service /newroot/etc/systemd/system/multi-user.target.wants/fc-agent.service
+
+echo "fc-agent installed successfully"
+
+# Also ensure MMDS route config exists (in case setup script failed)
+mkdir -p /newroot/etc/systemd/network/10-eth0.network.d
+if [ ! -f /newroot/etc/systemd/network/10-eth0.network.d/mmds.conf ]; then
+    echo "Adding MMDS route config..."
+    cat > /newroot/etc/systemd/network/10-eth0.network.d/mmds.conf << 'MMDSCONF'
+[Route]
+Destination=169.254.169.254/32
+Scope=link
+MMDSCONF
+fi
+
+# Also create the base network config if missing
+if [ ! -f /newroot/etc/systemd/network/10-eth0.network ]; then
+    echo "Adding base network config..."
+    cat > /newroot/etc/systemd/network/10-eth0.network << 'NETCONF'
+[Match]
+Name=eth0
+
+[Network]
+KeepConfiguration=yes
+NETCONF
+fi
+
+# Cleanup
+umount /proc
+umount /sys
+umount /dev
+
+# Switch to the real root and exec init
+exec switch_root /newroot /sbin/init
+"#;
+
+/// Ensure the fc-agent initrd exists, creating if needed
+///
+/// The initrd is cached by fc-agent binary hash. When fc-agent is rebuilt,
+/// a new initrd is automatically created.
+///
+/// Returns the path to the initrd file.
+pub async fn ensure_fc_agent_initrd() -> Result<PathBuf> {
+    // Find fc-agent binary
+    let fc_agent_path = find_fc_agent_binary()?;
+    let fc_agent_bytes = std::fs::read(&fc_agent_path)
+        .with_context(|| format!("reading fc-agent binary at {}", fc_agent_path.display()))?;
+    let fc_agent_sha = compute_sha256(&fc_agent_bytes);
+    let fc_agent_sha_short = &fc_agent_sha[..12];
+
+    // Check if initrd already exists for this fc-agent version
+    let initrd_dir = paths::base_dir().join("initrd");
+    let initrd_path = initrd_dir.join(format!("fc-agent-{}.initrd", fc_agent_sha_short));
+
+    if initrd_path.exists() {
+        debug!(
+            path = %initrd_path.display(),
+            fc_agent_sha = %fc_agent_sha_short,
+            "using cached fc-agent initrd"
+        );
+        return Ok(initrd_path);
     }
 
-    info!(url = %image_url, "downloading Ubuntu 24.04 cloud image");
-    info!("download size: ~644MB (one-time, cached for future use)");
-    info!("download may take 5-15 minutes depending on network speed");
-
-    // Download with reqwest
-    let client = reqwest::Client::new();
-    let response = client
-        .get(image_url)
-        .send()
+    // Create initrd directory
+    tokio::fs::create_dir_all(&initrd_dir)
         .await
-        .context("downloading cloud image")?;
+        .context("creating initrd directory")?;
 
-    if !response.status().is_success() {
-        bail!("download failed with status: {}", response.status());
-    }
-
-    // Get content length for progress reporting
-    let total_size = response.content_length().unwrap_or(0);
-    let total_mb = total_size as f64 / 1024.0 / 1024.0;
-
-    // Stream to file with progress
-    let mut file = File::create(&image_path)
-        .await
-        .context("creating image file")?;
+    info!(
+        fc_agent = %fc_agent_path.display(),
+        fc_agent_sha = %fc_agent_sha_short,
+        "creating fc-agent initrd"
+    );
 
-    let bytes = response.bytes().await.context("reading response body")?;
-    let downloaded_mb = bytes.len() as f64 / 1024.0 / 1024.0;
+    // Create temporary directory for initrd contents
+    let temp_dir = initrd_dir.join(format!(".initrd-build-{}", fc_agent_sha_short));
+    let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+    tokio::fs::create_dir_all(&temp_dir).await?;
 
-    file.write_all(&bytes).await.context("writing image file")?;
-    file.flush().await.context("flushing image file")?;
+    // Create directory structure
+    for dir in &["bin", "sbin", "dev", "proc", "sys", "newroot"] {
+        tokio::fs::create_dir_all(temp_dir.join(dir)).await?;
+    }
 
-    info!(path = %image_path.display(),
-          downloaded_mb = downloaded_mb,
-          expected_mb = total_mb,
-          "cloud image download complete");
+    // Find busybox (prefer static version)
+    let busybox_path = find_busybox()?;
 
-    Ok(image_path)
-}
+    // Copy busybox
+    tokio::fs::copy(&busybox_path, temp_dir.join("bin/busybox")).await?;
 
-/// Extract root partition from qcow2 cloud image to a raw ext4 file
-async fn extract_root_partition(qcow2_path: &Path, output_path: &Path) -> Result<()> {
-    info!("extracting root partition from cloud image");
+    // Make busybox executable
+    Command::new("chmod")
+        .args(["755", temp_dir.join("bin/busybox").to_str().unwrap()])
+        .output()
+        .await?;
 
-    // Find a free NBD device
-    let nbd_device = "/dev/nbd0";
+    // Write init script
+    tokio::fs::write(temp_dir.join("init"), INITRD_INIT_SCRIPT).await?;
+    Command::new("chmod")
+        .args(["755", temp_dir.join("init").to_str().unwrap()])
+        .output()
+        .await?;
 
-    // Load nbd kernel module if not already loaded
-    let _ = Command::new("modprobe")
-        .arg("nbd")
-        .arg("max_part=8")
+    // Copy fc-agent binary
+    tokio::fs::copy(&fc_agent_path, temp_dir.join("fc-agent")).await?;
+    Command::new("chmod")
+        .args(["755", temp_dir.join("fc-agent").to_str().unwrap()])
         .output()
-        .await;
+        .await?;
 
-    // Connect qcow2 to NBD device
-    info!("connecting qcow2 to NBD device");
-    let output = Command::new("qemu-nbd")
-        .args(["--connect", nbd_device, "-r", path_to_str(qcow2_path)?])
+    // Write service file
+    tokio::fs::write(temp_dir.join("fc-agent.service"), FC_AGENT_SERVICE).await?;
+
+    // Create cpio archive (initrd format)
+    let temp_initrd = initrd_path.with_extension("initrd.tmp");
+    let output = Command::new("sh")
+        .args([
+            "-c",
+            &format!(
+                "cd {} && find . | cpio -o -H newc 2>/dev/null | gzip > {}",
+                temp_dir.display(),
+                temp_initrd.display()
+            ),
+        ])
         .output()
         .await
-        .context("running qemu-nbd connect")?;
+        .context("creating initrd cpio archive")?;
 
     if !output.status.success() {
         bail!(
-            "qemu-nbd connect failed: {}",
+            "Failed to create initrd: {}",
             String::from_utf8_lossy(&output.stderr)
         );
     }
 
-    // Force kernel to re-read partition table - required on some systems (e.g., CI runners)
-    // Try partprobe first (from parted), fall back to partx (from util-linux)
-    info!("scanning partition table");
-    let partprobe_result = Command::new("partprobe").arg(nbd_device).output().await;
-    if partprobe_result.is_err()
-        || !partprobe_result
-            .as_ref()
-            .map(|o| o.status.success())
-            .unwrap_or(false)
-    {
-        // Fallback to partx
-        let _ = Command::new("partx")
-            .args(["-a", nbd_device])
-            .output()
-            .await;
-    }
-
-    // Wait for partition to appear with retry loop
-    let partition = format!("{}p1", nbd_device);
-
-    // Small delay to allow kernel to create partition device nodes
-    // This is needed because partprobe/partx returns before udev creates the nodes
-    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
-
-    let mut retries = 10;
-    while retries > 0 && !std::path::Path::new(&partition).exists() {
-        info!(
-            partition = %partition,
-            retries_left = retries,
-            "waiting for partition to appear"
-        );
-        tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
-        retries -= 1;
-    }
+    // Rename to final path
+    tokio::fs::rename(&temp_initrd, &initrd_path).await?;
 
-    // If partition still doesn't exist, try to create the device node manually.
-    // This is needed when running in a container where the host kernel creates
-    // the partition device on the host's devtmpfs, but the container has its own.
-    // NBD major is 43, partition 1 is minor 1.
-    if !std::path::Path::new(&partition).exists() {
-        info!("partition not auto-created, trying mknod");
+    // Cleanup temp directory
+    let _ = tokio::fs::remove_dir_all(&temp_dir).await;
 
-        // Get partition info from sysfs
-        let sysfs_path = "/sys/block/nbd0/nbd0p1/dev";
-        let dev_info = tokio::fs::read_to_string(sysfs_path).await;
+    info!(
+        path = %initrd_path.display(),
+        fc_agent_sha = %fc_agent_sha_short,
+        "fc-agent initrd created"
+    );
 
-        if let Ok(dev_str) = dev_info {
-            // dev_str is "major:minor" e.g., "43:1"
-            let dev_str = dev_str.trim();
-            info!(dev = %dev_str, "found partition info in sysfs");
+    Ok(initrd_path)
+}
 
-            // Create device node with mknod
-            let mknod_result = Command::new("mknod")
-                .args([&partition, "b", "43", "1"])
-                .output()
-                .await;
+/// Find busybox binary (prefer static version)
+fn find_busybox() -> Result<PathBuf> {
+    // Check for busybox-static first
+    for path in &["/bin/busybox-static", "/usr/bin/busybox-static", "/bin/busybox", "/usr/bin/busybox"] {
+        let p = PathBuf::from(path);
+        if p.exists() {
+            return Ok(p);
+        }
+    }
 
-            if let Ok(output) = mknod_result {
-                if output.status.success() {
-                    info!(partition = %partition, "created partition device node");
-                } else {
-                    warn!("mknod failed: {}", String::from_utf8_lossy(&output.stderr));
-                }
+    // Try which
+    if let Ok(output) = std::process::Command::new("which").arg("busybox").output() {
+        if output.status.success() {
+            let path = String::from_utf8_lossy(&output.stdout).trim().to_string();
+            if !path.is_empty() {
+                return Ok(PathBuf::from(path));
             }
-        } else {
-            // Try mknod with assumed minor number (1 for first partition)
-            info!("sysfs info not available, trying mknod with assumed minor 1");
-            let _ = Command::new("mknod")
-                .args([&partition, "b", "43", "1"])
-                .output()
-                .await;
         }
     }
 
-    // Final check
-    if !std::path::Path::new(&partition).exists() {
-        // List what devices exist for debugging
-        let ls_output = Command::new("sh")
-            .args([
-                "-c",
-                "ls -la /dev/nbd0* 2>/dev/null || echo 'no nbd devices'",
-            ])
-            .output()
-            .await;
-        let devices = ls_output
-            .map(|o| String::from_utf8_lossy(&o.stdout).to_string())
-            .unwrap_or_else(|_| "failed to list".to_string());
-
-        // Also check sysfs for partition info
-        let sysfs_output = Command::new("sh")
-            .args([
-                "-c",
-                "cat /sys/block/nbd0/nbd0p1/dev 2>/dev/null || echo 'no sysfs info'",
-            ])
-            .output()
-            .await;
-        let sysfs_info = sysfs_output
-            .map(|o| String::from_utf8_lossy(&o.stdout).to_string())
-            .unwrap_or_else(|_| "no sysfs".to_string());
+    bail!("busybox not found. Install with: apt-get install busybox-static")
+}
+
+// ============================================================================
+// Layer 2 Creation (Rootless)
+// ============================================================================
 
+/// Create Layer 2 rootfs without requiring root
+///
+/// 1. Download cloud image (qcow2, cached)
+/// 2. Convert to raw with qemu-img (no root)
+/// 3. Expand to 10GB (no root)
+/// 4. Download .deb packages on host (has network)
+/// 5. Create initrd with embedded packages
+/// 6. Boot VM with initrd to install packages (no network needed)
+/// 7. Wait for VM to shut down
+///
+/// NOTE: fc-agent is NOT included - it will be injected per-VM at boot time.
+async fn create_layer2_rootless(
+    plan: &Plan,
+    script_sha_short: &str,
+    script: &str,
+    output_path: &Path,
+) -> Result<()> {
+    // Step 1: Download cloud image (cached by URL)
+    let cloud_image = download_cloud_image(plan).await?;
+
+    // Step 2: Convert qcow2 to raw (no root required!)
+    info!("converting qcow2 to raw format (no root required)");
+    let full_disk_path = output_path.with_extension("full");
+    let output = Command::new("qemu-img")
+        .args([
+            "convert",
+            "-f", "qcow2",
+            "-O", "raw",
+            path_to_str(&cloud_image)?,
+            path_to_str(&full_disk_path)?,
+        ])
+        .output()
+        .await
+        .context("running qemu-img convert")?;
+
+    if !output.status.success() {
         bail!(
-            "partition {} not found after waiting. Devices: {}, Sysfs: {}",
-            partition,
-            devices.trim(),
-            sysfs_info.trim()
+            "qemu-img convert failed: {}",
+            String::from_utf8_lossy(&output.stderr)
         );
     }
 
-    info!(partition = %partition, "copying root partition");
+    // Step 3: Extract partition 1 (root filesystem) using fdisk and dd
+    // This avoids GPT partition table issues with Firecracker
+    info!("extracting root partition from GPT disk (no root required)");
+    let partition_path = output_path.with_extension("converting");
+
+    // Get partition info using sfdisk
+    let output = Command::new("sfdisk")
+        .args(["-J", path_to_str(&full_disk_path)?])
+        .output()
+        .await
+        .context("getting partition info")?;
+
+    if !output.status.success() {
+        bail!("sfdisk failed: {}", String::from_utf8_lossy(&output.stderr));
+    }
+
+    // Parse sfdisk JSON output to find partition 1
+    #[derive(serde::Deserialize)]
+    struct SfdiskOutput {
+        partitiontable: PartitionTable,
+    }
+    #[derive(serde::Deserialize)]
+    struct PartitionTable {
+        partitions: Vec<Partition>,
+    }
+    #[derive(serde::Deserialize)]
+    struct Partition {
+        node: String,
+        start: u64,
+        size: u64,
+        #[serde(rename = "type")]
+        ptype: String,
+    }
+
+    let sfdisk_output: SfdiskOutput = serde_json::from_slice(&output.stdout)
+        .context("parsing sfdisk JSON output")?;
+
+    // Find the Linux filesystem partition (type ends with 0FC63DAF-8483-4772-8E79-3D69D8477DE4 or similar)
+    let root_part = sfdisk_output.partitiontable.partitions.iter()
+        .find(|p| p.ptype.contains("0FC63DAF") || p.node.ends_with("1"))
+        .ok_or_else(|| anyhow::anyhow!("Could not find root partition in GPT disk"))?;
+
+    info!(
+        partition = %root_part.node,
+        start_sector = root_part.start,
+        size_sectors = root_part.size,
+        "found root partition"
+    );
+
+    // Extract partition using dd (sector size is 512 bytes)
     let output = Command::new("dd")
         .args([
-            &format!("if={}", partition),
-            &format!("of={}", path_to_str(output_path)?),
-            "bs=4M",
+            &format!("if={}", path_to_str(&full_disk_path)?),
+            &format!("of={}", path_to_str(&partition_path)?),
+            "bs=512",
+            &format!("skip={}", root_part.start),
+            &format!("count={}", root_part.size),
+            "status=progress",
         ])
         .output()
-        .await;
+        .await
+        .context("extracting partition with dd")?;
+
+    if !output.status.success() {
+        bail!("dd failed: {}", String::from_utf8_lossy(&output.stderr));
+    }
 
-    // Always disconnect NBD
-    let disconnect_output = Command::new("qemu-nbd")
-        .args(["--disconnect", nbd_device])
+    // Remove full disk image (no longer needed)
+    let _ = tokio::fs::remove_file(&full_disk_path).await;
+
+    // Step 4: Expand the extracted partition to 10GB
+    info!("expanding partition to {}", LAYER2_SIZE);
+    let output = Command::new("truncate")
+        .args(["-s", LAYER2_SIZE, path_to_str(&partition_path)?])
         .output()
-        .await;
+        .await
+        .context("expanding partition")?;
 
-    // Check dd result
-    let output = output.context("running dd")?;
     if !output.status.success() {
-        bail!("dd failed: {}", String::from_utf8_lossy(&output.stderr));
+        bail!("truncate failed: {}", String::from_utf8_lossy(&output.stderr));
     }
 
-    // Check disconnect result
-    if let Ok(disc_out) = disconnect_output {
-        if !disc_out.status.success() {
-            warn!(
-                "qemu-nbd disconnect warning: {}",
-                String::from_utf8_lossy(&disc_out.stderr)
-            );
-        }
+    // Resize the ext4 filesystem to fill the partition
+    info!("resizing ext4 filesystem");
+    let _output = Command::new("e2fsck")
+        .args(["-f", "-y", path_to_str(&partition_path)?])
+        .output()
+        .await
+        .context("running e2fsck")?;
+    // e2fsck may return non-zero even on success (exit code 1 = errors corrected)
+
+    let output = Command::new("resize2fs")
+        .args([path_to_str(&partition_path)?])
+        .output()
+        .await
+        .context("running resize2fs")?;
+
+    if !output.status.success() {
+        bail!("resize2fs failed: {}", String::from_utf8_lossy(&output.stderr));
     }
 
-    // Resize the extracted ext4 to 10GB (plenty of space for containers)
-    info!("resizing filesystem to 10GB");
+    // Step 4b: Fix /etc/fstab to remove BOOT and UEFI entries
+    // This MUST happen before booting - systemd reads fstab before cloud-init runs
+    info!("fixing /etc/fstab to remove non-existent partition entries");
+    fix_fstab_in_image(&partition_path).await?;
+
+    // Step 5: Download packages on host (host has network!)
+    let packages_dir = download_packages(plan, script_sha_short).await?;
+
+    // Step 6: Create initrd for Layer 2 setup with embedded packages
+    // The initrd runs before systemd and:
+    // - Mounts rootfs at /newroot
+    // - Copies packages from initrd to rootfs
+    // - Runs dpkg -i to install packages
+    // - Runs the setup script
+    // - Powers off
+    // Packages are embedded in the initrd (no second disk needed)
+    let install_script = generate_install_script();
+
+    let setup_initrd = create_layer2_setup_initrd(&install_script, script, &packages_dir).await?;
+
+    // Step 7: Boot VM with initrd to run setup (no cloud-init needed!)
+    // Now we boot a pure ext4 partition (no GPT), so root=/dev/vda works
+    // Only one disk needed - packages are in the initrd
+    info!(
+        script_sha = %script_sha_short,
+        "booting VM with setup initrd (packages embedded)"
+    );
 
-    // First resize the file itself to 10GB
-    let output = Command::new("truncate")
-        .args(["-s", "10G", path_to_str(output_path)?])
+    boot_vm_for_setup(&partition_path, &setup_initrd).await?;
+
+    // Step 8: Rename to final path
+    tokio::fs::rename(&partition_path, output_path)
+        .await
+        .context("renaming partition to output path")?;
+
+    info!("Layer 2 creation complete (packages embedded in initrd)");
+    Ok(())
+}
+
+/// Fix /etc/fstab in an ext4 image to remove BOOT and UEFI partition entries
+///
+/// The Ubuntu cloud image has fstab entries for LABEL=BOOT and LABEL=UEFI
+/// which cause systemd to enter emergency mode when these partitions don't exist.
+/// We use debugfs to modify fstab directly in the ext4 image without mounting.
+async fn fix_fstab_in_image(image_path: &Path) -> Result<()> {
+    // Read current fstab using debugfs
+    let output = Command::new("debugfs")
+        .args(["-R", "cat /etc/fstab", path_to_str(image_path)?])
         .output()
         .await
-        .context("running truncate")?;
+        .context("reading fstab with debugfs")?;
 
     if !output.status.success() {
         bail!(
-            "truncate failed: {}",
+            "debugfs read failed: {}",
             String::from_utf8_lossy(&output.stderr)
         );
     }
 
-    // Check and fix filesystem
-    let output = Command::new("e2fsck")
-        .args(["-f", "-y", path_to_str(output_path)?])
+    let fstab_content = String::from_utf8_lossy(&output.stdout);
+
+    // Filter out BOOT and UEFI entries
+    let new_fstab: String = fstab_content
+        .lines()
+        .filter(|line| {
+            !line.contains("LABEL=BOOT") && !line.contains("LABEL=UEFI")
+        })
+        .collect::<Vec<_>>()
+        .join("\n");
+
+    debug!("new fstab content:\n{}", new_fstab);
+
+    // Write new fstab to a temp file
+    let temp_fstab = std::env::temp_dir().join("fstab.new");
+    tokio::fs::write(&temp_fstab, format!("{}\n", new_fstab))
+        .await
+        .context("writing temp fstab")?;
+
+    // Write the new fstab back using debugfs -w
+    // debugfs command: rm /etc/fstab; write /tmp/fstab.new /etc/fstab
+    let output = Command::new("debugfs")
+        .args([
+            "-w",
+            "-R",
+            &format!("rm /etc/fstab"),
+            path_to_str(image_path)?,
+        ])
         .output()
         .await
-        .context("running e2fsck")?;
+        .context("removing old fstab with debugfs")?;
 
-    if !output.status.success()
-        && !output
-            .status
-            .code()
-            .map(|c| c == 1 || c == 2)
-            .unwrap_or(false)
-    {
-        // Exit codes 1-2 are warnings, not errors
-        warn!(
-            "e2fsck warnings: {}",
+    // rm might fail if file doesn't exist, that's OK
+    if !output.status.success() {
+        debug!(
+            "debugfs rm fstab (might be expected): {}",
             String::from_utf8_lossy(&output.stderr)
         );
     }
 
-    // Resize filesystem to fill the file
-    let output = Command::new("resize2fs")
-        .arg(path_to_str(output_path)?)
+    let output = Command::new("debugfs")
+        .args([
+            "-w",
+            "-R",
+            &format!("write {} /etc/fstab", temp_fstab.display()),
+            path_to_str(image_path)?,
+        ])
         .output()
         .await
-        .context("running resize2fs")?;
+        .context("writing new fstab with debugfs")?;
 
     if !output.status.success() {
         bail!(
-            "resize2fs failed: {}",
+            "debugfs write failed: {}",
             String::from_utf8_lossy(&output.stderr)
         );
     }
 
+    // Cleanup temp file
+    let _ = tokio::fs::remove_file(&temp_fstab).await;
+
+    // Verify the change
+    let output = Command::new("debugfs")
+        .args(["-R", "cat /etc/fstab", path_to_str(image_path)?])
+        .output()
+        .await
+        .context("verifying fstab with debugfs")?;
+
+    let new_content = String::from_utf8_lossy(&output.stdout);
+    if new_content.contains("LABEL=BOOT") || new_content.contains("LABEL=UEFI") {
+        warn!("fstab still contains BOOT/UEFI entries after fix - VM may enter emergency mode");
+    } else {
+        info!("fstab fixed - removed BOOT and UEFI entries");
+    }
+
     Ok(())
 }
 
-/// Customize Ubuntu cloud image using virt-customize
+/// Create a Layer 2 setup initrd with embedded packages
 ///
-/// This modifies the qcow2 image in-place, adding Podman, fc-agent, and all configs.
-/// Much simpler and more robust than manual mount/chroot/unmount.
-async fn customize_ubuntu_cloud_image(image_path: &Path) -> Result<()> {
-    // Find fc-agent binary
-    let fc_agent_src = find_fc_agent_binary()?;
-
-    info!("running virt-customize on cloud image");
-
-    let mut cmd = Command::new("virt-customize");
-    cmd.arg("-a").arg(path_to_str(image_path)?);
-
-    // Disable networking to avoid passt errors (packages installed later via chroot)
-    cmd.arg("--no-network");
-
-    // 1. Fix /etc/fstab - remove BOOT and UEFI partitions that don't exist
-    cmd.arg("--run-command")
-        .arg("sed -i '/LABEL=BOOT/d;/LABEL=UEFI/d' /etc/fstab");
-
-    // 2. Copy fc-agent binary (packages installed later via chroot)
-    // Note: universe repository already enabled in base cloud image
-    info!("adding fc-agent binary");
-    cmd.arg("--run-command").arg("mkdir -p /usr/local/bin");
-    cmd.arg("--copy-in")
-        .arg(format!("{}:/usr/local/bin/", fc_agent_src.display()));
-    cmd.arg("--chmod").arg("0755:/usr/local/bin/fc-agent");
-
-    // 4. Write chrony config (create directory first)
-    info!("adding chrony config");
-    cmd.arg("--run-command").arg("mkdir -p /etc/chrony");
-    let chrony_conf = "# NTP servers from pool.ntp.org\npool pool.ntp.org iburst\n\n\
-                       # Allow clock to be stepped (not slewed) for large time differences\n\
-                       makestep 1.0 3\n\n\
-                       # Directory for drift and other runtime files\n\
-                       driftfile /var/lib/chrony/drift\n";
-    cmd.arg("--write")
-        .arg(format!("/etc/chrony/chrony.conf:{}", chrony_conf));
-
-    // 5. Write systemd-networkd config
-    info!("adding network config");
-    cmd.arg("--run-command")
-        .arg("mkdir -p /etc/systemd/network /etc/systemd/network/10-eth0.network.d");
-
-    let network_config = "[Match]\nName=eth0\n\n[Network]\n# Keep kernel IP configuration from ip= boot parameter\nKeepConfiguration=yes\n# DNS is provided via kernel ip= boot parameter (gateway IP where dnsmasq listens)\n";
-    cmd.arg("--write").arg(format!(
-        "/etc/systemd/network/10-eth0.network:{}",
-        network_config
-    ));
+/// This creates a busybox-based initrd that:
+/// 1. Mounts /dev/vda (rootfs) at /newroot
+/// 2. Copies packages from /packages (embedded in initrd) to rootfs
+/// 3. Runs dpkg -i to install packages inside rootfs
+/// 4. Runs the setup script
+/// 5. Powers off the VM
+///
+/// Packages are embedded directly in the initrd, no second disk needed.
+/// This allows using Kata's kernel which has FUSE but no ISO9660/SquashFS.
+async fn create_layer2_setup_initrd(
+    install_script: &str,
+    setup_script: &str,
+    packages_dir: &Path,
+) -> Result<PathBuf> {
+    info!("creating Layer 2 setup initrd with embedded packages");
+
+    let temp_dir = PathBuf::from("/tmp/fcvm-layer2-initrd");
+    let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+    tokio::fs::create_dir_all(&temp_dir).await?;
 
-    let mmds_route = "[Route]\nDestination=169.254.169.254/32\nScope=link\n";
-    cmd.arg("--write").arg(format!(
-        "/etc/systemd/network/10-eth0.network.d/mmds.conf:{}",
-        mmds_route
-    ));
+    // Create the init script that runs before systemd
+    let init_script = generate_init_script(install_script, setup_script);
 
-    // 6. DNS configuration note
-    // DNS is now handled by fc-agent at startup (parses kernel cmdline, writes /etc/resolv.conf)
-    // This avoids relying on systemd service ordering which was unreliable on some CI runners
-
-    // 7. Write fc-agent systemd service
-    info!("adding fc-agent service");
-    let fc_agent_service = "[Unit]\nDescription=fcvm guest agent for container orchestration\n\
-                            After=network.target\nWants=network.target\n\n\
-                            [Service]\nType=simple\nExecStart=/usr/local/bin/fc-agent\n\
-                            Restart=on-failure\nRestartSec=5\n\
-                            StandardOutput=journal+console\nStandardError=journal+console\n\n\
-                            [Install]\nWantedBy=multi-user.target\n";
-    cmd.arg("--write").arg(format!(
-        "/etc/systemd/system/fc-agent.service:{}",
-        fc_agent_service
-    ));
+    // Write init script
+    let init_path = temp_dir.join("init");
+    tokio::fs::write(&init_path, &init_script).await?;
 
-    // 9. Enable services (fc-agent, other services enabled after package install)
-    info!("enabling systemd services");
-    cmd.arg("--run-command")
-        .arg("systemctl enable fc-agent systemd-networkd serial-getty@ttyS0");
+    // Make init executable
+    let output = Command::new("chmod")
+        .args(["755", path_to_str(&init_path)?])
+        .output()
+        .await
+        .context("making init executable")?;
 
-    info!("executing virt-customize (this should be quick)");
+    if !output.status.success() {
+        bail!("Failed to chmod init: {}", String::from_utf8_lossy(&output.stderr));
+    }
 
-    let output = cmd.output().await.context("running virt-customize")?;
+    // Copy busybox static binary
+    let busybox_src = PathBuf::from("/bin/busybox");
+    let busybox_dst = temp_dir.join("bin").join("busybox");
+    tokio::fs::create_dir_all(temp_dir.join("bin")).await?;
+    tokio::fs::copy(&busybox_src, &busybox_dst)
+        .await
+        .context("copying busybox")?;
+
+    let output = Command::new("chmod")
+        .args(["755", path_to_str(&busybox_dst)?])
+        .output()
+        .await
+        .context("making busybox executable")?;
 
     if !output.status.success() {
+        bail!("Failed to chmod busybox: {}", String::from_utf8_lossy(&output.stderr));
+    }
+
+    // Copy packages into initrd
+    let initrd_packages_dir = temp_dir.join("packages");
+    tokio::fs::create_dir_all(&initrd_packages_dir).await?;
+
+    // Copy all .deb files from packages_dir to initrd
+    let mut entries = tokio::fs::read_dir(packages_dir).await?;
+    let mut package_count = 0;
+    while let Some(entry) = entries.next_entry().await? {
+        let path = entry.path();
+        if path.extension().map(|e| e == "deb").unwrap_or(false) {
+            let dest = initrd_packages_dir.join(entry.file_name());
+            tokio::fs::copy(&path, &dest).await?;
+            package_count += 1;
+        }
+    }
+    info!(count = package_count, "embedded packages in initrd");
+
+    // Create the initrd using cpio
+    let initrd_path = temp_dir.join("initrd.cpio.gz");
+    let cpio_output = Command::new("sh")
+        .args([
+            "-c",
+            &format!(
+                "cd {} && find . | cpio -o -H newc 2>/dev/null | gzip > {}",
+                temp_dir.display(),
+                initrd_path.display()
+            ),
+        ])
+        .output()
+        .await
+        .context("creating initrd cpio archive")?;
+
+    if !cpio_output.status.success() {
         bail!(
-            "virt-customize failed:\n{}",
-            String::from_utf8_lossy(&output.stderr)
+            "Failed to create initrd: {}",
+            String::from_utf8_lossy(&cpio_output.stderr)
         );
     }
 
-    info!("virt-customize completed successfully");
+    // Log initrd size
+    if let Ok(meta) = tokio::fs::metadata(&initrd_path).await {
+        let size_mb = meta.len() as f64 / 1024.0 / 1024.0;
+        info!(path = %initrd_path.display(), size_mb = format!("{:.1}", size_mb), "Layer 2 setup initrd created");
+    }
 
-    Ok(())
+    Ok(initrd_path)
 }
 
-/// Install packages in extracted rootfs using mount + chroot
+/// Download all required .deb packages on the host
 ///
-/// This is done AFTER extraction because virt-customize has networking issues.
-/// Still much simpler than the old approach - single-purpose mount+chroot.
-async fn install_packages_in_rootfs(rootfs_path: &Path) -> Result<()> {
-    let temp_dir = PathBuf::from("/tmp/fcvm-rootfs-install");
-    let mount_point = temp_dir.join("mnt");
-
-    // Cleanup any previous mounts
-    let _ = Command::new("umount")
-        .arg("-R")
-        .arg(path_to_str(&mount_point).unwrap_or("/tmp/fcvm-rootfs-install/mnt"))
-        .output()
-        .await;
-    let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+/// Returns the path to the packages directory (not an ISO).
+/// Packages will be embedded directly in the initrd.
+///
+/// NOTE: fc-agent is NOT included - it will be injected per-VM at boot time.
+async fn download_packages(plan: &Plan, script_sha_short: &str) -> Result<PathBuf> {
+    let cache_dir = paths::base_dir().join("cache");
+    let packages_dir = cache_dir.join(format!("packages-{}", script_sha_short));
+
+    // If packages directory already exists with .deb files, use it
+    if packages_dir.exists() {
+        if let Ok(mut entries) = tokio::fs::read_dir(&packages_dir).await {
+            let mut has_debs = false;
+            while let Ok(Some(entry)) = entries.next_entry().await {
+                if entry.path().extension().map(|e| e == "deb").unwrap_or(false) {
+                    has_debs = true;
+                    break;
+                }
+            }
+            if has_debs {
+                info!(path = %packages_dir.display(), "using cached packages directory");
+                return Ok(packages_dir);
+            }
+        }
+    }
 
-    tokio::fs::create_dir_all(&mount_point)
-        .await
-        .context("creating temp mount directory")?;
+    // Create packages directory
+    let _ = tokio::fs::remove_dir_all(&packages_dir).await;
+    tokio::fs::create_dir_all(&packages_dir).await?;
 
-    // Mount the rootfs
-    let output = Command::new("mount")
+    // Get list of packages
+    let packages = plan.packages.all_packages();
+    let packages_str = packages.join(" ");
+
+    info!(packages = %packages_str, "downloading .deb packages on host");
+
+    // Download packages with dependencies using apt-get download
+    // We need to run this in a way that downloads packages for the target system
+    // Using apt-get download with proper architecture
+    let output = Command::new("apt-get")
         .args([
-            "-o",
-            "loop",
-            path_to_str(rootfs_path)?,
-            path_to_str(&mount_point)?,
+            "download",
+            "-o", &format!("Dir::Cache::archives={}", packages_dir.display()),
         ])
+        .args(&packages)
+        .current_dir(&packages_dir)
         .output()
         .await
-        .context("mounting rootfs for package installation")?;
+        .context("downloading packages with apt-get")?;
 
     if !output.status.success() {
-        bail!(
-            "mount failed: {}. Are you running as root?",
-            String::from_utf8_lossy(&output.stderr)
-        );
+        // apt-get download might fail, try with apt-cache to get dependencies first
+        warn!("apt-get download failed, trying alternative method");
+
+        // Alternative: use apt-rdepends or manually download
+        for pkg in &packages {
+            let output = Command::new("apt-get")
+                .args(["download", pkg])
+                .current_dir(&packages_dir)
+                .output()
+                .await;
+
+            if let Ok(out) = output {
+                if !out.status.success() {
+                    warn!(package = %pkg, "failed to download package, continuing...");
+                }
+            }
+        }
     }
 
-    // Mount required filesystems for chroot
-    for (fs, target) in [
-        ("proc", "proc"),
-        ("sysfs", "sys"),
-        ("devtmpfs", "dev"),
-        ("devpts", "dev/pts"),
-    ] {
-        let target_path = mount_point.join(target);
-        let _ = Command::new("mount")
-            .args(["-t", fs, fs, path_to_str(&target_path)?])
-            .output()
-            .await;
-    }
-
-    // Copy DNS resolution config into chroot for apt-get update
-    let resolv_conf_dest = mount_point.join("etc/resolv.conf");
-    // Remove existing resolv.conf (might be a symlink)
-    let _ = tokio::fs::remove_file(&resolv_conf_dest).await;
-    tokio::fs::copy("/etc/resolv.conf", &resolv_conf_dest)
-        .await
-        .context("copying /etc/resolv.conf into chroot")?;
-
-    // Install packages via chroot
-    let result = async {
-        // Update apt cache (universe already enabled in base cloud image)
-        info!("running apt-get update in chroot");
-        let output = Command::new("chroot")
-            .arg(path_to_str(&mount_point)?)
-            .args(["apt-get", "update", "-y"])
-            .output()
-            .await
-            .context("running apt-get update in chroot")?;
+    // Also download dependencies
+    info!("downloading package dependencies");
+    let deps_output = Command::new("sh")
+        .args([
+            "-c",
+            &format!(
+                "apt-cache depends --recurse --no-recommends --no-suggests --no-conflicts \
+                 --no-breaks --no-replaces --no-enhances {} | \
+                 grep '^\\w' | sort -u | xargs apt-get download 2>/dev/null || true",
+                packages_str
+            ),
+        ])
+        .current_dir(&packages_dir)
+        .output()
+        .await;
 
-        // apt-get update completed successfully - no need to log verbose output
+    if let Err(e) = deps_output {
+        warn!(error = %e, "failed to download some dependencies, continuing...");
+    }
 
-        if !output.status.success() {
-            bail!(
-                "apt-get update failed: {}",
-                String::from_utf8_lossy(&output.stderr)
-            );
+    // Count downloaded packages
+    let mut count = 0;
+    if let Ok(mut entries) = tokio::fs::read_dir(&packages_dir).await {
+        while let Ok(Some(entry)) = entries.next_entry().await {
+            if entry.path().extension().map(|e| e == "deb").unwrap_or(false) {
+                count += 1;
+            }
         }
+    }
+    info!(count = count, "downloaded .deb packages");
 
-        // Install packages (with verbose output)
-        info!("installing packages: podman crun fuse-overlayfs fuse3 haveged chrony");
-        info!("package installation typically takes 30-60 seconds");
-
-        let output = Command::new("chroot")
-            .arg(path_to_str(&mount_point)?)
-            .env("DEBIAN_FRONTEND", "noninteractive")
-            .args([
-                "apt-get",
-                "install",
-                "-y",
-                "-o",
-                "Dpkg::Options::=--force-confnew", // Force install new config files
-                "podman",
-                "crun",
-                "fuse-overlayfs",
-                "fuse3",
-                "haveged",
-                "chrony",
-            ])
-            .output()
-            .await
-            .context("installing packages in chroot")?;
+    if count == 0 {
+        bail!("No packages downloaded. Check network and apt configuration.");
+    }
 
-        // Log apt output for debugging
-        info!(
-            "apt-get install stdout:\n{}",
-            String::from_utf8_lossy(&output.stdout)
-        );
-        if !output.stderr.is_empty() {
-            info!(
-                "apt-get install stderr:\n{}",
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
+    info!(path = %packages_dir.display(), count = count, "packages downloaded");
+    Ok(packages_dir)
+}
 
-        if !output.status.success() {
-            bail!(
-                "apt-get install failed: {}",
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
+/// Download cloud image (cached by URL hash)
+async fn download_cloud_image(plan: &Plan) -> Result<PathBuf> {
+    let cache_dir = paths::base_dir().join("cache");
+    tokio::fs::create_dir_all(&cache_dir)
+        .await
+        .context("creating cache directory")?;
 
-        // Enable services
-        let output = Command::new("chroot")
-            .arg(path_to_str(&mount_point)?)
-            .args(["systemctl", "enable", "haveged", "chrony"])
-            .output()
-            .await
-            .context("enabling services in chroot")?;
+    // Get arch-specific config
+    let arch_config = match std::env::consts::ARCH {
+        "x86_64" => &plan.base.amd64,
+        "aarch64" => &plan.base.arm64,
+        other => bail!("unsupported architecture: {}", other),
+    };
 
-        if !output.status.success() {
-            bail!(
-                "systemctl enable failed: {}",
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
+    let arch_name = match std::env::consts::ARCH {
+        "x86_64" => "amd64",
+        "aarch64" => "arm64",
+        other => other,
+    };
 
-        // Configure Podman registries (after packages installed to avoid conffile conflict)
-        info!("configuring Podman container registries");
-        let registries_conf_path = mount_point.join("etc/containers/registries.conf");
-        let registries_content = "unqualified-search-registries = [\"docker.io\"]\n\n\
-                                  [[registry]]\n\
-                                  location = \"docker.io\"\n";
-        tokio::fs::write(&registries_conf_path, registries_content)
-            .await
-            .context("writing registries.conf")?;
-
-        // Write initial resolv.conf - will be overwritten by fcvm-setup-dns.service at boot
-        // The startup script extracts gateway IP from kernel cmdline and configures DNS
-        info!("configuring initial resolv.conf (will be updated at boot)");
-        let resolv_conf_path = mount_point.join("etc/resolv.conf");
-        tokio::fs::write(
-            &resolv_conf_path,
-            "# Placeholder - fcvm-setup-dns.service configures DNS at boot from kernel cmdline\nnameserver 127.0.0.53\n",
-        )
-        .await
-        .context("writing resolv.conf")?;
+    // Cache by URL hash - changing URL triggers re-download
+    let url_hash = &compute_sha256(arch_config.url.as_bytes())[..12];
+    let image_path = cache_dir.join(format!(
+        "ubuntu-{}-{}-{}.img",
+        plan.base.version,
+        arch_name,
+        url_hash
+    ));
 
-        Ok(())
+    // If cached, use it
+    if image_path.exists() {
+        info!(path = %image_path.display(), "using cached cloud image");
+        return Ok(image_path);
     }
-    .await;
 
-    // Always unmount (in reverse order)
-    for target in ["dev/pts", "dev", "sys", "proc", ""] {
-        let target_path = if target.is_empty() {
-            mount_point.clone()
-        } else {
-            mount_point.join(target)
-        };
-        let _ = Command::new("umount")
-            .arg(path_to_str(&target_path).unwrap_or(""))
-            .output()
-            .await;
+    // Download
+    info!(
+        url = %arch_config.url,
+        "downloading Ubuntu cloud image (this may take several minutes)"
+    );
+
+    let temp_path = image_path.with_extension("img.download");
+    let output = Command::new("curl")
+        .args([
+            "-L",
+            "-o",
+            path_to_str(&temp_path)?,
+            "--progress-bar",
+            &arch_config.url,
+        ])
+        .status()
+        .await
+        .context("downloading cloud image")?;
+
+    if !output.success() {
+        bail!("curl failed to download cloud image");
     }
 
-    // Cleanup
+    // Rename to final path
+    tokio::fs::rename(&temp_path, &image_path)
+        .await
+        .context("renaming downloaded image")?;
+
+    info!(
+        path = %image_path.display(),
+        "cloud image downloaded"
+    );
+
+    Ok(image_path)
+}
+
+/// Boot a Firecracker VM to run the Layer 2 setup initrd
+///
+/// This boots with an initrd that has packages embedded:
+/// - Mounts rootfs (/dev/vda) at /newroot
+/// - Copies packages from /packages (in initrd RAM) to rootfs
+/// - Runs dpkg -i to install packages inside rootfs via chroot
+/// - Runs the setup script
+/// - Powers off when complete
+///
+/// Only one disk is needed - packages are embedded in the initrd.
+/// This allows using Kata's kernel which has FUSE but no ISO9660/SquashFS.
+async fn boot_vm_for_setup(disk_path: &Path, initrd_path: &Path) -> Result<()> {
+    use std::time::Duration;
+    use tokio::time::timeout;
+
+    // Create a temporary directory for this setup VM
+    let temp_dir = PathBuf::from("/tmp/fcvm-layer2-setup");
     let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+    tokio::fs::create_dir_all(&temp_dir).await?;
 
-    result?;
+    let api_socket = temp_dir.join("firecracker.sock");
+    let log_path = temp_dir.join("firecracker.log");
 
-    info!("packages installed successfully");
+    // Find kernel - downloaded from Kata release if needed
+    let kernel_path = crate::setup::kernel::ensure_kernel().await?;
 
-    Ok(())
+    // Create serial console output file
+    let serial_path = temp_dir.join("serial.log");
+    let serial_file = std::fs::File::create(&serial_path)
+        .context("creating serial console file")?;
+
+    // Start Firecracker with serial console output
+    info!("starting Firecracker for Layer 2 setup (serial output: {})", serial_path.display());
+    let mut fc_process = Command::new("firecracker")
+        .args([
+            "--api-sock", path_to_str(&api_socket)?,
+            "--log-path", path_to_str(&log_path)?,
+            "--level", "Info",
+        ])
+        .stdout(serial_file.try_clone().context("cloning serial file")?)
+        .stderr(std::process::Stdio::null())
+        .spawn()
+        .context("starting Firecracker")?;
+
+    // Wait for socket to be ready
+    for _ in 0..50 {
+        if api_socket.exists() {
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(100)).await;
+    }
+
+    if !api_socket.exists() {
+        fc_process.kill().await.ok();
+        bail!("Firecracker API socket not created");
+    }
+
+    // Configure VM via API
+    let client = crate::firecracker::api::FirecrackerClient::new(api_socket.clone())?;
+
+    // Set boot source - boot from raw ext4 partition (no GPT)
+    // The disk IS the filesystem, so use root=/dev/vda directly
+    // No cloud-init needed - scripts are injected via debugfs and run by rc.local
+    client
+        .set_boot_source(crate::firecracker::api::BootSource {
+            kernel_image_path: kernel_path.display().to_string(),
+            // Boot with initrd that runs setup before trying to use systemd
+            // The initrd handles everything and powers off, so we don't need to worry about systemd
+            boot_args: Some("console=ttyS0 reboot=k panic=1 pci=off".to_string()),
+            initrd_path: Some(initrd_path.display().to_string()),
+        })
+        .await?;
+
+    // Add root drive (raw ext4 filesystem, no partition table)
+    client
+        .add_drive(
+            "rootfs",
+            crate::firecracker::api::Drive {
+                drive_id: "rootfs".to_string(),
+                path_on_host: disk_path.display().to_string(),
+                is_root_device: true,
+                is_read_only: false,
+                partuuid: None,
+                rate_limiter: None,
+            },
+        )
+        .await?;
+
+    // No packages drive needed - packages are embedded in the initrd
+
+    // Configure machine (minimal for setup)
+    client
+        .set_machine_config(crate::firecracker::api::MachineConfig {
+            vcpu_count: 2,
+            mem_size_mib: 2048, // 2GB for package installation
+            smt: Some(false),
+            cpu_template: None,
+            track_dirty_pages: None,
+        })
+        .await?;
+
+    // No network needed! Packages are installed from local ISO.
+
+    // Start the VM
+    client.put_action(crate::firecracker::api::InstanceAction::InstanceStart).await?;
+    info!("Layer 2 setup VM started, waiting for completion (this takes several minutes)");
+
+    // Wait for VM to shut down (setup script runs shutdown -h now when done)
+    // Timeout after 15 minutes
+    let start = std::time::Instant::now();
+    let mut last_serial_len = 0usize;
+    let result = timeout(Duration::from_secs(900), async {
+        loop {
+            // Check if Firecracker process has exited
+            match fc_process.try_wait() {
+                Ok(Some(status)) => {
+                    let elapsed = start.elapsed();
+                    info!("Firecracker exited with status: {:?} after {:?}", status, elapsed);
+                    return Ok(elapsed);
+                }
+                Ok(None) => {
+                    // Still running, check for new serial output and log it
+                    if let Ok(serial_content) = tokio::fs::read_to_string(&serial_path).await {
+                        if serial_content.len() > last_serial_len {
+                            // Log new output (trimmed to avoid excessive logging)
+                            let new_output = &serial_content[last_serial_len..];
+                            for line in new_output.lines() {
+                                // Skip empty lines and lines that are just timestamps
+                                if !line.trim().is_empty() {
+                                    debug!(target: "layer2_setup", "{}", line);
+                                }
+                            }
+                            last_serial_len = serial_content.len();
+                        }
+                    }
+                    tokio::time::sleep(Duration::from_secs(5)).await;
+                }
+                Err(e) => {
+                    return Err(anyhow::anyhow!("Error checking Firecracker status: {}", e));
+                }
+            }
+        }
+    })
+    .await;
+
+    // Cleanup
+    fc_process.kill().await.ok();
+
+    match result {
+        Ok(Ok(elapsed)) => {
+            // Check for completion marker in serial output
+            let serial_content = tokio::fs::read_to_string(&serial_path).await.unwrap_or_default();
+            if !serial_content.contains("FCVM_SETUP_COMPLETE") {
+                warn!("Setup failed! Serial console output:\n{}", serial_content);
+                if let Ok(log_content) = tokio::fs::read_to_string(&log_path).await {
+                    warn!("Firecracker log:\n{}", log_content);
+                }
+                let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+                bail!("Layer 2 setup failed (no FCVM_SETUP_COMPLETE marker found)");
+            }
+            let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+            info!(elapsed_secs = elapsed.as_secs(), "Layer 2 setup VM completed successfully");
+            Ok(())
+        }
+        Ok(Err(e)) => {
+            let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+            Err(e)
+        }
+        Err(_) => {
+            let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+            bail!("Layer 2 setup VM timed out after 15 minutes")
+        }
+    }
+}
+
+/// Helper to convert Path to str
+fn path_to_str(path: &Path) -> Result<&str> {
+    path.to_str()
+        .ok_or_else(|| anyhow::anyhow!("path contains invalid UTF-8: {:?}", path))
 }
diff --git a/tests/common/mod.rs b/tests/common/mod.rs
index 26a73f3d..16041926 100644
--- a/tests/common/mod.rs
+++ b/tests/common/mod.rs
@@ -13,6 +13,68 @@ use tokio::time::sleep;
 /// Global counter for unique test IDs
 static TEST_COUNTER: AtomicUsize = AtomicUsize::new(0);
 
+/// Fail loudly if running as actual host root.
+///
+/// Rootless tests break when run with `sudo` on the host because user namespace
+/// mapping doesn't work correctly when you're already root.
+///
+/// However, running as root inside a container is fine - the container provides
+/// the isolation boundary, not the UID inside it.
+///
+/// Call this at the start of any rootless test function.
+pub fn require_non_root(test_name: &str) -> anyhow::Result<()> {
+    // Skip check if we're in a container - container is the isolation boundary
+    if is_in_container() {
+        return Ok(());
+    }
+
+    if nix::unistd::geteuid().is_root() {
+        anyhow::bail!(
+            "Rootless test '{}' cannot run as root! Run without sudo.",
+            test_name
+        );
+    }
+    Ok(())
+}
+
+/// Check if we're running inside a container.
+///
+/// Containers create marker files that we can use to detect containerized environments.
+fn is_in_container() -> bool {
+    // Podman creates /run/.containerenv
+    if std::path::Path::new("/run/.containerenv").exists() {
+        return true;
+    }
+    // Docker creates /.dockerenv
+    if std::path::Path::new("/.dockerenv").exists() {
+        return true;
+    }
+    false
+}
+
+/// Generate unique names for snapshot/clone tests.
+///
+/// Returns (baseline_name, clone_name, snapshot_name, serve_name) with unique suffixes.
+/// Uses process ID and atomic counter to ensure uniqueness across parallel tests.
+///
+/// # Arguments
+/// * `prefix` - Base name for the test (e.g., "portfwd", "internet")
+///
+/// # Returns
+/// Tuple of (baseline, clone, snapshot, serve) names
+pub fn unique_names(prefix: &str) -> (String, String, String, String) {
+    let id = TEST_COUNTER.fetch_add(1, Ordering::SeqCst);
+    let pid = std::process::id();
+    let suffix = format!("{}-{}", pid, id);
+
+    (
+        format!("{}-base-{}", prefix, suffix),
+        format!("{}-clone-{}", prefix, suffix),
+        format!("{}-snap-{}", prefix, suffix),
+        format!("{}-serve-{}", prefix, suffix),
+    )
+}
+
 /// Fixture for managing a VM with FUSE volume for testing
 pub struct VmFixture {
     pub child: tokio::process::Child,
diff --git a/tests/test_egress.rs b/tests/test_egress.rs
index f067bdc2..5b672290 100644
--- a/tests/test_egress.rs
+++ b/tests/test_egress.rs
@@ -26,6 +26,7 @@ async fn test_egress_fresh_bridged() -> Result<()> {
 /// Test egress connectivity for fresh VM with rootless networking
 #[tokio::test]
 async fn test_egress_fresh_rootless() -> Result<()> {
+    common::require_non_root("test_egress_fresh_rootless")?;
     egress_fresh_test_impl("rootless").await
 }
 
@@ -38,12 +39,13 @@ async fn test_egress_clone_bridged() -> Result<()> {
 /// Test egress connectivity for cloned VM with rootless networking
 #[tokio::test]
 async fn test_egress_clone_rootless() -> Result<()> {
+    common::require_non_root("test_egress_clone_rootless")?;
     egress_clone_test_impl("rootless").await
 }
 
 /// Implementation for testing egress on a fresh (non-cloned) VM
 async fn egress_fresh_test_impl(network: &str) -> Result<()> {
-    let vm_name = format!("egress-fresh-{}", network);
+    let (vm_name, _, _, _) = common::unique_names(&format!("egress-fresh-{}", network));
 
     println!("\n╔═══════════════════════════════════════════════════════════════╗");
     println!(
@@ -103,9 +105,8 @@ async fn egress_fresh_test_impl(network: &str) -> Result<()> {
 
 /// Implementation for testing egress on a cloned VM
 async fn egress_clone_test_impl(network: &str) -> Result<()> {
-    let snapshot_name = format!("egress-snapshot-{}", network);
-    let baseline_name = format!("egress-baseline-{}", network);
-    let clone_name = format!("egress-clone-{}", network);
+    let (baseline_name, clone_name, snapshot_name, _) =
+        common::unique_names(&format!("egress-{}", network));
 
     println!("\n╔═══════════════════════════════════════════════════════════════╗");
     println!(
diff --git a/tests/test_egress_stress.rs b/tests/test_egress_stress.rs
index 6250e5ff..dc3c9dee 100644
--- a/tests/test_egress_stress.rs
+++ b/tests/test_egress_stress.rs
@@ -37,6 +37,7 @@ async fn test_egress_stress_bridged() -> Result<()> {
 /// Test egress stress with rootless networking using local HTTP server
 #[tokio::test]
 async fn test_egress_stress_rootless() -> Result<()> {
+    common::require_non_root("test_egress_stress_rootless")?;
     egress_stress_impl("rootless", NUM_CLONES, REQUESTS_PER_CLONE).await
 }
 
@@ -45,7 +46,10 @@ async fn egress_stress_impl(
     num_clones: usize,
     requests_per_clone: usize,
 ) -> Result<()> {
-    let test_name = format!("egress-stress-{}", network);
+    // Use unique prefix for all resources
+    let (baseline_name, _, snapshot_name, _) =
+        common::unique_names(&format!("estress-{}", network));
+    let test_name = baseline_name.clone(); // Use for clone naming
 
     println!("\n╔═══════════════════════════════════════════════════════════════╗");
     println!(
@@ -84,7 +88,6 @@ async fn egress_stress_impl(
     let fcvm_path = common::find_fcvm_binary()?;
 
     // Step 1: Start baseline VM
-    let baseline_name = format!("{}-baseline", test_name);
     println!("\nStep 1: Starting baseline VM '{}'...", baseline_name);
 
     let (_baseline_child, baseline_pid) = common::spawn_fcvm_with_logs(
@@ -146,7 +149,6 @@ async fn egress_stress_impl(
     println!("  ✓ Baseline egress works");
 
     // Step 2: Create snapshot
-    let snapshot_name = format!("{}-snapshot", test_name);
     println!("\nStep 2: Creating snapshot '{}'...", snapshot_name);
 
     let output = tokio::process::Command::new(&fcvm_path)
diff --git a/tests/test_exec.rs b/tests/test_exec.rs
index 96791263..8ce334ed 100644
--- a/tests/test_exec.rs
+++ b/tests/test_exec.rs
@@ -18,6 +18,7 @@ async fn test_exec_bridged() -> Result<()> {
 
 #[tokio::test]
 async fn test_exec_rootless() -> Result<()> {
+    common::require_non_root("test_exec_rootless")?;
     exec_test_impl("rootless").await
 }
 
@@ -26,7 +27,7 @@ async fn exec_test_impl(network: &str) -> Result<()> {
     println!("================================");
 
     let fcvm_path = common::find_fcvm_binary()?;
-    let vm_name = format!("exec-test-{}", network);
+    let (vm_name, _, _, _) = common::unique_names(&format!("exec-{}", network));
 
     // Start the VM using spawn_fcvm helper (uses Stdio::inherit to prevent deadlock)
     println!("Starting VM...");
diff --git a/tests/test_port_forward.rs b/tests/test_port_forward.rs
index 4fe4357c..e09d5302 100644
--- a/tests/test_port_forward.rs
+++ b/tests/test_port_forward.rs
@@ -22,15 +22,10 @@ struct VmDisplay {
 /// Test port forwarding with bridged networking
 #[test]
 fn test_port_forward_bridged() -> Result<()> {
-    // Requires root for bridged networking
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_port_forward_bridged: requires root");
-        return Ok(());
-    }
-
     println!("\ntest_port_forward_bridged");
 
     let fcvm_path = common::find_fcvm_binary()?;
+    let vm_name = format!("port-bridged-{}", std::process::id());
 
     // Start VM with port forwarding
     let mut fcvm = Command::new(&fcvm_path)
@@ -38,7 +33,7 @@ fn test_port_forward_bridged() -> Result<()> {
             "podman",
             "run",
             "--name",
-            "port-test",
+            &vm_name,
             "--network",
             "bridged",
             "--publish",
@@ -187,9 +182,11 @@ fn test_port_forward_bridged() -> Result<()> {
 /// allowing multiple VMs to all forward the same port.
 #[test]
 fn test_port_forward_rootless() -> Result<()> {
+    common::require_non_root("test_port_forward_rootless")?;
     println!("\ntest_port_forward_rootless");
 
     let fcvm_path = common::find_fcvm_binary()?;
+    let vm_name = format!("port-rootless-{}", std::process::id());
 
     // Start VM with rootless networking and port forwarding
     // Use unprivileged port 8080 since rootless can't bind to 80
@@ -198,7 +195,7 @@ fn test_port_forward_rootless() -> Result<()> {
             "podman",
             "run",
             "--name",
-            "port-test-rootless",
+            &vm_name,
             "--network",
             "rootless",
             "--publish",
diff --git a/tests/test_readme_examples.rs b/tests/test_readme_examples.rs
index 17362444..28223f10 100644
--- a/tests/test_readme_examples.rs
+++ b/tests/test_readme_examples.rs
@@ -30,12 +30,6 @@ async fn test_readonly_volume() -> Result<()> {
     println!("\ntest_readonly_volume");
     println!("====================");
 
-    // Requires root for bridged networking (more reliable health checks)
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_readonly_volume: requires root for bridged networking");
-        return Ok(());
-    }
-
     let test_id = format!("ro-{}", std::process::id());
     let vm_name = format!("ro-vol-{}", std::process::id());
 
@@ -133,12 +127,6 @@ async fn test_env_variables() -> Result<()> {
     println!("\ntest_env_variables");
     println!("==================");
 
-    // Requires root for bridged networking (more reliable health checks)
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_env_variables: requires root for bridged networking");
-        return Ok(());
-    }
-
     let vm_name = format!("env-test-{}", std::process::id());
 
     // Start VM with environment variables using bridged mode for reliable health checks
@@ -218,12 +206,6 @@ async fn test_custom_resources() -> Result<()> {
     println!("\ntest_custom_resources");
     println!("=====================");
 
-    // Requires root for bridged networking (more reliable health checks)
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_custom_resources: requires root for bridged networking");
-        return Ok(());
-    }
-
     let vm_name = format!("resources-test-{}", std::process::id());
 
     // Start VM with custom resources using bridged mode for reliable health checks
@@ -303,12 +285,6 @@ async fn test_fcvm_ls() -> Result<()> {
     println!("\ntest_fcvm_ls");
     println!("============");
 
-    // Requires root for bridged networking (more reliable health checks)
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_fcvm_ls: requires root for bridged networking");
-        return Ok(());
-    }
-
     let fcvm_path = common::find_fcvm_binary()?;
     let vm_name = format!("ls-test-{}", std::process::id());
 
@@ -440,12 +416,6 @@ async fn test_custom_command() -> Result<()> {
     println!("\ntest_custom_command");
     println!("===================");
 
-    // Requires root for bridged networking (more reliable for custom commands)
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_custom_command: requires root for bridged networking");
-        return Ok(());
-    }
-
     let vm_name = format!("cmd-test-{}", std::process::id());
 
     // Use nginx:alpine with a custom command that:
diff --git a/tests/test_sanity.rs b/tests/test_sanity.rs
index 0356590f..65355c00 100644
--- a/tests/test_sanity.rs
+++ b/tests/test_sanity.rs
@@ -14,6 +14,7 @@ async fn test_sanity_bridged() -> Result<()> {
 
 #[tokio::test]
 async fn test_sanity_rootless() -> Result<()> {
+    common::require_non_root("test_sanity_rootless")?;
     sanity_test_impl("rootless").await
 }
 
@@ -26,7 +27,7 @@ async fn sanity_test_impl(network: &str) -> Result<()> {
 
     // Start the VM using spawn_fcvm helper (uses Stdio::inherit to prevent deadlock)
     println!("Starting VM...");
-    let vm_name = format!("sanity-test-{}", network);
+    let (vm_name, _, _, _) = common::unique_names(&format!("sanity-{}", network));
     let (mut child, fcvm_pid) = common::spawn_fcvm(&[
         "podman",
         "run",
diff --git a/tests/test_signal_cleanup.rs b/tests/test_signal_cleanup.rs
index 6bb62676..beb6930f 100644
--- a/tests/test_signal_cleanup.rs
+++ b/tests/test_signal_cleanup.rs
@@ -52,12 +52,6 @@ fn send_signal(pid: u32, signal: &str) -> Result<()> {
 /// Test that SIGINT properly kills the VM and cleans up firecracker
 #[test]
 fn test_sigint_kills_firecracker() -> Result<()> {
-    // This test requires root for bridged networking
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_sigint_kills_firecracker: requires root");
-        return Ok(());
-    }
-
     println!("\ntest_sigint_kills_firecracker");
 
     // Get initial firecracker count
@@ -76,12 +70,13 @@ fn test_sigint_kills_firecracker() -> Result<()> {
 
     // Start fcvm in background
     let fcvm_path = common::find_fcvm_binary()?;
+    let vm_name = format!("signal-int-{}", std::process::id());
     let mut fcvm = Command::new(&fcvm_path)
         .args([
             "podman",
             "run",
             "--name",
-            "signal-test",
+            &vm_name,
             "--network",
             "bridged",
             "nginx:alpine",
@@ -210,22 +205,17 @@ fn test_sigint_kills_firecracker() -> Result<()> {
 /// Test that SIGTERM properly kills the VM and cleans up firecracker
 #[test]
 fn test_sigterm_kills_firecracker() -> Result<()> {
-    // This test requires root for bridged networking
-    if !nix::unistd::geteuid().is_root() {
-        eprintln!("Skipping test_sigterm_kills_firecracker: requires root");
-        return Ok(());
-    }
-
     println!("\ntest_sigterm_kills_firecracker");
 
     // Start fcvm in background
     let fcvm_path = common::find_fcvm_binary()?;
+    let vm_name = format!("signal-term-{}", std::process::id());
     let mut fcvm = Command::new(&fcvm_path)
         .args([
             "podman",
             "run",
             "--name",
-            "signal-test-term",
+            &vm_name,
             "--network",
             "bridged",
             "nginx:alpine",
diff --git a/tests/test_snapshot_clone.rs b/tests/test_snapshot_clone.rs
index 6f8716f6..6d6d5a9b 100644
--- a/tests/test_snapshot_clone.rs
+++ b/tests/test_snapshot_clone.rs
@@ -17,12 +17,14 @@ use tokio::sync::Mutex;
 /// Full snapshot/clone workflow test with rootless networking (10 clones)
 #[tokio::test]
 async fn test_snapshot_clone_rootless_10() -> Result<()> {
+    common::require_non_root("test_snapshot_clone_rootless_10")?;
     snapshot_clone_test_impl("rootless", 10).await
 }
 
 /// Stress test with 100 clones using rootless networking
 #[tokio::test]
 async fn test_snapshot_clone_stress_100() -> Result<()> {
+    common::require_non_root("test_snapshot_clone_stress_100")?;
     snapshot_clone_test_impl("rootless", 100).await
 }
 
@@ -36,8 +38,7 @@ struct CloneResult {
 }
 
 async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<()> {
-    let snapshot_name = format!("test-snapshot-{}", network);
-    let baseline_name = format!("baseline-{}", network);
+    let (baseline_name, _, snapshot_name, _) = common::unique_names(&format!("snap-{}", network));
     let test_start = Instant::now();
 
     println!("\n╔═══════════════════════════════════════════════════════════════╗");
@@ -145,7 +146,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<()
     let mut spawn_handles = Vec::new();
 
     for i in 0..num_clones {
-        let clone_name = format!("clone-{}-{}", network, i);
+        let clone_name = format!("{}-{}", baseline_name.replace("-base-", "-clone-"), i);
         let network = network.to_string();
         let results = Arc::clone(&results);
         let clone_pids = Arc::clone(&clone_pids);
@@ -191,7 +192,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<()
                     };
 
                     results.lock().await.push(CloneResult {
-                        name: clone_name,
+                        name: clone_name.clone(),
                         pid: clone_pid,
                         spawn_time_ms: spawn_ms,
                         health_time_secs: health_time,
@@ -200,7 +201,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<()
                 }
                 Err(e) => {
                     results.lock().await.push(CloneResult {
-                        name: clone_name,
+                        name: clone_name.clone(),
                         pid: 0,
                         spawn_time_ms: spawn_start.elapsed().as_secs_f64() * 1000.0,
                         health_time_secs: None,
@@ -378,8 +379,7 @@ async fn snapshot_clone_test_impl(network: &str, num_clones: usize) -> Result<()
 /// isolation, Firecracker would try to bind to the same socket path stored in vmstate.bin.
 #[tokio::test]
 async fn test_clone_while_baseline_running() -> Result<()> {
-    let snapshot_name = "test-clone-running";
-    let baseline_name = "baseline-running";
+    let (baseline_name, clone_name, snapshot_name, _) = common::unique_names("running");
 
     println!("\n╔═══════════════════════════════════════════════════════════════╗");
     println!("║     Clone While Baseline Running Test                         ║");
@@ -394,12 +394,12 @@ async fn test_clone_while_baseline_running() -> Result<()> {
             "podman",
             "run",
             "--name",
-            baseline_name,
+            &baseline_name,
             "--network",
             "bridged",
             common::TEST_IMAGE,
         ],
-        baseline_name,
+        &baseline_name,
     )
     .await
     .context("spawning baseline VM")?;
@@ -417,7 +417,7 @@ async fn test_clone_while_baseline_running() -> Result<()> {
             "--pid",
             &baseline_pid.to_string(),
             "--tag",
-            snapshot_name,
+            &snapshot_name,
         ])
         .output()
         .await
@@ -437,19 +437,18 @@ async fn test_clone_while_baseline_running() -> Result<()> {
     // Step 4: Start memory server
     println!("\nStep 4: Starting memory server...");
     let (_serve_child, serve_pid) =
-        common::spawn_fcvm_with_logs(&["snapshot", "serve", snapshot_name], "uffd-server")
+        common::spawn_fcvm_with_logs(&["snapshot", "serve", &snapshot_name], "uffd-server")
             .await
             .context("spawning memory server")?;
 
     // Wait for serve to be ready (poll for socket)
-    common::poll_serve_ready(snapshot_name, serve_pid, 30).await?;
+    common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?;
     println!("  ✓ Memory server ready (PID: {})", serve_pid);
 
     // Step 5: Clone WHILE baseline is still running (this is the key test!)
     println!("\nStep 5: Spawning clone while baseline is STILL RUNNING...");
     println!("  (This tests vsock socket isolation via mount namespace)");
 
-    let clone_name = "clone-running";
     let serve_pid_str = serve_pid.to_string();
     let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs(
         &[
@@ -458,11 +457,11 @@ async fn test_clone_while_baseline_running() -> Result<()> {
             "--pid",
             &serve_pid_str,
             "--name",
-            clone_name,
+            &clone_name,
             "--network",
             "bridged",
         ],
-        clone_name,
+        &clone_name,
     )
     .await
     .context("spawning clone while baseline running")?;
@@ -533,12 +532,13 @@ async fn test_clone_internet_bridged() -> Result<()> {
 /// Test that clones can reach the internet in rootless mode
 #[tokio::test]
 async fn test_clone_internet_rootless() -> Result<()> {
+    common::require_non_root("test_clone_internet_rootless")?;
     clone_internet_test_impl("rootless").await
 }
 
 async fn clone_internet_test_impl(network: &str) -> Result<()> {
-    let snapshot_name = format!("test-internet-{}", network);
-    let baseline_name = format!("baseline-internet-{}", network);
+    let (baseline_name, clone_name, snapshot_name, _) =
+        common::unique_names(&format!("inet-{}", network));
 
     println!("\n╔═══════════════════════════════════════════════════════════════╗");
     println!(
@@ -608,7 +608,6 @@ async fn clone_internet_test_impl(network: &str) -> Result<()> {
 
     // Step 4: Spawn clone
     println!("\nStep 4: Spawning clone...");
-    let clone_name = format!("clone-internet-{}", network);
     let serve_pid_str = serve_pid.to_string();
     let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs(
         &[
@@ -762,6 +761,429 @@ async fn test_clone_http(fcvm_path: &std::path::Path, clone_pid: u32) -> Result<
     }
 }
 
+/// Test port forwarding on clones with bridged networking
+///
+/// Verifies that --publish correctly forwards ports to cloned VMs.
+/// This tests the full port forwarding path: host → iptables DNAT → clone VM → nginx.
+#[tokio::test]
+async fn test_clone_port_forward_bridged() -> Result<()> {
+    let (baseline_name, clone_name, snapshot_name, _) = common::unique_names("pf-bridged");
+
+    println!("\n╔═══════════════════════════════════════════════════════════════╗");
+    println!("║     Clone Port Forwarding Test (bridged)                      ║");
+    println!("╚═══════════════════════════════════════════════════════════════╝\n");
+
+    let fcvm_path = common::find_fcvm_binary()?;
+
+    // Step 1: Start baseline VM with nginx
+    println!("Step 1: Starting baseline VM with nginx...");
+    let (_baseline_child, baseline_pid) = common::spawn_fcvm_with_logs(
+        &[
+            "podman",
+            "run",
+            "--name",
+            &baseline_name,
+            "--network",
+            "bridged",
+            common::TEST_IMAGE,
+        ],
+        &baseline_name,
+    )
+    .await
+    .context("spawning baseline VM")?;
+
+    println!("  Waiting for baseline VM to become healthy...");
+    common::poll_health_by_pid(baseline_pid, 60).await?;
+    println!("  ✓ Baseline VM healthy (PID: {})", baseline_pid);
+
+    // Step 2: Create snapshot
+    println!("\nStep 2: Creating snapshot...");
+    let output = tokio::process::Command::new(&fcvm_path)
+        .args([
+            "snapshot",
+            "create",
+            "--pid",
+            &baseline_pid.to_string(),
+            "--tag",
+            &snapshot_name,
+        ])
+        .output()
+        .await
+        .context("running snapshot create")?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        anyhow::bail!("Snapshot creation failed: {}", stderr);
+    }
+    println!("  ✓ Snapshot created");
+
+    // Kill baseline - we only need the snapshot for clones
+    common::kill_process(baseline_pid).await;
+    println!("  Killed baseline VM (only need snapshot)");
+
+    // Step 3: Start memory server
+    println!("\nStep 3: Starting memory server...");
+    let (_serve_child, serve_pid) =
+        common::spawn_fcvm_with_logs(&["snapshot", "serve", &snapshot_name], "uffd-server")
+            .await
+            .context("spawning memory server")?;
+
+    // Wait for serve to be ready (poll for socket)
+    common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?;
+    println!("  ✓ Memory server ready (PID: {})", serve_pid);
+
+    // Step 4: Spawn clone WITH port forwarding
+    println!("\nStep 4: Spawning clone with --publish 19080:80...");
+    let serve_pid_str = serve_pid.to_string();
+    let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs(
+        &[
+            "snapshot",
+            "run",
+            "--pid",
+            &serve_pid_str,
+            "--name",
+            &clone_name,
+            "--network",
+            "bridged",
+            "--publish",
+            "19080:80",
+        ],
+        &clone_name,
+    )
+    .await
+    .context("spawning clone with port forward")?;
+
+    // Wait for clone to become healthy
+    println!("  Waiting for clone to become healthy...");
+    common::poll_health_by_pid(clone_pid, 60).await?;
+    println!("  ✓ Clone is healthy (PID: {})", clone_pid);
+
+    // Step 5: Test port forwarding
+    println!("\nStep 5: Testing port forwarding...");
+
+    // Get clone's guest IP from state
+    let output = tokio::process::Command::new(&fcvm_path)
+        .args(["ls", "--json", "--pid", &clone_pid.to_string()])
+        .output()
+        .await
+        .context("getting clone state")?;
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let guest_ip: String = serde_json::from_str::<Vec<serde_json::Value>>(&stdout)
+        .ok()
+        .and_then(|v| v.first().cloned())
+        .and_then(|v| {
+            v.get("config")?
+                .get("network")?
+                .get("guest_ip")?
+                .as_str()
+                .map(|s| s.to_string())
+        })
+        .unwrap_or_default();
+
+    println!("  Clone guest IP: {}", guest_ip);
+
+    // Test 1: Direct access to guest IP
+    println!("  Testing direct access to guest...");
+    let direct_result = tokio::process::Command::new("curl")
+        .args(["-s", "--max-time", "10", &format!("http://{}:80", guest_ip)])
+        .output()
+        .await;
+
+    let direct_works = direct_result
+        .map(|o| o.status.success() && !o.stdout.is_empty())
+        .unwrap_or(false);
+    println!(
+        "    Direct access: {}",
+        if direct_works { "✓ OK" } else { "✗ FAIL" }
+    );
+
+    // Test 2: Access via host's primary IP and forwarded port
+    let host_ip = tokio::process::Command::new("hostname")
+        .arg("-I")
+        .output()
+        .await
+        .ok()
+        .and_then(|o| String::from_utf8(o.stdout).ok())
+        .and_then(|s| s.split_whitespace().next().map(|ip| ip.to_string()))
+        .unwrap_or_else(|| "127.0.0.1".to_string());
+
+    println!("  Testing access via host IP {}:19080...", host_ip);
+    let forward_result = tokio::process::Command::new("curl")
+        .args([
+            "-s",
+            "--max-time",
+            "10",
+            &format!("http://{}:19080", host_ip),
+        ])
+        .output()
+        .await;
+
+    let forward_works = forward_result
+        .map(|o| o.status.success() && !o.stdout.is_empty())
+        .unwrap_or(false);
+    println!(
+        "    Port forward (host IP): {}",
+        if forward_works { "✓ OK" } else { "✗ FAIL" }
+    );
+
+    // Test 3: Access via localhost
+    println!("  Testing access via localhost:19080...");
+    let localhost_result = tokio::process::Command::new("curl")
+        .args(["-s", "--max-time", "10", "http://127.0.0.1:19080"])
+        .output()
+        .await;
+
+    let localhost_works = localhost_result
+        .map(|o| o.status.success() && !o.stdout.is_empty())
+        .unwrap_or(false);
+    println!(
+        "    Localhost access: {}",
+        if localhost_works {
+            "✓ OK"
+        } else {
+            "✗ FAIL"
+        }
+    );
+
+    // Cleanup
+    println!("\nCleaning up...");
+    common::kill_process(clone_pid).await;
+    println!("  Killed clone");
+    common::kill_process(serve_pid).await;
+    println!("  Killed memory server");
+
+    // Results
+    println!("\n╔═══════════════════════════════════════════════════════════════╗");
+    println!("║                         RESULTS                               ║");
+    println!("╠═══════════════════════════════════════════════════════════════╣");
+    println!(
+        "║  Direct access to guest:    {}                                 ║",
+        if direct_works {
+            "✓ PASSED"
+        } else {
+            "✗ FAILED"
+        }
+    );
+    println!(
+        "║  Port forward (host IP):    {}                                 ║",
+        if forward_works {
+            "✓ PASSED"
+        } else {
+            "✗ FAILED"
+        }
+    );
+    println!(
+        "║  Localhost port forward:    {}                                 ║",
+        if localhost_works {
+            "✓ PASSED"
+        } else {
+            "✗ FAILED"
+        }
+    );
+    println!("╚═══════════════════════════════════════════════════════════════╝");
+
+    // All port forwarding methods must work
+    if direct_works && forward_works && localhost_works {
+        println!("\n✅ CLONE PORT FORWARDING TEST PASSED!");
+        Ok(())
+    } else {
+        anyhow::bail!(
+            "Clone port forwarding test failed: direct={}, forward={}, localhost={}",
+            direct_works,
+            forward_works,
+            localhost_works
+        )
+    }
+}
+
+/// Test port forwarding on clones with rootless networking
+///
+/// This is the key test - rootless clones with port forwarding.
+/// Port forwarding is done via slirp4netns API, accessing via unique loopback IP.
+#[tokio::test]
+async fn test_clone_port_forward_rootless() -> Result<()> {
+    common::require_non_root("test_clone_port_forward_rootless")?;
+
+    let (baseline_name, clone_name, snapshot_name, _) = common::unique_names("pf-rootless");
+
+    println!("\n╔═══════════════════════════════════════════════════════════════╗");
+    println!("║     Clone Port Forwarding Test (rootless)                     ║");
+    println!("╚═══════════════════════════════════════════════════════════════╝\n");
+
+    let fcvm_path = common::find_fcvm_binary()?;
+
+    // Step 1: Start baseline VM with nginx (rootless)
+    println!("Step 1: Starting baseline VM with nginx (rootless)...");
+    let (_baseline_child, baseline_pid) = common::spawn_fcvm_with_logs(
+        &[
+            "podman",
+            "run",
+            "--name",
+            &baseline_name,
+            "--network",
+            "rootless",
+            common::TEST_IMAGE,
+        ],
+        &baseline_name,
+    )
+    .await
+    .context("spawning baseline VM")?;
+
+    println!("  Waiting for baseline VM to become healthy...");
+    common::poll_health_by_pid(baseline_pid, 90).await?;
+    println!("  ✓ Baseline VM healthy (PID: {})", baseline_pid);
+
+    // Step 2: Create snapshot
+    println!("\nStep 2: Creating snapshot...");
+    let output = tokio::process::Command::new(&fcvm_path)
+        .args([
+            "snapshot",
+            "create",
+            "--pid",
+            &baseline_pid.to_string(),
+            "--tag",
+            &snapshot_name,
+        ])
+        .output()
+        .await
+        .context("running snapshot create")?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        anyhow::bail!("Snapshot creation failed: {}", stderr);
+    }
+    println!("  ✓ Snapshot created");
+
+    // Kill baseline - we only need the snapshot for clones
+    common::kill_process(baseline_pid).await;
+    println!("  Killed baseline VM (only need snapshot)");
+
+    // Step 3: Start memory server
+    println!("\nStep 3: Starting memory server...");
+    let (_serve_child, serve_pid) =
+        common::spawn_fcvm_with_logs(&["snapshot", "serve", &snapshot_name], "uffd-server")
+            .await
+            .context("spawning memory server")?;
+
+    // Wait for serve to be ready (poll for socket)
+    common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?;
+    println!("  ✓ Memory server ready (PID: {})", serve_pid);
+
+    // Step 4: Spawn clone WITH port forwarding (rootless)
+    // Use port 8080 (unprivileged) since rootless can't bind to 80
+    println!("\nStep 4: Spawning clone with --publish 8080:80 (rootless)...");
+    let serve_pid_str = serve_pid.to_string();
+    let (_clone_child, clone_pid) = common::spawn_fcvm_with_logs(
+        &[
+            "snapshot",
+            "run",
+            "--pid",
+            &serve_pid_str,
+            "--name",
+            &clone_name,
+            "--network",
+            "rootless",
+            "--publish",
+            "8080:80",
+        ],
+        &clone_name,
+    )
+    .await
+    .context("spawning clone with port forward")?;
+
+    // Wait for clone to become healthy
+    println!("  Waiting for clone to become healthy...");
+    common::poll_health_by_pid(clone_pid, 60).await?;
+    println!("  ✓ Clone is healthy (PID: {})", clone_pid);
+
+    // Step 5: Test port forwarding via loopback IP
+    println!("\nStep 5: Testing port forwarding...");
+
+    // Get clone's loopback IP from state (rootless uses 127.x.y.z)
+    let output = tokio::process::Command::new(&fcvm_path)
+        .args(["ls", "--json", "--pid", &clone_pid.to_string()])
+        .output()
+        .await
+        .context("getting clone state")?;
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let loopback_ip: String = serde_json::from_str::<Vec<serde_json::Value>>(&stdout)
+        .ok()
+        .and_then(|v| v.first().cloned())
+        .and_then(|v| {
+            v.get("config")?
+                .get("network")?
+                .get("loopback_ip")?
+                .as_str()
+                .map(|s| s.to_string())
+        })
+        .unwrap_or_default();
+
+    println!("  Clone loopback IP: {}", loopback_ip);
+
+    // Test: Access via loopback IP and forwarded port
+    println!("  Testing access via loopback {}:8080...", loopback_ip);
+    let loopback_result = tokio::process::Command::new("curl")
+        .args([
+            "-s",
+            "--max-time",
+            "10",
+            &format!("http://{}:8080", loopback_ip),
+        ])
+        .output()
+        .await;
+
+    let loopback_works = loopback_result
+        .as_ref()
+        .map(|o| o.status.success() && !o.stdout.is_empty())
+        .unwrap_or(false);
+
+    if let Ok(ref out) = loopback_result {
+        if loopback_works {
+            println!("    Loopback access: ✓ OK");
+            let response = String::from_utf8_lossy(&out.stdout);
+            println!(
+                "    Response: {} bytes (nginx welcome page)",
+                response.len()
+            );
+        } else {
+            println!("    Loopback access: ✗ FAIL");
+            println!("    stderr: {}", String::from_utf8_lossy(&out.stderr));
+        }
+    } else {
+        println!("    Loopback access: ✗ FAIL (request error)");
+    }
+
+    // Cleanup
+    println!("\nCleaning up...");
+    common::kill_process(clone_pid).await;
+    println!("  Killed clone");
+    common::kill_process(serve_pid).await;
+    println!("  Killed memory server");
+
+    // Results
+    println!("\n╔═══════════════════════════════════════════════════════════════╗");
+    println!("║                         RESULTS                               ║");
+    println!("╠═══════════════════════════════════════════════════════════════╣");
+    println!(
+        "║  Loopback port forward: {}                                    ║",
+        if loopback_works {
+            "✓ PASSED"
+        } else {
+            "✗ FAILED"
+        }
+    );
+    println!("╚═══════════════════════════════════════════════════════════════╝");
+
+    if loopback_works {
+        println!("\n✅ ROOTLESS CLONE PORT FORWARDING TEST PASSED!");
+        Ok(())
+    } else {
+        anyhow::bail!("Rootless clone port forwarding test failed")
+    }
+}
+
 /// Test snapshot run --exec with bridged networking
 #[tokio::test]
 async fn test_snapshot_run_exec_bridged() -> Result<()> {
@@ -771,13 +1193,13 @@ async fn test_snapshot_run_exec_bridged() -> Result<()> {
 /// Test snapshot run --exec with rootless networking
 #[tokio::test]
 async fn test_snapshot_run_exec_rootless() -> Result<()> {
+    common::require_non_root("test_snapshot_run_exec_rootless")?;
     snapshot_run_exec_test_impl("rootless").await
 }
 
 /// Implementation of snapshot run --exec test
 async fn snapshot_run_exec_test_impl(network: &str) -> Result<()> {
-    let snapshot_name = format!("test-exec-{}", network);
-    let baseline_name = format!("baseline-exec-{}", network);
+    let (baseline_name, _, snapshot_name, _) = common::unique_names(&format!("exec-{}", network));
 
     println!("\n╔═══════════════════════════════════════════════════════════════╗");
     println!(