From d24ef7acb0d0f36c828677aa58f706950ea824ba Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 5 Jan 2026 19:25:21 +0000 Subject: [PATCH 1/6] Add TTY and interactive mode support for podman run and exec Implement full PTY support matching docker/podman -i and -t flags: - `-i`: Keep stdin open (forward user input) - `-t`: Allocate pseudo-TTY (colors, line editing, vim) - `-it`: Both (interactive shell) Host side (src/commands/tty.rs): - Raw terminal mode via tcsetattr/cfmakeraw - Binary exec_proto framing over vsock - Async reader/writer tasks for stdin/stdout Guest side (fc-agent/src/tty.rs): - PTY allocation via openpty() - Fork child with PTY as controlling terminal - Relay between vsock and PTY master CLI changes: - Add -i/-t flags to `fcvm podman run` - Change exec `name` from positional to `--name` flag - Fix argument conflict between --pid and trailing command Documentation: - Add TTY architecture section to DESIGN.md - Add Interactive Mode section to README.md - Update CLI examples to use --name flag Tested: - 4 TTY tests pass (tty, interactive, parallel stress) - 6 exec tests pass - Manual verification of README examples --- .claude/CLAUDE.md | 17 +- Cargo.toml | 2 +- DESIGN.md | 222 ++++++---- Makefile | 9 +- README.md | 918 ++++++++++++++++++++--------------------- fc-agent/src/main.rs | 386 +++++------------ fc-agent/src/tty.rs | 409 ++++++++++++++++++ src/cli/args.rs | 38 +- src/commands/common.rs | 5 +- src/commands/exec.rs | 156 +------ src/commands/mod.rs | 1 + src/commands/podman.rs | 107 ++++- src/commands/tty.rs | 277 +++++++++++++ src/utils.rs | 2 + tests/common/mod.rs | 2 +- tests/test_exec.rs | 575 +++++++++++++++++++++++++- tests/test_kvm.rs | 2 +- 17 files changed, 2093 insertions(+), 1035 deletions(-) create mode 100644 fc-agent/src/tty.rs create mode 100644 src/commands/tty.rs diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 2f4b0207..467d6259 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -499,13 +499,22 @@ See `Containerfile.libfuse-remap` and `Containerfile.pjdfstest` for examples. **CRITICAL: VM commands BLOCK the terminal.** You MUST use Claude's `run_in_background: true` feature. +**PREFER NON-ROOT TESTING**: Run tests without sudo when possible. Rootless networking mode (`--network rootless`, the default) doesn't require sudo. Only use `sudo` for: +- `--network bridged` tests +- Operations that explicitly need root (iptables, privileged containers) + +The ubuntu user has KVM access (`kvm` group), so `fcvm podman run` works without sudo in rootless mode. + ```bash -# WRONG - This blocks forever, wastes context, and times out -sudo fcvm podman run --name test nginx:alpine +# PREFERRED - Rootless mode (no sudo needed, use run_in_background: true) +./target/release/fcvm podman run --name test alpine:latest 2>&1 | tee /tmp/vm.log +# Defaults to --network rootless +# Get PID from state and use exec: +ls -t /mnt/fcvm-btrfs/state/*.json | head -1 | xargs cat | jq -r '.pid' +./target/release/fcvm exec --pid -- hostname -# CORRECT - Run VM in background, then use exec to test +# ONLY WHEN NEEDED - Bridged mode (requires sudo) sudo ./target/release/fcvm podman run --name test --network bridged nginx:alpine 2>&1 | tee /tmp/vm.log -# Use run_in_background: true in Bash tool call # Then sleep and check logs: sleep 30 grep healthy /tmp/vm.log diff --git a/Cargo.toml b/Cargo.toml index 1c58e6d8..7902d16f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ serde_json = "1" sha2 = "0.10" hex = "0.4" toml = "0.8" -tokio = { version = "1", features = ["rt-multi-thread", "macros", "process", "fs", "signal", "io-util", "sync", "time"] } +tokio = { version = "1", features = ["rt-multi-thread", "macros", "process", "fs", "signal", "io-util", "io-std", "sync", "time"] } reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } which = "6" uuid = { version = "1", features = ["v4", "serde"] } diff --git a/DESIGN.md b/DESIGN.md index 6cb6915a..3117e33c 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -43,6 +43,8 @@ 2. **`fcvm exec` Command** - Execute commands in running VMs - Supports running in guest OS or inside container (`-c` flag) + - Interactive mode with stdin forwarding (`-i` flag) + - TTY allocation for terminal apps (`-t` flag) 3. **`fcvm snapshot` Commands** - `fcvm snapshot create`: Create snapshot from running VM @@ -907,120 +909,168 @@ The guest is configured to support rootless Podman: --- -## CLI Interface - -### Commands +## TTY & Interactive Mode -#### `fcvm setup` +fcvm provides full interactive terminal support for both `podman run -it` and `exec -it`, matching docker/podman semantics. -**Purpose**: Download kernel and create rootfs (first-time setup). +### Architecture -**Usage**: -```bash -fcvm setup +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Host Process (fcvm) │ +│ ┌──────────────┐ ┌──────────────────┐ ┌──────────────────────┐│ +│ │ User Terminal│────►│ Raw Mode Handler │────►│ exec_proto Encoder ││ +│ │ (stdin/out) │◄────│ (tcsetattr) │◄────│ (binary framing) ││ +│ └──────────────┘ └──────────────────┘ └──────────┬───────────┘│ +│ │ │ +└───────────────────────────────────────────────────────────┼────────────┘ + │ vsock + ▼ +┌───────────────────────────────────────────────────────────────────────┐ +│ Guest (fc-agent) │ │ +│ ┌──────────────────┐ ┌──────────────┐ ┌──────────▼─────────┐ │ +│ │ exec_proto │────►│ PTY Master │────►│ Container Process │ │ +│ │ Decoder │◄────│ (openpty) │◄────│ (sh, vim, etc.) │ │ +│ └──────────────────┘ └──────────────┘ └────────────────────┘ │ +└───────────────────────────────────────────────────────────────────────┘ ``` -**What it does:** -1. Downloads Kata kernel (~15MB, cached by URL hash) -2. Downloads packages via `podman run ubuntu:noble` with `apt-get install --download-only` -3. Creates Layer 2 rootfs (~10GB): boots VM, installs packages, writes config -4. Verifies setup by checking `/etc/fcvm-setup-complete` marker file -5. Creates fc-agent initrd (embeds statically-linked fc-agent binary) - -Takes 5-10 minutes on first run. Subsequent runs are instant (cached by content hash). +### Flags -**Note**: Must be run before `fcvm podman run` with bridged networking. For rootless mode, you can use `--setup` flag on `fcvm podman run` instead. +| Flag | Meaning | Implementation | +|------|---------|----------------| +| `-i` | Keep stdin open | Host reads stdin, sends via STDIN messages | +| `-t` | Allocate PTY | Guest allocates PTY master/slave pair | +| `-it` | Both | Interactive shell with full terminal support | +| neither | Plain exec | Pipes for stdin/stdout, no PTY | -#### `fcvm podman run` +### Wire Protocol (exec_proto) -**Purpose**: Launch a container in a new Firecracker VM. +Binary framed protocol over vsock for efficient transport of terminal data: -**Usage**: -```bash -fcvm podman run --name [OPTIONS] +``` +┌─────────┬─────────┬──────────────────┐ +│ Type(1) │ Len(4) │ Payload(N) │ +└─────────┴─────────┴──────────────────┘ ``` -**Arguments**: -- `IMAGE` - Container image (e.g., `nginx:alpine`, `ghcr.io/org/app:v1.0`) +**Message Types**: +- `DATA (0x00)`: Output from command (stdout/stderr) +- `STDIN (0x01)`: Input from user terminal +- `EXIT (0x02)`: Command exit code (4 bytes, big-endian i32) +- `ERROR (0x03)`: Error message string -**Options**: +**Why binary framing?** +- Handles escape sequences (Ctrl+C = 0x03, Ctrl+D = 0x04) +- Preserves all bytes without escaping +- Efficient for high-throughput terminal output + +### Host-Side Implementation (`src/commands/tty.rs`) + +```rust +// 1. Set terminal to raw mode +let original = tcgetattr(stdin)?; +let mut raw = original.clone(); +cfmakeraw(&mut raw); +tcsetattr(stdin, TCSANOW, &raw)?; + +// 2. Spawn reader/writer tasks +tokio::spawn(async move { + // Reader: terminal stdin → vsock + loop { + let n = stdin.read(&mut buf)?; + exec_proto::write_stdin(&mut vsock, &buf[..n])?; + } +}); + +tokio::spawn(async move { + // Writer: vsock → terminal stdout + loop { + match exec_proto::Message::read_from(&mut vsock)? { + Message::Data(data) => stdout.write_all(&data)?, + Message::Exit(code) => return code, + _ => {} + } + } +}); ``` ---name VM name (required) ---cpu vCPU count (default: 2) ---mem Memory in MiB (default: 2048) ---network Network mode: rootless|bridged (default: rootless) ---map Volume mount via FUSE (can specify multiple) ---env Environment variable (can specify multiple) ---cmd Container command override ---publish Port publish (can specify multiple) ---balloon Memory balloon target ---health-check HTTP health check URL ---privileged Run container in privileged mode ---setup Run setup if kernel/rootfs missing (rootless only) + +### Guest-Side Implementation (`fc-agent/src/tty.rs`) + +```rust +// 1. Allocate PTY +let (master, slave) = openpty()?; + +// 2. Fork child process +match fork() { + 0 => { + // Child: setup PTY as controlling terminal + setsid(); + ioctl(slave, TIOCSCTTY, 0); + dup2(slave, STDIN); + dup2(slave, STDOUT); + dup2(slave, STDERR); + execvp(command, args); + } + pid => { + // Parent: relay between vsock and PTY master + // Reader thread: PTY master → vsock (DATA messages) + // Writer thread: vsock (STDIN messages) → PTY master + } +} ``` -**Examples**: -```bash -# Simple nginx (bridged networking, requires sudo) -sudo fcvm podman run --name my-nginx nginx:alpine +### Supported Features -# Rootless mode (no sudo required) -fcvm podman run --name my-nginx --network rootless nginx:alpine +- **Escape sequences**: Colors (ANSI), cursor movement, screen clearing +- **Control characters**: Ctrl+C (SIGINT), Ctrl+D (EOF), Ctrl+Z (SIGTSTP) +- **Line editing**: Arrow keys, backspace, history (shell-dependent) +- **Full-screen apps**: vim, htop, less, nano, tmux -# With port forwarding -sudo fcvm podman run --name web --publish 8080:80 nginx:alpine +### Limitations -# With volumes and environment -sudo fcvm podman run \ - --name db \ - --map /host/data:/data \ - --env DB_HOST=localhost \ - postgres:15 +- **Window resize (SIGWINCH)**: Not implemented. Terminal size is fixed at session start. +- **Job control**: Background/foreground (`bg`, `fg`) work within the container, but signals are not forwarded to the host. -# With health check -sudo fcvm podman run \ - --name web \ - --health-check http://localhost/health \ - nginx:alpine +--- -# High CPU/memory with balloon -sudo fcvm podman run \ - --name ml \ - --cpu 8 \ - --mem 8192 \ - --balloon 4096 \ - ml-training:latest -``` +## CLI Interface -#### `fcvm exec` +> **Full CLI documentation with examples**: See [README.md](README.md#cli-reference) -**Purpose**: Execute a command in a running VM. +### Command Summary -**Usage**: -```bash -fcvm exec --pid [OPTIONS] -- [ARGS...] -``` +| Command | Purpose | +|---------|---------| +| `fcvm setup` | Download kernel, create rootfs (first-time setup, ~5-10 min) | +| `fcvm podman run` | Launch container in Firecracker VM | +| `fcvm exec` | Execute command in running VM/container | +| `fcvm ls` | List running VMs | +| `fcvm snapshot create` | Create snapshot from running VM | +| `fcvm snapshot serve` | Start UFFD memory server for cloning | +| `fcvm snapshot run` | Spawn clone from memory server | +| `fcvm snapshots` | List available snapshots | -**Options**: -``` ---pid PID of the fcvm process managing the VM (required) --c, --container Run command inside the container (not just guest OS) -``` +### Key CLI Design Decisions -**Examples**: -```bash -# Run command in guest OS -sudo fcvm exec --pid 12345 -- ls -la / +1. **Trailing arguments**: Both `podman run` and `exec` support trailing args after `--`: + ```bash + fcvm podman run --name test alpine:latest echo "hello" + fcvm exec --name test -it -- sh -c "ls -la" + ``` -# Run command inside container -sudo fcvm exec --pid 12345 -c -- curl -s http://localhost/health +2. **`--name` vs `--pid`**: VM identification uses named flags (not positional): + ```bash + fcvm exec --name my-vm -- hostname # By name + fcvm exec --pid 12345 -- hostname # By PID + ``` -# Check egress connectivity from guest -sudo fcvm exec --pid 12345 -- curl -s ifconfig.me +3. **Interactive flags** (`-i`, `-t`): Match docker/podman semantics: + - `-i`: Keep stdin open + - `-t`: Allocate PTY + - `-it`: Both (interactive shell) -# Check egress connectivity from container -sudo fcvm exec --pid 12345 -c -- wget -q -O - http://ifconfig.me -``` +4. **Network modes**: `--network rootless` (default, no sudo) or `--network bridged` (sudo required) #### `fcvm snapshot create` diff --git a/Makefile b/Makefile index 93a8397d..d3b6c113 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,13 @@ else NEXTEST_IGNORED := endif +# Disable retries when FILTER is set (debugging specific tests) +ifdef FILTER +NEXTEST_RETRIES := +else +NEXTEST_RETRIES := --retries 2 +endif + # Default log level: fcvm debug, suppress FUSE spam # Override with: RUST_LOG=debug make test-root TEST_LOG ?= fcvm=debug,health-monitor=info,fuser=warn,fuse_backend_rs=warn,passthrough=warn @@ -199,7 +206,7 @@ _test-root: FCVM_DATA_DIR=$(ROOT_DATA_DIR) \ CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER='sudo -E' \ CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER='sudo -E' \ - $(NEXTEST) $(NEXTEST_CAPTURE) $(NEXTEST_IGNORED) --retries 2 --features privileged-tests $(FILTER) + $(NEXTEST) $(NEXTEST_CAPTURE) $(NEXTEST_IGNORED) $(NEXTEST_RETRIES) --features privileged-tests $(FILTER) # Host targets (with setup, check-disk first to fail fast if disk is full) test-unit: show-notes check-disk build _test-unit diff --git a/README.md b/README.md index efc52b8f..a5054f82 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,7 @@ A Rust implementation that launches Firecracker microVMs to run Podman container > - Port forwarding for both regular VMs and clones > - FUSE-based host directory mapping via fuse-pipe > - Container exit code forwarding - -**fcvm builds on Firecracker** to provide a container-native experience while preserving the security isolation of hardware virtualization. +> - Interactive shell support (`-it`) with full TTY (vim, editors, colors) --- @@ -34,6 +33,44 @@ A Rust implementation that launches Firecracker microVMs to run Podman container --- +## Test Requirements + +**Container Testing (Recommended)** - All dependencies bundled: +```bash +make container-test # All tests in container (just needs podman + /dev/kvm) +``` + +See [CLAUDE.md](.claude/CLAUDE.md#makefile-targets) for all Makefile targets. + +**Native Testing** - Additional dependencies required: + +| Category | Packages | +|----------|----------| +| FUSE | fuse3, libfuse3-dev | +| pjdfstest build | autoconf, automake, libtool | +| pjdfstest runtime | perl | +| bindgen (userfaultfd-sys) | libclang-dev, clang | +| VM tests | iproute2, iptables, slirp4netns | +| Rootfs build | qemu-utils, e2fsprogs | +| User namespaces | uidmap (for newuidmap/newgidmap) | + +**pjdfstest Setup** (for POSIX compliance tests): +```bash +git clone --depth 1 https://github.com/pjd/pjdfstest /tmp/pjdfstest-check +cd /tmp/pjdfstest-check && autoreconf -ifs && ./configure && make +``` + +**Ubuntu/Debian Install**: +```bash +sudo apt-get update && sudo apt-get install -y \ + fuse3 libfuse3-dev \ + autoconf automake libtool perl \ + libclang-dev clang \ + iproute2 iptables slirp4netns \ + qemu-utils e2fsprogs \ + uidmap +``` + **Complete prerequisites**: See [`Containerfile`](Containerfile) for the full list of dependencies used in CI. This includes additional packages for kernel builds, container runtime, and testing. Running fcvm inside a VM (nested virtualization) is experimental. **Host system configuration**: @@ -65,100 +102,103 @@ sudo iptables -P FORWARD ACCEPT ## Quick Start +### Build ```bash -git clone https://github.com/ejc3/fcvm -cd fcvm -make build - -# First-time setup (downloads kernel + builds rootfs, ~5 min) -# Use sudo if btrfs storage needs to be created; skip sudo if already mounted -sudo ./target/release/fcvm setup - -# Run a container (rootless - no sudo needed) -./target/release/fcvm podman run --name test nginx:alpine - -# Exec into that container -./target/release/fcvm exec --name test -- cat /etc/os-release +# Build host CLI and guest agent +cargo build --release --workspace +``` -# One-shot run a command in the container -./target/release/fcvm podman run --name test nginx:alpine -- cat /etc/os-release +### Setup (First Time) +```bash +# Create btrfs filesystem +make setup-btrfs -# Or with bridged networking (requires sudo) -sudo ./target/release/fcvm podman run --name test --network bridged nginx:alpine +# Download kernel and create rootfs (takes 5-10 minutes first time) +fcvm setup ``` -That's it! See [Examples](#examples) for port forwarding, volumes, and more. - ---- +**What `fcvm setup` does:** +1. Downloads Kata kernel (~15MB, cached by URL hash) +2. Downloads packages via `podman run ubuntu:noble` (ensures correct Ubuntu 24.04 versions) +3. Creates Layer 2 rootfs (~10GB): boots VM, installs packages, writes config files +4. Verifies setup completed successfully (checks marker file) +5. Creates fc-agent initrd -## Why fcvm? +Subsequent runs are instant - everything is cached by content hash. -[Firecracker](https://github.com/firecracker-microvm/firecracker) is the VMM that powers AWS Lambda and Fargate. It provides secure, multi-tenant microVMs with minimal overhead. However, using Firecracker directly requires significant setup: +**Alternative: Auto-setup on first run (rootless only)** +```bash +# Skip explicit setup - does it automatically on first run +fcvm podman run --name web1 --network rootless --setup nginx:alpine +``` +The `--setup` flag triggers setup if kernel/rootfs are missing. Only works with `--network rootless` to avoid file ownership issues when running as root. -| Raw Firecracker | fcvm | -|-----------------|------| -| Download kernel + rootfs from S3, patch SSH keys | `fcvm setup` - one command | -| Configure VM via curl to Unix socket API | `fcvm podman run` - Docker/Podman syntax | -| Manual snapshot/restore orchestration | Instant cloning with UFFD + btrfs CoW (`fcvm snapshot`) | -| Manual TAP device, iptables, NAT setup | Automatic networking (bridged or rootless) | -| SSH into guest for access | `fcvm exec` via vsock (no SSH needed) | -| No container runtime integration | Native OCI container support via Podman | -| No host filesystem access | FUSE (`--map`), NFS (`--nfs`), block devices (`--disk`) | -| No port forwarding | `--publish` with automatic NAT rules | -| ~100 lines of bash per VM | Single command | +### Run a Container +```bash +# Run a one-shot command (clean output, just like docker run) +fcvm podman run --name test alpine:latest echo "hello world" +# Output: hello world ---- +# Run a service (uses rootless mode by default, no sudo needed) +fcvm podman run --name web1 public.ecr.aws/nginx/nginx:alpine -## Examples +# With port forwarding (8080 on host -> 80 in guest) +# Note: In rootless mode, use the assigned loopback IP (e.g., curl 127.0.0.2:8080) +# In bridged mode, use the veth host IP (see fcvm ls --json for the IP) +fcvm podman run --name web1 --publish 8080:80 public.ecr.aws/nginx/nginx:alpine -```bash -# Run a service (rootless mode, no sudo) -./target/release/fcvm podman run --name web1 public.ecr.aws/nginx/nginx:alpine +# With host directory mapping (via fuse-pipe) +fcvm podman run --name web1 --map /host/data:/data public.ecr.aws/nginx/nginx:alpine -# Port forwarding (8080 on host -> 80 in container) -./target/release/fcvm podman run --name web1 --publish 8080:80 public.ecr.aws/nginx/nginx:alpine +# Custom resources +fcvm podman run --name web1 --cpu 4 --mem 4096 public.ecr.aws/nginx/nginx:alpine -# Use this to find the IP it's bound to: -./target/release/fcvm ls # --json to view as json +# Bridged mode (requires sudo, uses iptables) +sudo fcvm podman run --name web1 --network bridged public.ecr.aws/nginx/nginx:alpine -# Host directory mapping (using fuse) -./target/release/fcvm podman run --name web1 --map /host/data:/data public.ecr.aws/nginx/nginx:alpine +# Interactive container (-it like docker/podman) +fcvm podman run --name shell -it alpine:latest sh # Interactive shell +fcvm podman run --name vim -it alpine:latest vi # Run vim (full TTY support) -# Custom resources -./target/release/fcvm podman run --name web1 --cpu 4 --mem 4096 public.ecr.aws/nginx/nginx:alpine +# List running VMs (sudo needed to read VM state files) +sudo fcvm ls +sudo fcvm ls --json # JSON output +sudo fcvm ls --pid 12345 # Filter by PID -# Bridged mode (requires sudo) -sudo ./target/release/fcvm podman run --name web1 --network bridged public.ecr.aws/nginx/nginx:alpine +# Execute commands (mirrors podman exec, sudo needed) +sudo fcvm exec --name web1 -- cat /etc/os-release # Run in container (default) +sudo fcvm exec --name web1 --vm -- hostname # Run in VM +sudo fcvm exec --pid 12345 -- hostname # By PID (from fcvm ls) -# List VMs and execute commands -sudo ./target/release/fcvm ls -sudo ./target/release/fcvm exec web1 -- cat /etc/os-release -sudo ./target/release/fcvm exec web1 -- bash # Interactive shell +# Interactive shell/TTY (like docker/podman -it) +sudo fcvm exec --name web1 -it -- sh # Interactive shell in container +sudo fcvm exec --name web1 -it --vm -- bash # Interactive shell in VM +sudo fcvm exec --name web1 -t -- ls -la --color=always # TTY for colors, no stdin ``` ### Snapshot & Clone Workflow ```bash # 1. Start baseline VM (using bridged, or omit --network for rootless) -sudo ./target/release/fcvm podman run --name baseline --network bridged public.ecr.aws/nginx/nginx:alpine +sudo fcvm podman run --name baseline --network bridged public.ecr.aws/nginx/nginx:alpine # 2. Create snapshot (pauses VM briefly) -sudo ./target/release/fcvm snapshot create baseline --tag nginx-warm +sudo fcvm snapshot create baseline --tag nginx-warm # 3. Start UFFD memory server (serves pages on-demand) -sudo ./target/release/fcvm snapshot serve nginx-warm +sudo fcvm snapshot serve nginx-warm # 4. Clone from snapshot (~3ms startup) -sudo ./target/release/fcvm snapshot run --pid --name clone1 --network bridged -sudo ./target/release/fcvm snapshot run --pid --name clone2 --network bridged +sudo fcvm snapshot run --pid --name clone1 --network bridged +sudo fcvm snapshot run --pid --name clone2 --network bridged # 5. Clone with port forwarding (each clone can have unique ports) -sudo ./target/release/fcvm snapshot run --pid --name web1 --network bridged --publish 8081:80 -sudo ./target/release/fcvm snapshot run --pid --name web2 --network bridged --publish 8082:80 +sudo fcvm snapshot run --pid --name web1 --network bridged --publish 8081:80 +sudo fcvm snapshot run --pid --name web2 --network bridged --publish 8082:80 # Get the host IP from fcvm ls --json, then curl it: -curl $(sudo ./target/release/fcvm ls --json | jq -r '.[] | select(.name=="web1") | .config.network.host_ip'):8081 +# curl $(sudo fcvm ls --json | jq -r '.[] | select(.name=="web1") | .config.network.host_ip'):8081 # 6. Clone and execute command (auto-cleans up after) -sudo ./target/release/fcvm snapshot run --pid --network bridged --exec "curl localhost" +sudo fcvm snapshot run --pid --network bridged --exec "curl localhost" ``` --- @@ -269,182 +309,400 @@ sudo fcvm podman run --name full \ --- -## How It Works - -### What `fcvm setup` does +## Interactive Mode & TTY -1. **Creates btrfs storage** at configured path (60GB loopback, auto-mounted) -2. **Downloads Kata kernel** (~15MB, cached by URL hash) -3. **Downloads packages** via `podman run ubuntu:noble` (ensures correct versions) -4. **Creates Layer 2 rootfs** (~10GB): boots VM, installs packages, configures services -5. **Creates fc-agent initrd** for guest agent injection +fcvm supports full interactive terminal sessions, matching docker/podman's `-i` and `-t` flags: -Everything is cached by content hash - subsequent runs are instant. +| Flag | Meaning | Use Case | +|------|---------|----------| +| `-i` | Keep stdin open | Pipe data to container | +| `-t` | Allocate pseudo-TTY | Colors, line editing | +| `-it` | Both | Interactive shell | -**Alternative: Manual btrfs setup (no sudo for fcvm)** +### Interactive Shell Examples -If you prefer to create the loopback yourself: ```bash -truncate -s 60G /mnt/fcvm-btrfs.img -sudo mkfs.btrfs /mnt/fcvm-btrfs.img -sudo mount -o loop /mnt/fcvm-btrfs.img /mnt/fcvm-btrfs -sudo chown $USER:$USER /mnt/fcvm-btrfs +# Run interactive shell in container +fcvm podman run --name shell -it alpine:latest sh -# Then setup without sudo (btrfs already exists) -fcvm setup -``` +# Run vim (full TTY - arrow keys, escape sequences work) +fcvm podman run --name editor -it alpine:latest vi /tmp/test.txt -Or just point `assets_dir` in your config to any existing btrfs path. +# Run shell in existing VM +sudo fcvm exec --name web1 -it -- sh -**Alternative: Auto-setup on first run (rootless only)** -```bash -fcvm podman run --name web1 --network rootless --setup nginx:alpine +# Pipe data (use -i without -t) +echo "hello" | fcvm podman run --name pipe -i alpine:latest cat ``` -### Networking Modes - -| Mode | Flag | Requires sudo | Port forwarding | -|------|------|---------------|-----------------| -| Rootless | `--network rootless` (default) | No | Unique loopback IP per VM | -| Bridged | `--network bridged` | Yes | iptables DNAT | +### How It Works -In rootless mode, each VM gets a unique loopback IP (127.0.0.2, 127.0.0.3, ...). Use `fcvm ls --json` to find a VM's IP. +1. **Host side**: Sets terminal to raw mode, captures all input +2. **Protocol**: Binary framed protocol over vsock (handles escape sequences, control chars) +3. **Guest side**: Allocates PTY, connects container stdin/stdout ---- +**Supported**: +- Escape sequences (colors, cursor movement) +- Control characters (Ctrl+C, Ctrl+D, Ctrl+Z) +- Line editing in shells +- Full-screen apps (vim, htop, less) -## Test Requirements +**Not yet implemented**: +- Window resize (SIGWINCH) - terminal size is fixed at session start -**Container Testing (Recommended)** - All dependencies bundled: -```bash -make container-test # All tests in container (just needs podman + /dev/kvm) -``` +--- -See [CLAUDE.md](.claude/CLAUDE.md#makefile-targets) for all Makefile targets. +## Nested Virtualization -**Native Testing** - Additional dependencies required: +> ⚠️ **Experimental Feature**: Nested virtualization (L2+) is experimental. While basic functionality works, there are known stability issues under high I/O load. See [Known Issues](#known-issues-nested) below. -| Category | Packages | -|----------|----------| -| FUSE | fuse3, libfuse3-dev | -| pjdfstest build | autoconf, automake, libtool | -| pjdfstest runtime | perl | -| bindgen (userfaultfd-sys) | libclang-dev, clang | -| VM tests | iproute2, iptables, slirp4netns | -| Rootfs build | qemu-utils, e2fsprogs | -| User namespaces | uidmap (for newuidmap/newgidmap) | +fcvm supports running VMs inside VMs using ARM64 FEAT_NV2 nested virtualization. Currently **one level of nesting works**: Host → L1 VM with full KVM support. -**pjdfstest Setup** (for POSIX compliance tests): -```bash -git clone --depth 1 https://github.com/pjd/pjdfstest /tmp/pjdfstest-check -cd /tmp/pjdfstest-check && autoreconf -ifs && ./configure && make ``` - -**Ubuntu/Debian Install**: -```bash -sudo apt-get update && sudo apt-get install -y \ - fuse3 libfuse3-dev \ - autoconf automake libtool perl \ - libclang-dev clang \ - iproute2 iptables slirp4netns \ - qemu-utils e2fsprogs \ - uidmap +┌─────────────────────────────────────────────────────────┐ +│ Host (bare metal c7g.metal) │ +│ ┌───────────────────────────────────────────────────┐ │ +│ │ L1 VM (fcvm + nested kernel profile) │ │ +│ │ ┌─────────────────────────────────────────────┐ │ │ +│ │ │ L2 VM (fcvm inside L1) │ │ │ +│ │ │ - Runs containers │ │ │ +│ │ │ - Full VM isolation │ │ │ +│ │ └─────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ ``` ---- - -## Project Structure +**What Works**: Host → L1 → L2 nesting is fully functional. The `arm64.nv2` kernel boot parameter enables recursive KVM (`KVM_CAP_ARM_EL2=1`). -``` -fcvm/ -├── src/ # Host CLI (fcvm binary) -├── fc-agent/ # Guest agent (runs inside VM) -├── fuse-pipe/ # FUSE passthrough library -└── tests/ # Integration tests (16 files) -``` +**Limitation**: L3+ nesting (L1 → L2 → L3...) is blocked by FUSE-over-FUSE latency. Each nesting level adds ~3-5 seconds per filesystem request due to the multi-hop FUSE chain. See `.claude/CLAUDE.md` for technical details. -See [DESIGN.md](DESIGN.md#directory-structure) for detailed structure. +### Requirements ---- +| Requirement | Details | +|-------------|---------| +| **Hardware** | ARM64 with FEAT_NV2 (Graviton3+: c7g.metal, c7gn.metal, r7g.metal) | +| **Host kernel** | 6.18+ with `kvm-arm.mode=nested` boot parameter | +| **Nested kernel** | Pre-built from releases or `fcvm setup --kernel-profile nested --build-kernels` | +| **Firecracker** | Fork with NV2 support (configured via kernel profile) | -## CLI Reference +### Setting Up an EC2 Instance for Nested Virtualization -Run `fcvm --help` or `fcvm --help` for full options. +**Step 1: Launch a metal instance** -### Commands +```bash +# Must be a metal instance for FEAT_NV2 hardware support +# Recommended: c7g.metal, m7g.metal, r7g.metal (Graviton3) +aws ec2 run-instances \ + --instance-type c7g.metal \ + --image-id ami-0xyz... # Ubuntu 24.04 ARM64 +``` -| Command | Description | -|---------|-------------| -| `fcvm setup` | Download kernel (~15MB) and create rootfs (~10GB). Takes 5-10 min first run | -| `fcvm podman run` | Run container in Firecracker VM | -| `fcvm exec` | Execute command in running VM/container | -| `fcvm ls` | List running VMs (`--json` for JSON output) | -| `fcvm snapshot create` | Create snapshot from running VM | -| `fcvm snapshot serve` | Start UFFD memory server for cloning | -| `fcvm snapshot run` | Spawn clone from memory server | -| `fcvm snapshots` | List available snapshots | +**Step 2: Install fcvm and set up host kernel** -See [DESIGN.md](DESIGN.md#commands) for full option reference. +```bash +# Install fcvm (or build from source) +cargo install fcvm -### Key Options +# Download nested kernel profile and install as host kernel +# This also configures GRUB with kvm-arm.mode=nested +sudo fcvm setup --kernel-profile nested --install-host-kernel -**`fcvm podman run`** - Essential options: -``` ---name VM name (required) ---network rootless (default) or bridged (needs sudo) ---publish Port forward host:guest (e.g., 8080:80) ---map Volume mount host:guest (optional :ro for read-only) ---env Environment variable ---setup Auto-setup if kernel/rootfs missing (rootless only) +# Reboot into the new kernel +sudo reboot ``` -**`fcvm exec`** - Execute in VM/container: +**Step 3: Verify nested KVM is enabled** + ```bash -sudo fcvm exec my-vm -- cat /etc/os-release # In container -sudo fcvm exec my-vm --vm -- curl -s ifconfig.me # In guest OS -sudo fcvm exec my-vm -- bash # Interactive shell +# Check kernel version +uname -r # Should show 6.18-nested + +# Check nested mode is enabled +cat /sys/module/kvm/parameters/mode # Should show "nested" + +# Verify KVM works +ls -la /dev/kvm ``` ---- +
+Manual kernel build (alternative) -## Network Modes +If you prefer to build the host kernel manually: -| Mode | Flag | Root | Notes | -|------|------|------|-------| -| Rootless | `--network rootless` (default) | No | slirp4netns, no root needed | -| Bridged | `--network bridged` | Yes | iptables NAT | +```bash +# Install build dependencies +sudo apt-get update +sudo apt-get install -y build-essential flex bison bc libelf-dev libssl-dev -See [DESIGN.md](DESIGN.md#networking) for architecture details. +# Download kernel source +cd /tmp +wget https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.18.2.tar.xz +tar xf linux-6.18.2.tar.xz +cd linux-6.18.2 ---- +# Configure for ARM64 with KVM +make defconfig +./scripts/config --enable VIRTUALIZATION +./scripts/config --enable KVM +./scripts/config --enable CONFIG_FUSE_FS -## Container Behavior +# Build and install (~10-20 minutes on metal) +make -j$(nproc) +sudo make modules_install +sudo make install -- **Exit codes**: Container exit code forwarded to host via vsock -- **Logs**: Container stdout goes to host stdout, stderr to host stderr (clean output for scripting) -- **Health**: Default uses vsock ready signal; optional `--health-check` for HTTP +# Configure GRUB +sudo sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="kvm-arm.mode=nested /' /etc/default/grub +sudo update-grub +sudo reboot +``` -See [DESIGN.md](DESIGN.md#guest-agent) for details. +
---- +### Getting the Nested Kernel -## Environment Variables +> **Note**: If you followed "Setting Up an EC2 Instance" above, the kernel is already downloaded. This section is for users who already have a host with nested KVM enabled. -| Variable | Default | Description | -|----------|---------|-------------| -| `FCVM_BASE_DIR` | `/mnt/fcvm-btrfs` | Base directory for all data | -| `RUST_LOG` | `warn` | Logging level (quiet by default; use `info` or `debug` for verbose) | +```bash +# Download pre-built kernel from GitHub releases (~20MB) +fcvm setup --kernel-profile nested ---- +# Kernel will be at /mnt/fcvm-btrfs/kernels/vmlinux-nested-6.18-aarch64-*.bin +``` -## Testing +Or build locally (takes 10-20 minutes): +```bash +fcvm setup --kernel-profile nested --build-kernels +``` -### CI Summary +The nested kernel (6.18) includes: +- **CONFIG_KVM=y** - KVM hypervisor for nested virtualization +- **EL2 support** - ARM Exception Level 2 (hypervisor mode) +- **MMFR4 patch** - Enables `arm64.nv2` boot param for NV2 capability +- **FUSE** - For volume mounts between host and guest +- **Networking** - TUN/VETH/netfilter for bridged networking in nested VMs -Every CI run exercises the full stack: +### Running Nested VMs -| Metric | Count | -|--------|-------| -| **Total Tests** | 9,290 | +**Step 1: Start outer VM with nested kernel profile** +```bash +# Uses nested kernel profile from rootfs-config.toml +sudo fcvm podman run \ + --name outer-vm \ + --network bridged \ + --kernel-profile nested \ + --privileged \ + --map /mnt/fcvm-btrfs:/mnt/fcvm-btrfs \ + --map /path/to/fcvm/binary:/opt/fcvm \ + nginx:alpine +``` + +**Step 2: Verify nested KVM works** +```bash +# Check guest sees HYP mode +fcvm exec --pid --vm -- dmesg | grep -i kvm +# Should show: "kvm [1]: VHE mode initialized successfully" + +# Verify /dev/kvm is accessible +fcvm exec --pid --vm -- ls -la /dev/kvm +``` + +**Step 3: Run inner VM** +```bash +# Inside outer VM (via exec or SSH) +cd /mnt/fcvm-btrfs +/opt/fcvm/fcvm podman run --name inner-vm --network bridged alpine:latest echo "Hello from nested VM!" +``` + +### How It Works + +1. **FCVM_NV2=1** environment variable (auto-set when `--kernel` is used) triggers fcvm to pass `--enable-nv2` to Firecracker +2. **HAS_EL2 + HAS_EL2_E2H0** vCPU features are enabled + - HAS_EL2 (bit 7): Enables virtual EL2 for guest + - HAS_EL2_E2H0 (bit 8): Forces nVHE mode (avoids timer trap storm) +3. **vCPU boots at EL2h** so guest kernel's `is_hyp_mode_available()` returns true +4. **EL2 registers initialized**: HCR_EL2, CNTHCTL_EL2, VMPIDR_EL2, VPIDR_EL2 +5. Guest kernel initializes KVM: "CPU: All CPU(s) started at EL2" +6. Nested fcvm creates VMs using the guest's KVM + +### Testing Nested Virtualization + +```bash +# Run nested virtualization tests +make test-root FILTER=kvm + +# Tests: +# - test_kvm_available_in_vm: Verifies /dev/kvm works in guest with nested profile +# - test_nested_run_fcvm_inside_vm: Full test of running fcvm inside fcvm +# - test_nested_l2: Full L1→L2 nesting with benchmarks at each level +``` + +### Nested Performance Benchmarks + +Performance at each nesting level (measured on c7g.metal, ARM64 Graviton3): + +| Metric | L1 (Host→VM) | L2 (VM→VM) | Overhead | +|--------|-------------|------------|----------| +| **Egress (curl)** | ✓ | ✓ | — | +| **Local Write** (10MB sync) | 4ms | 16ms | 4x | +| **Local Read** (10MB) | 2ms | 14ms | 7x | +| **FUSE Write** (10MB sync) | 83ms | 295ms | 3.6x | +| **FUSE Read** (10MB) | 45ms | 226ms | 5x | +| **FUSE Stat** (per-op) | 1.1ms | 5.3ms | 4.8x | +| **Copy TO FUSE** (100MB) | 1078ms (92 MB/s) | 7789ms (12 MB/s) | **7.2x** | +| **Copy FROM FUSE** (100MB) | 398ms (250 MB/s) | 2227ms (44 MB/s) | **5.6x** | +| **Memory Used** | 399MB | 341MB | — | + +**Key observations:** +- **~5-7x FUSE overhead** at L2 due to FUSE-over-FUSE chaining (L2 → L1 → Host) +- **Large copies** show sustained throughput: 92 MB/s at L1, 12 MB/s at L2 (write) / 44 MB/s (read) +- **Local disk** overhead is lower (~4-7x) since it only traverses the virtio block device +- **Memory** is similar at each level (~350-400MB for the nested container image) + +**Why L3+ is blocked:** Each additional nesting level adds another FUSE hop. At L3, a single stat() would take ~25ms (5x × 5x = 25x overhead), making container startup take 10+ minutes. + +#### Network Performance (iperf3) + +Egress/ingress throughput measured with iperf3 (3-second tests, various block sizes and parallelism): + +| Direction | Block Size | Streams | L1 | L2 | Overhead | +|-----------|------------|---------|----|----|----------| +| **Egress** (VM→Host) | 128K | 1 | 42.4 Gbps | 11.0 Gbps | 3.9x | +| | 128K | 4 | 38.0 Gbps | 12.8 Gbps | 3.0x | +| | 1M | 1 | 43.1 Gbps | 9.0 Gbps | 4.8x | +| | 1M | 8 | 33.1 Gbps | 12.3 Gbps | 2.7x | +| **Ingress** (Host→VM) | 128K | 1 | 48.7 Gbps | 8.4 Gbps | 5.8x | +| | 128K | 4 | 44.3 Gbps | 8.6 Gbps | 5.2x | +| | 1M | 1 | 53.4 Gbps | 11.7 Gbps | 4.6x | +| | 1M | 8 | 43.0 Gbps | 10.4 Gbps | 4.1x | + +**Network observations:** +- **L1 achieves 40-53 Gbps** - excellent virtio-net performance +- **L2 achieves 8-13 Gbps** - ~4-5x overhead from double NAT chain +- **Single stream often outperforms parallel** - likely virtio queue contention +- **Egress slightly faster than ingress at L2** - asymmetric NAT path + +### Limitations + +- ARM64 only (x86_64 nested virt uses different mechanism) +- Requires bare-metal instance (c7g.metal) or host with nested virt enabled +- L3+ nesting blocked by FUSE-over-FUSE latency (~5x per level) + +### L2 Cache Coherency Fix + +**Background**: Under NV2 nested virtualization, L2 FUSE writes could corrupt when using large packet sizes (~1MB). The root cause was missing cache synchronization at nested guest exit - L2's writes to the virtio ring weren't visible to L1's mmap reads. + +**Solution**: A kernel patch adds a DSB SY (Data Synchronization Barrier) in `kvm_nested_sync_hwstate()` to ensure L2's writes are visible to L1 before returning from the nested guest exit handler. + +The patch is at `kernel/patches/nv2-vsock-cache-sync.patch` and is automatically applied when building the nested kernel. + +**Test**: 100MB file copies through FUSE-over-FUSE complete successfully with unbounded max_write: +```bash +make test-root FILTER=nested_l2_with_large +``` + +### Known Issues (Nested) {#known-issues-nested} + +- **L3+ nesting**: Blocked by FUSE-over-FUSE latency (~5x per level). Each additional nesting level adds 3-5 seconds per filesystem request. +- **Nested tests disabled**: L2/L3 nested tests are currently disabled in CI due to timing sensitivity and flakiness under NV2. The tests pass individually but are slow (~5 min each) and occasionally timeout. Run manually with `make test-root FILTER=nested` if needed. + +--- + +## Project Structure + +``` +fcvm/ +├── src/ # Host CLI (fcvm binary) +├── fc-agent/ # Guest agent (runs inside VM) +├── fuse-pipe/ # FUSE passthrough library +└── tests/ # Integration tests (16 files) +``` + +See [DESIGN.md](DESIGN.md#directory-structure) for detailed structure. + +--- + +## CLI Reference + +Run `fcvm --help` or `fcvm --help` for full options. + +### Commands + +| Command | Description | +|---------|-------------| +| `fcvm setup` | Download kernel (~15MB) and create rootfs (~10GB). Takes 5-10 min first run | +| `fcvm podman run` | Run container in Firecracker VM | +| `fcvm exec` | Execute command in running VM/container | +| `fcvm ls` | List running VMs (`--json` for JSON output) | +| `fcvm snapshot create` | Create snapshot from running VM | +| `fcvm snapshot serve` | Start UFFD memory server for cloning | +| `fcvm snapshot run` | Spawn clone from memory server | +| `fcvm snapshots` | List available snapshots | + +See [DESIGN.md](DESIGN.md#cli-interface) for architecture and design decisions. + +### Key Options + +**`fcvm podman run`** - Essential options: +``` +--name VM name (required) +--network rootless (default) or bridged (needs sudo) +--publish Port forward host:guest (e.g., 8080:80) +--map Volume mount host:guest (optional :ro for read-only) +--env Environment variable +-i, --interactive Keep stdin open (for piping input) +-t, --tty Allocate pseudo-TTY (for vim, colors, etc.) +--setup Auto-setup if kernel/rootfs missing (rootless only) +``` + +**`fcvm exec`** - Execute in VM/container: +```bash +sudo fcvm exec --name my-vm -- cat /etc/os-release # In container +sudo fcvm exec --name my-vm --vm -- curl -s ifconfig.me # In guest OS +sudo fcvm exec --name my-vm -it -- bash # Interactive shell +``` + +--- + +## Network Modes + +| Mode | Flag | Root | Notes | +|------|------|------|-------| +| Rootless | `--network rootless` (default) | No | slirp4netns, no root needed | +| Bridged | `--network bridged` | Yes | iptables NAT | + +See [DESIGN.md](DESIGN.md#networking) for architecture details. + +--- + +## Container Behavior + +- **Exit codes**: Container exit code forwarded to host via vsock +- **Logs**: Container stdout goes to host stdout, stderr to host stderr (clean output for scripting) +- **Health**: Default uses vsock ready signal; optional `--health-check` for HTTP + +See [DESIGN.md](DESIGN.md#guest-agent) for details. + +--- + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `FCVM_BASE_DIR` | `/mnt/fcvm-btrfs` | Base directory for all data | +| `RUST_LOG` | `warn` | Logging level (quiet by default; use `info` or `debug` for verbose) | + +--- + +## Testing + +### CI Summary + +Every CI run exercises the full stack: + +| Metric | Count | +|--------|-------| +| **Total Tests** | 9,290 | | **Nextest Functions** | 501 | | **POSIX Compliance (pjdfstest)** | 8,789 | | **VMs Spawned** | 331 (92 base + 239 clones) | @@ -552,11 +810,12 @@ sudo fusermount3 -u /tmp/fuse-*-mount* ## Data Layout -All data stored under the configured `assets_dir` (default `/mnt/fcvm-btrfs/`) which requires btrfs for CoW reflinks. See [DESIGN.md](DESIGN.md#data-directory) for details. +All data stored under `/mnt/fcvm-btrfs/` (btrfs for CoW reflinks). See [DESIGN.md](DESIGN.md#data-directory) for details. ```bash -# Setup storage, kernel, and rootfs (creates btrfs automatically if needed) -sudo fcvm setup +# Setup btrfs (done automatically by make setup-btrfs) +make setup-btrfs +make setup-fcvm # Download kernel, create rootfs ``` --- @@ -647,7 +906,7 @@ boot_args = "quiet" | Field | Required | Description | |-------|----------|-------------| | `kernel_version` | Yes | Kernel version (e.g., "6.18.3") | -| `kernel_repo` | Yes | GitHub repo for releases (e.g., "ejc3/fcvm") | +| `kernel_repo` | Yes | GitHub repo for releases (e.g., "ejc3/firepod") | | `build_inputs` | Yes | Files to hash for kernel SHA (supports globs) | | `kernel_config` | No | Kernel .config file path | | `patches_dir` | No | Directory containing kernel patches | @@ -743,281 +1002,6 @@ PRs are automatically reviewed by Claude. Reviews are blocking if critical issue Reviews check for security issues, bugs, and breaking changes. Issues prefixed with `BLOCKING:` will fail the status check. -### Auto-Fix CI Failures - -When CI fails, Claude automatically attempts to diagnose and fix the issue. - -| Scenario | Behavior | -|----------|----------| -| **PR from org member** | Creates fix PR on top of the failing branch | -| **Main branch failure** | Creates fix PR targeting main | -| **External contributor PR** | Skipped (no auto-fix) | - -The auto-fix workflow: -1. Downloads failed job logs -2. Runs Claude Code to diagnose and fix -3. Creates a stacked fix PR -4. Monitors CI on the fix PR (30 min timeout) -5. Updates the original PR with status and links - -Fix branches use the naming convention `claude/fix-{runId}` and are excluded from auto-fix to prevent infinite loops. - ---- - -## Nested Virtualization - -> ⚠️ **Experimental Feature**: Nested virtualization (L2+) is experimental. While basic functionality works, there are known stability issues under high I/O load. See [Known Issues](#known-issues-nested) below. - -fcvm supports running VMs inside VMs using ARM64 FEAT_NV2 nested virtualization. Currently **one level of nesting works**: Host → L1 VM with full KVM support. - -``` -┌─────────────────────────────────────────────────────────┐ -│ Host (bare metal c7g.metal) │ -│ ┌───────────────────────────────────────────────────┐ │ -│ │ L1 VM (fcvm + nested kernel profile) │ │ -│ │ ┌─────────────────────────────────────────────┐ │ │ -│ │ │ L2 VM (fcvm inside L1) │ │ │ -│ │ │ - Runs containers │ │ │ -│ │ │ - Full VM isolation │ │ │ -│ │ └─────────────────────────────────────────────┘ │ │ -│ └───────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────┘ -``` - -**What Works**: Host → L1 → L2 nesting is fully functional. The `arm64.nv2` kernel boot parameter enables recursive KVM (`KVM_CAP_ARM_EL2=1`). - -**Limitation**: L3+ nesting (L1 → L2 → L3...) is blocked by FUSE-over-FUSE latency. Each nesting level adds ~3-5 seconds per filesystem request due to the multi-hop FUSE chain. See `.claude/CLAUDE.md` for technical details. - -### Requirements - -| Requirement | Details | -|-------------|---------| -| **Hardware** | ARM64 with FEAT_NV2 (Graviton3+: c7g.metal, c7gn.metal, r7g.metal) | -| **Host kernel** | 6.18+ with `kvm-arm.mode=nested` boot parameter | -| **Nested kernel** | Pre-built from releases or `fcvm setup --kernel-profile nested --build-kernels` | -| **Firecracker** | Fork with NV2 support (configured via kernel profile) | - -### Setting Up an EC2 Instance for Nested Virtualization - -**Step 1: Launch a metal instance** - -```bash -# Must be a metal instance for FEAT_NV2 hardware support -# Recommended: c7g.metal, m7g.metal, r7g.metal (Graviton3) -aws ec2 run-instances \ - --instance-type c7g.metal \ - --image-id ami-0xyz... # Ubuntu 24.04 ARM64 -``` - -**Step 2: Install fcvm and set up host kernel** - -```bash -# Install fcvm (or build from source) -cargo install fcvm - -# Download nested kernel profile and install as host kernel -# This also configures GRUB with kvm-arm.mode=nested -sudo fcvm setup --kernel-profile nested --install-host-kernel - -# Reboot into the new kernel -sudo reboot -``` - -**Step 3: Verify nested KVM is enabled** - -```bash -# Check kernel version -uname -r # Should show 6.18-nested - -# Check nested mode is enabled -cat /sys/module/kvm/parameters/mode # Should show "nested" - -# Verify KVM works -ls -la /dev/kvm -``` - -
-Manual kernel build (alternative) - -If you prefer to build the host kernel manually: - -```bash -# Install build dependencies -sudo apt-get update -sudo apt-get install -y build-essential flex bison bc libelf-dev libssl-dev - -# Download kernel source -cd /tmp -wget https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.18.2.tar.xz -tar xf linux-6.18.2.tar.xz -cd linux-6.18.2 - -# Configure for ARM64 with KVM -make defconfig -./scripts/config --enable VIRTUALIZATION -./scripts/config --enable KVM -./scripts/config --enable CONFIG_FUSE_FS - -# Build and install (~10-20 minutes on metal) -make -j$(nproc) -sudo make modules_install -sudo make install - -# Configure GRUB -sudo sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="kvm-arm.mode=nested /' /etc/default/grub -sudo update-grub -sudo reboot -``` - -
- -### Getting the Nested Kernel - -> **Note**: If you followed "Setting Up an EC2 Instance" above, the kernel is already downloaded. This section is for users who already have a host with nested KVM enabled. - -```bash -# Download pre-built kernel from GitHub releases (~20MB) -fcvm setup --kernel-profile nested - -# Kernel will be at /mnt/fcvm-btrfs/kernels/vmlinux-nested-6.18-aarch64-*.bin -``` - -Or build locally (takes 10-20 minutes): -```bash -fcvm setup --kernel-profile nested --build-kernels -``` - -The nested kernel (6.18) includes: -- **CONFIG_KVM=y** - KVM hypervisor for nested virtualization -- **EL2 support** - ARM Exception Level 2 (hypervisor mode) -- **MMFR4 patch** - Enables `arm64.nv2` boot param for NV2 capability -- **FUSE** - For volume mounts between host and guest -- **Networking** - TUN/VETH/netfilter for bridged networking in nested VMs - -### Running Nested VMs - -**Step 1: Start outer VM with nested kernel profile** -```bash -# Uses nested kernel profile from rootfs-config.toml -sudo fcvm podman run \ - --name outer-vm \ - --network bridged \ - --kernel-profile nested \ - --privileged \ - --map /mnt/fcvm-btrfs:/mnt/fcvm-btrfs \ - --map /path/to/fcvm/binary:/opt/fcvm \ - nginx:alpine -``` - -**Step 2: Verify nested KVM works** -```bash -# Check guest sees HYP mode -fcvm exec --pid --vm -- dmesg | grep -i kvm -# Should show: "kvm [1]: VHE mode initialized successfully" - -# Verify /dev/kvm is accessible -fcvm exec --pid --vm -- ls -la /dev/kvm -``` - -**Step 3: Run inner VM** -```bash -# Inside outer VM (via exec or SSH) -cd /mnt/fcvm-btrfs -/opt/fcvm/fcvm podman run --name inner-vm --network bridged alpine:latest echo "Hello from nested VM!" -``` - -### How It Works - -1. **FCVM_NV2=1** environment variable (auto-set when `--kernel` is used) triggers fcvm to pass `--enable-nv2` to Firecracker -2. **HAS_EL2 + HAS_EL2_E2H0** vCPU features are enabled - - HAS_EL2 (bit 7): Enables virtual EL2 for guest - - HAS_EL2_E2H0 (bit 8): Forces nVHE mode (avoids timer trap storm) -3. **vCPU boots at EL2h** so guest kernel's `is_hyp_mode_available()` returns true -4. **EL2 registers initialized**: HCR_EL2, CNTHCTL_EL2, VMPIDR_EL2, VPIDR_EL2 -5. Guest kernel initializes KVM: "CPU: All CPU(s) started at EL2" -6. Nested fcvm creates VMs using the guest's KVM - -### Testing Nested Virtualization - -```bash -# Run nested virtualization tests -make test-root FILTER=kvm - -# Tests: -# - test_kvm_available_in_vm: Verifies /dev/kvm works in guest with nested profile -# - test_nested_run_fcvm_inside_vm: Full test of running fcvm inside fcvm -# - test_nested_l2: Full L1→L2 nesting with benchmarks at each level -``` - -### Nested Performance Benchmarks - -Performance at each nesting level (measured on c7g.metal, ARM64 Graviton3): - -| Metric | L1 (Host→VM) | L2 (VM→VM) | Overhead | -|--------|-------------|------------|----------| -| **Egress (curl)** | ✓ | ✓ | — | -| **Local Write** (10MB sync) | 4ms | 16ms | 4x | -| **Local Read** (10MB) | 2ms | 14ms | 7x | -| **FUSE Write** (10MB sync) | 83ms | 295ms | 3.6x | -| **FUSE Read** (10MB) | 45ms | 226ms | 5x | -| **FUSE Stat** (per-op) | 1.1ms | 5.3ms | 4.8x | -| **Copy TO FUSE** (100MB) | 1078ms (92 MB/s) | 7789ms (12 MB/s) | **7.2x** | -| **Copy FROM FUSE** (100MB) | 398ms (250 MB/s) | 2227ms (44 MB/s) | **5.6x** | -| **Memory Used** | 399MB | 341MB | — | - -**Key observations:** -- **~5-7x FUSE overhead** at L2 due to FUSE-over-FUSE chaining (L2 → L1 → Host) -- **Large copies** show sustained throughput: 92 MB/s at L1, 12 MB/s at L2 (write) / 44 MB/s (read) -- **Local disk** overhead is lower (~4-7x) since it only traverses the virtio block device -- **Memory** is similar at each level (~350-400MB for the nested container image) - -**Why L3+ is blocked:** Each additional nesting level adds another FUSE hop. At L3, a single stat() would take ~25ms (5x × 5x = 25x overhead), making container startup take 10+ minutes. - -#### Network Performance (iperf3) - -Egress/ingress throughput measured with iperf3 (3-second tests, various block sizes and parallelism): - -| Direction | Block Size | Streams | L1 | L2 | Overhead | -|-----------|------------|---------|----|----|----------| -| **Egress** (VM→Host) | 128K | 1 | 42.4 Gbps | 11.0 Gbps | 3.9x | -| | 128K | 4 | 38.0 Gbps | 12.8 Gbps | 3.0x | -| | 1M | 1 | 43.1 Gbps | 9.0 Gbps | 4.8x | -| | 1M | 8 | 33.1 Gbps | 12.3 Gbps | 2.7x | -| **Ingress** (Host→VM) | 128K | 1 | 48.7 Gbps | 8.4 Gbps | 5.8x | -| | 128K | 4 | 44.3 Gbps | 8.6 Gbps | 5.2x | -| | 1M | 1 | 53.4 Gbps | 11.7 Gbps | 4.6x | -| | 1M | 8 | 43.0 Gbps | 10.4 Gbps | 4.1x | - -**Network observations:** -- **L1 achieves 40-53 Gbps** - excellent virtio-net performance -- **L2 achieves 8-13 Gbps** - ~4-5x overhead from double NAT chain -- **Single stream often outperforms parallel** - likely virtio queue contention -- **Egress slightly faster than ingress at L2** - asymmetric NAT path - -### Limitations - -- ARM64 only (x86_64 nested virt uses different mechanism) -- Requires bare-metal instance (c7g.metal) or host with nested virt enabled -- L3+ nesting blocked by FUSE-over-FUSE latency (~5x per level) - -### L2 Cache Coherency Fix - -**Background**: Under NV2 nested virtualization, L2 FUSE writes could corrupt when using large packet sizes (~1MB). The root cause was missing cache synchronization at nested guest exit - L2's writes to the virtio ring weren't visible to L1's mmap reads. - -**Solution**: A kernel patch adds a DSB SY (Data Synchronization Barrier) in `kvm_nested_sync_hwstate()` to ensure L2's writes are visible to L1 before returning from the nested guest exit handler. - -The patch is at `kernel/patches/nv2-vsock-cache-sync.patch` and is automatically applied when building the nested kernel. - -**Test**: 100MB file copies through FUSE-over-FUSE complete successfully with unbounded max_write: -```bash -make test-root FILTER=nested_l2_with_large -``` - -### Known Issues (Nested) {#known-issues-nested} - -- **L3+ nesting**: Blocked by FUSE-over-FUSE latency (~5x per level). Each additional nesting level adds 3-5 seconds per filesystem request. -- **Nested tests disabled**: L2/L3 nested tests are currently disabled in CI due to timing sensitivity and flakiness under NV2. The tests pass individually but are slow (~5 min each) and occasionally timeout. Run manually with `make test-root FILTER=nested` if needed. - --- ## License diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index 7d89a5d0..f6114546 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -1,4 +1,5 @@ mod fuse; +mod tty; use anyhow::{Context, Result}; use fs2::FileExt; @@ -35,6 +36,12 @@ struct Plan { /// Run container in privileged mode (allows mknod, device access, etc.) #[serde(default)] privileged: bool, + /// Keep STDIN open even if not attached + #[serde(default)] + interactive: bool, + /// Allocate a pseudo-TTY + #[serde(default)] + tty: bool, } /// Volume mount configuration from MMDS @@ -617,7 +624,7 @@ const STATUS_VSOCK_PORT: u32 = 4999; /// Exec server port for running commands from host const EXEC_VSOCK_PORT: u32 = 4998; -/// Container output streaming port +/// Container output streaming port (line-based protocol) const OUTPUT_VSOCK_PORT: u32 = 4997; /// Host CID for vsock (always 2) @@ -831,285 +838,32 @@ fn handle_exec_connection_blocking(fd: i32) { // Use framed protocol for TTY or interactive modes // JSON line protocol only for plain non-interactive if request.tty || request.interactive { - handle_exec_framed(fd, &request); - } else { - handle_exec_pipe(fd, &request); - } -} - -/// Handle exec with binary framing protocol -/// -/// Uses the binary framing protocol (exec_proto) to cleanly separate -/// control messages (exit code) from raw terminal/pipe data. -/// -/// - If `tty=true`: allocates PTY for terminal emulation -/// - If `tty=false`: uses pipes for stdin/stdout/stderr -fn handle_exec_framed(fd: i32, request: &ExecRequest) { - use std::io::Read; - use std::os::unix::io::FromRawFd; - - // Wrap vsock fd in File for clean I/O - let mut vsock = unsafe { std::fs::File::from_raw_fd(fd) }; - - // For TTY mode, allocate a PTY - // For non-TTY, we'll use pipes (set up after fork) - let (master_fd, slave_fd) = if request.tty { - let mut master: libc::c_int = 0; - let mut slave: libc::c_int = 0; - - let result = unsafe { - libc::openpty( - &mut master, - &mut slave, - std::ptr::null_mut(), - std::ptr::null_mut(), - std::ptr::null_mut(), - ) - }; - - if result != 0 { - let _ = exec_proto::write_error(&mut vsock, "Failed to allocate PTY"); - return; - } - (master, slave) - } else { - (-1, -1) // Will use pipes instead - }; - - // For non-TTY mode, create pipes - let (stdin_read, stdin_write, stdout_read, stdout_write) = if !request.tty { - let mut stdin_pipe = [0i32; 2]; - let mut stdout_pipe = [0i32; 2]; - unsafe { - if libc::pipe(stdin_pipe.as_mut_ptr()) != 0 { - let _ = exec_proto::write_error(&mut vsock, "Failed to create stdin pipe"); - return; - } - if libc::pipe(stdout_pipe.as_mut_ptr()) != 0 { - libc::close(stdin_pipe[0]); - libc::close(stdin_pipe[1]); - let _ = exec_proto::write_error(&mut vsock, "Failed to create stdout pipe"); - return; - } - } - (stdin_pipe[0], stdin_pipe[1], stdout_pipe[0], stdout_pipe[1]) - } else { - (-1, -1, -1, -1) - }; - - // Fork - let pid = unsafe { libc::fork() }; - - if pid < 0 { - let _ = exec_proto::write_error(&mut vsock, "Failed to fork"); - if request.tty { - unsafe { - libc::close(master_fd); - libc::close(slave_fd); - } - } else { - unsafe { - libc::close(stdin_read); - libc::close(stdin_write); - libc::close(stdout_read); - libc::close(stdout_write); - } - } - return; - } - - if pid == 0 { - // Child process - don't let File destructor close fd - std::mem::forget(vsock); - - if request.tty { - unsafe { - // Create new session and set controlling terminal - libc::setsid(); - libc::ioctl(slave_fd, libc::TIOCSCTTY as _, 0); - - // Redirect stdin/stdout/stderr to PTY slave - libc::dup2(slave_fd, 0); - libc::dup2(slave_fd, 1); - libc::dup2(slave_fd, 2); - - // Close fds we don't need - if slave_fd > 2 { - libc::close(slave_fd); - } - libc::close(master_fd); - } - } else { - unsafe { - // Redirect stdin/stdout/stderr to pipes - libc::dup2(stdin_read, 0); - libc::dup2(stdout_write, 1); - libc::dup2(stdout_write, 2); // stderr also to stdout pipe - - // Close unused pipe ends - libc::close(stdin_read); - libc::close(stdin_write); - libc::close(stdout_read); - libc::close(stdout_write); - } - } - - unsafe { libc::close(fd) }; - - // Build and exec command - if request.in_container { - let mut args: Vec = vec![ - std::ffi::CString::new("podman").unwrap(), - std::ffi::CString::new("exec").unwrap(), - ]; - // Pass -i and -t separately, matching podman's semantics + // Build command for exec + let command = if request.in_container { + let mut cmd = vec!["podman".to_string(), "exec".to_string()]; + // Pass -i and -t to podman exec if request.interactive { - args.push(std::ffi::CString::new("-i").unwrap()); + cmd.push("-i".to_string()); } if request.tty { - args.push(std::ffi::CString::new("-t").unwrap()); - } - args.push(std::ffi::CString::new("--latest").unwrap()); - for arg in &request.command { - args.push(std::ffi::CString::new(arg.as_str()).unwrap()); - } - let arg_ptrs: Vec<*const libc::c_char> = args - .iter() - .map(|s| s.as_ptr()) - .chain(std::iter::once(std::ptr::null())) - .collect(); - unsafe { - libc::execvp(args[0].as_ptr(), arg_ptrs.as_ptr()); + cmd.push("-t".to_string()); } + cmd.push("--latest".to_string()); + cmd.extend(request.command.iter().cloned()); + cmd } else { - let prog = std::ffi::CString::new(request.command[0].as_str()).unwrap(); - let args: Vec = request - .command - .iter() - .map(|s| std::ffi::CString::new(s.as_str()).unwrap()) - .collect(); - let arg_ptrs: Vec<*const libc::c_char> = args - .iter() - .map(|s| s.as_ptr()) - .chain(std::iter::once(std::ptr::null())) - .collect(); - unsafe { - libc::execvp(prog.as_ptr(), arg_ptrs.as_ptr()); - } - } - - unsafe { libc::_exit(127) }; - } - - // Parent process - close child ends of pipes/pty - if request.tty { - unsafe { libc::close(slave_fd) }; - } else { - unsafe { - libc::close(stdin_read); - libc::close(stdout_write); - } - } - - // Wrap master fd or stdout pipe in File for reading output - let output_reader = if request.tty { - unsafe { std::fs::File::from_raw_fd(master_fd) } - } else { - unsafe { std::fs::File::from_raw_fd(stdout_read) } - }; - - // For non-TTY, wrap stdin pipe for writing - let stdin_writer = if !request.tty { - Some(unsafe { std::fs::File::from_raw_fd(stdin_write) }) - } else { - None - }; - - // Use output_reader as the "pty_master" equivalent - let pty_master = output_reader; - - // Only forward stdin if interactive mode is enabled - let writer_thread = if request.interactive { - let vsock_for_writer = vsock.try_clone().expect("clone vsock"); - // For TTY mode, write to PTY; for non-TTY, write to stdin pipe - let stdin_target: std::fs::File = if request.tty { - pty_master.try_clone().expect("clone pty") - } else { - stdin_writer.expect("stdin_writer should exist for interactive non-TTY") + request.command.clone() }; - // Thread: read STDIN messages from vsock, write to PTY/stdin pipe - Some(std::thread::spawn(move || { - use std::io::Write; - let mut vsock = vsock_for_writer; - let mut target = stdin_target; - - loop { - match exec_proto::Message::read_from(&mut vsock) { - Ok(exec_proto::Message::Stdin(data)) => { - if target.write_all(&data).is_err() { - break; - } - if target.flush().is_err() { - break; - } - } - Ok(exec_proto::Message::Exit(_)) | Ok(exec_proto::Message::Error(_)) => break, - Ok(_) => {} // Ignore unexpected message types - Err(_) => break, - } - } - // Drop target to close stdin pipe, signaling EOF to child - drop(target); - })) + // Use unified TTY handler + // NOTE: Don't call std::process::exit() here - that would kill the entire fc-agent! + // run_with_pty_fd already sends the exit code via exec_proto and closes the fd. + // We just return and let the spawn_blocking task end naturally. + let _exit_code = tty::run_with_pty_fd(fd, &command, request.tty, request.interactive); + return; } else { - // Drop stdin_writer if not interactive (closes pipe) - drop(stdin_writer); - None - }; - - // Thread: read from PTY, write DATA messages to vsock - let reader_thread = std::thread::spawn(move || { - let mut vsock = vsock; - let mut pty = pty_master; - let mut buf = [0u8; 4096]; - - loop { - match pty.read(&mut buf) { - Ok(0) => break, // EOF - Ok(n) => { - if exec_proto::write_data(&mut vsock, &buf[..n]).is_err() { - break; - } - } - Err(_) => break, - } - } - - // Return vsock for exit message - vsock - }); - - // Wait for child to exit - let mut status: libc::c_int = 0; - unsafe { - libc::waitpid(pid, &mut status, 0); + handle_exec_pipe(fd, &request); } - - let exit_code = if libc::WIFEXITED(status) { - libc::WEXITSTATUS(status) - } else { - 1 - }; - - // Wait for reader (it will get EOF when PTY closes) - let mut vsock = reader_thread.join().expect("reader thread"); - - // Writer may be blocked on read - abort it by dropping - drop(writer_thread); - - // Send exit code - let _ = exec_proto::write_exit(&mut vsock, exit_code); } /// Handle exec in pipe mode (non-TTY) @@ -2092,69 +1846,119 @@ async fn run_agent() -> Result<()> { eprintln!("[fc-agent] launching container: {}", image_ref); - // Build Podman command - let mut cmd = Command::new("podman"); - cmd.arg("run") - .arg("--rm") - .arg("--network=host") - // Raise ulimit for containers running parallel tests - .arg("--ulimit") - .arg("nofile=65536:65536"); + // Build Podman args (used for both TTY and non-TTY modes) + let mut podman_args = vec![ + "podman".to_string(), + "run".to_string(), + "--rm".to_string(), + "--network=host".to_string(), + "--ulimit".to_string(), + "nofile=65536:65536".to_string(), + ]; // Privileged mode: allows mknod, device access, etc. for POSIX compliance tests if plan.privileged { eprintln!("[fc-agent] privileged mode enabled"); - cmd.arg("--device-cgroup-rule=b *:* rwm") // Allow block device nodes - .arg("--device-cgroup-rule=c *:* rwm") // Allow char device nodes - .arg("--privileged"); + podman_args.push("--device-cgroup-rule=b *:* rwm".to_string()); + podman_args.push("--device-cgroup-rule=c *:* rwm".to_string()); + podman_args.push("--privileged".to_string()); + } + + // Interactive/TTY modes + if plan.interactive { + podman_args.push("-i".to_string()); + } + if plan.tty { + podman_args.push("-t".to_string()); } // Add environment variables for (key, val) in &plan.env { - cmd.arg("-e").arg(format!("{}={}", key, val)); + podman_args.push("-e".to_string()); + podman_args.push(format!("{}={}", key, val)); } // Add FUSE-mounted volumes as bind mounts to container - // The FUSE mount is already at guest_path, so we bind it to same path in container for vol in &plan.volumes { let mount_spec = if vol.read_only { format!("{}:{}:ro", vol.guest_path, vol.guest_path) } else { format!("{}:{}", vol.guest_path, vol.guest_path) }; - cmd.arg("-v").arg(mount_spec); + podman_args.push("-v".to_string()); + podman_args.push(mount_spec); } // Add extra disk mounts as bind mounts to container - // Disks are already mounted at mount_path, so we bind to same path in container for disk in &plan.extra_disks { let mount_spec = if disk.read_only { format!("{}:{}:ro", disk.mount_path, disk.mount_path) } else { format!("{}:{}", disk.mount_path, disk.mount_path) }; - cmd.arg("-v").arg(mount_spec); + podman_args.push("-v".to_string()); + podman_args.push(mount_spec); } // Add NFS mounts as bind mounts to container - // NFS shares are already mounted at mount_path, so we bind to same path in container for share in &plan.nfs_mounts { let mount_spec = if share.read_only { format!("{}:{}:ro", share.mount_path, share.mount_path) } else { format!("{}:{}", share.mount_path, share.mount_path) }; - cmd.arg("-v").arg(mount_spec); + podman_args.push("-v".to_string()); + podman_args.push(mount_spec); } // Image (either oci-archive:/path or image name from registry) - cmd.arg(&image_ref); + podman_args.push(image_ref.clone()); // Command override if let Some(cmd_args) = &plan.cmd { - cmd.args(cmd_args); + podman_args.extend(cmd_args.iter().cloned()); } + // TTY mode: use PTY and binary protocol (blocking, not async) + if plan.tty { + eprintln!("[fc-agent] TTY mode enabled, using PTY"); + + // Notify host that container is starting + notify_container_started(); + + // Run container with TTY (blocks until container exits) + let exit_code = tty::run_with_pty(&podman_args, plan.tty, plan.interactive); + + // Notify host of container exit + notify_container_exit(exit_code); + + // Unmount FUSE volumes before shutdown + if !mounted_fuse_paths.is_empty() { + eprintln!( + "[fc-agent] unmounting {} FUSE volume(s) before shutdown", + mounted_fuse_paths.len() + ); + for path in &mounted_fuse_paths { + eprintln!("[fc-agent] unmounting FUSE volume at {}", path); + let _ = std::process::Command::new("umount") + .arg("-l") + .arg(path) + .output(); + } + } + + // Power off the VM + eprintln!("[fc-agent] powering off VM"); + let _ = std::process::Command::new("poweroff").arg("-f").spawn(); + + // Exit with container's exit code + std::process::exit(exit_code); + } + + // Non-TTY mode: use async I/O with line-based protocol + let mut cmd = Command::new(&podman_args[0]); + cmd.args(&podman_args[1..]); + // Spawn container with piped stdin/stdout/stderr for bidirectional I/O cmd.stdin(Stdio::piped()); cmd.stdout(Stdio::piped()); diff --git a/fc-agent/src/tty.rs b/fc-agent/src/tty.rs new file mode 100644 index 00000000..6027df73 --- /dev/null +++ b/fc-agent/src/tty.rs @@ -0,0 +1,409 @@ +//! Unified TTY handling for fc-agent. +//! +//! This module provides a single implementation for running commands with PTY support, +//! used by both `podman run -it` and `exec -it` paths. + +use std::io::{Read, Write}; +use std::os::unix::io::FromRawFd; + +/// Vsock port for TTY I/O (used by podman run -it) +pub const TTY_VSOCK_PORT: u32 = 4996; + +/// Host CID for vsock connections +const HOST_CID: u32 = 2; + +/// Run a command with PTY, connecting to host first. +/// +/// Used by `podman run -it` where fc-agent initiates the connection. +pub fn run_with_pty(command: &[String], tty: bool, interactive: bool) -> i32 { + if command.is_empty() { + eprintln!("[fc-agent] tty: empty command"); + return 1; + } + + // Connect to host via vsock + let vsock_fd = match connect_vsock(TTY_VSOCK_PORT) { + Ok(fd) => fd, + Err(e) => { + eprintln!("[fc-agent] tty: failed to connect vsock: {}", e); + return 1; + } + }; + + run_with_pty_fd(vsock_fd, command, tty, interactive) +} + +/// Run a command with PTY using a pre-connected fd. +/// +/// Used by `exec -it` where the host has already connected to fc-agent. +/// Also called by `run_with_pty` after connecting. +pub fn run_with_pty_fd(vsock_fd: i32, command: &[String], tty: bool, interactive: bool) -> i32 { + // Allocate PTY or pipes + let (master_fd, slave_fd, stdin_read, stdin_write, stdout_read, stdout_write) = if tty { + match allocate_pty() { + Ok((m, s)) => (m, s, -1, -1, -1, -1), + Err(e) => { + eprintln!("[fc-agent] tty: failed to allocate PTY: {}", e); + unsafe { libc::close(vsock_fd) }; + return 1; + } + } + } else { + match allocate_pipes() { + Ok((sr, sw, or, ow)) => (-1, -1, sr, sw, or, ow), + Err(e) => { + eprintln!("[fc-agent] tty: failed to allocate pipes: {}", e); + unsafe { libc::close(vsock_fd) }; + return 1; + } + } + }; + + // Fork + let pid = unsafe { libc::fork() }; + if pid < 0 { + eprintln!("[fc-agent] tty: fork failed"); + cleanup_fds( + tty, + master_fd, + slave_fd, + stdin_read, + stdin_write, + stdout_read, + stdout_write, + ); + unsafe { libc::close(vsock_fd) }; + return 1; + } + + if pid == 0 { + // Child process + unsafe { libc::close(vsock_fd) }; + + if tty { + setup_child_pty(slave_fd, master_fd); + } else { + setup_child_pipes(stdin_read, stdin_write, stdout_read, stdout_write); + } + + exec_command(command); + // exec_command never returns on success + eprintln!("[fc-agent] tty: exec failed"); + unsafe { libc::_exit(127) }; + } + + // Parent process + if tty { + unsafe { libc::close(slave_fd) }; + } else { + unsafe { + libc::close(stdin_read); + libc::close(stdout_write); + } + } + + // Create File wrappers for I/O + let mut vsock = unsafe { std::fs::File::from_raw_fd(vsock_fd) }; + + let output_fd = if tty { master_fd } else { stdout_read }; + + // Spawn writer thread (only if interactive) + let writer_thread = if interactive { + let vsock_clone = match vsock.try_clone() { + Ok(f) => f, + Err(e) => { + eprintln!("[fc-agent] tty: failed to clone vsock: {}", e); + unsafe { + libc::kill(pid, libc::SIGKILL); + libc::waitpid(pid, std::ptr::null_mut(), 0); + } + return 1; + } + }; + + // For TTY, duplicate the master fd so we can use it for writing + // while the main thread uses it for reading + let input_file = if tty { + let dup_fd = unsafe { libc::dup(master_fd) }; + if dup_fd < 0 { + eprintln!("[fc-agent] tty: failed to dup master fd"); + unsafe { + libc::kill(pid, libc::SIGKILL); + libc::waitpid(pid, std::ptr::null_mut(), 0); + } + return 1; + } + unsafe { std::fs::File::from_raw_fd(dup_fd) } + } else { + unsafe { std::fs::File::from_raw_fd(stdin_write) } + }; + + Some(std::thread::spawn(move || { + writer_loop(vsock_clone, input_file); + })) + } else { + // Close stdin pipe if not interactive + if !tty && stdin_write >= 0 { + unsafe { libc::close(stdin_write) }; + } + None + }; + + // Reader loop in main thread + let output_file = unsafe { std::fs::File::from_raw_fd(output_fd) }; + let exit_code = reader_loop(output_file, &mut vsock, pid); + + // Send exit message and flush to ensure it's sent before we exit + let _ = exec_proto::write_exit(&mut vsock, exit_code); + let _ = vsock.flush(); + + // Wait for writer thread + if let Some(handle) = writer_thread { + let _ = handle.join(); + } + + // Drop vsock explicitly to ensure close is sent before process exits + drop(vsock); + + exit_code +} + +/// Connect to host via vsock +fn connect_vsock(port: u32) -> Result { + let fd = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_STREAM, 0) }; + if fd < 0 { + return Err("socket creation failed".to_string()); + } + + let addr = libc::sockaddr_vm { + svm_family: libc::AF_VSOCK as u16, + svm_reserved1: 0, + svm_port: port, + svm_cid: HOST_CID, + svm_zero: [0u8; 4], + }; + + let result = unsafe { + libc::connect( + fd, + &addr as *const libc::sockaddr_vm as *const libc::sockaddr, + std::mem::size_of::() as u32, + ) + }; + + if result < 0 { + unsafe { libc::close(fd) }; + return Err(format!( + "connect failed: {}", + std::io::Error::last_os_error() + )); + } + + Ok(fd) +} + +/// Allocate PTY pair +fn allocate_pty() -> Result<(i32, i32), String> { + let mut master: libc::c_int = 0; + let mut slave: libc::c_int = 0; + + let result = unsafe { + libc::openpty( + &mut master, + &mut slave, + std::ptr::null_mut(), + std::ptr::null_mut(), + std::ptr::null_mut(), + ) + }; + + if result != 0 { + return Err("openpty failed".to_string()); + } + + Ok((master, slave)) +} + +/// Allocate stdin/stdout pipes +fn allocate_pipes() -> Result<(i32, i32, i32, i32), String> { + let mut stdin_pipe = [0i32; 2]; + let mut stdout_pipe = [0i32; 2]; + + if unsafe { libc::pipe(stdin_pipe.as_mut_ptr()) } != 0 { + return Err("stdin pipe failed".to_string()); + } + + if unsafe { libc::pipe(stdout_pipe.as_mut_ptr()) } != 0 { + unsafe { + libc::close(stdin_pipe[0]); + libc::close(stdin_pipe[1]); + } + return Err("stdout pipe failed".to_string()); + } + + // stdin_pipe[0] = read end (child reads from this) + // stdin_pipe[1] = write end (parent writes to this) + // stdout_pipe[0] = read end (parent reads from this) + // stdout_pipe[1] = write end (child writes to this) + Ok((stdin_pipe[0], stdin_pipe[1], stdout_pipe[0], stdout_pipe[1])) +} + +/// Clean up file descriptors on error +fn cleanup_fds( + tty: bool, + master_fd: i32, + slave_fd: i32, + stdin_read: i32, + stdin_write: i32, + stdout_read: i32, + stdout_write: i32, +) { + unsafe { + if tty { + if master_fd >= 0 { + libc::close(master_fd); + } + if slave_fd >= 0 { + libc::close(slave_fd); + } + } else { + if stdin_read >= 0 { + libc::close(stdin_read); + } + if stdin_write >= 0 { + libc::close(stdin_write); + } + if stdout_read >= 0 { + libc::close(stdout_read); + } + if stdout_write >= 0 { + libc::close(stdout_write); + } + } + } +} + +/// Set up child process for PTY +fn setup_child_pty(slave_fd: i32, master_fd: i32) { + unsafe { + // Create new session and set controlling terminal + libc::setsid(); + libc::ioctl(slave_fd, libc::TIOCSCTTY as _, 0); + + // Redirect stdin/stdout/stderr to PTY slave + libc::dup2(slave_fd, 0); + libc::dup2(slave_fd, 1); + libc::dup2(slave_fd, 2); + + // Close original fds + if slave_fd > 2 { + libc::close(slave_fd); + } + libc::close(master_fd); + } +} + +/// Set up child process for pipes +fn setup_child_pipes(stdin_read: i32, stdin_write: i32, stdout_read: i32, stdout_write: i32) { + unsafe { + // Redirect stdin to read end of stdin pipe + libc::dup2(stdin_read, 0); + // Redirect stdout/stderr to write end of stdout pipe + libc::dup2(stdout_write, 1); + libc::dup2(stdout_write, 2); + + // Close original pipe fds + libc::close(stdin_read); + libc::close(stdin_write); + libc::close(stdout_read); + libc::close(stdout_write); + } +} + +/// Exec the command +fn exec_command(command: &[String]) { + use std::ffi::CString; + + let prog = match CString::new(command[0].as_str()) { + Ok(s) => s, + Err(_) => return, + }; + + let args: Vec = command + .iter() + .filter_map(|s| CString::new(s.as_str()).ok()) + .collect(); + + let arg_ptrs: Vec<*const libc::c_char> = args + .iter() + .map(|s| s.as_ptr()) + .chain(std::iter::once(std::ptr::null())) + .collect(); + + unsafe { + libc::execvp(prog.as_ptr(), arg_ptrs.as_ptr()); + } +} + +/// Writer loop: read STDIN messages from vsock, write to PTY/pipe +fn writer_loop(mut vsock: std::fs::File, mut target: std::fs::File) { + loop { + match exec_proto::Message::read_from(&mut vsock) { + Ok(exec_proto::Message::Stdin(data)) => { + if target.write_all(&data).is_err() { + break; + } + if target.flush().is_err() { + break; + } + } + Ok(exec_proto::Message::Exit(_)) | Ok(exec_proto::Message::Error(_)) => { + break; + } + Ok(_) => { + // Unexpected message type, ignore + } + Err(_) => { + break; + } + } + } + // Drop target to close pipe/PTY, signaling EOF to child +} + +/// Reader loop: read from PTY/pipe, send DATA messages via exec_proto +fn reader_loop(mut source: std::fs::File, vsock: &mut std::fs::File, child_pid: i32) -> i32 { + let mut buf = [0u8; 4096]; + + loop { + match source.read(&mut buf) { + Ok(0) => { + break; + } + Ok(n) => { + if exec_proto::write_data(vsock, &buf[..n]).is_err() { + break; + } + } + Err(e) => { + if e.kind() != std::io::ErrorKind::Interrupted { + break; + } + } + } + } + + // Wait for child and get exit code + let mut status: libc::c_int = 0; + unsafe { + libc::waitpid(child_pid, &mut status, 0); + } + + if libc::WIFEXITED(status) { + libc::WEXITSTATUS(status) + } else if libc::WIFSIGNALED(status) { + 128 + libc::WTERMSIG(status) + } else { + 1 + } +} diff --git a/src/cli/args.rs b/src/cli/args.rs index c851e392..27bf3869 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -97,14 +97,6 @@ pub enum PodmanCommands { #[derive(Args, Debug)] pub struct RunArgs { - /// Container image (e.g., nginx:alpine or localhost/myimage) - pub image: String, - - /// Command and arguments to run in container (alternative to --cmd) - /// Example: fcvm podman run --name foo --network bridged alpine:latest sh -c "echo hello" - #[arg(trailing_var_arg = true, allow_hyphen_values = true)] - pub command_args: Vec, - /// VM name (required) #[arg(long)] pub name: String, @@ -177,6 +169,14 @@ pub struct RunArgs { #[arg(long)] pub privileged: bool, + /// Keep STDIN open even if not attached + #[arg(short, long)] + pub interactive: bool, + + /// Allocate a pseudo-TTY + #[arg(short, long)] + pub tty: bool, + /// Debug fc-agent with strace (output to /tmp/fc-agent.strace in guest) /// Useful for diagnosing fc-agent startup issues #[arg(long)] @@ -201,6 +201,14 @@ pub struct RunArgs { /// Example: --vsock-dir /tmp/myvm creates /tmp/myvm/vsock.sock #[arg(long)] pub vsock_dir: Option, + + /// Container image (e.g., nginx:alpine or localhost/myimage) + pub image: String, + + /// Command and arguments to run in container (alternative to --cmd) + /// Example: fcvm podman run --name foo --network bridged alpine:latest sh -c "echo hello" + #[arg(trailing_var_arg = true, allow_hyphen_values = true)] + pub command_args: Vec, } // ============================================================================ @@ -305,10 +313,6 @@ pub struct LsArgs { #[derive(Args, Debug)] pub struct ExecArgs { - /// VM name to exec into (mutually exclusive with --pid) - #[arg(conflicts_with = "pid")] - pub name: Option, - /// VM PID to exec into (mutually exclusive with name) #[arg(long, conflicts_with = "name")] pub pid: Option, @@ -317,6 +321,10 @@ pub struct ExecArgs { #[arg(long)] pub vm: bool, + /// Execute inside container (default, mutually exclusive with --vm) + #[arg(short, long)] + pub container: bool, + /// Keep STDIN open even if not attached #[arg(short, long)] pub interactive: bool, @@ -329,7 +337,11 @@ pub struct ExecArgs { #[arg(short, long)] pub quiet: bool, + /// VM name to exec into (mutually exclusive with --pid) + #[arg(long, conflicts_with = "pid")] + pub name: Option, + /// Command and arguments to execute - #[arg(last = true, required = true)] + #[arg(trailing_var_arg = true, allow_hyphen_values = true, required = true)] pub command: Vec, } diff --git a/src/commands/common.rs b/src/commands/common.rs index 9a54101d..fde30302 100644 --- a/src/commands/common.rs +++ b/src/commands/common.rs @@ -21,9 +21,12 @@ pub const VSOCK_VOLUME_PORT_BASE: u32 = 5000; /// Vsock port for status channel (fc-agent notifies when container starts) pub const VSOCK_STATUS_PORT: u32 = 4999; -/// Vsock port for container output streaming (bidirectional) +/// Vsock port for container output streaming (bidirectional, line-based) pub const VSOCK_OUTPUT_PORT: u32 = 4997; +/// Vsock port for TTY container I/O (binary exec_proto) +pub const VSOCK_TTY_PORT: u32 = 4996; + /// Minimum required Firecracker version for network_overrides support const MIN_FIRECRACKER_VERSION: (u32, u32, u32) = (1, 13, 1); diff --git a/src/commands/exec.rs b/src/commands/exec.rs index 9b96b8c9..775cf46a 100644 --- a/src/commands/exec.rs +++ b/src/commands/exec.rs @@ -12,7 +12,6 @@ use crate::cli::ExecArgs; use crate::paths; use crate::state::StateManager; use anyhow::{bail, Context, Result}; -use exec_proto; use std::io::{BufRead, BufReader, Read, Write}; use std::os::unix::net::UnixStream; use std::path::Path; @@ -231,7 +230,11 @@ pub async fn cmd_exec(args: ExecArgs) -> Result<()> { // Use binary framing for any mode needing TTY or stdin forwarding // JSON line mode only for plain non-interactive commands if tty || interactive { - run_framed_mode(stream, tty, interactive) + let exit_code = super::tty::run_tty_session_connected(stream, tty, interactive)?; + if exit_code != 0 { + std::process::exit(exit_code); + } + Ok(()) } else { run_line_mode(stream) } @@ -282,155 +285,6 @@ fn run_line_mode(stream: UnixStream) -> Result<()> { Ok(()) } -/// Run with binary framing protocol (for TTY and/or interactive modes) -/// -/// Uses the binary framing protocol (exec_proto) to cleanly separate -/// control messages (exit code) from raw terminal data. -/// -/// - `tty`: If true, set terminal to raw mode for proper TTY handling -/// - `interactive`: If true, forward stdin to the remote command -fn run_framed_mode(stream: UnixStream, tty: bool, interactive: bool) -> Result<()> { - use std::io::{stdin, stdout}; - use std::os::unix::io::AsRawFd; - use std::sync::atomic::{AtomicBool, Ordering}; - use std::sync::Arc; - - let stdin_fd = stdin().as_raw_fd(); - let mut orig_termios: Option = None; - - // Only set up raw terminal mode if TTY requested - if tty { - let is_tty = unsafe { libc::isatty(stdin_fd) == 1 }; - if !is_tty { - bail!("TTY mode requires a terminal. Use without -t for non-interactive mode."); - } - - // Save original terminal settings - let mut termios: libc::termios = unsafe { std::mem::zeroed() }; - if unsafe { libc::tcgetattr(stdin_fd, &mut termios) } != 0 { - bail!("Failed to get terminal attributes"); - } - orig_termios = Some(termios); - - // Set raw mode - let mut raw_termios = termios; - unsafe { - libc::cfmakeraw(&mut raw_termios); - } - if unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &raw_termios) } != 0 { - bail!("Failed to set raw terminal mode"); - } - } - - // Flag to track completion - let done = Arc::new(AtomicBool::new(false)); - let done_clone = done.clone(); - - // Clone stream for reader/writer - let read_stream = stream.try_clone().context("cloning stream for reader")?; - let mut write_stream = stream; - - // Spawn thread to read framed messages from socket and write to stdout - let reader_done = done.clone(); - let reader_thread = std::thread::spawn(move || { - let mut stdout = stdout().lock(); - let mut read_stream = read_stream; - - loop { - if reader_done.load(Ordering::Relaxed) { - break; - } - - // Read framed message using binary protocol - match exec_proto::Message::read_from(&mut read_stream) { - Ok(exec_proto::Message::Data(data)) => { - // Write raw PTY data to stdout - let _ = stdout.write_all(&data); - let _ = stdout.flush(); - } - Ok(exec_proto::Message::Exit(code)) => { - // Process exited, return the exit code - reader_done.store(true, Ordering::Relaxed); - return Some(code); - } - Ok(exec_proto::Message::Error(msg)) => { - // Error from guest - let _ = writeln!(stdout, "\r\nError: {}\r", msg); - let _ = stdout.flush(); - reader_done.store(true, Ordering::Relaxed); - return Some(1); - } - Ok(exec_proto::Message::Stdin(_)) => { - // Should not receive STDIN from guest, ignore - } - Err(e) => { - if e.kind() == std::io::ErrorKind::UnexpectedEof { - // Connection closed - break; - } - // Other error, log and exit - eprintln!("\r\nProtocol error: {}\r", e); - break; - } - } - } - None - }); - - // Only spawn writer thread if interactive mode (stdin forwarding) - let writer_thread = if interactive { - let writer_done = done.clone(); - Some(std::thread::spawn(move || { - let stdin = stdin(); - let mut stdin = stdin.lock(); - let mut buf = [0u8; 1024]; - - loop { - if writer_done.load(Ordering::Relaxed) { - break; - } - - match stdin.read(&mut buf) { - Ok(0) => break, // EOF - Ok(n) => { - // Send stdin data as framed STDIN message - if exec_proto::write_stdin(&mut write_stream, &buf[..n]).is_err() { - break; - } - } - Err(_) => break, - } - } - })) - } else { - // Not interactive - just drop the write_stream - drop(write_stream); - None - }; - - // Wait for reader to finish (it detects exit) - let exit_code = reader_thread.join().ok().flatten().unwrap_or(0); - - // Signal writer to stop - done_clone.store(true, Ordering::Relaxed); - - // Restore terminal if we set raw mode - if let Some(termios) = orig_termios { - unsafe { - libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios); - } - } - - // Don't wait for writer - it may be blocked on stdin - drop(writer_thread); - - if exit_code != 0 { - std::process::exit(exit_code); - } - - Ok(()) -} - /// Request sent to fc-agent exec server #[derive(serde::Serialize, serde::Deserialize)] pub struct ExecRequest { diff --git a/src/commands/mod.rs b/src/commands/mod.rs index 617ca2fa..2ef47a6f 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -6,6 +6,7 @@ pub mod podman; pub mod setup; pub mod snapshot; pub mod snapshots; +pub mod tty; // Re-export command functions pub use completions::cmd_completions; diff --git a/src/commands/podman.rs b/src/commands/podman.rs index 25fa6718..2d07af4c 100644 --- a/src/commands/podman.rs +++ b/src/commands/podman.rs @@ -54,7 +54,7 @@ impl VolumeMapping { } } -use super::common::{VSOCK_OUTPUT_PORT, VSOCK_STATUS_PORT, VSOCK_VOLUME_PORT_BASE}; +use super::common::{VSOCK_OUTPUT_PORT, VSOCK_STATUS_PORT, VSOCK_TTY_PORT, VSOCK_VOLUME_PORT_BASE}; /// Create an ext4 disk image from a directory's contents. /// Returns the path to the created image. @@ -335,8 +335,14 @@ async fn run_status_listener( /// Guest → Host: "stdout:content" or "stderr:content" /// Host → Guest: "stdin:content" (written to container stdin) /// +/// If `interactive` is true, forwards host stdin to container. +/// /// Returns collected output lines as Vec<(stream, line)>. -async fn run_output_listener(socket_path: &str, vm_id: &str) -> Result> { +async fn run_output_listener( + socket_path: &str, + vm_id: &str, + interactive: bool, +) -> Result> { use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader}; use tokio::net::UnixListener; @@ -374,10 +380,41 @@ async fn run_output_listener(socket_path: &str, vm_id: &str) -> Result break, // EOF + Ok(_) => { + // Forward to container: "stdin:content\n" + let msg = format!("stdin:{}", line.trim_end()); + let mut w = writer.lock().await; + if w.write_all(msg.as_bytes()).await.is_err() { + break; + } + if w.write_all(b"\n").await.is_err() { + break; + } + } + Err(_) => break, + } + } + })) + } else { + None + }; + // Read lines until connection closes loop { line_buf.clear(); @@ -406,7 +443,8 @@ async fn run_output_listener(socket_path: &str, vm_id: &str) -> Result { @@ -421,6 +459,11 @@ async fn run_output_listener(socket_path: &str, vm_id: &str) -> Result Result<()> { }) }; - // Start bidirectional output listener for container stdout/stderr - // Port 4997 receives JSON lines: {"stream":"stdout|stderr","line":"..."} + // Start I/O listener for container stdin/stdout/stderr + // TTY mode: use binary exec_proto on port 4996 (blocking, raw terminal) + // Non-TTY mode: use line-based protocol on port 4997 (async) + let tty_mode = args.tty; + let interactive = args.interactive; + let tty_socket_path = format!("{}_{}", vsock_socket_path.display(), VSOCK_TTY_PORT); let output_socket_path = format!("{}_{}", vsock_socket_path.display(), VSOCK_OUTPUT_PORT); - let _output_handle = { + + // For TTY mode, we spawn a blocking thread that handles the TTY I/O + // This must be set up BEFORE VM starts so we're ready to accept connection + let tty_handle = if tty_mode { + let socket_path = tty_socket_path.clone(); + Some(std::thread::spawn(move || { + super::tty::run_tty_session(&socket_path, true, interactive) + })) + } else { + None + }; + + // For non-TTY mode, use async output listener + let _output_handle = if !tty_mode { let socket_path = output_socket_path.clone(); let vm_id_clone = vm_id.clone(); - tokio::spawn(async move { - match run_output_listener(&socket_path, &vm_id_clone).await { + Some(tokio::spawn(async move { + match run_output_listener(&socket_path, &vm_id_clone, interactive).await { Ok(lines) => lines, Err(e) => { tracing::warn!("Output listener error: {}", e); Vec::new() } } - }) + })) + } else { + None }; // Run the main VM setup in a helper to ensure cleanup on error @@ -848,6 +910,8 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { let mut sigint = signal(SignalKind::interrupt())?; // Wait for signal or VM exit + // For TTY mode, we get exit code from the TTY listener thread + // For non-TTY mode, we read it from the file written by status listener let container_exit_code: Option; tokio::select! { _ = sigterm.recv() => { @@ -860,12 +924,21 @@ async fn cmd_podman_run(args: RunArgs) -> Result<()> { } status = vm_manager.wait() => { info!(status = ?status, "VM exited"); - // Read container exit code from file written by status listener - let exit_file = data_dir.join("container-exit"); - container_exit_code = std::fs::read_to_string(&exit_file) - .ok() - .and_then(|s| s.trim().parse::().ok()); - info!(container_exit_code = ?container_exit_code, "container exit code"); + if let Some(handle) = tty_handle { + // TTY mode: get exit code from TTY listener + container_exit_code = handle + .join() + .ok() + .and_then(|r| r.ok()); + info!(container_exit_code = ?container_exit_code, "TTY container exit code"); + } else { + // Non-TTY mode: read container exit code from file written by status listener + let exit_file = data_dir.join("container-exit"); + container_exit_code = std::fs::read_to_string(&exit_file) + .ok() + .and_then(|s| s.trim().parse::().ok()); + info!(container_exit_code = ?container_exit_code, "container exit code"); + } } } @@ -1812,6 +1885,8 @@ async fn run_vm_setup( "nfs_mounts": nfs_mounts, "image_archive": image_archive_name.map(|name| format!("/tmp/fcvm-image/{}", name)), "privileged": args.privileged, + "interactive": args.interactive, + "tty": args.tty, }, "host-time": chrono::Utc::now().timestamp().to_string(), } diff --git a/src/commands/tty.rs b/src/commands/tty.rs new file mode 100644 index 00000000..c0b8c480 --- /dev/null +++ b/src/commands/tty.rs @@ -0,0 +1,277 @@ +//! Unified TTY handling for fcvm host side. +//! +//! This module provides a single implementation for running TTY sessions, +//! used by both `podman run -it` and `exec -it` paths. + +use std::io::Write; +use std::os::unix::io::AsRawFd; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use anyhow::{bail, Context, Result}; +use tracing::{debug, info}; + +/// Run a TTY session by listening on a Unix socket. +/// +/// Used by `podman run -it` where the host listens and guest connects. +/// +/// 1. Binds and accepts connection on Unix socket +/// 2. Delegates to `run_tty_session_connected()` for I/O handling +/// 3. Cleans up socket on exit +pub fn run_tty_session(socket_path: &str, tty: bool, interactive: bool) -> Result { + // Remove stale socket if it exists + let _ = std::fs::remove_file(socket_path); + + let listener = + UnixListener::bind(socket_path).with_context(|| format!("binding to {}", socket_path))?; + + info!(socket = %socket_path, tty, interactive, "TTY session started"); + + // Accept connection from guest (blocking) + listener + .set_nonblocking(false) + .context("setting listener to blocking")?; + let (stream, _) = listener.accept().context("accepting connection")?; + + debug!("TTY connection established"); + + // Run the session with the connected stream + let result = run_tty_session_connected(stream, tty, interactive); + + // Clean up socket + let _ = std::fs::remove_file(socket_path); + + result +} + +/// Run a TTY session with a pre-connected stream. +/// +/// Used by `exec -it` where the host has already connected to the guest. +/// +/// 1. Sets terminal to raw mode if `tty=true` +/// 2. Spawns reader thread (socket -> stdout) +/// 3. Spawns writer thread if `interactive=true` (stdin -> socket) +/// 4. Returns exit code from remote command +pub fn run_tty_session_connected(stream: UnixStream, tty: bool, interactive: bool) -> Result { + // Set up raw terminal mode if TTY requested + let stdin_fd = std::io::stdin().as_raw_fd(); + + // Debug: check if stdin is non-blocking + let stdin_flags = unsafe { libc::fcntl(stdin_fd, libc::F_GETFL) }; + let is_nonblocking = (stdin_flags & libc::O_NONBLOCK) != 0; + debug!( + "run_tty_session_connected: stdin_fd={}, flags=0x{:x}, O_NONBLOCK={}", + stdin_fd, stdin_flags, is_nonblocking + ); + let orig_termios = if tty { + setup_raw_terminal(stdin_fd)? + } else { + None + }; + + // Flag to track completion + let done = Arc::new(AtomicBool::new(false)); + + // Clone stream for reader/writer + let read_stream = stream.try_clone().context("cloning stream for reader")?; + let mut write_stream = stream; + + // Spawn reader thread: socket -> stdout + let reader_done = done.clone(); + let reader_thread = std::thread::spawn(move || reader_loop(read_stream, reader_done)); + + // Spawn writer thread if interactive: stdin -> socket + let writer_thread = if interactive { + let writer_done = done.clone(); + Some(std::thread::spawn(move || { + writer_loop(&mut write_stream, writer_done); + })) + } else { + drop(write_stream); + None + }; + + // Wait for reader to finish and get exit code + let exit_code = reader_thread.join().ok().flatten().unwrap_or(0); + + // Signal writer to stop + done.store(true, Ordering::Relaxed); + + // Restore terminal if we set raw mode + if let Some(termios) = orig_termios { + restore_terminal(stdin_fd, termios); + } + + // Clean up writer thread handle + drop(writer_thread); + + Ok(exit_code) +} + +/// Set up raw terminal mode +fn setup_raw_terminal(stdin_fd: i32) -> Result> { + let is_tty = unsafe { libc::isatty(stdin_fd) == 1 }; + if !is_tty { + bail!("TTY mode requires a terminal. Use without -t for non-interactive mode."); + } + + // Ensure stdin is in blocking mode (tokio may have set it non-blocking) + unsafe { + let flags = libc::fcntl(stdin_fd, libc::F_GETFL); + if flags != -1 && (flags & libc::O_NONBLOCK) != 0 { + libc::fcntl(stdin_fd, libc::F_SETFL, flags & !libc::O_NONBLOCK); + debug!("setup_raw_terminal: cleared O_NONBLOCK from stdin"); + } + } + + // Save original terminal settings + let mut termios: libc::termios = unsafe { std::mem::zeroed() }; + if unsafe { libc::tcgetattr(stdin_fd, &mut termios) } != 0 { + bail!("Failed to get terminal attributes"); + } + let orig = termios; + + // Set raw mode + unsafe { + libc::cfmakeraw(&mut termios); + } + if unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios) } != 0 { + bail!("Failed to set raw terminal mode"); + } + + Ok(Some(orig)) +} + +/// Restore terminal to original settings +fn restore_terminal(stdin_fd: i32, termios: libc::termios) { + unsafe { + libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios); + } +} + +/// Reader loop: read framed messages from socket, write to stdout +fn reader_loop(mut stream: std::os::unix::net::UnixStream, done: Arc) -> Option { + let mut stdout = std::io::stdout().lock(); + let mut total_read: usize = 0; + + loop { + if done.load(Ordering::Relaxed) { + debug!("reader_loop: done flag set, exiting"); + break; + } + + match exec_proto::Message::read_from(&mut stream) { + Ok(exec_proto::Message::Data(data)) => { + total_read += data.len(); + debug!("reader_loop: received {} bytes (total {})", data.len(), total_read); + let _ = stdout.write_all(&data); + let _ = stdout.flush(); + } + Ok(exec_proto::Message::Exit(code)) => { + debug!("reader_loop: received Exit({})", code); + done.store(true, Ordering::Relaxed); + return Some(code); + } + Ok(exec_proto::Message::Error(msg)) => { + debug!("reader_loop: received Error({})", msg); + let _ = writeln!(stdout, "\r\nError: {}\r", msg); + let _ = stdout.flush(); + done.store(true, Ordering::Relaxed); + return Some(1); + } + Ok(exec_proto::Message::Stdin(_)) => { + // Should not receive STDIN from guest, ignore + } + Err(e) => { + if e.kind() == std::io::ErrorKind::UnexpectedEof { + debug!("reader_loop: EOF"); + break; + } + debug!("reader_loop: error: {}", e); + eprintln!("\r\nProtocol error: {}\r", e); + break; + } + } + } + None +} + +/// Writer loop: read from stdin, send STDIN messages to socket +fn writer_loop(stream: &mut std::os::unix::net::UnixStream, done: Arc) { + let stdin_fd = std::io::stdin().as_raw_fd(); + let mut buf = [0u8; 1024]; + let mut total_written = 0usize; + + debug!( + "writer_loop: starting, stdin_fd={}, waiting for stdin data", + stdin_fd + ); + + // Use poll-based loop to avoid blocking forever and see when data arrives + let mut poll_count = 0; + let mut last_log_time = std::time::Instant::now(); + + loop { + if done.load(Ordering::Relaxed) { + debug!("writer_loop: done flag set, exiting"); + break; + } + + // Poll for 1 second + let mut pollfd = libc::pollfd { + fd: stdin_fd, + events: libc::POLLIN, + revents: 0, + }; + let poll_result = unsafe { libc::poll(&mut pollfd, 1, 1000) }; + poll_count += 1; + + // Log every 5 seconds + if last_log_time.elapsed() > std::time::Duration::from_secs(5) { + debug!( + "writer_loop: poll #{}, result={}, revents=0x{:x}", + poll_count, poll_result, pollfd.revents + ); + last_log_time = std::time::Instant::now(); + } + + if poll_result < 0 { + let errno = std::io::Error::last_os_error(); + debug!("writer_loop: poll error: {}", errno); + break; + } else if poll_result == 0 { + // Timeout, no data yet + continue; + } + + // Data available! Read it + debug!( + "writer_loop: DATA AVAILABLE! poll_result={}, revents=0x{:x}", + poll_result, pollfd.revents + ); + + let n = unsafe { libc::read(stdin_fd, buf.as_mut_ptr() as *mut libc::c_void, buf.len()) }; + + if n < 0 { + let errno = std::io::Error::last_os_error(); + debug!("writer_loop: stdin read error: {}", errno); + break; + } else if n == 0 { + debug!("writer_loop: EOF on stdin"); + break; + } else { + let n = n as usize; + total_written += n; + debug!( + "writer_loop: read {} bytes from stdin (total {})", + n, total_written + ); + if exec_proto::write_stdin(stream, &buf[..n]).is_err() { + debug!("writer_loop: write_stdin failed"); + break; + } + } + } + debug!("writer_loop: exiting, total written: {} bytes", total_written); +} diff --git a/src/utils.rs b/src/utils.rs index 1e50933e..ff411499 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -119,6 +119,8 @@ pub fn spawn_streaming( where F: Fn(&str, bool) + Send + Sync + Clone + 'static, { + // Stdin is null to prevent child from competing with parent for terminal input + cmd.stdin(Stdio::null()); cmd.stdout(Stdio::piped()); cmd.stderr(Stdio::piped()); diff --git a/tests/common/mod.rs b/tests/common/mod.rs index ee2fef02..7c6f343a 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -745,7 +745,7 @@ pub async fn exec_in_container(pid: u32, cmd: &[&str]) -> anyhow::Result let script = cmd.join(" "); let output = tokio::process::Command::new(&fcvm_path) - .args(["exec", "--pid", &pid.to_string(), "--", "sh", "-c", &script]) + .args(["exec", "--pid", &pid.to_string(), "sh", "-c", &script]) .output() .await?; diff --git a/tests/test_exec.rs b/tests/test_exec.rs index ee68f23c..91e7b900 100644 --- a/tests/test_exec.rs +++ b/tests/test_exec.rs @@ -360,7 +360,7 @@ async fn exec_test_impl(network: &str) -> Result<()> { { let pid_str = fcvm_pid.to_string(); let mut child = tokio::process::Command::new(&fcvm_path) - .args(["exec", "--pid", &pid_str, "--vm", "-i", "--", "head", "-1"]) + .args(["exec", "--pid", &pid_str, "--vm", "-i", "head", "-1"]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) @@ -391,7 +391,7 @@ async fn exec_test_impl(network: &str) -> Result<()> { { let pid_str = fcvm_pid.to_string(); let mut child = tokio::process::Command::new(&fcvm_path) - .args(["exec", "--pid", &pid_str, "-i", "--", "head", "-1"]) + .args(["exec", "--pid", &pid_str, "-i", "head", "-1"]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) @@ -483,6 +483,117 @@ async fn exec_test_impl(network: &str) -> Result<()> { ); println!(" ✓ -t without -i correctly ignores stdin (container)"); + // ====================================================================== + // Tests 25-28: Ctrl-C/Ctrl-D/Ctrl-Z via PTY (proper signal path) + // These tests send actual control characters through the PTY to verify + // the signal handling works correctly through the terminal layer. + // ====================================================================== + + // Test 25: Ctrl-C (0x03) via PTY interrupts sleep (VM) + // This tests the proper PTY signal path: we write 0x03 to PTY master, + // which gets interpreted by the PTY slave's line discipline as SIGINT + println!("\nTest 25: Ctrl-C (0x03) via PTY interrupts command (VM -it)"); + let (exit_code, _, output) = run_exec_with_pty_interrupt( + &fcvm_path, + fcvm_pid, + true, // in_vm + &["sh", "-c", "trap 'echo CAUGHT_SIGINT; exit 130' INT; echo READY; sleep 999"], + "READY", + 0x03, // Ctrl-C + ) + .await?; + println!(" exit code: {}", exit_code); + println!(" output: {:?}", output); + assert!( + output.contains("READY"), + "should see READY before interrupt, got: {:?}", + output + ); + assert!( + output.contains("CAUGHT_SIGINT"), + "trap should catch SIGINT, got: {:?}", + output + ); + assert_eq!(exit_code, 130, "exit code should be 130 (128 + SIGINT)"); + println!(" ✓ Ctrl-C via PTY works for VM exec"); + + // Test 26: Ctrl-C (0x03) via PTY interrupts command (container -it) + println!("\nTest 26: Ctrl-C (0x03) via PTY interrupts command (container -it)"); + let (exit_code, _, output) = run_exec_with_pty_interrupt( + &fcvm_path, + fcvm_pid, + false, // container + &["sh", "-c", "trap 'echo CAUGHT_SIGINT; exit 130' INT; echo READY; sleep 999"], + "READY", + 0x03, // Ctrl-C + ) + .await?; + println!(" exit code: {}", exit_code); + println!(" output: {:?}", output); + assert!( + output.contains("READY"), + "should see READY before interrupt, got: {:?}", + output + ); + assert!( + output.contains("CAUGHT_SIGINT"), + "trap should catch SIGINT, got: {:?}", + output + ); + assert_eq!(exit_code, 130, "exit code should be 130 (128 + SIGINT)"); + println!(" ✓ Ctrl-C via PTY works for container exec"); + + // Test 27: Ctrl-D (0x04) via PTY sends EOF (VM -it) + // Ctrl-D should close stdin, causing the process to see EOF + println!("\nTest 27: Ctrl-D (0x04) via PTY sends EOF (VM -it)"); + let (exit_code, _, output) = run_exec_with_pty_interrupt( + &fcvm_path, + fcvm_pid, + true, // in_vm + &["sh", "-c", "echo READY; cat; echo GOT_EOF"], + "READY", + 0x04, // Ctrl-D + ) + .await?; + println!(" exit code: {}", exit_code); + println!(" output: {:?}", output); + assert!( + output.contains("READY"), + "should see READY before EOF, got: {:?}", + output + ); + assert!( + output.contains("GOT_EOF"), + "cat should get EOF and script should continue, got: {:?}", + output + ); + println!(" ✓ Ctrl-D via PTY works for VM exec"); + + // Test 28: Ctrl-D (0x04) via PTY sends EOF (container -it) + println!("\nTest 28: Ctrl-D (0x04) via PTY sends EOF (container -it)"); + let (exit_code, _, output) = run_exec_with_pty_interrupt( + &fcvm_path, + fcvm_pid, + false, // container + &["sh", "-c", "echo READY; cat; echo GOT_EOF"], + "READY", + 0x04, // Ctrl-D + ) + .await?; + println!(" exit code: {}", exit_code); + println!(" output: {:?}", output); + assert!( + output.contains("READY"), + "should see READY before EOF, got: {:?}", + output + ); + assert!( + output.contains("GOT_EOF"), + "cat should get EOF and script should continue, got: {:?}", + output + ); + println!(" ✓ Ctrl-D via PTY works for container exec"); + // Cleanup println!("\nCleaning up..."); common::kill_process(fcvm_pid).await; @@ -870,6 +981,158 @@ async fn run_exec_with_pty( } } +/// Run fcvm exec with PTY and send a control character after seeing expected output +/// +/// This tests the proper PTY signal path: +/// - Ctrl-C (0x03) should be converted to SIGINT by PTY layer +/// - Ctrl-D (0x04) should be converted to EOF +/// - Ctrl-Z (0x1a) should be converted to SIGTSTP +/// +/// Uses -it mode (interactive + tty) so stdin is forwarded. +async fn run_exec_with_pty_interrupt( + fcvm_path: &std::path::Path, + pid: u32, + in_vm: bool, + cmd: &[&str], + wait_for: &str, + control_char: u8, +) -> Result<(i32, Duration, String)> { + use nix::pty::openpty; + use nix::unistd::{close, dup2, fork, ForkResult}; + use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; + + let pid_str = pid.to_string(); + let start = std::time::Instant::now(); + + // Allocate a PTY pair + let pty = openpty(None, None).context("opening PTY")?; + + // Configure PTY slave before forking + unsafe { + let slave_fd = pty.slave.as_raw_fd(); + let mut termios: libc::termios = std::mem::zeroed(); + if libc::tcgetattr(slave_fd, &mut termios) == 0 { + // Disable echo but KEEP ISIG enabled so control characters work + termios.c_lflag &= !(libc::ECHO | libc::ICANON); + // ISIG must be enabled for Ctrl-C → SIGINT conversion in PTY layer + libc::tcsetattr(slave_fd, libc::TCSANOW, &termios); + } + } + + let master_fd = pty.master.into_raw_fd(); + let slave_fd = pty.slave.into_raw_fd(); + + match unsafe { fork() }.context("forking")? { + ForkResult::Child => { + // Child: set up PTY slave as stdin/stdout/stderr + unsafe { + libc::setsid(); + libc::ioctl(slave_fd, libc::TIOCSCTTY as _, 0); + dup2(slave_fd, 0).ok(); + dup2(slave_fd, 1).ok(); + dup2(slave_fd, 2).ok(); + if slave_fd > 2 { + close(slave_fd).ok(); + } + close(master_fd).ok(); + } + + // Build command + use std::ffi::CString; + let prog = CString::new(fcvm_path.to_str().unwrap()).unwrap(); + let mut args: Vec = vec![ + CString::new(fcvm_path.to_str().unwrap()).unwrap(), + CString::new("exec").unwrap(), + CString::new("--pid").unwrap(), + CString::new(pid_str.as_str()).unwrap(), + CString::new("-it").unwrap(), // Always use -it for control char tests + ]; + if in_vm { + args.push(CString::new("--vm").unwrap()); + } + args.push(CString::new("--").unwrap()); + for c in cmd { + args.push(CString::new(*c).unwrap()); + } + + #[allow(unreachable_code)] + { + nix::unistd::execvp(&prog, &args).expect("execvp failed"); + std::process::exit(1); + } + } + ForkResult::Parent { child } => { + close(slave_fd).ok(); + let mut master = unsafe { std::fs::File::from_raw_fd(master_fd) }; + + // Set non-blocking + unsafe { + let flags = libc::fcntl(master_fd, libc::F_GETFL); + libc::fcntl(master_fd, libc::F_SETFL, flags | libc::O_NONBLOCK); + } + + let mut output = Vec::new(); + let mut buf = [0u8; 4096]; + let deadline = std::time::Instant::now() + Duration::from_secs(30); + let mut sent_control = false; + + loop { + if std::time::Instant::now() > deadline { + nix::sys::signal::kill(child, nix::sys::signal::Signal::SIGKILL).ok(); + anyhow::bail!("timeout waiting for command"); + } + + use std::io::Read; + match master.read(&mut buf) { + Ok(0) => break, + Ok(n) => { + output.extend_from_slice(&buf[..n]); + + // Check if we should send the control character + if !sent_control { + let output_str = String::from_utf8_lossy(&output); + if output_str.contains(wait_for) { + // Small delay to ensure command is in expected state + std::thread::sleep(Duration::from_millis(100)); + + // Send the control character through PTY + use std::io::Write; + master.write_all(&[control_char]).ok(); + master.flush().ok(); + sent_control = true; + } + } + } + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => { + // Check if child exited + match nix::sys::wait::waitpid( + child, + Some(nix::sys::wait::WaitPidFlag::WNOHANG), + ) { + Ok(nix::sys::wait::WaitStatus::Exited(_, _)) => break, + Ok(nix::sys::wait::WaitStatus::Signaled(_, _, _)) => break, + _ => std::thread::sleep(Duration::from_millis(50)), + } + } + Err(_) => break, + } + } + + // Wait for child + let status = nix::sys::wait::waitpid(child, None).context("waiting for child")?; + let exit_code = match status { + nix::sys::wait::WaitStatus::Exited(_, code) => code, + _ => -1, + }; + + let duration = start.elapsed(); + let output_str = String::from_utf8_lossy(&output).to_string(); + + Ok((exit_code, duration, output_str)) + } + } +} + /// Stress test: Run 100 parallel TTY execs to verify no race conditions /// /// This test verifies that the TTY stdin fix works under heavy parallel load. @@ -1010,3 +1273,311 @@ async fn test_exec_parallel_tty_stress() -> Result<()> { println!("✓ ALL {} PARALLEL TTY EXECS PASSED!", TOTAL_EXECS); Ok(()) } + +// ============================================================================ +// Tests for `fcvm podman run -it` (not exec, but container run with -i/-t) +// ============================================================================ + +/// Test `fcvm podman run -t` allocates TTY for container +/// +/// Uses `tty` command to verify TTY is allocated when -t flag is used. +#[tokio::test] +async fn test_podman_run_tty() -> Result<()> { + use nix::pty::openpty; + use nix::unistd::{close, dup2, fork, ForkResult}; + use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; + use std::time::Duration; + + println!("\nTest: fcvm podman run -t (TTY allocation)"); + println!("=========================================="); + + let fcvm_path = common::find_fcvm_binary()?; + let (vm_name, _, _, _) = common::unique_names("run-tty"); + + // Allocate PTY for test harness (since we don't have a real terminal) + let pty = openpty(None, None).context("opening PTY")?; + + // Disable echo before forking + unsafe { + let slave_fd = pty.slave.as_raw_fd(); + let mut termios: libc::termios = std::mem::zeroed(); + if libc::tcgetattr(slave_fd, &mut termios) == 0 { + termios.c_lflag &= !(libc::ECHO | libc::ICANON | libc::ISIG); + libc::tcsetattr(slave_fd, libc::TCSANOW, &termios); + } + } + + let master_fd = pty.master.into_raw_fd(); + let slave_fd = pty.slave.into_raw_fd(); + + // Fork to run fcvm with PTY + match unsafe { fork() }.context("forking")? { + ForkResult::Child => { + // Child: set up PTY and exec fcvm + unsafe { + libc::setsid(); + libc::ioctl(slave_fd, libc::TIOCSCTTY as _, 0); + dup2(slave_fd, 0).ok(); + dup2(slave_fd, 1).ok(); + dup2(slave_fd, 2).ok(); + if slave_fd > 2 { + close(slave_fd).ok(); + } + close(master_fd).ok(); + } + + // Exec fcvm podman run -t ... tty + use std::ffi::CString; + let prog = CString::new(fcvm_path.to_str().unwrap()).unwrap(); + let args = vec![ + CString::new(fcvm_path.to_str().unwrap()).unwrap(), + CString::new("podman").unwrap(), + CString::new("run").unwrap(), + CString::new("--name").unwrap(), + CString::new(vm_name.as_str()).unwrap(), + CString::new("-t").unwrap(), + CString::new("alpine:latest").unwrap(), + CString::new("tty").unwrap(), + ]; + #[allow(unreachable_code)] + { + nix::unistd::execvp(&prog, &args).expect("execvp failed"); + std::process::exit(1); + } + } + ForkResult::Parent { child } => { + close(slave_fd).ok(); + + let mut master = unsafe { std::fs::File::from_raw_fd(master_fd) }; + + // Read output with timeout + let mut output = Vec::new(); + let mut buf = [0u8; 4096]; + + // Set non-blocking + unsafe { + let flags = libc::fcntl(master_fd, libc::F_GETFL); + libc::fcntl(master_fd, libc::F_SETFL, flags | libc::O_NONBLOCK); + } + + let deadline = std::time::Instant::now() + Duration::from_secs(120); + loop { + if std::time::Instant::now() > deadline { + nix::sys::signal::kill(child, nix::sys::signal::Signal::SIGKILL).ok(); + break; + } + + use std::io::Read; + match master.read(&mut buf) { + Ok(0) => break, + Ok(n) => output.extend_from_slice(&buf[..n]), + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => { + match nix::sys::wait::waitpid( + child, + Some(nix::sys::wait::WaitPidFlag::WNOHANG), + ) { + Ok(nix::sys::wait::WaitStatus::Exited(_, _)) => break, + Ok(nix::sys::wait::WaitStatus::Signaled(_, _, _)) => break, + _ => std::thread::sleep(Duration::from_millis(50)), + } + } + Err(_) => break, + } + } + + // Wait for child + let _ = nix::sys::wait::waitpid(child, None); + + let output_str = String::from_utf8_lossy(&output); + println!(" Output: {:?}", output_str); + + // Should see /dev/pts/X + assert!( + output_str.contains("/dev/pts"), + "With -t flag, container should have TTY (/dev/pts), got: {:?}", + output_str + ); + println!("✓ fcvm podman run -t correctly allocates TTY"); + } + } + + Ok(()) +} + +/// Test `fcvm podman run -it` for interactive container with TTY +/// +/// Uses `head -1` to verify stdin is forwarded when -i flag is used. +#[tokio::test] +async fn test_podman_run_interactive_tty() -> Result<()> { + use nix::pty::openpty; + use nix::unistd::{close, dup2, fork, ForkResult}; + use std::io::Write; + use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; + use std::time::Duration; + + println!("\nTest: fcvm podman run -it (interactive + TTY)"); + println!("=============================================="); + + let fcvm_path = common::find_fcvm_binary()?; + let (vm_name, _, _, _) = common::unique_names("run-it"); + + // Allocate PTY for test harness + let pty = openpty(None, None).context("opening PTY")?; + + // Disable echo before forking + unsafe { + let slave_fd = pty.slave.as_raw_fd(); + let mut termios: libc::termios = std::mem::zeroed(); + if libc::tcgetattr(slave_fd, &mut termios) == 0 { + termios.c_lflag &= !(libc::ECHO | libc::ICANON | libc::ISIG); + libc::tcsetattr(slave_fd, libc::TCSANOW, &termios); + } + } + + let master_fd = pty.master.into_raw_fd(); + let slave_fd = pty.slave.into_raw_fd(); + + // Fork to run fcvm with PTY + match unsafe { fork() }.context("forking")? { + ForkResult::Child => { + // Child: set up PTY and exec fcvm + unsafe { + libc::setsid(); + libc::ioctl(slave_fd, libc::TIOCSCTTY as _, 0); + dup2(slave_fd, 0).ok(); + dup2(slave_fd, 1).ok(); + dup2(slave_fd, 2).ok(); + if slave_fd > 2 { + close(slave_fd).ok(); + } + close(master_fd).ok(); + } + + // Exec fcvm podman run -it ... head -1 + use std::ffi::CString; + let prog = CString::new(fcvm_path.to_str().unwrap()).unwrap(); + let args = vec![ + CString::new(fcvm_path.to_str().unwrap()).unwrap(), + CString::new("podman").unwrap(), + CString::new("run").unwrap(), + CString::new("--name").unwrap(), + CString::new(vm_name.as_str()).unwrap(), + CString::new("-it").unwrap(), + CString::new("alpine:latest").unwrap(), + CString::new("head").unwrap(), + CString::new("-1").unwrap(), + ]; + #[allow(unreachable_code)] + { + nix::unistd::execvp(&prog, &args).expect("execvp failed"); + unreachable!(); + } + } + ForkResult::Parent { child } => { + close(slave_fd).ok(); + + let mut master = unsafe { std::fs::File::from_raw_fd(master_fd) }; + + // Wait for container to be ready + std::thread::sleep(Duration::from_secs(10)); + + // Write input to container + master + .write_all(b"hello-from-stdin\n") + .context("writing stdin")?; + master.flush()?; + + // Read output with timeout + let mut output = Vec::new(); + let mut buf = [0u8; 4096]; + + // Set non-blocking + unsafe { + let flags = libc::fcntl(master_fd, libc::F_GETFL); + libc::fcntl(master_fd, libc::F_SETFL, flags | libc::O_NONBLOCK); + } + + let deadline = std::time::Instant::now() + Duration::from_secs(120); + loop { + if std::time::Instant::now() > deadline { + nix::sys::signal::kill(child, nix::sys::signal::Signal::SIGKILL).ok(); + break; + } + + use std::io::Read; + match master.read(&mut buf) { + Ok(0) => break, + Ok(n) => output.extend_from_slice(&buf[..n]), + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => { + match nix::sys::wait::waitpid( + child, + Some(nix::sys::wait::WaitPidFlag::WNOHANG), + ) { + Ok(nix::sys::wait::WaitStatus::Exited(_, _)) => break, + Ok(nix::sys::wait::WaitStatus::Signaled(_, _, _)) => break, + _ => std::thread::sleep(Duration::from_millis(50)), + } + } + Err(_) => break, + } + } + + // Wait for child + let _ = nix::sys::wait::waitpid(child, None); + + let output_str = String::from_utf8_lossy(&output); + println!(" Output: {:?}", output_str); + + // Should see our input echoed back + assert!( + output_str.contains("hello-from-stdin"), + "With -it flags, stdin should be forwarded, got: {:?}", + output_str + ); + println!("✓ fcvm podman run -it correctly forwards stdin"); + } + } + + Ok(()) +} + +/// Test `fcvm podman run` without -t flag (no TTY) +/// +/// Verifies that without -t, container does NOT have a TTY. +#[tokio::test] +async fn test_podman_run_no_tty() -> Result<()> { + use std::time::Duration; + + println!("\nTest: fcvm podman run (no -t = no TTY)"); + println!("======================================"); + + let fcvm_path = common::find_fcvm_binary()?; + let (vm_name, _, _, _) = common::unique_names("run-no-tty"); + + // Run without -t, check tty command output + let output = tokio::time::timeout( + Duration::from_secs(120), + tokio::process::Command::new(&fcvm_path) + .args(["podman", "run", "--name", &vm_name, "alpine:latest", "tty"]) + .output(), + ) + .await + .context("timeout")? + .context("running fcvm")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{}{}", stdout, stderr); + + println!(" Output: {:?}", combined.trim()); + + // Without -t, should say "not a tty" + assert!( + combined.contains("not a tty") || combined.contains("not a terminal"), + "Without -t flag, container should NOT have TTY, got: {:?}", + combined + ); + println!("✓ fcvm podman run (no -t) correctly has no TTY"); + + Ok(()) +} diff --git a/tests/test_kvm.rs b/tests/test_kvm.rs index 474bf482..fd642641 100644 --- a/tests/test_kvm.rs +++ b/tests/test_kvm.rs @@ -318,7 +318,7 @@ async fn test_nested_run_fcvm_inside_vm() -> Result<()> { // First, check kernel config and dmesg for KVM-related messages let debug_output = tokio::process::Command::new(&fcvm_path) .args([ - "exec", "--pid", &outer_pid.to_string(), "--vm", "--", + "exec", "--pid", &outer_pid.to_string(), "--vm", "sh", "-c", r#" echo "=== Kernel config (KVM/VIRTUALIZATION) ===" zcat /proc/config.gz 2>/dev/null | grep -E "^CONFIG_(KVM|VIRTUALIZATION)" || echo "config.gz not available" From 84a33741203d58441b92e8bb388bd2a863ab39f9 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 5 Jan 2026 19:30:34 +0000 Subject: [PATCH 2/6] Add guidance to fetch before investigating branches --- .claude/CLAUDE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 467d6259..b78c8c1b 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -27,6 +27,12 @@ gh pr view 2 --json baseRefName ## UNDERSTAND BRANCH CHAINS +**ALWAYS fetch before investigating branches:** +```bash +git fetch origin +``` +Branches may already be merged on remote. Don't waste time on stale local state. + **Run before starting work, committing, or opening PRs:** ```bash From 0b46783817b10b3e0cb937cca0003e92295ead15 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 5 Jan 2026 19:35:08 +0000 Subject: [PATCH 3/6] Fix clippy needless_return and cargo fmt issues - Remove needless return statement in fc-agent exec TTY handler - Fix formatting in tty.rs and test_exec.rs --- fc-agent/src/main.rs | 3 +-- src/commands/tty.rs | 11 +++++++++-- tests/test_exec.rs | 14 +++++++++++--- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fc-agent/src/main.rs b/fc-agent/src/main.rs index f6114546..4ab712d5 100644 --- a/fc-agent/src/main.rs +++ b/fc-agent/src/main.rs @@ -858,9 +858,8 @@ fn handle_exec_connection_blocking(fd: i32) { // Use unified TTY handler // NOTE: Don't call std::process::exit() here - that would kill the entire fc-agent! // run_with_pty_fd already sends the exit code via exec_proto and closes the fd. - // We just return and let the spawn_blocking task end naturally. + // We just let the spawn_blocking task end naturally. let _exit_code = tty::run_with_pty_fd(fd, &command, request.tty, request.interactive); - return; } else { handle_exec_pipe(fd, &request); } diff --git a/src/commands/tty.rs b/src/commands/tty.rs index c0b8c480..b0ab5d75 100644 --- a/src/commands/tty.rs +++ b/src/commands/tty.rs @@ -164,7 +164,11 @@ fn reader_loop(mut stream: std::os::unix::net::UnixStream, done: Arc match exec_proto::Message::read_from(&mut stream) { Ok(exec_proto::Message::Data(data)) => { total_read += data.len(); - debug!("reader_loop: received {} bytes (total {})", data.len(), total_read); + debug!( + "reader_loop: received {} bytes (total {})", + data.len(), + total_read + ); let _ = stdout.write_all(&data); let _ = stdout.flush(); } @@ -273,5 +277,8 @@ fn writer_loop(stream: &mut std::os::unix::net::UnixStream, done: Arc Result<()> { let (exit_code, _, output) = run_exec_with_pty_interrupt( &fcvm_path, fcvm_pid, - true, // in_vm - &["sh", "-c", "trap 'echo CAUGHT_SIGINT; exit 130' INT; echo READY; sleep 999"], + true, // in_vm + &[ + "sh", + "-c", + "trap 'echo CAUGHT_SIGINT; exit 130' INT; echo READY; sleep 999", + ], "READY", 0x03, // Ctrl-C ) @@ -523,7 +527,11 @@ async fn exec_test_impl(network: &str) -> Result<()> { &fcvm_path, fcvm_pid, false, // container - &["sh", "-c", "trap 'echo CAUGHT_SIGINT; exit 130' INT; echo READY; sleep 999"], + &[ + "sh", + "-c", + "trap 'echo CAUGHT_SIGINT; exit 130' INT; echo READY; sleep 999", + ], "READY", 0x03, // Ctrl-C ) From 46ee4e31a2dc5c836b3c7ddab180c43316586955 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 5 Jan 2026 19:59:06 +0000 Subject: [PATCH 4/6] Address TTY review feedback: signal handling, resource leaks, DoS Fixes from Claude review (PR #108 comment): Host-side (src/commands/tty.rs): - Add signal handlers (SIGTERM, SIGQUIT, SIGHUP) to restore terminal on abnormal exit - prevents terminal corruption - Join writer thread properly instead of drop() - prevents zombie threads - Log warning on tcsetattr failure instead of silent ignore - Return exit code 1 on protocol errors/EOF instead of 0 Guest-side (fc-agent/src/tty.rs): - Fix FD leaks on vsock.try_clone() failure (now closes master_fd or stdout_read/stdin_write depending on mode) - Fix FD leak on dup() failure (now closes master_fd) - Check waitpid return value and log error on failure Protocol (exec-proto/src/lib.rs): - Reduce max message size from 16MB to 1MB (sufficient for TTY) - Read payload progressively in 8KB chunks instead of allocating full buffer upfront - prevents memory exhaustion on disconnect Tested: cargo fmt --check && cargo clippy --all-targets -D warnings --- exec-proto/src/lib.rs | 26 ++++++++++---- fc-agent/src/tty.rs | 21 +++++++++-- src/commands/tty.rs | 84 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 114 insertions(+), 17 deletions(-) diff --git a/exec-proto/src/lib.rs b/exec-proto/src/lib.rs index a62e337b..cd33397f 100644 --- a/exec-proto/src/lib.rs +++ b/exec-proto/src/lib.rs @@ -100,17 +100,31 @@ impl Message { reader.read_exact(&mut len_buf)?; let len = u32::from_be_bytes(len_buf) as usize; - // Sanity check: limit message size to 16MB - if len > 16 * 1024 * 1024 { + // Sanity check: limit message size to 1MB (plenty for TTY data) + // This prevents DoS via large length values + const MAX_MESSAGE_SIZE: usize = 1024 * 1024; + if len > MAX_MESSAGE_SIZE { return Err(io::Error::new( io::ErrorKind::InvalidData, - format!("message too large: {} bytes", len), + format!( + "message too large: {} bytes (max {})", + len, MAX_MESSAGE_SIZE + ), )); } - // Read payload - let mut payload = vec![0u8; len]; - reader.read_exact(&mut payload)?; + // Read payload progressively to avoid allocating large buffers upfront + // This prevents memory exhaustion if sender disconnects mid-transfer + let mut payload = Vec::with_capacity(len.min(64 * 1024)); // Start with at most 64KB + let mut remaining = len; + let mut chunk = [0u8; 8192]; // Read in 8KB chunks + + while remaining > 0 { + let to_read = remaining.min(chunk.len()); + reader.read_exact(&mut chunk[..to_read])?; + payload.extend_from_slice(&chunk[..to_read]); + remaining -= to_read; + } // Parse based on type match msg_type { diff --git a/fc-agent/src/tty.rs b/fc-agent/src/tty.rs index 6027df73..f0f65ad9 100644 --- a/fc-agent/src/tty.rs +++ b/fc-agent/src/tty.rs @@ -113,6 +113,15 @@ pub fn run_with_pty_fd(vsock_fd: i32, command: &[String], tty: bool, interactive Ok(f) => f, Err(e) => { eprintln!("[fc-agent] tty: failed to clone vsock: {}", e); + // Clean up FDs before returning + if tty { + unsafe { libc::close(master_fd) }; + } else { + unsafe { + libc::close(stdout_read); + libc::close(stdin_write); + } + } unsafe { libc::kill(pid, libc::SIGKILL); libc::waitpid(pid, std::ptr::null_mut(), 0); @@ -127,6 +136,8 @@ pub fn run_with_pty_fd(vsock_fd: i32, command: &[String], tty: bool, interactive let dup_fd = unsafe { libc::dup(master_fd) }; if dup_fd < 0 { eprintln!("[fc-agent] tty: failed to dup master fd"); + // Clean up master_fd before returning + unsafe { libc::close(master_fd) }; unsafe { libc::kill(pid, libc::SIGKILL); libc::waitpid(pid, std::ptr::null_mut(), 0); @@ -395,8 +406,14 @@ fn reader_loop(mut source: std::fs::File, vsock: &mut std::fs::File, child_pid: // Wait for child and get exit code let mut status: libc::c_int = 0; - unsafe { - libc::waitpid(child_pid, &mut status, 0); + let ret = unsafe { libc::waitpid(child_pid, &mut status, 0) }; + + if ret < 0 { + eprintln!( + "[fc-agent] tty: waitpid failed: {}", + std::io::Error::last_os_error() + ); + return 1; } if libc::WIFEXITED(status) { diff --git a/src/commands/tty.rs b/src/commands/tty.rs index b0ab5d75..0ec41f7c 100644 --- a/src/commands/tty.rs +++ b/src/commands/tty.rs @@ -7,10 +7,49 @@ use std::io::Write; use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use anyhow::{bail, Context, Result}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; + +/// Global storage for terminal restoration on signal +/// Stores (stdin_fd, original_termios) when raw mode is active +static ORIG_TERMIOS: Mutex> = Mutex::new(None); + +/// Global flag to track if signal handlers are installed +static SIGNAL_HANDLERS_INSTALLED: AtomicBool = AtomicBool::new(false); + +/// Install signal handlers for terminal restoration +fn install_signal_handlers() { + if SIGNAL_HANDLERS_INSTALLED.swap(true, Ordering::SeqCst) { + return; // Already installed + } + + // Handler that restores terminal and re-raises the signal + extern "C" fn signal_handler(sig: libc::c_int) { + // Restore terminal if we have saved state + if let Ok(guard) = ORIG_TERMIOS.lock() { + if let Some((fd, termios)) = *guard { + unsafe { + libc::tcsetattr(fd, libc::TCSANOW, &termios); + } + } + } + // Re-raise the signal with default handler + unsafe { + libc::signal(sig, libc::SIG_DFL); + libc::raise(sig); + } + } + + unsafe { + // Install handlers for common termination signals + libc::signal(libc::SIGTERM, signal_handler as libc::sighandler_t); + libc::signal(libc::SIGQUIT, signal_handler as libc::sighandler_t); + libc::signal(libc::SIGHUP, signal_handler as libc::sighandler_t); + // Note: SIGINT (Ctrl+C) is passed through to guest in raw mode + } +} /// Run a TTY session by listening on a Unix socket. /// @@ -103,8 +142,10 @@ pub fn run_tty_session_connected(stream: UnixStream, tty: bool, interactive: boo restore_terminal(stdin_fd, termios); } - // Clean up writer thread handle - drop(writer_thread); + // Join writer thread to avoid zombie threads + if let Some(handle) = writer_thread { + let _ = handle.join(); + } Ok(exit_code) } @@ -116,6 +157,9 @@ fn setup_raw_terminal(stdin_fd: i32) -> Result> { bail!("TTY mode requires a terminal. Use without -t for non-interactive mode."); } + // Install signal handlers for terminal restoration before modifying terminal + install_signal_handlers(); + // Ensure stdin is in blocking mode (tokio may have set it non-blocking) unsafe { let flags = libc::fcntl(stdin_fd, libc::F_GETFL); @@ -132,11 +176,20 @@ fn setup_raw_terminal(stdin_fd: i32) -> Result> { } let orig = termios; + // Store in global for signal handler access + if let Ok(mut guard) = ORIG_TERMIOS.lock() { + *guard = Some((stdin_fd, orig)); + } + // Set raw mode unsafe { libc::cfmakeraw(&mut termios); } if unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios) } != 0 { + // Clear global on failure + if let Ok(mut guard) = ORIG_TERMIOS.lock() { + *guard = None; + } bail!("Failed to set raw terminal mode"); } @@ -145,8 +198,17 @@ fn setup_raw_terminal(stdin_fd: i32) -> Result> { /// Restore terminal to original settings fn restore_terminal(stdin_fd: i32, termios: libc::termios) { - unsafe { - libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios); + // Clear global first (signal handler won't need to restore anymore) + if let Ok(mut guard) = ORIG_TERMIOS.lock() { + *guard = None; + } + + let ret = unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios) }; + if ret != 0 { + warn!( + "Failed to restore terminal settings: {}", + std::io::Error::last_os_error() + ); } } @@ -190,15 +252,19 @@ fn reader_loop(mut stream: std::os::unix::net::UnixStream, done: Arc Err(e) => { if e.kind() == std::io::ErrorKind::UnexpectedEof { debug!("reader_loop: EOF"); - break; + // EOF without Exit message - return error + done.store(true, Ordering::Relaxed); + return Some(1); } debug!("reader_loop: error: {}", e); eprintln!("\r\nProtocol error: {}\r", e); - break; + done.store(true, Ordering::Relaxed); + return Some(1); } } } - None + // Should not reach here normally - return error if we do + Some(1) } /// Writer loop: read from stdin, send STDIN messages to socket From 7242d63c251f23ab73d6417b595c150147a2c186 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 5 Jan 2026 20:03:52 +0000 Subject: [PATCH 5/6] Fix signal handler async-signal-safety issues Address review feedback from second Claude review: 1. Replace Mutex with static mut + atomic flag - Mutex::lock() is not async-signal-safe per POSIX - Could deadlock if signal arrives while mutex is held - Now uses static mut ORIG_FD/ORIG_TERMIOS_STORAGE with TERMIOS_SAVED atomic flag for synchronization - Uses std::ptr::addr_of!() to avoid reference to mutable static 2. Replace signal() with sigaction() - signal() has undefined behavior in multithreaded programs - sigaction() provides well-defined POSIX semantics - Use SA_RESETHAND flag for auto-reset to SIG_DFL 3. Improve reader_loop comment - Clarify when post-loop code is reached (done flag set externally) - Add debug log for this case Tested: cargo fmt --check && cargo clippy --all-targets -D warnings --- src/commands/tty.rs | 71 ++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/src/commands/tty.rs b/src/commands/tty.rs index 0ec41f7c..7d9e0650 100644 --- a/src/commands/tty.rs +++ b/src/commands/tty.rs @@ -7,14 +7,18 @@ use std::io::Write; use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use anyhow::{bail, Context, Result}; use tracing::{debug, info, warn}; -/// Global storage for terminal restoration on signal -/// Stores (stdin_fd, original_termios) when raw mode is active -static ORIG_TERMIOS: Mutex> = Mutex::new(None); +/// Global storage for terminal restoration on signal (async-signal-safe) +/// Using static mut with atomic flag instead of Mutex to be signal-safe. +/// SAFETY: Only written while TERMIOS_SAVED is false, only read in signal handler +/// after TERMIOS_SAVED is true. The main code path ensures proper synchronization. +static mut ORIG_FD: i32 = -1; +static mut ORIG_TERMIOS_STORAGE: libc::termios = unsafe { std::mem::zeroed() }; +static TERMIOS_SAVED: AtomicBool = AtomicBool::new(false); /// Global flag to track if signal handlers are installed static SIGNAL_HANDLERS_INSTALLED: AtomicBool = AtomicBool::new(false); @@ -25,28 +29,34 @@ fn install_signal_handlers() { return; // Already installed } - // Handler that restores terminal and re-raises the signal - extern "C" fn signal_handler(sig: libc::c_int) { + // Handler that restores terminal - must be async-signal-safe + // Only calls tcsetattr (async-signal-safe per POSIX) + extern "C" fn signal_handler(_sig: libc::c_int) { // Restore terminal if we have saved state - if let Ok(guard) = ORIG_TERMIOS.lock() { - if let Some((fd, termios)) = *guard { - unsafe { - libc::tcsetattr(fd, libc::TCSANOW, &termios); - } + // Using Relaxed is fine - we just need to see a consistent value + if TERMIOS_SAVED.load(Ordering::Relaxed) { + unsafe { + // Use raw pointer to avoid creating a reference to mutable static + libc::tcsetattr( + ORIG_FD, + libc::TCSANOW, + std::ptr::addr_of!(ORIG_TERMIOS_STORAGE), + ); } } - // Re-raise the signal with default handler - unsafe { - libc::signal(sig, libc::SIG_DFL); - libc::raise(sig); - } + // SA_RESETHAND auto-resets handler to SIG_DFL, so signal will terminate process } unsafe { - // Install handlers for common termination signals - libc::signal(libc::SIGTERM, signal_handler as libc::sighandler_t); - libc::signal(libc::SIGQUIT, signal_handler as libc::sighandler_t); - libc::signal(libc::SIGHUP, signal_handler as libc::sighandler_t); + // Use sigaction() instead of signal() for well-defined behavior + // SA_RESETHAND: auto-reset to SIG_DFL after first invocation (no need to re-raise) + let mut sa: libc::sigaction = std::mem::zeroed(); + sa.sa_sigaction = signal_handler as usize; + sa.sa_flags = libc::SA_RESETHAND; + + libc::sigaction(libc::SIGTERM, &sa, std::ptr::null_mut()); + libc::sigaction(libc::SIGQUIT, &sa, std::ptr::null_mut()); + libc::sigaction(libc::SIGHUP, &sa, std::ptr::null_mut()); // Note: SIGINT (Ctrl+C) is passed through to guest in raw mode } } @@ -176,10 +186,14 @@ fn setup_raw_terminal(stdin_fd: i32) -> Result> { } let orig = termios; - // Store in global for signal handler access - if let Ok(mut guard) = ORIG_TERMIOS.lock() { - *guard = Some((stdin_fd, orig)); + // Store in global for signal handler access (async-signal-safe approach) + // SAFETY: We only write while TERMIOS_SAVED is false, ensuring no concurrent read + unsafe { + ORIG_FD = stdin_fd; + ORIG_TERMIOS_STORAGE = orig; } + // Memory barrier: ensure writes above are visible before setting flag + TERMIOS_SAVED.store(true, Ordering::Release); // Set raw mode unsafe { @@ -187,9 +201,7 @@ fn setup_raw_terminal(stdin_fd: i32) -> Result> { } if unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios) } != 0 { // Clear global on failure - if let Ok(mut guard) = ORIG_TERMIOS.lock() { - *guard = None; - } + TERMIOS_SAVED.store(false, Ordering::Release); bail!("Failed to set raw terminal mode"); } @@ -199,9 +211,7 @@ fn setup_raw_terminal(stdin_fd: i32) -> Result> { /// Restore terminal to original settings fn restore_terminal(stdin_fd: i32, termios: libc::termios) { // Clear global first (signal handler won't need to restore anymore) - if let Ok(mut guard) = ORIG_TERMIOS.lock() { - *guard = None; - } + TERMIOS_SAVED.store(false, Ordering::Release); let ret = unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios) }; if ret != 0 { @@ -263,7 +273,8 @@ fn reader_loop(mut stream: std::os::unix::net::UnixStream, done: Arc } } } - // Should not reach here normally - return error if we do + // Reached when done flag set externally (e.g., writer thread error) without Exit message + debug!("reader_loop: exiting due to done flag without Exit message"); Some(1) } From 34e2f75410760e29713a47af6c1956cd9db760d1 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 5 Jan 2026 20:07:14 +0000 Subject: [PATCH 6/6] Refactor TTY signal handling to use nix crate Replace raw libc calls with nix crate for cleaner, more idiomatic Rust: - Use nix::sys::signal::{sigaction, SigAction, SigHandler, Signal} instead of raw libc::sigaction - Use nix::sys::termios::{Termios, tcgetattr, tcsetattr, cfmakeraw} instead of raw libc termios calls - Use nix::unistd::isatty instead of libc::isatty - Use nix::fcntl for O_NONBLOCK handling The signal handler still uses unsafe for accessing static mut, but the rest of the code is now using safe Rust wrappers. The async-signal-safety properties are preserved - tcsetattr is still async-signal-safe per POSIX. Tested: cargo fmt --check && cargo clippy --all-targets -D warnings --- src/commands/tty.rs | 112 +++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 54 deletions(-) diff --git a/src/commands/tty.rs b/src/commands/tty.rs index 7d9e0650..94f633d0 100644 --- a/src/commands/tty.rs +++ b/src/commands/tty.rs @@ -4,20 +4,22 @@ //! used by both `podman run -it` and `exec -it` paths. use std::io::Write; -use std::os::unix::io::AsRawFd; +use std::os::unix::io::{AsRawFd, BorrowedFd}; use std::os::unix::net::{UnixListener, UnixStream}; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; use std::sync::Arc; use anyhow::{bail, Context, Result}; +use nix::sys::signal::{sigaction, SaFlags, SigAction, SigHandler, SigSet, Signal}; +use nix::sys::termios::{self, SetArg, Termios}; use tracing::{debug, info, warn}; /// Global storage for terminal restoration on signal (async-signal-safe) -/// Using static mut with atomic flag instead of Mutex to be signal-safe. -/// SAFETY: Only written while TERMIOS_SAVED is false, only read in signal handler -/// after TERMIOS_SAVED is true. The main code path ensures proper synchronization. -static mut ORIG_FD: i32 = -1; -static mut ORIG_TERMIOS_STORAGE: libc::termios = unsafe { std::mem::zeroed() }; +/// Using atomics and static mut with careful synchronization to be signal-safe. +/// SAFETY: ORIG_TERMIOS is only written while TERMIOS_SAVED is false, +/// only read in signal handler after TERMIOS_SAVED is true. +static ORIG_FD: AtomicI32 = AtomicI32::new(-1); +static mut ORIG_TERMIOS: Option = None; static TERMIOS_SAVED: AtomicBool = AtomicBool::new(false); /// Global flag to track if signal handlers are installed @@ -30,35 +32,39 @@ fn install_signal_handlers() { } // Handler that restores terminal - must be async-signal-safe - // Only calls tcsetattr (async-signal-safe per POSIX) extern "C" fn signal_handler(_sig: libc::c_int) { - // Restore terminal if we have saved state - // Using Relaxed is fine - we just need to see a consistent value - if TERMIOS_SAVED.load(Ordering::Relaxed) { - unsafe { - // Use raw pointer to avoid creating a reference to mutable static - libc::tcsetattr( - ORIG_FD, - libc::TCSANOW, - std::ptr::addr_of!(ORIG_TERMIOS_STORAGE), - ); + if TERMIOS_SAVED.load(Ordering::Acquire) { + let fd = ORIG_FD.load(Ordering::Acquire); + if fd >= 0 { + // SAFETY: We only read ORIG_TERMIOS after TERMIOS_SAVED is true, + // and it's only written before TERMIOS_SAVED becomes true. + unsafe { + if let Some(ref termios) = ORIG_TERMIOS { + // tcsetattr is async-signal-safe per POSIX + let _ = termios::tcsetattr( + BorrowedFd::borrow_raw(fd), + SetArg::TCSANOW, + termios, + ); + } + } } } // SA_RESETHAND auto-resets handler to SIG_DFL, so signal will terminate process } + // Use sigaction with SA_RESETHAND for well-defined behavior + let handler = SigHandler::Handler(signal_handler); + let action = SigAction::new(handler, SaFlags::SA_RESETHAND, SigSet::empty()); + + // Install handlers for termination signals + // SAFETY: signal_handler is async-signal-safe unsafe { - // Use sigaction() instead of signal() for well-defined behavior - // SA_RESETHAND: auto-reset to SIG_DFL after first invocation (no need to re-raise) - let mut sa: libc::sigaction = std::mem::zeroed(); - sa.sa_sigaction = signal_handler as usize; - sa.sa_flags = libc::SA_RESETHAND; - - libc::sigaction(libc::SIGTERM, &sa, std::ptr::null_mut()); - libc::sigaction(libc::SIGQUIT, &sa, std::ptr::null_mut()); - libc::sigaction(libc::SIGHUP, &sa, std::ptr::null_mut()); - // Note: SIGINT (Ctrl+C) is passed through to guest in raw mode + let _ = sigaction(Signal::SIGTERM, &action); + let _ = sigaction(Signal::SIGQUIT, &action); + let _ = sigaction(Signal::SIGHUP, &action); } + // Note: SIGINT (Ctrl+C) is passed through to guest in raw mode } /// Run a TTY session by listening on a Unix socket. @@ -161,9 +167,11 @@ pub fn run_tty_session_connected(stream: UnixStream, tty: bool, interactive: boo } /// Set up raw terminal mode -fn setup_raw_terminal(stdin_fd: i32) -> Result> { - let is_tty = unsafe { libc::isatty(stdin_fd) == 1 }; - if !is_tty { +fn setup_raw_terminal(stdin_fd: i32) -> Result> { + // SAFETY: stdin_fd is valid for the duration of this call + let fd = unsafe { BorrowedFd::borrow_raw(stdin_fd) }; + + if !nix::unistd::isatty(stdin_fd).unwrap_or(false) { bail!("TTY mode requires a terminal. Use without -t for non-interactive mode."); } @@ -171,54 +179,50 @@ fn setup_raw_terminal(stdin_fd: i32) -> Result> { install_signal_handlers(); // Ensure stdin is in blocking mode (tokio may have set it non-blocking) - unsafe { - let flags = libc::fcntl(stdin_fd, libc::F_GETFL); - if flags != -1 && (flags & libc::O_NONBLOCK) != 0 { - libc::fcntl(stdin_fd, libc::F_SETFL, flags & !libc::O_NONBLOCK); + if let Ok(flags) = nix::fcntl::fcntl(stdin_fd, nix::fcntl::FcntlArg::F_GETFL) { + let oflags = nix::fcntl::OFlag::from_bits_truncate(flags); + if oflags.contains(nix::fcntl::OFlag::O_NONBLOCK) { + let new_flags = oflags & !nix::fcntl::OFlag::O_NONBLOCK; + let _ = nix::fcntl::fcntl(stdin_fd, nix::fcntl::FcntlArg::F_SETFL(new_flags)); debug!("setup_raw_terminal: cleared O_NONBLOCK from stdin"); } } // Save original terminal settings - let mut termios: libc::termios = unsafe { std::mem::zeroed() }; - if unsafe { libc::tcgetattr(stdin_fd, &mut termios) } != 0 { - bail!("Failed to get terminal attributes"); - } - let orig = termios; + let orig = termios::tcgetattr(fd).context("Failed to get terminal attributes")?; // Store in global for signal handler access (async-signal-safe approach) // SAFETY: We only write while TERMIOS_SAVED is false, ensuring no concurrent read + ORIG_FD.store(stdin_fd, Ordering::Release); unsafe { - ORIG_FD = stdin_fd; - ORIG_TERMIOS_STORAGE = orig; + ORIG_TERMIOS = Some(orig.clone()); } // Memory barrier: ensure writes above are visible before setting flag TERMIOS_SAVED.store(true, Ordering::Release); // Set raw mode - unsafe { - libc::cfmakeraw(&mut termios); - } - if unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios) } != 0 { + let mut raw = orig.clone(); + termios::cfmakeraw(&mut raw); + + if let Err(e) = termios::tcsetattr(fd, SetArg::TCSANOW, &raw) { // Clear global on failure TERMIOS_SAVED.store(false, Ordering::Release); - bail!("Failed to set raw terminal mode"); + bail!("Failed to set raw terminal mode: {}", e); } Ok(Some(orig)) } /// Restore terminal to original settings -fn restore_terminal(stdin_fd: i32, termios: libc::termios) { +fn restore_terminal(stdin_fd: i32, orig_termios: Termios) { // Clear global first (signal handler won't need to restore anymore) TERMIOS_SAVED.store(false, Ordering::Release); - let ret = unsafe { libc::tcsetattr(stdin_fd, libc::TCSANOW, &termios) }; - if ret != 0 { - warn!( - "Failed to restore terminal settings: {}", - std::io::Error::last_os_error() - ); + // SAFETY: stdin_fd is valid for the duration of this call + let fd = unsafe { BorrowedFd::borrow_raw(stdin_fd) }; + + if let Err(e) = termios::tcsetattr(fd, SetArg::TCSANOW, &orig_termios) { + warn!("Failed to restore terminal settings: {}", e); } }