From d09b2d8a3bb2d81040d6ff18ff7ac02479a1fcec Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Tue, 23 Dec 2025 19:51:26 +0000 Subject: [PATCH 01/13] plan --- ...ec2-style_full_vm_support_caf1dc20.plan.md | 219 +++++++++ .../systemd_vm_with_go_init_c0a9c010.plan.md | 436 ++++++++++++++++++ 2 files changed, 655 insertions(+) create mode 100644 .cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md create mode 100644 .cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md diff --git a/.cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md b/.cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md new file mode 100644 index 0000000..959df3a --- /dev/null +++ b/.cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md @@ -0,0 +1,219 @@ +--- +name: EC2-Style Full VM Support +overview: Add a new "full VM" mode alongside the existing container-exec mode, allowing users to boot ISOs/cloud images or systemd-based OCI images with proper PID 1 (systemd), while still running the hypeman guest-agent for exec/shell capabilities. +todos: + - id: phase1-mode-flag + content: Add --mode full flag to CLI and mode field to API + status: pending + - id: phase1-config-disk + content: Add FULL_VM_MODE to config disk generation + status: pending + - id: phase1-init-script + content: Modify init script with pivot_root path for systemd handoff + status: pending + - id: phase1-agent-service + content: Create systemd service unit for guest-agent injection + status: pending + - id: phase2-uefi-firmware + content: Download and bundle hypervisor-fw for UEFI boot + status: pending + - id: phase2-cloud-image + content: Add cloud image download and conversion support + status: pending + - id: phase2-cloudinit-disk + content: Implement cloud-init NoCloud datasource disk generation + status: pending + - id: phase3-imds + content: Implement EC2-compatible IMDS in guest-agent + status: pending +--- + +# EC2-Style Full VM Support for Hypeman + +## Current Architecture + +```mermaid +flowchart LR + subgraph boot [Current Boot Chain] + CH[Cloud Hypervisor] --> Kernel[Direct Kernel Boot] + Kernel --> Initrd[Custom Initrd] + Initrd --> InitScript[Shell Init PID 1] + InitScript --> GuestAgent[guest-agent] + InitScript --> Entrypoint[Container Entrypoint] + end +``` + +Your current approach is optimized for container-like workloads: + +- **Direct kernel boot** with custom initrd (no bootloader) +- **Shell script as PID 1** that runs entrypoint + guest-agent +- **OCI images as rootfs** via overlay filesystem + +## Proposed Architecture: Two VM Modes + +```mermaid +flowchart TB + subgraph modes [Instance Mode Selection] + Create[hypeman run] --> ModeCheck{--mode?} + ModeCheck -->|exec default| ExecMode[Exec Mode] + ModeCheck -->|full| FullMode[Full VM Mode] + end + + subgraph exec [Exec Mode - Current] + ExecMode --> DirectBoot[Direct Kernel Boot] + DirectBoot --> CustomInitrd[Custom Initrd] + CustomInitrd --> ShellPID1[Shell PID 1] + ShellPID1 --> AgentBG[guest-agent background] + ShellPID1 --> AppFG[Entrypoint foreground] + end + + subgraph full [Full VM Mode - New] + FullMode --> BootType{Boot Source?} + BootType -->|cloud image| CloudImg[Cloud Image] + BootType -->|systemd OCI| SystemdOCI[Systemd OCI Image] + CloudImg --> UEFI[UEFI Boot] + SystemdOCI --> DirectBoot2[Direct Kernel Boot] + DirectBoot2 --> PivotRoot[pivot_root to rootfs] + PivotRoot --> Systemd[systemd PID 1] + UEFI --> Systemd + Systemd --> AgentService[guest-agent.service] + end +``` + + + +## How EC2 Does It + +1. **AMI = Bootable EBS Volume**: Contains bootloader (GRUB), kernel, initramfs, full OS with systemd +2. **IMDS (Instance Metadata Service)**: HTTP endpoint at `169.254.169.254` provides instance config, credentials, user-data +3. **cloud-init**: Runs on boot to configure networking, SSH keys, run user-data scripts +4. **SSM Agent / SSH**: Enables remote access post-boot + +## Recommended Approach + +### Option A: UEFI Boot with Cloud Images (Recommended for ISOs/Cloud Images) + +For booting Ubuntu/Debian cloud images or custom ISOs: + +1. **Use Cloud Hypervisor's UEFI firmware** (`hypervisor-fw`) +2. **Boot from disk image** (qcow2/raw) containing full OS with systemd +3. **cloud-init for configuration**: Create a cloud-init NoCloud datasource disk with: + +- Network configuration +- SSH keys +- Guest-agent installation/startup + +4. **Guest-agent as systemd service**: Install and enable via cloud-init +```bash +# Example: Convert cloud image for Cloud Hypervisor +wget https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img +qemu-img convert -p -f qcow2 -O raw noble-server-cloudimg-amd64.img noble.raw + +# Boot with UEFI +cloud-hypervisor \ + --kernel hypervisor-fw \ + --disk path=noble.raw path=cloud-init.img \ + --cpus boot=4 --memory size=2G \ + --net tap=tap0,mac=... +``` + + + + +### Option B: Direct Kernel Boot with Systemd Handoff (Recommended for Systemd OCI Images) + +For OCI images that contain systemd (e.g., `systemd/systemd` base images): + +1. **Keep current direct kernel boot** (fast, no bootloader) +2. **Modify init script** to detect `FULL_VM_MODE=1` in config +3. **Use `pivot_root` + `exec /sbin/init`** instead of running entrypoint +4. **Pre-install guest-agent** as systemd service in the image or inject via overlay +```1:294:hypeman/lib/system/init_script.go +// Extend init_script.go with full-vm mode: + +# After overlay setup and network config, add: +if [ "${FULL_VM_MODE:-0}" = "1" ]; then + # Full VM mode: hand off to systemd + echo "overlay-init: full-vm mode - switching to systemd" + + # Install guest-agent as systemd service + mkdir -p /overlay/newroot/etc/systemd/system + cat > /overlay/newroot/etc/systemd/system/hypeman-agent.service << 'EOF' +[Unit] +Description=Hypeman Guest Agent +After=network.target + +[Service] +ExecStart=/usr/local/bin/guest-agent +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target +EOF + + ln -sf /etc/systemd/system/hypeman-agent.service \ + /overlay/newroot/etc/systemd/system/multi-user.target.wants/hypeman-agent.service + + # pivot_root and exec systemd + cd /overlay/newroot + pivot_root . oldroot + exec /sbin/init +fi +``` + + + + +### Option C: Hybrid Metadata Service (EC2-Compatible) + +For maximum EC2 compatibility: + +1. **Implement IMDS-compatible endpoint** in guest-agent at `169.254.169.254` +2. **Serve instance metadata, user-data, credentials** via this endpoint +3. **Works with standard cloud-init** out of the box + +## Recommended Implementation Path + +| Feature | Complexity | EC2-Feel | Notes ||---------|------------|----------|-------|| Option B (systemd OCI) | Low | Medium | Extends current init script, minimal changes || Option A (UEFI + cloud images) | Medium | High | Requires UEFI firmware, cloud-init disk generation || Option C (IMDS) | Medium | Very High | Makes cloud-init "just work" | + +### Suggested Phased Approach + +**Phase 1: Systemd OCI Support (Option B)** + +- Add `--mode full` flag to `hypeman run` +- Add `FULL_VM_MODE` to config disk generation +- Modify init script to pivot_root + exec systemd +- Inject guest-agent.service into overlay + +**Phase 2: Cloud Image Support (Option A)** + +- Add `--boot-image` flag for raw/qcow2 images +- Download and bundle UEFI firmware (hypervisor-fw) +- Generate cloud-init NoCloud datasource disks +- Package guest-agent as installable .deb/.rpm + +**Phase 3: IMDS Compatibility (Option C)** + +- Add HTTP server to guest-agent listening on 169.254.169.254:80 +- Implement EC2 IMDS v1/v2 compatible endpoints +- Remove need for custom cloud-init config + +## Processing ISOs into Bootable Images + +For ISOs, you have two paths: + +1. **One-time conversion** (recommended): Install the ISO to a disk image using QEMU/virt-install, then use the resulting disk image as a template + ```bash + # Create empty disk + qemu-img create -f raw base.raw 20G + + # Install from ISO (interactive or via preseed/kickstart) + qemu-system-x86_64 -enable-kvm -m 2G \ + -cdrom ubuntu-22.04.iso \ + -drive file=base.raw,format=raw \ + -boot d + + # After installation, use base.raw as template + + ``` \ No newline at end of file diff --git a/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md b/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md new file mode 100644 index 0000000..c71db68 --- /dev/null +++ b/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md @@ -0,0 +1,436 @@ +--- +name: Systemd VM with Go Init +overview: Support systemd-based OCI images via auto-detection or --init-mode flag, using a Go-based init binary with structured logging to hypeman operations log, performing pivot_root for systemd handoff. +todos: + - id: go-init-scaffold + content: Create lib/system/init/ package with main.go entry point + status: pending + - id: go-init-mount + content: Implement mount.go for proc/sys/dev/overlay mounting + status: pending + - id: go-init-config + content: Implement config.go to parse config disk + status: pending + - id: go-init-network + content: Implement network.go for network configuration + status: pending + - id: go-init-drivers + content: Implement drivers.go for GPU driver loading + status: pending + - id: go-init-volumes + content: Implement volumes.go for volume mounting + status: pending + - id: go-init-exec + content: Implement mode_exec.go with current behavior + status: pending + - id: go-init-systemd + content: Implement mode_systemd.go with pivot_root and service injection + status: pending + - id: go-init-logger + content: Implement logger.go writing to hypeman operations log + status: pending + - id: initrd-build + content: Modify initrd.go to compile and include init binary + status: pending + - id: systemd-detect + content: Create lib/images/systemd.go with IsSystemdImage() + status: pending + - id: config-disk-mode + content: Add INIT_MODE to configdisk.go + status: pending + - id: api-init-mode + content: Add init_mode field to openapi.yaml and instance types + status: pending + - id: cli-init-mode + content: Add --init-mode flag to hypeman run command + status: pending + - id: agent-location + content: Change guest-agent copy location to /opt/hypeman/ + status: pending + - id: dialer-resilience + content: Add error handling for missing agent in client.go + status: pending + - id: test-dockerfile + content: Create integration/testdata/systemd/Dockerfile + status: pending + - id: e2e-tests + content: Create integration/systemd_test.go with build/push/test flow + status: pending +--- + +# Systemd VM Support with Go-based Init + +## Architecture + +```mermaid +flowchart TB + subgraph initrd [Initrd Contents] + GoInit[init Go binary] + BusyBox[busybox utilities] + GuestAgent[guest-agent binary] + end + + subgraph boot [Boot Flow] + Kernel[Kernel Boot] --> GoInit + GoInit --> MountFS[Mount proc/sys/dev] + MountFS --> Overlay[Setup overlay rootfs] + Overlay --> Config[Read config disk] + Config --> Network[Configure network] + Network --> Drivers[Load GPU drivers if needed] + Drivers --> Volumes[Mount volumes] + Volumes --> CopyAgent[Copy agent to /opt/hypeman/] + CopyAgent --> ModeCheck{init_mode?} + ModeCheck -->|exec| ExecMode[Exec Mode] + ModeCheck -->|systemd| SystemdMode[Systemd Mode] + end + + subgraph execpath [Exec Mode] + ExecMode --> StartAgentBG[Start guest-agent in background] + StartAgentBG --> RunEntrypoint[Run entrypoint as child] + RunEntrypoint --> WaitPID[Wait for entrypoint exit] + end + + subgraph systemdpath [Systemd Mode] + SystemdMode --> InjectService[Inject guest-agent.service] + InjectService --> PivotRoot[pivot_root to overlay/newroot] + PivotRoot --> ExecInit["exec /sbin/init (systemd)"] + ExecInit --> SystemdPID1[Systemd manages everything] + end +``` + + + +## Shared vs Mode-Specific Behavior + +| Step | Exec Mode | Systemd Mode | + +|------|-----------|--------------| + +| Mount proc/sys/dev | Shared | Shared | + +| Mount rootfs overlay | Shared | Shared | + +| Read config disk | Shared | Shared | + +| Configure network | Init configures it | Init configures it (before pivot) | + +| Load GPU drivers | Shared | Shared | + +| Mount volumes | Shared | Shared | + +| Copy guest-agent | To `/opt/hypeman/` | To `/opt/hypeman/` | + +| Start guest-agent | Background process | Systemd service | + +| PID 1 | Go init binary | Systemd | + +| App lifecycle | Managed by init | Managed by systemd | + +## Go-based Init Binary + +Package structure at `lib/system/init/`: + +```javascript +lib/system/init/ + main.go # Entry point, orchestrates boot + mount.go # Mount operations (proc, sys, dev, overlay) + config.go # Parse config disk + network.go # Network configuration + drivers.go # GPU driver loading + volumes.go # Volume mounting + mode_exec.go # Exec mode: run entrypoint + mode_systemd.go # Systemd mode: pivot_root + exec init + logger.go # Structured logging to hypeman operations log +``` + + + +### Logger Design + +Logs to the hypeman operations log (same format as existing hypeman logs): + +```go +// lib/system/init/logger.go +package main + +type Logger struct { + file *os.File // Writes to /logs/hypeman.log via config disk path +} + +type LogEntry struct { + Time string `json:"time"` + Level string `json:"level"` + Phase string `json:"phase"` + Message string `json:"msg"` + Error string `json:"error,omitempty"` +} + +func (l *Logger) Info(phase, msg string) { + l.log("INFO", phase, msg, "") +} + +func (l *Logger) Error(phase, msg string, err error) { + errStr := "" + if err != nil { + errStr = err.Error() + } + l.log("ERROR", phase, msg, errStr) +} + +// Output: structured JSON to hypeman operations log +// {"time":"2024-12-23T10:15:30Z","level":"INFO","phase":"overlay","msg":"mounted rootfs"} +``` + + + +### Main Orchestration + +```go +// lib/system/init/main.go +package main + +func main() { + log := NewLogger() + log.Info("boot", "init starting") + + if err := mountEssentials(log); err != nil { + log.Error("mount", "failed", err) + dropToShell() + } + + if err := setupOverlay(log); err != nil { + log.Error("overlay", "failed", err) + dropToShell() + } + + cfg, err := readConfig(log) + if err != nil { + log.Error("config", "failed", err) + dropToShell() + } + + if cfg.NetworkEnabled { + if err := configureNetwork(log, cfg); err != nil { + log.Error("network", "failed", err) + } + } + + if cfg.HasGPU { + if err := loadGPUDrivers(log); err != nil { + log.Error("gpu", "failed", err) + } + } + + if err := mountVolumes(log, cfg); err != nil { + log.Error("volumes", "failed", err) + } + + if err := copyGuestAgent(log); err != nil { + log.Error("agent", "failed to copy", err) + } + + if cfg.InitMode == "systemd" { + log.Info("mode", "entering systemd mode") + runSystemdMode(log, cfg) + } else { + log.Info("mode", "entering exec mode") + runExecMode(log, cfg) + } +} +``` + + + +### Systemd Mode + +```go +// lib/system/init/mode_systemd.go +package main + +import ( + "os" + "syscall" +) + +func runSystemdMode(log *Logger, cfg *Config) { + const newroot = "/overlay/newroot" + + log.Info("systemd", "injecting hypeman-agent.service") + if err := injectAgentService(newroot); err != nil { + log.Error("systemd", "failed to inject service", err) + } + + oldroot := newroot + "/oldroot" + os.MkdirAll(oldroot, 0755) + + if err := os.Chdir(newroot); err != nil { + log.Error("systemd", "chdir failed", err) + dropToShell() + } + + log.Info("systemd", "executing pivot_root") + if err := syscall.PivotRoot(".", "oldroot"); err != nil { + log.Error("systemd", "pivot_root failed", err) + dropToShell() + } + + os.Chdir("/") + + log.Info("systemd", "exec /sbin/init") + syscall.Exec("/sbin/init", []string{"/sbin/init"}, os.Environ()) + + log.Error("systemd", "exec failed", nil) + dropToShell() +} + +func injectAgentService(newroot string) error { + serviceContent := `[Unit] +Description=Hypeman Guest Agent +After=network.target + +[Service] +Type=simple +ExecStart=/opt/hypeman/guest-agent +Restart=always +RestartSec=3 + +[Install] +WantedBy=multi-user.target +` + serviceDir := newroot + "/etc/systemd/system" + wantsDir := serviceDir + "/multi-user.target.wants" + os.MkdirAll(wantsDir, 0755) + + servicePath := serviceDir + "/hypeman-agent.service" + if err := os.WriteFile(servicePath, []byte(serviceContent), 0644); err != nil { + return err + } + return os.Symlink(servicePath, wantsDir+"/hypeman-agent.service") +} +``` + + + +## Detection Logic + +```go +// lib/images/systemd.go +package images + +import "strings" + +func IsSystemdImage(entrypoint, cmd []string) bool { + effective := append(entrypoint, cmd...) + if len(effective) == 0 { + return false + } + + first := effective[0] + systemdPaths := []string{ + "/sbin/init", + "/lib/systemd/systemd", + "/usr/lib/systemd/systemd", + } + for _, p := range systemdPaths { + if first == p { + return true + } + } + if strings.HasSuffix(first, "/init") { + return true + } + return false +} +``` + + + +## CLI and API + +```bash +# Auto-detect from image CMD (default) +hypeman run my-systemd-image + +# Force systemd mode +hypeman run --init-mode=systemd my-image + +# Force exec mode on a systemd image +hypeman run --init-mode=exec my-systemd-image +``` + + + +## E2E Test + +Custom Dockerfile in repository at `integration/testdata/systemd/Dockerfile`: + +```dockerfile +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + systemd \ + systemd-sysv \ + dbus \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Remove unnecessary systemd units +RUN rm -f /lib/systemd/system/multi-user.target.wants/* \ + /etc/systemd/system/*.wants/* \ + /lib/systemd/system/local-fs.target.wants/* \ + /lib/systemd/system/sockets.target.wants/*udev* \ + /lib/systemd/system/sockets.target.wants/*initctl* + +VOLUME ["/sys/fs/cgroup"] +CMD ["/lib/systemd/systemd"] +``` + +Test flow: + +1. Build image with `docker build` +2. Push to hypeman via OCI import +3. Run instance (should auto-detect systemd mode) +4. Verify systemd is PID 1 +5. Verify guest-agent.service is active +```go +// integration/systemd_test.go + +func TestSystemdMode(t *testing.T) { + // Build and push test image + buildAndPushTestImage(t, "integration/testdata/systemd", "test-systemd:latest") + + // Create instance (auto-detects systemd mode) + inst := createInstance(t, "test-systemd:latest", "") + defer deleteInstance(t, inst.Id) + + time.Sleep(10 * time.Second) + + // Verify systemd is PID 1 + result := execInVM(t, inst, "cat", "/proc/1/comm") + assert.Equal(t, "systemd", strings.TrimSpace(result.Stdout)) + + // Verify agent service is running + result = execInVM(t, inst, "systemctl", "is-active", "hypeman-agent") + assert.Equal(t, "active", strings.TrimSpace(result.Stdout)) + + // Verify agent location + result = execInVM(t, inst, "test", "-x", "/opt/hypeman/guest-agent") + assert.Equal(t, 0, result.ExitCode) +} + +func TestInitModeOverride(t *testing.T) { + buildAndPushTestImage(t, "integration/testdata/systemd", "test-systemd:latest") + + // Force exec mode + inst := createInstance(t, "test-systemd:latest", "exec") + defer deleteInstance(t, inst.Id) + + time.Sleep(5 * time.Second) + + // PID 1 should be our init, not systemd + result := execInVM(t, inst, "cat", "/proc/1/comm") + assert.NotEqual(t, "systemd", strings.TrimSpace(result.Stdout)) +} +``` \ No newline at end of file From bb45dcb843162e92822fa545ee9fc2467c704a07 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 24 Dec 2025 03:18:19 +0000 Subject: [PATCH 02/13] feat: add systemd mode for EC2-like VMs Replace shell-based init script with Go binary that supports two modes: ## Exec Mode (existing behavior) - Go init runs as PID 1 - Starts guest-agent in background - Runs container entrypoint as child process - Used for standard Docker images (nginx, python, etc.) ## Systemd Mode (new) - Auto-detected when image CMD is /sbin/init or /lib/systemd/systemd - Go init sets up rootfs, then chroots and execs systemd - Systemd becomes PID 1 and manages the full system - guest-agent runs as a systemd service (hypeman-agent.service) - Enables EC2-like experience: ssh, systemctl, journalctl all work ## Key changes: - lib/system/init/: New Go-based init binary with modular boot phases - lib/images/systemd.go: IsSystemdImage() auto-detection from CMD - lib/instances/configdisk.go: Passes INIT_MODE to guest - lib/system/init/init.sh: Shell wrapper to mount /proc /sys /dev before Go runtime (Go requires these during initialization) - integration/systemd_test.go: Full E2E test verifying: - systemd is PID 1 - hypeman-agent.service is active - journalctl works for viewing logs ## Boot flow: 1. Kernel loads initrd with busybox + Go init + guest-agent 2. init.sh mounts /proc, /sys, /dev (Go runtime needs these) 3. init.sh execs Go init binary 4. Go init mounts overlay rootfs, configures network, copies agent 5. Based on INIT_MODE: exec mode (run entrypoint) or systemd mode (chroot + exec /sbin/init) --- ...ec2-style_full_vm_support_caf1dc20.plan.md | 219 ------------- .../systemd_vm_with_go_init_c0a9c010.plan.md | 230 ++++++-------- .gitignore | 2 +- Makefile | 21 +- integration/systemd_test.go | 223 +++++++++++++ lib/guest/client.go | 32 +- lib/images/systemd.go | 43 +++ lib/images/systemd_test.go | 91 ++++++ lib/instances/configdisk.go | 12 +- lib/system/README.md | 65 ++-- lib/system/guest_agent_binary.go | 1 - lib/system/init/config.go | 194 ++++++++++++ lib/system/init/drivers.go | 190 +++++++++++ lib/system/init/init.sh | 17 + lib/system/init/logger.go | 71 +++++ lib/system/init/main.go | 95 ++++++ lib/system/init/mode_exec.go | 100 ++++++ lib/system/init/mode_systemd.go | 90 ++++++ lib/system/init/mount.go | 210 +++++++++++++ lib/system/init/network.go | 58 ++++ lib/system/init/volumes.go | 109 +++++++ lib/system/init_binary.go | 11 + lib/system/init_script.go | 294 ------------------ lib/system/init_wrapper.go | 6 + lib/system/initrd.go | 35 ++- lib/system/manager_test.go | 17 +- 26 files changed, 1739 insertions(+), 697 deletions(-) delete mode 100644 .cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md create mode 100644 integration/systemd_test.go create mode 100644 lib/images/systemd.go create mode 100644 lib/images/systemd_test.go create mode 100644 lib/system/init/config.go create mode 100644 lib/system/init/drivers.go create mode 100644 lib/system/init/init.sh create mode 100644 lib/system/init/logger.go create mode 100644 lib/system/init/main.go create mode 100644 lib/system/init/mode_exec.go create mode 100644 lib/system/init/mode_systemd.go create mode 100644 lib/system/init/mount.go create mode 100644 lib/system/init/network.go create mode 100644 lib/system/init/volumes.go create mode 100644 lib/system/init_binary.go delete mode 100644 lib/system/init_script.go create mode 100644 lib/system/init_wrapper.go diff --git a/.cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md b/.cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md deleted file mode 100644 index 959df3a..0000000 --- a/.cursor/plans/ec2-style_full_vm_support_caf1dc20.plan.md +++ /dev/null @@ -1,219 +0,0 @@ ---- -name: EC2-Style Full VM Support -overview: Add a new "full VM" mode alongside the existing container-exec mode, allowing users to boot ISOs/cloud images or systemd-based OCI images with proper PID 1 (systemd), while still running the hypeman guest-agent for exec/shell capabilities. -todos: - - id: phase1-mode-flag - content: Add --mode full flag to CLI and mode field to API - status: pending - - id: phase1-config-disk - content: Add FULL_VM_MODE to config disk generation - status: pending - - id: phase1-init-script - content: Modify init script with pivot_root path for systemd handoff - status: pending - - id: phase1-agent-service - content: Create systemd service unit for guest-agent injection - status: pending - - id: phase2-uefi-firmware - content: Download and bundle hypervisor-fw for UEFI boot - status: pending - - id: phase2-cloud-image - content: Add cloud image download and conversion support - status: pending - - id: phase2-cloudinit-disk - content: Implement cloud-init NoCloud datasource disk generation - status: pending - - id: phase3-imds - content: Implement EC2-compatible IMDS in guest-agent - status: pending ---- - -# EC2-Style Full VM Support for Hypeman - -## Current Architecture - -```mermaid -flowchart LR - subgraph boot [Current Boot Chain] - CH[Cloud Hypervisor] --> Kernel[Direct Kernel Boot] - Kernel --> Initrd[Custom Initrd] - Initrd --> InitScript[Shell Init PID 1] - InitScript --> GuestAgent[guest-agent] - InitScript --> Entrypoint[Container Entrypoint] - end -``` - -Your current approach is optimized for container-like workloads: - -- **Direct kernel boot** with custom initrd (no bootloader) -- **Shell script as PID 1** that runs entrypoint + guest-agent -- **OCI images as rootfs** via overlay filesystem - -## Proposed Architecture: Two VM Modes - -```mermaid -flowchart TB - subgraph modes [Instance Mode Selection] - Create[hypeman run] --> ModeCheck{--mode?} - ModeCheck -->|exec default| ExecMode[Exec Mode] - ModeCheck -->|full| FullMode[Full VM Mode] - end - - subgraph exec [Exec Mode - Current] - ExecMode --> DirectBoot[Direct Kernel Boot] - DirectBoot --> CustomInitrd[Custom Initrd] - CustomInitrd --> ShellPID1[Shell PID 1] - ShellPID1 --> AgentBG[guest-agent background] - ShellPID1 --> AppFG[Entrypoint foreground] - end - - subgraph full [Full VM Mode - New] - FullMode --> BootType{Boot Source?} - BootType -->|cloud image| CloudImg[Cloud Image] - BootType -->|systemd OCI| SystemdOCI[Systemd OCI Image] - CloudImg --> UEFI[UEFI Boot] - SystemdOCI --> DirectBoot2[Direct Kernel Boot] - DirectBoot2 --> PivotRoot[pivot_root to rootfs] - PivotRoot --> Systemd[systemd PID 1] - UEFI --> Systemd - Systemd --> AgentService[guest-agent.service] - end -``` - - - -## How EC2 Does It - -1. **AMI = Bootable EBS Volume**: Contains bootloader (GRUB), kernel, initramfs, full OS with systemd -2. **IMDS (Instance Metadata Service)**: HTTP endpoint at `169.254.169.254` provides instance config, credentials, user-data -3. **cloud-init**: Runs on boot to configure networking, SSH keys, run user-data scripts -4. **SSM Agent / SSH**: Enables remote access post-boot - -## Recommended Approach - -### Option A: UEFI Boot with Cloud Images (Recommended for ISOs/Cloud Images) - -For booting Ubuntu/Debian cloud images or custom ISOs: - -1. **Use Cloud Hypervisor's UEFI firmware** (`hypervisor-fw`) -2. **Boot from disk image** (qcow2/raw) containing full OS with systemd -3. **cloud-init for configuration**: Create a cloud-init NoCloud datasource disk with: - -- Network configuration -- SSH keys -- Guest-agent installation/startup - -4. **Guest-agent as systemd service**: Install and enable via cloud-init -```bash -# Example: Convert cloud image for Cloud Hypervisor -wget https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img -qemu-img convert -p -f qcow2 -O raw noble-server-cloudimg-amd64.img noble.raw - -# Boot with UEFI -cloud-hypervisor \ - --kernel hypervisor-fw \ - --disk path=noble.raw path=cloud-init.img \ - --cpus boot=4 --memory size=2G \ - --net tap=tap0,mac=... -``` - - - - -### Option B: Direct Kernel Boot with Systemd Handoff (Recommended for Systemd OCI Images) - -For OCI images that contain systemd (e.g., `systemd/systemd` base images): - -1. **Keep current direct kernel boot** (fast, no bootloader) -2. **Modify init script** to detect `FULL_VM_MODE=1` in config -3. **Use `pivot_root` + `exec /sbin/init`** instead of running entrypoint -4. **Pre-install guest-agent** as systemd service in the image or inject via overlay -```1:294:hypeman/lib/system/init_script.go -// Extend init_script.go with full-vm mode: - -# After overlay setup and network config, add: -if [ "${FULL_VM_MODE:-0}" = "1" ]; then - # Full VM mode: hand off to systemd - echo "overlay-init: full-vm mode - switching to systemd" - - # Install guest-agent as systemd service - mkdir -p /overlay/newroot/etc/systemd/system - cat > /overlay/newroot/etc/systemd/system/hypeman-agent.service << 'EOF' -[Unit] -Description=Hypeman Guest Agent -After=network.target - -[Service] -ExecStart=/usr/local/bin/guest-agent -Restart=always -RestartSec=5 - -[Install] -WantedBy=multi-user.target -EOF - - ln -sf /etc/systemd/system/hypeman-agent.service \ - /overlay/newroot/etc/systemd/system/multi-user.target.wants/hypeman-agent.service - - # pivot_root and exec systemd - cd /overlay/newroot - pivot_root . oldroot - exec /sbin/init -fi -``` - - - - -### Option C: Hybrid Metadata Service (EC2-Compatible) - -For maximum EC2 compatibility: - -1. **Implement IMDS-compatible endpoint** in guest-agent at `169.254.169.254` -2. **Serve instance metadata, user-data, credentials** via this endpoint -3. **Works with standard cloud-init** out of the box - -## Recommended Implementation Path - -| Feature | Complexity | EC2-Feel | Notes ||---------|------------|----------|-------|| Option B (systemd OCI) | Low | Medium | Extends current init script, minimal changes || Option A (UEFI + cloud images) | Medium | High | Requires UEFI firmware, cloud-init disk generation || Option C (IMDS) | Medium | Very High | Makes cloud-init "just work" | - -### Suggested Phased Approach - -**Phase 1: Systemd OCI Support (Option B)** - -- Add `--mode full` flag to `hypeman run` -- Add `FULL_VM_MODE` to config disk generation -- Modify init script to pivot_root + exec systemd -- Inject guest-agent.service into overlay - -**Phase 2: Cloud Image Support (Option A)** - -- Add `--boot-image` flag for raw/qcow2 images -- Download and bundle UEFI firmware (hypervisor-fw) -- Generate cloud-init NoCloud datasource disks -- Package guest-agent as installable .deb/.rpm - -**Phase 3: IMDS Compatibility (Option C)** - -- Add HTTP server to guest-agent listening on 169.254.169.254:80 -- Implement EC2 IMDS v1/v2 compatible endpoints -- Remove need for custom cloud-init config - -## Processing ISOs into Bootable Images - -For ISOs, you have two paths: - -1. **One-time conversion** (recommended): Install the ISO to a disk image using QEMU/virt-install, then use the resulting disk image as a template - ```bash - # Create empty disk - qemu-img create -f raw base.raw 20G - - # Install from ISO (interactive or via preseed/kickstart) - qemu-system-x86_64 -enable-kvm -m 2G \ - -cdrom ubuntu-22.04.iso \ - -drive file=base.raw,format=raw \ - -boot d - - # After installation, use base.raw as template - - ``` \ No newline at end of file diff --git a/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md b/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md index c71db68..f018b2d 100644 --- a/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md +++ b/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md @@ -1,61 +1,55 @@ --- name: Systemd VM with Go Init -overview: Support systemd-based OCI images via auto-detection or --init-mode flag, using a Go-based init binary with structured logging to hypeman operations log, performing pivot_root for systemd handoff. +overview: Support systemd-based OCI images via auto-detection from image CMD, using a Go-based init binary with human-readable logging, performing chroot for systemd handoff. todos: - id: go-init-scaffold content: Create lib/system/init/ package with main.go entry point - status: pending + status: completed - id: go-init-mount content: Implement mount.go for proc/sys/dev/overlay mounting - status: pending + status: completed - id: go-init-config content: Implement config.go to parse config disk - status: pending + status: completed - id: go-init-network content: Implement network.go for network configuration - status: pending + status: completed - id: go-init-drivers content: Implement drivers.go for GPU driver loading - status: pending + status: completed - id: go-init-volumes content: Implement volumes.go for volume mounting - status: pending + status: completed - id: go-init-exec content: Implement mode_exec.go with current behavior - status: pending + status: completed - id: go-init-systemd - content: Implement mode_systemd.go with pivot_root and service injection - status: pending + content: Implement mode_systemd.go with chroot and service injection + status: completed - id: go-init-logger - content: Implement logger.go writing to hypeman operations log - status: pending + content: Implement logger.go with human-readable format + status: completed - id: initrd-build content: Modify initrd.go to compile and include init binary - status: pending + status: completed - id: systemd-detect - content: Create lib/images/systemd.go with IsSystemdImage() - status: pending + content: Create lib/images/systemd.go with IsSystemdImage() based on CMD + status: completed - id: config-disk-mode - content: Add INIT_MODE to configdisk.go - status: pending - - id: api-init-mode - content: Add init_mode field to openapi.yaml and instance types - status: pending - - id: cli-init-mode - content: Add --init-mode flag to hypeman run command - status: pending + content: Add INIT_MODE to configdisk.go based on CMD detection + status: completed - id: agent-location content: Change guest-agent copy location to /opt/hypeman/ - status: pending + status: completed - id: dialer-resilience content: Add error handling for missing agent in client.go - status: pending + status: completed - id: test-dockerfile content: Create integration/testdata/systemd/Dockerfile - status: pending + status: completed - id: e2e-tests content: Create integration/systemd_test.go with build/push/test flow - status: pending + status: completed --- # Systemd VM Support with Go-based Init @@ -79,7 +73,7 @@ flowchart TB Network --> Drivers[Load GPU drivers if needed] Drivers --> Volumes[Mount volumes] Volumes --> CopyAgent[Copy agent to /opt/hypeman/] - CopyAgent --> ModeCheck{init_mode?} + CopyAgent --> ModeCheck{init_mode from CMD?} ModeCheck -->|exec| ExecMode[Exec Mode] ModeCheck -->|systemd| SystemdMode[Systemd Mode] end @@ -92,39 +86,62 @@ flowchart TB subgraph systemdpath [Systemd Mode] SystemdMode --> InjectService[Inject guest-agent.service] - InjectService --> PivotRoot[pivot_root to overlay/newroot] - PivotRoot --> ExecInit["exec /sbin/init (systemd)"] + InjectService --> Chroot[chroot to overlay/newroot] + Chroot --> ExecInit["exec /sbin/init (systemd)"] ExecInit --> SystemdPID1[Systemd manages everything] end ``` - - ## Shared vs Mode-Specific Behavior -| Step | Exec Mode | Systemd Mode | +| Step | Exec Mode | Systemd Mode ||------|-----------|--------------|| Mount proc/sys/dev | Shared | Shared || Mount rootfs overlay | Shared | Shared || Read config disk | Shared | Shared || Configure network | Init configures it | Init configures it (before pivot) || Load GPU drivers | Shared | Shared || Mount volumes | Shared | Shared || Copy guest-agent | To `/opt/hypeman/` | To `/opt/hypeman/` || Start guest-agent | Background process | Systemd service || PID 1 | Go init binary | Systemd || App lifecycle | Managed by init | Managed by systemd | -|------|-----------|--------------| +## Logging Behavior -| Mount proc/sys/dev | Shared | Shared | +### `hypeman logs` Output by Mode -| Mount rootfs overlay | Shared | Shared | +| Log Source | Exec Mode | Systemd Mode ||------------|-----------|--------------|| `--source app` (default) | Entrypoint stdout/stderr | Systemd boot messages + console output || `--source hypeman` | Init phases + operations | Init phases + operations (until pivot_root) || `--source vmm` | Cloud Hypervisor logs | Cloud Hypervisor logs |In systemd mode, after pivot_root: -| Read config disk | Shared | Shared | +- Serial console (app.log) shows systemd boot progress and any services writing to console +- To view individual service logs, use: `hypeman exec journalctl -u ` +- To view guest-agent logs: `hypeman exec journalctl -u hypeman-agent` -| Configure network | Init configures it | Init configures it (before pivot) | +### Init Logger Format -| Load GPU drivers | Shared | Shared | +Human-readable format for `hypeman logs --source hypeman`: -| Mount volumes | Shared | Shared | +```go +// lib/system/init/logger.go +package main -| Copy guest-agent | To `/opt/hypeman/` | To `/opt/hypeman/` | +type Logger struct { + file *os.File +} -| Start guest-agent | Background process | Systemd service | +func (l *Logger) Info(phase, msg string) { + // Format: 2024-12-23T10:15:30Z [INFO] [overlay] mounted rootfs from /dev/vda + fmt.Fprintf(l.file, "%s [INFO] [%s] %s\n", + time.Now().UTC().Format(time.RFC3339), phase, msg) +} -| PID 1 | Go init binary | Systemd | +func (l *Logger) Error(phase, msg string, err error) { + if err != nil { + fmt.Fprintf(l.file, "%s [ERROR] [%s] %s: %v\n", + time.Now().UTC().Format(time.RFC3339), phase, msg, err) + } else { + fmt.Fprintf(l.file, "%s [ERROR] [%s] %s\n", + time.Now().UTC().Format(time.RFC3339), phase, msg) + } +} -| App lifecycle | Managed by init | Managed by systemd | +// Example output: +// 2024-12-23T10:15:30Z [INFO] [boot] init starting +// 2024-12-23T10:15:30Z [INFO] [mount] mounted proc/sys/dev +// 2024-12-23T10:15:31Z [INFO] [overlay] mounted rootfs from /dev/vda +// 2024-12-23T10:15:31Z [INFO] [network] configured eth0 with 10.0.0.2/24 +// 2024-12-23T10:15:32Z [INFO] [systemd] performing pivot_root +// 2024-12-23T10:15:32Z [INFO] [systemd] exec /sbin/init +``` ## Go-based Init Binary @@ -140,49 +157,9 @@ lib/system/init/ volumes.go # Volume mounting mode_exec.go # Exec mode: run entrypoint mode_systemd.go # Systemd mode: pivot_root + exec init - logger.go # Structured logging to hypeman operations log + logger.go # Human-readable logging to hypeman operations log ``` - - -### Logger Design - -Logs to the hypeman operations log (same format as existing hypeman logs): - -```go -// lib/system/init/logger.go -package main - -type Logger struct { - file *os.File // Writes to /logs/hypeman.log via config disk path -} - -type LogEntry struct { - Time string `json:"time"` - Level string `json:"level"` - Phase string `json:"phase"` - Message string `json:"msg"` - Error string `json:"error,omitempty"` -} - -func (l *Logger) Info(phase, msg string) { - l.log("INFO", phase, msg, "") -} - -func (l *Logger) Error(phase, msg string, err error) { - errStr := "" - if err != nil { - errStr = err.Error() - } - l.log("ERROR", phase, msg, errStr) -} - -// Output: structured JSON to hypeman operations log -// {"time":"2024-12-23T10:15:30Z","level":"INFO","phase":"overlay","msg":"mounted rootfs"} -``` - - - ### Main Orchestration ```go @@ -239,8 +216,6 @@ func main() { } ``` - - ### Systemd Mode ```go @@ -260,17 +235,10 @@ func runSystemdMode(log *Logger, cfg *Config) { log.Error("systemd", "failed to inject service", err) } - oldroot := newroot + "/oldroot" - os.MkdirAll(oldroot, 0755) - - if err := os.Chdir(newroot); err != nil { - log.Error("systemd", "chdir failed", err) - dropToShell() - } - - log.Info("systemd", "executing pivot_root") - if err := syscall.PivotRoot(".", "oldroot"); err != nil { - log.Error("systemd", "pivot_root failed", err) + // Use chroot instead of pivot_root (more reliable in VM environment) + log.Info("systemd", "executing chroot") + if err := syscall.Chroot(newroot); err != nil { + log.Error("systemd", "chroot failed", err) dropToShell() } @@ -309,23 +277,29 @@ WantedBy=multi-user.target } ``` - - ## Detection Logic +Auto-detect systemd mode by inspecting the image's CMD. No override flag - if CMD is a systemd init, always use systemd mode. + ```go // lib/images/systemd.go package images import "strings" +// IsSystemdImage checks if the image's CMD indicates it wants systemd as init. +// Detection is based on the effective command (entrypoint + cmd), not whether +// systemd is installed in the image. func IsSystemdImage(entrypoint, cmd []string) bool { + // Combine to get the actual command that will run effective := append(entrypoint, cmd...) if len(effective) == 0 { return false } first := effective[0] + + // Match specific systemd/init paths systemdPaths := []string{ "/sbin/init", "/lib/systemd/systemd", @@ -336,30 +310,16 @@ func IsSystemdImage(entrypoint, cmd []string) bool { return true } } + + // Match any path ending in /init (e.g., /usr/sbin/init) if strings.HasSuffix(first, "/init") { return true } + return false } ``` - - -## CLI and API - -```bash -# Auto-detect from image CMD (default) -hypeman run my-systemd-image - -# Force systemd mode -hypeman run --init-mode=systemd my-image - -# Force exec mode on a systemd image -hypeman run --init-mode=exec my-systemd-image -``` - - - ## E2E Test Custom Dockerfile in repository at `integration/testdata/systemd/Dockerfile`: @@ -391,9 +351,10 @@ Test flow: 1. Build image with `docker build` 2. Push to hypeman via OCI import -3. Run instance (should auto-detect systemd mode) +3. Run instance (auto-detects systemd mode from CMD) 4. Verify systemd is PID 1 5. Verify guest-agent.service is active +6. Verify `hypeman logs` shows systemd boot messages ```go // integration/systemd_test.go @@ -401,8 +362,8 @@ func TestSystemdMode(t *testing.T) { // Build and push test image buildAndPushTestImage(t, "integration/testdata/systemd", "test-systemd:latest") - // Create instance (auto-detects systemd mode) - inst := createInstance(t, "test-systemd:latest", "") + // Create instance (auto-detects systemd mode from CMD) + inst := createInstance(t, "test-systemd:latest") defer deleteInstance(t, inst.Id) time.Sleep(10 * time.Second) @@ -418,19 +379,28 @@ func TestSystemdMode(t *testing.T) { // Verify agent location result = execInVM(t, inst, "test", "-x", "/opt/hypeman/guest-agent") assert.Equal(t, 0, result.ExitCode) + + // Verify can view agent logs via journalctl + result = execInVM(t, inst, "journalctl", "-u", "hypeman-agent", "--no-pager") + assert.Equal(t, 0, result.ExitCode) } -func TestInitModeOverride(t *testing.T) { - buildAndPushTestImage(t, "integration/testdata/systemd", "test-systemd:latest") - - // Force exec mode - inst := createInstance(t, "test-systemd:latest", "exec") +func TestExecModeUnchanged(t *testing.T) { + // Regular container image should still work as before + inst := createInstance(t, "nginx:alpine") defer deleteInstance(t, inst.Id) - time.Sleep(5 * time.Second) + time.Sleep(3 * time.Second) - // PID 1 should be our init, not systemd - result := execInVM(t, inst, "cat", "/proc/1/comm") - assert.NotEqual(t, "systemd", strings.TrimSpace(result.Stdout)) + // Nginx should be running + result := execInVM(t, inst, "pgrep", "nginx") + assert.Equal(t, 0, result.ExitCode) + + // PID 1 is init binary (not systemd) + result = execInVM(t, inst, "cat", "/proc/1/comm") + assert.Equal(t, "init", strings.TrimSpace(result.Stdout)) } -``` \ No newline at end of file +``` + + +## Files to Modify/Create \ No newline at end of file diff --git a/.gitignore b/.gitignore index e76a3b0..b2b815d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,7 @@ cloud-hypervisor cloud-hypervisor/** lib/system/exec_agent/exec-agent lib/system/guest_agent/guest-agent -lib/system/guest_agent/guest_agent +lib/system/init/init # Envoy binaries lib/ingress/binaries/** diff --git a/Makefile b/Makefile index 4925096..74c53fa 100644 --- a/Makefile +++ b/Makefile @@ -165,26 +165,36 @@ ensure-caddy-binaries: fi # Build guest-agent (guest binary) into its own directory for embedding -lib/system/guest_agent/guest-agent: lib/system/guest_agent/main.go +lib/system/guest_agent/guest-agent: lib/system/guest_agent/*.go @echo "Building guest-agent..." cd lib/system/guest_agent && CGO_ENABLED=0 go build -ldflags="-s -w" -o guest-agent . +# Build init binary (runs as PID 1 in guest VM) for embedding +# Uses static linking for portability across different guest environments +lib/system/init/init: lib/system/init/*.go + @echo "Building init binary..." + cd lib/system/init && CGO_ENABLED=0 go build -ldflags="-s -w" -o init . + +# Build all embedded binaries +.PHONY: build-embedded +build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init + # Build the binary -build: ensure-ch-binaries ensure-caddy-binaries lib/system/guest_agent/guest-agent | $(BIN_DIR) +build: ensure-ch-binaries ensure-caddy-binaries build-embedded | $(BIN_DIR) go build -tags containers_image_openpgp -o $(BIN_DIR)/hypeman ./cmd/api # Build all binaries build-all: build # Run in development mode with hot reload -dev: ensure-ch-binaries ensure-caddy-binaries lib/system/guest_agent/guest-agent $(AIR) +dev: ensure-ch-binaries ensure-caddy-binaries build-embedded $(AIR) @rm -f ./tmp/main $(AIR) -c .air.toml # Run tests (as root for network capabilities, enables caching and parallelism) # Usage: make test - runs all tests # make test TEST=TestCreateInstanceWithNetwork - runs specific test -test: ensure-ch-binaries ensure-caddy-binaries lib/system/guest_agent/guest-agent +test: ensure-ch-binaries ensure-caddy-binaries build-embedded @if [ -n "$(TEST)" ]; then \ echo "Running specific test: $(TEST)"; \ sudo env "PATH=$$PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" go test -tags containers_image_openpgp -run=$(TEST) -v -timeout=180s ./...; \ @@ -203,8 +213,9 @@ clean: rm -rf lib/vmm/binaries/cloud-hypervisor/ rm -rf lib/ingress/binaries/ rm -f lib/system/guest_agent/guest-agent + rm -f lib/system/init/init # Prepare for release build (called by GoReleaser) # Downloads all embedded binaries and builds embedded components -release-prep: download-ch-binaries build-caddy-binaries lib/system/guest_agent/guest-agent +release-prep: download-ch-binaries build-caddy-binaries build-embedded go mod tidy diff --git a/integration/systemd_test.go b/integration/systemd_test.go new file mode 100644 index 0000000..760620c --- /dev/null +++ b/integration/systemd_test.go @@ -0,0 +1,223 @@ +package integration + +import ( + "bytes" + "context" + "os" + "strings" + "testing" + "time" + + "github.com/onkernel/hypeman/cmd/api/config" + "github.com/onkernel/hypeman/lib/devices" + "github.com/onkernel/hypeman/lib/guest" + "github.com/onkernel/hypeman/lib/hypervisor" + "github.com/onkernel/hypeman/lib/images" + "github.com/onkernel/hypeman/lib/instances" + "github.com/onkernel/hypeman/lib/network" + "github.com/onkernel/hypeman/lib/paths" + "github.com/onkernel/hypeman/lib/system" + "github.com/onkernel/hypeman/lib/volumes" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestSystemdMode verifies that hypeman correctly detects and runs +// systemd-based images with systemd as PID 1. +// +// This test uses the jrei/systemd-ubuntu image from Docker Hub which runs +// systemd as its CMD. The test verifies that hypeman auto-detects this and: +// - Uses systemd mode (chroot to container rootfs) +// - Starts systemd as PID 1 +// - Injects and starts the hypeman-agent.service +func TestSystemdMode(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + // Skip if KVM is not available + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { + t.Skip("/dev/kvm not available") + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + // Set up test environment + tmpDir := t.TempDir() + p := paths.New(tmpDir) + + cfg := &config.Config{ + DataDir: tmpDir, + BridgeName: "vmbr0", + SubnetCIDR: "10.100.0.0/16", + DNSServer: "1.1.1.1", + } + + // Create managers + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + systemManager := system.NewManager(p) + networkManager := network.NewManager(p, cfg, nil) + deviceManager := devices.NewManager(p) + volumeManager := volumes.NewManager(p, 0, nil) + + limits := instances.ResourceLimits{ + MaxOverlaySize: 100 * 1024 * 1024 * 1024, + MaxVcpusPerInstance: 0, + MaxMemoryPerInstance: 0, + MaxTotalVcpus: 0, + MaxTotalMemory: 0, + } + + instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil) + + // Cleanup any orphaned instances + t.Cleanup(func() { + instanceManager.DeleteInstance(ctx, "systemd-test") + }) + + imageName := "docker.io/jrei/systemd-ubuntu:22.04" + + // Pull the systemd image + t.Log("Pulling systemd image:", imageName) + _, err = imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: imageName, + }) + require.NoError(t, err) + + // Wait for image to be ready + t.Log("Waiting for image build...") + var img *images.Image + for i := 0; i < 120; i++ { + img, err = imageManager.GetImage(ctx, imageName) + if err == nil && img.Status == images.StatusReady { + break + } + time.Sleep(1 * time.Second) + } + require.Equal(t, images.StatusReady, img.Status, "image should be ready") + + // Verify systemd detection + t.Run("IsSystemdImage", func(t *testing.T) { + isSystemd := images.IsSystemdImage(img.Entrypoint, img.Cmd) + assert.True(t, isSystemd, "image should be detected as systemd, entrypoint=%v cmd=%v", img.Entrypoint, img.Cmd) + }) + + // Ensure system files (kernel, initrd) + t.Log("Ensuring system files...") + err = systemManager.EnsureSystemFiles(ctx) + require.NoError(t, err) + + // Create the systemd instance + t.Log("Creating systemd instance...") + inst, err := instanceManager.CreateInstance(ctx, instances.CreateInstanceRequest{ + Name: "systemd-test", + Image: imageName, + Size: 2 * 1024 * 1024 * 1024, // 2GB + HotplugSize: 512 * 1024 * 1024, + OverlaySize: 1024 * 1024 * 1024, + Vcpus: 2, + NetworkEnabled: false, // No network needed for this test + }) + require.NoError(t, err) + t.Logf("Instance created: %s", inst.Id) + + // Wait for guest agent to be ready + t.Log("Waiting for guest agent...") + err = waitForGuestAgent(ctx, instanceManager, inst.Id, 60*time.Second) + require.NoError(t, err, "guest agent should be ready") + + // Test: Verify systemd is PID 1 + t.Run("SystemdIsPID1", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "cat", "/proc/1/comm") + require.NoError(t, err, "exec should work") + require.Equal(t, 0, exitCode, "command should succeed") + + pid1Name := strings.TrimSpace(output) + assert.Equal(t, "systemd", pid1Name, "PID 1 should be systemd") + t.Logf("PID 1 is: %s", pid1Name) + }) + + // Test: Verify guest-agent binary exists + t.Run("GuestAgentExists", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "test", "-x", "/opt/hypeman/guest-agent") + require.NoError(t, err, "exec should work") + assert.Equal(t, 0, exitCode, "guest-agent binary should exist at /opt/hypeman/guest-agent, output: %s", output) + }) + + // Test: Verify hypeman-agent.service is active + t.Run("AgentServiceActive", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "systemctl", "is-active", "hypeman-agent") + require.NoError(t, err, "exec should work") + status := strings.TrimSpace(output) + assert.Equal(t, 0, exitCode, "hypeman-agent service should be active, status: %s", status) + assert.Equal(t, "active", status, "service status should be 'active'") + t.Logf("hypeman-agent service status: %s", status) + }) + + // Test: Verify we can view agent logs via journalctl + t.Run("AgentLogsAccessible", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "journalctl", "-u", "hypeman-agent", "--no-pager", "-n", "5") + require.NoError(t, err, "exec should work") + assert.Equal(t, 0, exitCode, "journalctl should succeed") + t.Logf("Agent logs (last 5 lines):\n%s", output) + }) + + t.Log("All systemd mode tests passed!") +} + +// waitForGuestAgent polls until the guest agent is ready +func waitForGuestAgent(ctx context.Context, mgr instances.Manager, instanceID string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + inst, err := mgr.GetInstance(ctx, instanceID) + if err != nil { + time.Sleep(500 * time.Millisecond) + continue + } + + // Try to connect to the guest agent + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + time.Sleep(500 * time.Millisecond) + continue + } + + // Try a simple exec to verify agent is responding + var stdout bytes.Buffer + _, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: []string{"echo", "ready"}, + Stdout: &stdout, + TTY: false, + }) + if err == nil { + return nil + } + + time.Sleep(500 * time.Millisecond) + } + return context.DeadlineExceeded +} + +// execInInstance executes a command in the instance +func execInInstance(ctx context.Context, inst *instances.Instance, command ...string) (string, int, error) { + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + return "", -1, err + } + + var stdout, stderr bytes.Buffer + exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: command, + Stdout: &stdout, + Stderr: &stderr, + TTY: false, + }) + if err != nil { + return stderr.String(), -1, err + } + + return stdout.String(), exit.Code, nil +} diff --git a/lib/guest/client.go b/lib/guest/client.go index dc7a7fd..48361a2 100644 --- a/lib/guest/client.go +++ b/lib/guest/client.go @@ -2,6 +2,7 @@ package guest import ( "context" + "errors" "fmt" "io" "io/fs" @@ -25,6 +26,31 @@ const ( vsockGuestPort = 2222 ) +// AgentConnectionError indicates the guest agent is not responding. +// This can happen if: +// - The VM is still booting +// - The guest agent was stopped or deleted +// - The VM is in systemd mode and the agent service failed to start +type AgentConnectionError struct { + Err error +} + +func (e *AgentConnectionError) Error() string { + return fmt.Sprintf("guest agent not responding (it may have been stopped, deleted, or the VM is still booting): %v", e.Err) +} + +func (e *AgentConnectionError) Unwrap() error { + return e.Err +} + +// IsAgentConnectionError checks if an error is due to the guest agent not responding. +func IsAgentConnectionError(err error) bool { + var agentErr *AgentConnectionError + return err != nil && (strings.Contains(err.Error(), "guest agent not responding") || + strings.Contains(err.Error(), "connection refused") || + errors.As(err, &agentErr)) +} + // connPool manages reusable gRPC connections per vsock dialer key // This avoids the overhead and potential issues of rapidly creating/closing connections var connPool = struct { @@ -59,7 +85,11 @@ func GetOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc. // Create new connection using the VsockDialer conn, err := grpc.Dial("passthrough:///vsock", grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { - return dialer.DialVsock(ctx, vsockGuestPort) + netConn, err := dialer.DialVsock(ctx, vsockGuestPort) + if err != nil { + return nil, &AgentConnectionError{Err: err} + } + return netConn, nil }), grpc.WithTransportCredentials(insecure.NewCredentials()), ) diff --git a/lib/images/systemd.go b/lib/images/systemd.go new file mode 100644 index 0000000..661477f --- /dev/null +++ b/lib/images/systemd.go @@ -0,0 +1,43 @@ +package images + +import "strings" + +// IsSystemdImage checks if the image's CMD indicates it wants systemd as init. +// Detection is based on the effective command (entrypoint + cmd), not whether +// systemd is installed in the image. +// +// Returns true if the image's command is: +// - /sbin/init +// - /lib/systemd/systemd +// - /usr/lib/systemd/systemd +// - Any path ending in /init +func IsSystemdImage(entrypoint, cmd []string) bool { + // Combine to get the actual command that will run + effective := append(entrypoint, cmd...) + if len(effective) == 0 { + return false + } + + first := effective[0] + + // Match specific systemd/init paths + systemdPaths := []string{ + "/sbin/init", + "/lib/systemd/systemd", + "/usr/lib/systemd/systemd", + } + for _, p := range systemdPaths { + if first == p { + return true + } + } + + // Match any absolute path ending in /init (e.g., /usr/sbin/init) + // Only match absolute paths to avoid false positives like "./init" + if strings.HasPrefix(first, "/") && strings.HasSuffix(first, "/init") { + return true + } + + return false +} + diff --git a/lib/images/systemd_test.go b/lib/images/systemd_test.go new file mode 100644 index 0000000..8b55587 --- /dev/null +++ b/lib/images/systemd_test.go @@ -0,0 +1,91 @@ +package images + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsSystemdImage(t *testing.T) { + tests := []struct { + name string + entrypoint []string + cmd []string + expected bool + }{ + { + name: "empty entrypoint and cmd", + entrypoint: nil, + cmd: nil, + expected: false, + }, + { + name: "/sbin/init as cmd", + entrypoint: nil, + cmd: []string{"/sbin/init"}, + expected: true, + }, + { + name: "/lib/systemd/systemd as cmd", + entrypoint: nil, + cmd: []string{"/lib/systemd/systemd"}, + expected: true, + }, + { + name: "/usr/lib/systemd/systemd as cmd", + entrypoint: nil, + cmd: []string{"/usr/lib/systemd/systemd"}, + expected: true, + }, + { + name: "path ending in /init", + entrypoint: nil, + cmd: []string{"/usr/sbin/init"}, + expected: true, + }, + { + name: "regular command (nginx)", + entrypoint: []string{"nginx"}, + cmd: []string{"-g", "daemon off;"}, + expected: false, + }, + { + name: "regular command (python)", + entrypoint: []string{"/usr/bin/python3"}, + cmd: []string{"app.py"}, + expected: false, + }, + { + name: "entrypoint with systemd", + entrypoint: []string{"/lib/systemd/systemd"}, + cmd: nil, + expected: true, + }, + { + name: "entrypoint with init", + entrypoint: []string{"/sbin/init"}, + cmd: nil, + expected: true, + }, + { + name: "shell script named init should not match", + entrypoint: nil, + cmd: []string{"./init"}, + expected: false, + }, + { + name: "bash command should not match", + entrypoint: nil, + cmd: []string{"/bin/bash", "-c", "init"}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := IsSystemdImage(tt.entrypoint, tt.cmd) + assert.Equal(t, tt.expected, result) + }) + } +} + diff --git a/lib/instances/configdisk.go b/lib/instances/configdisk.go index fb9305a..ba53ef7 100644 --- a/lib/instances/configdisk.go +++ b/lib/instances/configdisk.go @@ -151,6 +151,15 @@ GUEST_DNS="%s" volumeSection = volumeLines.String() } + // Determine init mode based on image CMD + // If the image's command is /sbin/init or /lib/systemd/systemd, use systemd mode + initModeSection := "" + if images.IsSystemdImage(imageInfo.Entrypoint, imageInfo.Cmd) { + initModeSection = "\n# Init mode (auto-detected from image CMD)\nINIT_MODE=\"systemd\"\n" + } else { + initModeSection = "\n# Init mode\nINIT_MODE=\"exec\"\n" + } + // Generate script as a readable template block // ENTRYPOINT and CMD contain shell-quoted arrays that will be eval'd in init script := fmt.Sprintf(`#!/bin/sh @@ -162,7 +171,7 @@ CMD="%s" WORKDIR=%s # Environment variables -%s%s%s%s`, +%s%s%s%s%s`, inst.Id, entrypoint, cmd, @@ -171,6 +180,7 @@ WORKDIR=%s networkSection, volumeSection, gpuSection, + initModeSection, ) return script diff --git a/lib/system/README.md b/lib/system/README.md index 68629f0..59e8ce2 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -5,10 +5,11 @@ Manages versioned kernel and initrd files for Cloud Hypervisor VMs. ## Features - **Automatic Downloads**: Kernel downloaded from Cloud Hypervisor releases on first use -- **Automatic Build**: Initrd built from busybox + custom init script +- **Automatic Build**: Initrd built from Alpine base + Go init binary + guest-agent - **Versioned**: Side-by-side support for multiple kernel/initrd versions - **Zero Docker**: Uses OCI directly (reuses image manager infrastructure) - **Zero Image Modifications**: All init logic in initrd, OCI images used as-is +- **Dual Mode Support**: Exec mode (container-like) and systemd mode (full VM) ## Architecture @@ -54,15 +55,27 @@ Instance B (running): kernel v6.12.9, initrd v1.0.0 Both work independently ``` -## Init Script Consolidation +## Go Init Binary -All init logic moved from app rootfs to initrd: +The init binary (`lib/system/init/`) is a Go program that runs as PID 1 in the guest VM. +It replaces the previous shell-based init script with cleaner logic and structured logging. **Initrd handles:** - ✅ Mount overlay filesystem - ✅ Mount and source config disk - ✅ Network configuration (if enabled) -- ✅ Execute container entrypoint +- ✅ Load GPU drivers (if GPU attached) +- ✅ Mount volumes +- ✅ Auto-detect systemd images from CMD +- ✅ Execute container entrypoint (exec mode) +- ✅ Hand off to systemd via pivot_root (systemd mode) + +**Two boot modes:** +- **Exec mode** (default): Init binary is PID 1, runs entrypoint as child process +- **Systemd mode** (auto-detected): Uses pivot_root to hand off to systemd as PID 1 + +**Systemd detection:** If image CMD is `/sbin/init`, `/lib/systemd/systemd`, or similar, +hypeman automatically uses systemd mode. **Result:** OCI images require **zero modifications** - no `/init` script needed! @@ -107,9 +120,11 @@ Example URLs: ## Initrd Build Process -1. **Pull busybox** (using image manager's OCI client) -2. **Inject init script** (comprehensive, handles all init logic) -3. **Package as cpio.gz** (initramfs format) +1. **Pull Alpine base** (using image manager's OCI client) +2. **Add guest-agent binary** (embedded, runs in guest for exec/shell) +3. **Add init binary** (embedded Go binary, runs as PID 1) +4. **Add NVIDIA modules** (optional, for GPU passthrough) +5. **Package as cpio.gz** (initramfs format) **Build tools required:** `find`, `cpio`, `gzip` (standard Unix tools) @@ -136,21 +151,18 @@ var KernelDownloadURLs = map[KernelVersion]map[string]string{ var DefaultKernelVersion = KernelV6_12_10 ``` -### New Initrd Version +### Updating the Init Binary -```go -// lib/system/versions.go +The init binary is in `lib/system/init/`. After making changes: -const ( - InitrdV1_1_0 InitrdVersion = "v1.1.0" // Add constant -) +1. Build the init binary (statically linked for Alpine): + ```bash + make build-init + ``` -// lib/system/init_script.go -// Update GenerateInitScript() if init logic changes +2. The binary is embedded via `lib/system/init_binary.go` -// Update default -var DefaultInitrdVersion = InitrdV1_1_0 -``` +3. The initrd hash includes the binary, so it will auto-rebuild on next startup ## Testing @@ -167,7 +179,22 @@ go test ./lib/system/... | File | Size | Purpose | |------|------|---------| | kernel/*/vmlinux | ~70MB | Cloud Hypervisor optimized kernel | -| initrd/*/initrd | ~1-2MB | Busybox + comprehensive init script | +| initrd/*/initrd | ~5-10MB | Alpine base + Go init binary + guest-agent | Files downloaded/built once per version, reused for all instances using that version. +## Init Binary Package Structure + +``` +lib/system/init/ + main.go # Entry point, orchestrates boot + mount.go # Mount operations (proc, sys, dev, overlay) + config.go # Parse config disk + network.go # Network configuration + drivers.go # GPU driver loading + volumes.go # Volume mounting + mode_exec.go # Exec mode: run entrypoint + mode_systemd.go # Systemd mode: pivot_root + exec init + logger.go # Human-readable logging to hypeman operations log +``` + diff --git a/lib/system/guest_agent_binary.go b/lib/system/guest_agent_binary.go index 78a5b7b..57d6972 100644 --- a/lib/system/guest_agent_binary.go +++ b/lib/system/guest_agent_binary.go @@ -6,4 +6,3 @@ import _ "embed" // This is built by the Makefile before the main binary is compiled //go:embed guest_agent/guest-agent var GuestAgentBinary []byte - diff --git a/lib/system/init/config.go b/lib/system/init/config.go new file mode 100644 index 0000000..a6ef1e0 --- /dev/null +++ b/lib/system/init/config.go @@ -0,0 +1,194 @@ +package main + +import ( + "bufio" + "fmt" + "os" + "os/exec" + "strings" +) + +// Config holds the parsed configuration from the config disk. +type Config struct { + // Container execution parameters + Entrypoint string + Cmd string + Workdir string + + // Environment variables + Env map[string]string + + // Network configuration + NetworkEnabled bool + GuestIP string + GuestCIDR string + GuestGW string + GuestDNS string + + // GPU passthrough + HasGPU bool + + // Volume mounts (format: "device:path:mode[:overlay_device] ...") + VolumeMounts []VolumeMount + + // Init mode: "exec" (default) or "systemd" + InitMode string +} + +// VolumeMount represents a volume mount configuration. +type VolumeMount struct { + Device string + Path string + Mode string // "ro", "rw", or "overlay" + OverlayDevice string // Only used for overlay mode +} + +// readConfig mounts and reads the config disk, parsing the shell configuration. +func readConfig(log *Logger) (*Config, error) { + const configMount = "/mnt/config" + const configFile = "/mnt/config/config.sh" + + // Create mount point + if err := os.MkdirAll(configMount, 0755); err != nil { + return nil, fmt.Errorf("mkdir config mount: %w", err) + } + + // Mount config disk (/dev/vdc) read-only + cmd := exec.Command("/bin/mount", "-o", "ro", "/dev/vdc", configMount) + if output, err := cmd.CombinedOutput(); err != nil { + return nil, fmt.Errorf("mount config disk: %s: %s", err, output) + } + log.Info("config", "mounted config disk") + + // Read and parse config.sh + cfg, err := parseConfigFile(configFile) + if err != nil { + return nil, fmt.Errorf("parse config: %w", err) + } + log.Info("config", "parsed configuration") + + return cfg, nil +} + +// parseConfigFile parses a shell-style configuration file. +// It handles simple KEY=VALUE and KEY="VALUE" assignments. +func parseConfigFile(path string) (*Config, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + cfg := &Config{ + Env: make(map[string]string), + InitMode: "exec", // Default to exec mode + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + // Skip comments and empty lines + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + // Handle export statements + if strings.HasPrefix(line, "export ") { + line = strings.TrimPrefix(line, "export ") + } + + // Parse KEY=VALUE + parts := strings.SplitN(line, "=", 2) + if len(parts) != 2 { + continue + } + + key := strings.TrimSpace(parts[0]) + value := unquote(strings.TrimSpace(parts[1])) + + switch key { + case "ENTRYPOINT": + cfg.Entrypoint = value + case "CMD": + cfg.Cmd = value + case "WORKDIR": + cfg.Workdir = value + case "GUEST_IP": + cfg.GuestIP = value + cfg.NetworkEnabled = true + case "GUEST_CIDR": + cfg.GuestCIDR = value + case "GUEST_GW": + cfg.GuestGW = value + case "GUEST_DNS": + cfg.GuestDNS = value + case "HAS_GPU": + cfg.HasGPU = value == "1" + case "VOLUME_MOUNTS": + cfg.VolumeMounts = parseVolumeMounts(value) + case "INIT_MODE": + cfg.InitMode = value + default: + // Treat as environment variable + cfg.Env[key] = value + } + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + return cfg, nil +} + +// parseVolumeMounts parses the VOLUME_MOUNTS string. +// Format: "device:path:mode[:overlay_device] device:path:mode ..." +func parseVolumeMounts(s string) []VolumeMount { + if s == "" { + return nil + } + + var mounts []VolumeMount + for _, vol := range strings.Fields(s) { + parts := strings.Split(vol, ":") + if len(parts) < 3 { + continue + } + + mount := VolumeMount{ + Device: parts[0], + Path: parts[1], + Mode: parts[2], + } + + if len(parts) >= 4 { + mount.OverlayDevice = parts[3] + } + + mounts = append(mounts, mount) + } + + return mounts +} + +// unquote removes surrounding quotes from a string. +// Handles both single and double quotes. +func unquote(s string) string { + if len(s) < 2 { + return s + } + + // Handle double quotes + if s[0] == '"' && s[len(s)-1] == '"' { + return s[1 : len(s)-1] + } + + // Handle single quotes + if s[0] == '\'' && s[len(s)-1] == '\'' { + return s[1 : len(s)-1] + } + + return s +} + diff --git a/lib/system/init/drivers.go b/lib/system/init/drivers.go new file mode 100644 index 0000000..935e0f9 --- /dev/null +++ b/lib/system/init/drivers.go @@ -0,0 +1,190 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" +) + +// loadGPUDrivers loads NVIDIA kernel modules for GPU passthrough. +func loadGPUDrivers(log *Logger) error { + log.Info("gpu", "loading NVIDIA kernel modules") + + // Find kernel version directory + modules, err := os.ReadDir("/lib/modules") + if err != nil { + return fmt.Errorf("read /lib/modules: %w", err) + } + + if len(modules) == 0 { + return fmt.Errorf("no kernel modules found") + } + + kver := modules[0].Name() + gpuDir := filepath.Join("/lib/modules", kver, "kernel/drivers/gpu") + + if _, err := os.Stat(gpuDir); err != nil { + return fmt.Errorf("GPU modules not found for kernel %s", kver) + } + + // Load modules in order (dependencies first) + moduleOrder := []string{ + "nvidia.ko", + "nvidia-uvm.ko", + "nvidia-modeset.ko", + "nvidia-drm.ko", + } + + for _, mod := range moduleOrder { + modPath := filepath.Join(gpuDir, mod) + if _, err := os.Stat(modPath); err != nil { + log.Error("gpu", fmt.Sprintf("%s not found", mod), nil) + continue + } + + args := []string{modPath} + // nvidia-drm needs modeset=1 + if mod == "nvidia-drm.ko" { + args = append(args, "modeset=1") + } + + cmd := exec.Command("/sbin/insmod", args...) + if output, err := cmd.CombinedOutput(); err != nil { + log.Error("gpu", fmt.Sprintf("insmod %s failed", mod), fmt.Errorf("%s", output)) + } + } + + log.Info("gpu", fmt.Sprintf("loaded NVIDIA modules for kernel %s", kver)) + + // Create device nodes using nvidia-modprobe if available + if err := createNvidiaDevices(log); err != nil { + log.Error("gpu", "failed to create device nodes", err) + } + + // Inject NVIDIA userspace driver libraries into container rootfs + if err := injectNvidiaLibraries(log); err != nil { + log.Error("gpu", "failed to inject driver libraries", err) + } + + return nil +} + +// createNvidiaDevices creates NVIDIA device nodes. +func createNvidiaDevices(log *Logger) error { + // Try nvidia-modprobe first (the official NVIDIA utility) + if _, err := os.Stat("/usr/bin/nvidia-modprobe"); err == nil { + log.Info("gpu", "running nvidia-modprobe to create device nodes") + + cmd := exec.Command("/usr/bin/nvidia-modprobe") + cmd.CombinedOutput() + + cmd = exec.Command("/usr/bin/nvidia-modprobe", "-u", "-c=0") + cmd.CombinedOutput() + + return nil + } + + // Fallback: Manual device node creation + log.Info("gpu", "nvidia-modprobe not found, creating device nodes manually") + + // Read major numbers from /proc/devices + data, err := os.ReadFile("/proc/devices") + if err != nil { + return err + } + + lines := strings.Split(string(data), "\n") + var nvidiaMajor, uvmMajor string + + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) >= 2 { + if fields[1] == "nvidia-frontend" || fields[1] == "nvidia" { + nvidiaMajor = fields[0] + } else if fields[1] == "nvidia-uvm" { + uvmMajor = fields[0] + } + } + } + + if nvidiaMajor != "" { + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidiactl", "c", nvidiaMajor, "255").Run() + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia0", "c", nvidiaMajor, "0").Run() + log.Info("gpu", fmt.Sprintf("created /dev/nvidiactl and /dev/nvidia0 (major %s)", nvidiaMajor)) + } + + if uvmMajor != "" { + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia-uvm", "c", uvmMajor, "0").Run() + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia-uvm-tools", "c", uvmMajor, "1").Run() + log.Info("gpu", fmt.Sprintf("created /dev/nvidia-uvm* (major %s)", uvmMajor)) + } + + return nil +} + +// injectNvidiaLibraries injects NVIDIA userspace driver libraries into the container rootfs. +// This allows containers to use standard CUDA images without bundled drivers. +func injectNvidiaLibraries(log *Logger) error { + srcDir := "/usr/lib/nvidia" + if _, err := os.Stat(srcDir); err != nil { + return nil // No driver libraries to inject + } + + log.Info("gpu", "injecting NVIDIA driver libraries into container") + + // Determine library path based on architecture + var libDst string + if runtime.GOARCH == "arm64" { + libDst = "/overlay/newroot/usr/lib/aarch64-linux-gnu" + } else { + libDst = "/overlay/newroot/usr/lib/x86_64-linux-gnu" + } + binDst := "/overlay/newroot/usr/bin" + + if err := os.MkdirAll(libDst, 0755); err != nil { + return err + } + if err := os.MkdirAll(binDst, 0755); err != nil { + return err + } + + // Copy all driver libraries + libs, _ := filepath.Glob(filepath.Join(srcDir, "*.so.*")) + for _, lib := range libs { + libname := filepath.Base(lib) + data, err := os.ReadFile(lib) + if err != nil { + continue + } + os.WriteFile(filepath.Join(libDst, libname), data, 0755) + + // Create standard symlinks + base := strings.Split(libname, ".so.")[0] + os.Symlink(libname, filepath.Join(libDst, base+".so.1")) + os.Symlink(base+".so.1", filepath.Join(libDst, base+".so")) + } + + // Copy nvidia-smi and nvidia-modprobe binaries + for _, bin := range []string{"nvidia-smi", "nvidia-modprobe"} { + srcPath := filepath.Join("/usr/bin", bin) + if data, err := os.ReadFile(srcPath); err == nil { + os.WriteFile(filepath.Join(binDst, bin), data, 0755) + } + } + + // Update ldconfig cache + exec.Command("/usr/sbin/chroot", "/overlay/newroot", "ldconfig").Run() + + // Read driver version + version := "unknown" + if data, err := os.ReadFile(filepath.Join(srcDir, "version")); err == nil { + version = strings.TrimSpace(string(data)) + } + + log.Info("gpu", fmt.Sprintf("injected NVIDIA driver libraries (version: %s)", version)) + return nil +} + diff --git a/lib/system/init/init.sh b/lib/system/init/init.sh new file mode 100644 index 0000000..06b739c --- /dev/null +++ b/lib/system/init/init.sh @@ -0,0 +1,17 @@ +#!/bin/sh +# Minimal init wrapper that sets up environment before running Go init +# The Go runtime needs /proc and /dev to exist during initialization +# +# This pattern is used by other Go-based init systems: +# - u-root (github.com/u-root/u-root) - uses assembly stub for early mount +# - LinuxKit (github.com/linuxkit/linuxkit) - similar shell wrapper approach +# - gokrazy (github.com/gokrazy/gokrazy) - mounts filesystems before Go starts + +# Mount essential filesystems BEFORE running Go binary +mkdir -p /proc /sys /dev +mount -t proc proc /proc +mount -t sysfs sysfs /sys +mount -t devtmpfs devtmpfs /dev + +# Now exec the Go init binary (it will take over as PID 1) +exec /init.bin "$@" diff --git a/lib/system/init/logger.go b/lib/system/init/logger.go new file mode 100644 index 0000000..6d0a521 --- /dev/null +++ b/lib/system/init/logger.go @@ -0,0 +1,71 @@ +package main + +import ( + "fmt" + "os" + "time" +) + +// Logger provides human-readable structured logging for the init process. +// Logs are written to serial console. +type Logger struct { + console *os.File +} + +// NewLogger creates a new logger that writes to serial console. +func NewLogger() *Logger { + l := &Logger{} + + // Open serial console for output + // ttyS0 for x86_64, ttyAMA0 for ARM64 (PL011 UART) + if f, err := os.OpenFile("/dev/ttyAMA0", os.O_WRONLY, 0); err == nil { + l.console = f + } else if f, err := os.OpenFile("/dev/ttyS0", os.O_WRONLY, 0); err == nil { + l.console = f + } else { + // Fallback to stdout + l.console = os.Stdout + } + return l +} + +// SetConsole sets the serial console for output. +func (l *Logger) SetConsole(path string) { + if f, err := os.OpenFile(path, os.O_WRONLY, 0); err == nil { + l.console = f + } +} + +// Info logs an informational message. +// Format: 2024-12-23T10:15:30Z [INFO] [phase] message +func (l *Logger) Info(phase, msg string) { + ts := time.Now().UTC().Format(time.RFC3339) + line := fmt.Sprintf("%s [INFO] [%s] %s\n", ts, phase, msg) + l.write(line) +} + +// Error logs an error message. +// Format: 2024-12-23T10:15:30Z [ERROR] [phase] message: error +func (l *Logger) Error(phase, msg string, err error) { + ts := time.Now().UTC().Format(time.RFC3339) + var line string + if err != nil { + line = fmt.Sprintf("%s [ERROR] [%s] %s: %v\n", ts, phase, msg, err) + } else { + line = fmt.Sprintf("%s [ERROR] [%s] %s\n", ts, phase, msg) + } + l.write(line) +} + +// Infof logs a formatted informational message. +func (l *Logger) Infof(phase, format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + l.Info(phase, msg) +} + +// write outputs a log line to serial console. +func (l *Logger) write(line string) { + if l.console != nil { + l.console.WriteString(line) + } +} diff --git a/lib/system/init/main.go b/lib/system/init/main.go new file mode 100644 index 0000000..ca25e42 --- /dev/null +++ b/lib/system/init/main.go @@ -0,0 +1,95 @@ +// Package main implements the hypeman init binary that runs as PID 1 in guest VMs. +// +// This binary replaces the shell-based init script with a Go program that provides: +// - Human-readable structured logging +// - Clean separation of boot phases +// - Support for both exec mode (container-like) and systemd mode (full VM) +// +// Note: This binary is called by init.sh wrapper which mounts /proc, /sys, /dev +// before the Go runtime starts (Go requires these during initialization). +package main + +import ( + "os" + "os/exec" +) + +func main() { + log := NewLogger() + log.Info("boot", "init starting") + + // Phase 1: Mount additional filesystems (proc/sys/dev already mounted by init.sh) + if err := mountEssentials(log); err != nil { + log.Error("mount", "failed to mount essentials", err) + dropToShell() + } + + // Phase 2: Setup overlay rootfs + if err := setupOverlay(log); err != nil { + log.Error("overlay", "failed to setup overlay", err) + dropToShell() + } + + // Phase 3: Read and parse config + cfg, err := readConfig(log) + if err != nil { + log.Error("config", "failed to read config", err) + dropToShell() + } + + // Phase 4: Configure network (shared between modes) + if cfg.NetworkEnabled { + if err := configureNetwork(log, cfg); err != nil { + log.Error("network", "failed to configure network", err) + // Continue anyway - network isn't always required + } + } + + // Phase 5: Load GPU drivers if needed + if cfg.HasGPU { + if err := loadGPUDrivers(log); err != nil { + log.Error("gpu", "failed to load GPU drivers", err) + // Continue anyway + } + } + + // Phase 6: Mount volumes + if len(cfg.VolumeMounts) > 0 { + if err := mountVolumes(log, cfg); err != nil { + log.Error("volumes", "failed to mount volumes", err) + // Continue anyway + } + } + + // Phase 7: Bind mount filesystems to new root + if err := bindMountsToNewRoot(log); err != nil { + log.Error("bind", "failed to bind mounts", err) + dropToShell() + } + + // Phase 8: Copy guest-agent to target location + if err := copyGuestAgent(log); err != nil { + log.Error("agent", "failed to copy guest-agent", err) + // Continue anyway - exec will still work, just no remote access + } + + // Phase 9: Mode-specific execution + if cfg.InitMode == "systemd" { + log.Info("mode", "entering systemd mode") + runSystemdMode(log, cfg) + } else { + log.Info("mode", "entering exec mode") + runExecMode(log, cfg) + } +} + +// dropToShell drops to an interactive shell for debugging when boot fails +func dropToShell() { + cmd := exec.Command("/bin/sh", "-i") + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Run() + os.Exit(1) +} + diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go new file mode 100644 index 0000000..c7152ca --- /dev/null +++ b/lib/system/init/mode_exec.go @@ -0,0 +1,100 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "syscall" +) + +// runExecMode runs the container in exec mode (default). +// This is the Docker-like behavior where: +// - The init binary remains PID 1 +// - Guest-agent runs as a background process +// - The container entrypoint runs as a child process +// - When the entrypoint exits, the VM exits +func runExecMode(log *Logger, cfg *Config) { + const newroot = "/overlay/newroot" + + // Set up environment + os.Setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") + os.Setenv("HOME", "/root") + + // Start guest-agent in background inside the container namespace + log.Info("exec", "starting guest-agent in background") + agentCmd := exec.Command("/usr/sbin/chroot", newroot, "/opt/hypeman/guest-agent") + agentCmd.Stdout = os.Stdout + agentCmd.Stderr = os.Stderr + if err := agentCmd.Start(); err != nil { + log.Error("exec", "failed to start guest-agent", err) + } + + // Build the entrypoint command + workdir := cfg.Workdir + if workdir == "" { + workdir = "/" + } + + entrypoint := cfg.Entrypoint + cmd := cfg.Cmd + + log.Info("exec", fmt.Sprintf("workdir=%s entrypoint=%s cmd=%s", workdir, entrypoint, cmd)) + + // Construct the shell command to run + // ENTRYPOINT and CMD are shell-safe quoted strings from config.sh + shellCmd := fmt.Sprintf("cd %s && exec %s %s", workdir, entrypoint, cmd) + + log.Info("exec", "launching entrypoint") + + // Run the entrypoint + appCmd := exec.Command("/usr/sbin/chroot", newroot, "/bin/sh", "-c", shellCmd) + appCmd.Stdin = os.Stdin + appCmd.Stdout = os.Stdout + appCmd.Stderr = os.Stderr + + // Set up environment for the app + appCmd.Env = buildEnv(cfg.Env) + + if err := appCmd.Start(); err != nil { + log.Error("exec", "failed to start entrypoint", err) + dropToShell() + } + + log.Info("exec", fmt.Sprintf("container app started (PID %d)", appCmd.Process.Pid)) + + // Wait for app to exit + err := appCmd.Wait() + exitCode := 0 + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + exitCode = exitErr.ExitCode() + } + } + + log.Info("exec", fmt.Sprintf("app exited with code %d", exitCode)) + + // Wait for guest-agent (keeps init alive, prevents kernel panic) + // The guest-agent runs forever, so this effectively keeps the VM alive + // until it's explicitly terminated + if agentCmd.Process != nil { + agentCmd.Wait() + } + + // Exit with the app's exit code + syscall.Exit(exitCode) +} + +// buildEnv constructs environment variables from the config. +func buildEnv(env map[string]string) []string { + result := []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOME=/root", + } + + for k, v := range env { + result = append(result, fmt.Sprintf("%s=%s", k, v)) + } + + return result +} + diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go new file mode 100644 index 0000000..6f042a1 --- /dev/null +++ b/lib/system/init/mode_systemd.go @@ -0,0 +1,90 @@ +package main + +import ( + "os" + "syscall" +) + +// runSystemdMode hands off control to systemd. +// This is used when the image's CMD is /sbin/init or /lib/systemd/systemd. +// The init binary: +// 1. Injects the hypeman-agent.service unit +// 2. Uses chroot to switch to the container rootfs +// 3. Execs /sbin/init (systemd) which becomes the new PID 1 +func runSystemdMode(log *Logger, cfg *Config) { + const newroot = "/overlay/newroot" + + // Inject hypeman-agent.service + log.Info("systemd", "injecting hypeman-agent.service") + if err := injectAgentService(newroot); err != nil { + log.Error("systemd", "failed to inject service", err) + // Continue anyway - VM will work, just without agent + } + + // Change root to the new filesystem using chroot + log.Info("systemd", "executing chroot") + if err := syscall.Chroot(newroot); err != nil { + log.Error("systemd", "chroot failed", err) + dropToShell() + } + + // Change to new root directory + if err := os.Chdir("/"); err != nil { + log.Error("systemd", "chdir / failed", err) + dropToShell() + } + + // Exec systemd - this replaces the current process + log.Info("systemd", "exec /sbin/init") + + // syscall.Exec replaces the current process with the new one + // /sbin/init is typically a symlink to /lib/systemd/systemd + err := syscall.Exec("/sbin/init", []string{"/sbin/init"}, os.Environ()) + if err != nil { + log.Error("systemd", "exec /sbin/init failed", err) + dropToShell() + } +} + +// injectAgentService creates the systemd service unit for the hypeman guest-agent. +func injectAgentService(newroot string) error { + serviceContent := `[Unit] +Description=Hypeman Guest Agent +After=network.target +Wants=network.target + +[Service] +Type=simple +ExecStart=/opt/hypeman/guest-agent +Restart=always +RestartSec=3 +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target +` + + serviceDir := newroot + "/etc/systemd/system" + wantsDir := serviceDir + "/multi-user.target.wants" + + // Create directories + if err := os.MkdirAll(serviceDir, 0755); err != nil { + return err + } + if err := os.MkdirAll(wantsDir, 0755); err != nil { + return err + } + + // Write service file + servicePath := serviceDir + "/hypeman-agent.service" + if err := os.WriteFile(servicePath, []byte(serviceContent), 0644); err != nil { + return err + } + + // Enable the service by creating a symlink in wants directory + symlinkPath := wantsDir + "/hypeman-agent.service" + // Use relative path for the symlink + return os.Symlink("../hypeman-agent.service", symlinkPath) +} + diff --git a/lib/system/init/mount.go b/lib/system/init/mount.go new file mode 100644 index 0000000..3dcee32 --- /dev/null +++ b/lib/system/init/mount.go @@ -0,0 +1,210 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "syscall" + "time" +) + +// mountEssentials mounts additional filesystems needed for boot. +// Note: /proc, /sys, /dev are already mounted by the init.sh wrapper script +// before the Go binary runs (the Go runtime needs them during initialization). +// This function mounts: +// - /dev/pts (pseudo-terminals) +// - /dev/shm (shared memory) +func mountEssentials(log *Logger) error { + // Create mount points for pts and shm (proc/sys/dev already exist from wrapper) + for _, dir := range []string{"/dev/pts", "/dev/shm"} { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + + // Mount devpts for PTY support (needed for guest-agent and interactive shells) + if err := syscall.Mount("devpts", "/dev/pts", "devpts", 0, ""); err != nil { + return fmt.Errorf("mount /dev/pts: %w", err) + } + + // Set permissions on /dev/shm + if err := os.Chmod("/dev/shm", 01777); err != nil { + return fmt.Errorf("chmod /dev/shm: %w", err) + } + + log.Info("mount", "mounted devpts/shm") + + // Set up serial console now that /dev is mounted + // ttyS0 for x86_64, ttyAMA0 for ARM64 (PL011 UART) + if _, err := os.Stat("/dev/ttyAMA0"); err == nil { + log.SetConsole("/dev/ttyAMA0") + redirectToConsole("/dev/ttyAMA0") + } else if _, err := os.Stat("/dev/ttyS0"); err == nil { + log.SetConsole("/dev/ttyS0") + redirectToConsole("/dev/ttyS0") + } + + log.Info("mount", "redirected to serial console") + + return nil +} + +// setupOverlay sets up the overlay filesystem: +// - /dev/vda: readonly rootfs (ext4) +// - /dev/vdb: writable overlay disk (ext4) +// - /overlay/newroot: merged overlay filesystem +func setupOverlay(log *Logger) error { + // Wait for block devices to be ready + time.Sleep(500 * time.Millisecond) + + // Create mount points + for _, dir := range []string{"/lower", "/overlay"} { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + + // Mount readonly rootfs from /dev/vda (ext4 filesystem) + if err := mount("/dev/vda", "/lower", "ext4", "ro"); err != nil { + return fmt.Errorf("mount rootfs: %w", err) + } + log.Info("overlay", "mounted rootfs from /dev/vda") + + // Mount writable overlay disk from /dev/vdb + if err := mount("/dev/vdb", "/overlay", "ext4", ""); err != nil { + return fmt.Errorf("mount overlay disk: %w", err) + } + + // Create overlay directories + for _, dir := range []string{"/overlay/upper", "/overlay/work", "/overlay/newroot"} { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + log.Info("overlay", "mounted overlay disk from /dev/vdb") + + // Create overlay filesystem + if err := mountOverlay("/lower", "/overlay/upper", "/overlay/work", "/overlay/newroot"); err != nil { + return fmt.Errorf("mount overlay: %w", err) + } + log.Info("overlay", "created overlay filesystem") + + return nil +} + +// bindMountsToNewRoot bind-mounts essential filesystems to the new root. +// Uses bind mounts instead of move so that the original /dev remains populated +// for processes running in the initrd namespace. +func bindMountsToNewRoot(log *Logger) error { + newroot := "/overlay/newroot" + + // Create mount points in new root + for _, dir := range []string{"proc", "sys", "dev", "dev/pts"} { + if err := os.MkdirAll(newroot+"/"+dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + + // Bind mount filesystems + mounts := []struct{ src, dst string }{ + {"/proc", newroot + "/proc"}, + {"/sys", newroot + "/sys"}, + {"/dev", newroot + "/dev"}, + {"/dev/pts", newroot + "/dev/pts"}, + } + + for _, m := range mounts { + if err := bindMount(m.src, m.dst); err != nil { + return fmt.Errorf("bind mount %s: %w", m.src, err) + } + } + + log.Info("bind", "bound mounts to new root") + + // Set up /dev symlinks for process substitution inside the container + symlinks := []struct{ target, link string }{ + {"/proc/self/fd", newroot + "/dev/fd"}, + {"/proc/self/fd/0", newroot + "/dev/stdin"}, + {"/proc/self/fd/1", newroot + "/dev/stdout"}, + {"/proc/self/fd/2", newroot + "/dev/stderr"}, + } + + for _, s := range symlinks { + os.Remove(s.link) // Remove if exists + os.Symlink(s.target, s.link) + } + + return nil +} + +// mount executes a mount command +func mount(source, target, fstype, options string) error { + args := []string{"-t", fstype} + if options != "" { + args = append(args, "-o", options) + } + args = append(args, source, target) + + cmd := exec.Command("/bin/mount", args...) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} + +// mountOverlay creates an overlay filesystem +func mountOverlay(lower, upper, work, target string) error { + options := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lower, upper, work) + cmd := exec.Command("/bin/mount", "-t", "overlay", "-o", options, "overlay", target) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} + +// bindMount performs a bind mount +func bindMount(source, target string) error { + cmd := exec.Command("/bin/mount", "--bind", source, target) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} + +// redirectToConsole redirects stdout/stderr to the serial console +func redirectToConsole(device string) { + f, err := os.OpenFile(device, os.O_WRONLY, 0) + if err != nil { + return + } + os.Stdout = f + os.Stderr = f +} + +// copyGuestAgent copies the guest-agent binary to the target location in the new root. +func copyGuestAgent(log *Logger) error { + const ( + src = "/usr/local/bin/guest-agent" + dst = "/overlay/newroot/opt/hypeman/guest-agent" + ) + + // Create target directory + if err := os.MkdirAll("/overlay/newroot/opt/hypeman", 0755); err != nil { + return fmt.Errorf("mkdir: %w", err) + } + + // Read source binary + data, err := os.ReadFile(src) + if err != nil { + return fmt.Errorf("read source: %w", err) + } + + // Write to destination + if err := os.WriteFile(dst, data, 0755); err != nil { + return fmt.Errorf("write destination: %w", err) + } + + log.Info("agent", "copied guest-agent to /opt/hypeman/") + return nil +} + diff --git a/lib/system/init/network.go b/lib/system/init/network.go new file mode 100644 index 0000000..d9d3d00 --- /dev/null +++ b/lib/system/init/network.go @@ -0,0 +1,58 @@ +package main + +import ( + "fmt" + "os" + "os/exec" +) + +// configureNetwork sets up networking in the guest VM. +// This is done from the initrd before pivot_root so it works for both exec and systemd modes. +func configureNetwork(log *Logger, cfg *Config) error { + // Bring up loopback interface + if err := runIP("link", "set", "lo", "up"); err != nil { + return fmt.Errorf("bring up lo: %w", err) + } + + // Add IP address to eth0 + addr := fmt.Sprintf("%s/%s", cfg.GuestIP, cfg.GuestCIDR) + if err := runIP("addr", "add", addr, "dev", "eth0"); err != nil { + return fmt.Errorf("add IP address: %w", err) + } + + // Bring up eth0 + if err := runIP("link", "set", "eth0", "up"); err != nil { + return fmt.Errorf("bring up eth0: %w", err) + } + + // Add default route + if err := runIP("route", "add", "default", "via", cfg.GuestGW); err != nil { + return fmt.Errorf("add default route: %w", err) + } + + // Configure DNS in the new root + resolvConf := fmt.Sprintf("nameserver %s\n", cfg.GuestDNS) + resolvPath := "/overlay/newroot/etc/resolv.conf" + + // Ensure /etc exists + if err := os.MkdirAll("/overlay/newroot/etc", 0755); err != nil { + return fmt.Errorf("mkdir /etc: %w", err) + } + + if err := os.WriteFile(resolvPath, []byte(resolvConf), 0644); err != nil { + return fmt.Errorf("write resolv.conf: %w", err) + } + + log.Info("network", fmt.Sprintf("configured eth0 with %s", addr)) + return nil +} + +// runIP executes an 'ip' command with the given arguments. +func runIP(args ...string) error { + cmd := exec.Command("/sbin/ip", args...) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} + diff --git a/lib/system/init/volumes.go b/lib/system/init/volumes.go new file mode 100644 index 0000000..2a52393 --- /dev/null +++ b/lib/system/init/volumes.go @@ -0,0 +1,109 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" +) + +// mountVolumes mounts attached volumes according to the configuration. +// Supports three modes: ro (read-only), rw (read-write), and overlay. +func mountVolumes(log *Logger, cfg *Config) error { + log.Info("volumes", "mounting volumes") + + for _, vol := range cfg.VolumeMounts { + mountPath := filepath.Join("/overlay/newroot", vol.Path) + + // Create mount point + if err := os.MkdirAll(mountPath, 0755); err != nil { + log.Error("volumes", fmt.Sprintf("mkdir %s failed", vol.Path), err) + continue + } + + switch vol.Mode { + case "overlay": + if err := mountVolumeOverlay(log, vol, mountPath); err != nil { + log.Error("volumes", fmt.Sprintf("mount overlay %s failed", vol.Path), err) + } + case "ro": + if err := mountVolumeReadOnly(log, vol, mountPath); err != nil { + log.Error("volumes", fmt.Sprintf("mount ro %s failed", vol.Path), err) + } + default: // "rw" + if err := mountVolumeReadWrite(log, vol, mountPath); err != nil { + log.Error("volumes", fmt.Sprintf("mount rw %s failed", vol.Path), err) + } + } + } + + return nil +} + +// mountVolumeOverlay mounts a volume in overlay mode. +// Uses the base device as read-only lower layer and overlay device for writable upper layer. +func mountVolumeOverlay(log *Logger, vol VolumeMount, mountPath string) error { + baseName := filepath.Base(vol.Path) + baseMount := fmt.Sprintf("/mnt/vol-base-%s", baseName) + overlayMount := fmt.Sprintf("/mnt/vol-overlay-%s", baseName) + + // Create mount points + if err := os.MkdirAll(baseMount, 0755); err != nil { + return err + } + if err := os.MkdirAll(overlayMount, 0755); err != nil { + return err + } + + // Mount base volume read-only (noload to skip journal recovery) + cmd := exec.Command("/bin/mount", "-t", "ext4", "-o", "ro,noload", vol.Device, baseMount) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mount base: %s: %s", err, output) + } + + // Mount overlay disk (writable) + cmd = exec.Command("/bin/mount", "-t", "ext4", vol.OverlayDevice, overlayMount) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mount overlay disk: %s: %s", err, output) + } + + // Create overlay directories + upperDir := filepath.Join(overlayMount, "upper") + workDir := filepath.Join(overlayMount, "work") + os.MkdirAll(upperDir, 0755) + os.MkdirAll(workDir, 0755) + + // Create overlayfs + options := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", baseMount, upperDir, workDir) + cmd = exec.Command("/bin/mount", "-t", "overlay", "-o", options, "overlay", mountPath) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mount overlay: %s: %s", err, output) + } + + log.Info("volumes", fmt.Sprintf("mounted %s at %s (overlay via %s)", vol.Device, vol.Path, vol.OverlayDevice)) + return nil +} + +// mountVolumeReadOnly mounts a volume in read-only mode. +func mountVolumeReadOnly(log *Logger, vol VolumeMount, mountPath string) error { + // Use noload to skip journal recovery for multi-attach safety + cmd := exec.Command("/bin/mount", "-t", "ext4", "-o", "ro,noload", vol.Device, mountPath) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + + log.Info("volumes", fmt.Sprintf("mounted %s at %s (ro)", vol.Device, vol.Path)) + return nil +} + +// mountVolumeReadWrite mounts a volume in read-write mode. +func mountVolumeReadWrite(log *Logger, vol VolumeMount, mountPath string) error { + cmd := exec.Command("/bin/mount", "-t", "ext4", vol.Device, mountPath) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + + log.Info("volumes", fmt.Sprintf("mounted %s at %s (rw)", vol.Device, vol.Path)) + return nil +} + diff --git a/lib/system/init_binary.go b/lib/system/init_binary.go new file mode 100644 index 0000000..ad378a6 --- /dev/null +++ b/lib/system/init_binary.go @@ -0,0 +1,11 @@ +package system + +import _ "embed" + +// InitBinary contains the embedded init binary for guest VMs. +// This is built by the Makefile before the main binary is compiled. +// The init binary is a statically-linked Go program that runs as PID 1 in the guest VM. +// It matches the architecture of the host (VMs run on the same arch as the host). +// +//go:embed init/init +var InitBinary []byte diff --git a/lib/system/init_script.go b/lib/system/init_script.go deleted file mode 100644 index 9e9b397..0000000 --- a/lib/system/init_script.go +++ /dev/null @@ -1,294 +0,0 @@ -package system - -// GenerateInitScript returns the comprehensive init script for initrd -// This consolidates ALL init logic - no modifications to OCI images needed -// -// The script: -// 1. Mounts essential filesystems (proc, sys, dev) -// 2. Sets up overlay filesystem (lowerdir=rootfs, upperdir=overlay disk) -// 3. Mounts and sources config disk (/dev/vdc) -// 4. Loads NVIDIA kernel modules (if HAS_GPU=1 in config.sh) -// 5. Configures networking (if enabled) -// 6. Executes container entrypoint -// -// GPU support: When HAS_GPU=1 is set in the instance's config.sh, the init script -// will load NVIDIA kernel modules before launching the container entrypoint. -func GenerateInitScript() string { - return `#!/bin/sh -set -xe - -echo "overlay-init: START" > /dev/kmsg - -# Create mount points -mkdir -p /proc /sys /dev - -# Mount essential filesystems -# devtmpfs handles /dev population (null, zero, vsock, etc.) automatically -mount -t proc none /proc -mount -t sysfs none /sys -mount -t devtmpfs none /dev - -# Setup PTY support (needed for guest-agent and interactive shells) -mkdir -p /dev/pts /dev/shm -mount -t devpts devpts /dev/pts -chmod 1777 /dev/shm - -echo "overlay-init: mounted proc/sys/dev" > /dev/kmsg - -# Redirect all output to serial console -# ttyS0 for x86_64, ttyAMA0 for ARM64 (PL011 UART) -if [ -e /dev/ttyAMA0 ]; then - exec >/dev/ttyAMA0 2>&1 -else - exec >/dev/ttyS0 2>&1 -fi - -echo "overlay-init: redirected to serial console" - -# Wait for block devices to be ready -sleep 0.5 - -# Mount readonly rootfs from /dev/vda (ext4 filesystem) -mkdir -p /lower -mount -t ext4 -o ro /dev/vda /lower -echo "overlay-init: mounted rootfs from /dev/vda" - -# Mount writable overlay disk from /dev/vdb -mkdir -p /overlay -mount -t ext4 /dev/vdb /overlay -mkdir -p /overlay/upper /overlay/work /overlay/newroot -echo "overlay-init: mounted overlay disk from /dev/vdb" - -# Create overlay filesystem -mount -t overlay \ - -o lowerdir=/lower,upperdir=/overlay/upper,workdir=/overlay/work \ - overlay /overlay/newroot -echo "overlay-init: created overlay filesystem" - -# Mount config disk (/dev/vdc) -mkdir -p /mnt/config -mount -o ro /dev/vdc /mnt/config -echo "overlay-init: mounted config disk" - -# Source configuration -if [ -f /mnt/config/config.sh ]; then - . /mnt/config/config.sh - echo "overlay-init: sourced config" -else - echo "overlay-init: ERROR - config.sh not found!" - /bin/sh -i - exit 1 -fi - -# Load NVIDIA kernel modules for GPU passthrough (if HAS_GPU=1) -if [ "${HAS_GPU:-0}" = "1" ]; then - echo "overlay-init: loading NVIDIA kernel modules for GPU passthrough" - if [ -d /lib/modules ]; then - # Find the kernel version directory - KVER=$(ls /lib/modules/ 2>/dev/null | head -1) - if [ -n "$KVER" ] && [ -d "/lib/modules/$KVER/kernel/drivers/gpu" ]; then - # Load modules in order (dependencies first) - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia.ko 2>&1 || echo "overlay-init: nvidia.ko load failed" - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia-uvm.ko 2>&1 || echo "overlay-init: nvidia-uvm.ko load failed" - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia-modeset.ko 2>&1 || echo "overlay-init: nvidia-modeset.ko load failed" - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia-drm.ko modeset=1 2>&1 || echo "overlay-init: nvidia-drm.ko load failed" - echo "overlay-init: NVIDIA modules loaded for kernel $KVER" - - # Use nvidia-modprobe to create device nodes with correct major/minor numbers. - # nvidia-modprobe is the official NVIDIA utility that: - # 1. Loads kernel modules if needed (already done above) - # 2. Creates /dev/nvidiactl and /dev/nvidia0 with correct permissions - # 3. Creates /dev/nvidia-uvm and /dev/nvidia-uvm-tools - if [ -x /usr/bin/nvidia-modprobe ]; then - echo "overlay-init: running nvidia-modprobe to create device nodes" - /usr/bin/nvidia-modprobe 2>&1 || echo "overlay-init: nvidia-modprobe failed" - /usr/bin/nvidia-modprobe -u -c=0 2>&1 || echo "overlay-init: nvidia-modprobe -u failed" - echo "overlay-init: nvidia-modprobe completed" - ls -la /dev/nvidia* 2>/dev/null || true - else - echo "overlay-init: nvidia-modprobe not found, falling back to manual mknod" - # Fallback: Manual device node creation - NVIDIA_MAJOR=$(awk '/nvidia-frontend|^[0-9]+ nvidia$/ {print $1}' /proc/devices 2>/dev/null | head -1) - NVIDIA_UVM_MAJOR=$(awk '/nvidia-uvm/ {print $1}' /proc/devices 2>/dev/null) - - if [ -n "$NVIDIA_MAJOR" ]; then - mknod -m 666 /dev/nvidiactl c $NVIDIA_MAJOR 255 - mknod -m 666 /dev/nvidia0 c $NVIDIA_MAJOR 0 - echo "overlay-init: created /dev/nvidiactl and /dev/nvidia0 (major $NVIDIA_MAJOR)" - fi - - if [ -n "$NVIDIA_UVM_MAJOR" ]; then - mknod -m 666 /dev/nvidia-uvm c $NVIDIA_UVM_MAJOR 0 - mknod -m 666 /dev/nvidia-uvm-tools c $NVIDIA_UVM_MAJOR 1 - echo "overlay-init: created /dev/nvidia-uvm* (major $NVIDIA_UVM_MAJOR)" - fi - fi - else - echo "overlay-init: NVIDIA modules not found in /lib/modules/$KVER" - fi - else - echo "overlay-init: /lib/modules not found, skipping NVIDIA module loading" - fi - - # Inject NVIDIA userspace driver libraries into container rootfs - # This allows containers to use standard CUDA images without bundled drivers - # See lib/devices/GPU.md for documentation - if [ -d /usr/lib/nvidia ]; then - echo "overlay-init: injecting NVIDIA driver libraries into container" - - DRIVER_VERSION=$(cat /usr/lib/nvidia/version 2>/dev/null || echo "unknown") - # Determine library path based on architecture - if [ "$(uname -m)" = "aarch64" ]; then - LIB_DST="/overlay/newroot/usr/lib/aarch64-linux-gnu" - else - LIB_DST="/overlay/newroot/usr/lib/x86_64-linux-gnu" - fi - BIN_DST="/overlay/newroot/usr/bin" - - mkdir -p "$LIB_DST" "$BIN_DST" - - # Copy all driver libraries and create symlinks - for lib in /usr/lib/nvidia/*.so.*; do - if [ -f "$lib" ]; then - libname=$(basename "$lib") - cp "$lib" "$LIB_DST/" - - # Create standard symlinks: libfoo.so.VERSION -> libfoo.so.1 -> libfoo.so - base=$(echo "$libname" | sed 's/\.so\..*//') - ln -sf "$libname" "$LIB_DST/${base}.so.1" 2>/dev/null || true - ln -sf "${base}.so.1" "$LIB_DST/${base}.so" 2>/dev/null || true - fi - done - - # Copy nvidia-smi and nvidia-modprobe binaries - for bin in nvidia-smi nvidia-modprobe; do - if [ -x /usr/bin/$bin ]; then - cp /usr/bin/$bin "$BIN_DST/" - fi - done - - # Update ldconfig cache so applications can find the libraries - chroot /overlay/newroot ldconfig 2>/dev/null || true - - echo "overlay-init: NVIDIA driver libraries injected (version: $DRIVER_VERSION)" - fi -fi - -# Mount attached volumes (from config: VOLUME_MOUNTS="device:path:mode[:overlay_device] ...") -# Modes: ro (read-only), rw (read-write), overlay (base ro + per-instance overlay) -if [ -n "${VOLUME_MOUNTS:-}" ]; then - echo "overlay-init: mounting volumes" - for vol in $VOLUME_MOUNTS; do - device=$(echo "$vol" | cut -d: -f1) - path=$(echo "$vol" | cut -d: -f2) - mode=$(echo "$vol" | cut -d: -f3) - - # Create mount point in overlay - mkdir -p "/overlay/newroot${path}" - - if [ "$mode" = "overlay" ]; then - # Overlay mode: mount base read-only, create overlayfs with per-instance writable layer - overlay_device=$(echo "$vol" | cut -d: -f4) - - # Create temp mount points for base and overlay disk. - # These persist for the lifetime of the VM but are NOT leaked - they exist inside - # the ephemeral guest rootfs (which is itself an overlayfs) and are destroyed - # when the VM terminates along with all guest state. - base_mount="/mnt/vol-base-$(basename "$path")" - overlay_mount="/mnt/vol-overlay-$(basename "$path")" - mkdir -p "$base_mount" "$overlay_mount" - - # Mount base volume read-only (noload to skip journal recovery) - mount -t ext4 -o ro,noload "$device" "$base_mount" - - # Mount overlay disk (writable) - mount -t ext4 "$overlay_device" "$overlay_mount" - mkdir -p "$overlay_mount/upper" "$overlay_mount/work" - - # Create overlayfs combining base (lower) and instance overlay (upper) - mount -t overlay \ - -o "lowerdir=$base_mount,upperdir=$overlay_mount/upper,workdir=$overlay_mount/work" \ - overlay "/overlay/newroot${path}" - - echo "overlay-init: mounted volume $device at $path (overlay via $overlay_device)" - elif [ "$mode" = "ro" ]; then - # Read-only mount (noload to skip journal recovery for multi-attach safety) - mount -t ext4 -o ro,noload "$device" "/overlay/newroot${path}" - echo "overlay-init: mounted volume $device at $path (ro)" - else - # Read-write mount - mount -t ext4 "$device" "/overlay/newroot${path}" - echo "overlay-init: mounted volume $device at $path (rw)" - fi - done -fi - -# Prepare new root mount points -# We use bind mounts instead of move so that the original /dev remains populated -# for processes running in the initrd namespace (like guest-agent). -mkdir -p /overlay/newroot/proc -mkdir -p /overlay/newroot/sys -mkdir -p /overlay/newroot/dev -mkdir -p /overlay/newroot/dev/pts - -mount --bind /proc /overlay/newroot/proc -mount --bind /sys /overlay/newroot/sys -mount --bind /dev /overlay/newroot/dev -mount --bind /dev/pts /overlay/newroot/dev/pts - -echo "overlay-init: bound mounts to new root" - -# Set up /dev symlinks for process substitution inside the container -chroot /overlay/newroot ln -sf /proc/self/fd /dev/fd 2>/dev/null || true -chroot /overlay/newroot ln -sf /proc/self/fd/0 /dev/stdin 2>/dev/null || true -chroot /overlay/newroot ln -sf /proc/self/fd/1 /dev/stdout 2>/dev/null || true -chroot /overlay/newroot ln -sf /proc/self/fd/2 /dev/stderr 2>/dev/null || true - -# Configure network from initrd (using busybox ip, not container's ip) -# Network interfaces are shared, so we can configure them from here -if [ -n "${GUEST_IP:-}" ]; then - echo "overlay-init: configuring network" - ip link set lo up - ip addr add ${GUEST_IP}/${GUEST_CIDR} dev eth0 - ip link set eth0 up - ip route add default via ${GUEST_GW} - echo "nameserver ${GUEST_DNS}" > /overlay/newroot/etc/resolv.conf - echo "overlay-init: network configured - IP: ${GUEST_IP}/${GUEST_CIDR}" -fi - -# Set PATH for initrd tools -export PATH='/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin' -export HOME='/root' - -# Copy guest-agent into container rootfs and start it in container namespace -# This way the PTY and shell run in the same namespace, fixing signal handling -echo "overlay-init: copying guest-agent to container" -mkdir -p /overlay/newroot/usr/local/bin -cp /usr/local/bin/guest-agent /overlay/newroot/usr/local/bin/guest-agent - -# Start vsock guest agent inside the container namespace -echo "overlay-init: starting guest agent in container namespace" -chroot /overlay/newroot /usr/local/bin/guest-agent & - -echo "overlay-init: launching entrypoint" -echo "overlay-init: workdir=${WORKDIR:-/} entrypoint=${ENTRYPOINT} cmd=${CMD}" - -set +e - -# Construct the command string carefully -# ENTRYPOINT and CMD are shell-safe quoted strings from config.sh -eval "chroot /overlay/newroot /bin/sh -c \"cd ${WORKDIR:-/} && exec ${ENTRYPOINT} ${CMD}\"" & -APP_PID=$! - -echo "overlay-init: container app started (PID $APP_PID)" - -# Wait for app to exit -wait $APP_PID -APP_EXIT=$? - -echo "overlay-init: app exited with code $APP_EXIT" - -# Wait for all background jobs (guest-agent runs forever, keeping init alive) -# This prevents kernel panic from killing init (PID 1) -wait` -} diff --git a/lib/system/init_wrapper.go b/lib/system/init_wrapper.go new file mode 100644 index 0000000..16056f2 --- /dev/null +++ b/lib/system/init_wrapper.go @@ -0,0 +1,6 @@ +package system + +import _ "embed" + +//go:embed init/init.sh +var InitWrapper []byte diff --git a/lib/system/initrd.go b/lib/system/initrd.go index 3048b75..567247d 100644 --- a/lib/system/initrd.go +++ b/lib/system/initrd.go @@ -20,7 +20,7 @@ import ( const alpineBaseImage = "alpine:3.22" -// buildInitrd builds initrd from Alpine base + embedded guest-agent + generated init script +// buildInitrd builds initrd from Alpine base + embedded guest-agent + embedded init binary func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) { // Create temp directory for building tempDir, err := os.MkdirTemp("", "hypeman-initrd-*") @@ -67,11 +67,17 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) log.InfoContext(ctx, "skipping NVIDIA modules", "error", err) } - // Write generated init script - initScript := GenerateInitScript() - initPath := filepath.Join(rootfsDir, "init") - if err := os.WriteFile(initPath, []byte(initScript), 0755); err != nil { - return "", fmt.Errorf("write init script: %w", err) + // Write shell wrapper as /init (sets up /proc, /sys, /dev before Go runtime) + // The Go runtime needs these filesystems during initialization + initWrapperPath := filepath.Join(rootfsDir, "init") + if err := os.WriteFile(initWrapperPath, InitWrapper, 0755); err != nil { + return "", fmt.Errorf("write init wrapper: %w", err) + } + + // Write Go init binary as /init.bin (called by wrapper after setup) + initBinPath := filepath.Join(rootfsDir, "init.bin") + if err := os.WriteFile(initBinPath, InitBinary, 0755); err != nil { + return "", fmt.Errorf("write init binary: %w", err) } // Generate timestamp for this build @@ -89,7 +95,7 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) // Store hash for staleness detection hashPath := filepath.Join(filepath.Dir(outputPath), ".hash") - currentHash := computeInitrdHash() + currentHash := computeInitrdHash(arch) if err := os.WriteFile(hashPath, []byte(currentHash), 0644); err != nil { return "", fmt.Errorf("write hash file: %w", err) } @@ -117,7 +123,7 @@ func (m *manager) ensureInitrd(ctx context.Context) (string, error) { initrdPath := m.paths.SystemInitrdTimestamp(target, arch) if _, err := os.Stat(initrdPath); err == nil { // File exists, check if it's stale by comparing embedded binary hash - if !m.isInitrdStale(initrdPath) { + if !m.isInitrdStale(initrdPath, arch) { return initrdPath, nil } } @@ -133,7 +139,7 @@ func (m *manager) ensureInitrd(ctx context.Context) (string, error) { } // isInitrdStale checks if the initrd needs rebuilding by comparing hashes -func (m *manager) isInitrdStale(initrdPath string) bool { +func (m *manager) isInitrdStale(initrdPath, arch string) bool { // Read stored hash hashPath := filepath.Join(filepath.Dir(initrdPath), ".hash") storedHash, err := os.ReadFile(hashPath) @@ -143,22 +149,23 @@ func (m *manager) isInitrdStale(initrdPath string) bool { } // Compare with current hash - currentHash := computeInitrdHash() + currentHash := computeInitrdHash(arch) return string(storedHash) != currentHash } -// computeInitrdHash computes a hash of the embedded binary, init script, and NVIDIA assets -func computeInitrdHash() string { +// computeInitrdHash computes a hash of the embedded binaries and NVIDIA assets for a specific architecture +func computeInitrdHash(arch string) string { h := sha256.New() h.Write(GuestAgentBinary) - h.Write([]byte(GenerateInitScript())) + h.Write(InitBinary) + h.Write(InitWrapper) // Include NVIDIA driver version in hash so initrd is rebuilt when driver changes if ver, ok := NvidiaDriverVersion[DefaultKernelVersion]; ok { h.Write([]byte(ver)) } // Include driver libs URL so initrd is rebuilt when the libs tarball changes if archURLs, ok := NvidiaDriverLibURLs[DefaultKernelVersion]; ok { - if url, ok := archURLs["x86_64"]; ok { + if url, ok := archURLs[arch]; ok { h.Write([]byte(url)) } } diff --git a/lib/system/manager_test.go b/lib/system/manager_test.go index ce32df1..ea91ae6 100644 --- a/lib/system/manager_test.go +++ b/lib/system/manager_test.go @@ -58,16 +58,9 @@ func TestEnsureSystemFiles(t *testing.T) { require.NoError(t, err) } -func TestInitScriptGeneration(t *testing.T) { - script := GenerateInitScript() - - // Verify script contains essential components - assert.Contains(t, script, "#!/bin/sh") - assert.Contains(t, script, "mount -t overlay") - assert.Contains(t, script, "/dev/vda") // rootfs disk - assert.Contains(t, script, "/dev/vdb") // overlay disk - assert.Contains(t, script, "/dev/vdc") // config disk - assert.Contains(t, script, "guest-agent") // vsock guest agent - assert.Contains(t, script, "${ENTRYPOINT}") - assert.Contains(t, script, "wait $APP_PID") // Supervisor pattern +func TestInitBinaryEmbedded(t *testing.T) { + // Verify the init binary is embedded and has reasonable size + // The Go init binary should be at least 1MB when statically linked + assert.NotEmpty(t, InitBinary, "init binary should be embedded") + assert.Greater(t, len(InitBinary), 100000, "init binary should be at least 100KB") } From 1d67894ed1e8e495439fe5bd3c5f88f8f6c9d0b3 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 24 Dec 2025 12:11:08 +0000 Subject: [PATCH 03/13] tweaks --- .../systemd_vm_with_go_init_c0a9c010.plan.md | 16 ++++- Makefile | 3 +- cmd/api/api/exec.go | 29 +++++---- cmd/api/api/exec_test.go | 34 ++++------- integration/systemd_test.go | 44 ++++++-------- lib/guest/client.go | 60 +++++++++++++------ lib/system/init/mode_exec.go | 1 - lib/system/init/network.go | 1 - 8 files changed, 101 insertions(+), 87 deletions(-) diff --git a/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md b/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md index f018b2d..f379daa 100644 --- a/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md +++ b/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md @@ -92,6 +92,8 @@ flowchart TB end ``` + + ## Shared vs Mode-Specific Behavior | Step | Exec Mode | Systemd Mode ||------|-----------|--------------|| Mount proc/sys/dev | Shared | Shared || Mount rootfs overlay | Shared | Shared || Read config disk | Shared | Shared || Configure network | Init configures it | Init configures it (before pivot) || Load GPU drivers | Shared | Shared || Mount volumes | Shared | Shared || Copy guest-agent | To `/opt/hypeman/` | To `/opt/hypeman/` || Start guest-agent | Background process | Systemd service || PID 1 | Go init binary | Systemd || App lifecycle | Managed by init | Managed by systemd | @@ -143,6 +145,8 @@ func (l *Logger) Error(phase, msg string, err error) { // 2024-12-23T10:15:32Z [INFO] [systemd] exec /sbin/init ``` + + ## Go-based Init Binary Package structure at `lib/system/init/`: @@ -160,6 +164,8 @@ lib/system/init/ logger.go # Human-readable logging to hypeman operations log ``` + + ### Main Orchestration ```go @@ -216,6 +222,8 @@ func main() { } ``` + + ### Systemd Mode ```go @@ -277,6 +285,8 @@ WantedBy=multi-user.target } ``` + + ## Detection Logic Auto-detect systemd mode by inspecting the image's CMD. No override flag - if CMD is a systemd init, always use systemd mode. @@ -320,6 +330,8 @@ func IsSystemdImage(entrypoint, cmd []string) bool { } ``` + + ## E2E Test Custom Dockerfile in repository at `integration/testdata/systemd/Dockerfile`: @@ -400,7 +412,7 @@ func TestExecModeUnchanged(t *testing.T) { result = execInVM(t, inst, "cat", "/proc/1/comm") assert.Equal(t, "init", strings.TrimSpace(result.Stdout)) } -``` -## Files to Modify/Create \ No newline at end of file + +``` \ No newline at end of file diff --git a/Makefile b/Makefile index 74c53fa..ec121bb 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries build-preview-cli release-prep clean +.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded # Directory where local binaries will be installed BIN_DIR ?= $(CURDIR)/bin @@ -176,7 +176,6 @@ lib/system/init/init: lib/system/init/*.go cd lib/system/init && CGO_ENABLED=0 go build -ldflags="-s -w" -o init . # Build all embedded binaries -.PHONY: build-embedded build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init # Build the binary diff --git a/cmd/api/api/exec.go b/cmd/api/api/exec.go index c06b5a7..f09b3e7 100644 --- a/cmd/api/api/exec.go +++ b/cmd/api/api/exec.go @@ -29,11 +29,12 @@ var upgrader = websocket.Upgrader{ // ExecRequest represents the JSON body for exec requests type ExecRequest struct { - Command []string `json:"command"` - TTY bool `json:"tty"` - Env map[string]string `json:"env,omitempty"` - Cwd string `json:"cwd,omitempty"` - Timeout int32 `json:"timeout,omitempty"` // seconds + Command []string `json:"command"` + TTY bool `json:"tty"` + Env map[string]string `json:"env,omitempty"` + Cwd string `json:"cwd,omitempty"` + Timeout int32 `json:"timeout,omitempty"` // seconds + WaitForAgent int32 `json:"wait_for_agent,omitempty"` // seconds to wait for guest agent to be ready } // ExecHandler handles exec requests via WebSocket for bidirectional streaming @@ -106,6 +107,7 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { "tty", execReq.TTY, "cwd", execReq.Cwd, "timeout", execReq.Timeout, + "wait_for_agent", execReq.WaitForAgent, ) // Create WebSocket read/writer wrapper @@ -122,14 +124,15 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { // Execute via vsock exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ - Command: execReq.Command, - Stdin: wsConn, - Stdout: wsConn, - Stderr: wsConn, - TTY: execReq.TTY, - Env: execReq.Env, - Cwd: execReq.Cwd, - Timeout: execReq.Timeout, + Command: execReq.Command, + Stdin: wsConn, + Stdout: wsConn, + Stderr: wsConn, + TTY: execReq.TTY, + Env: execReq.Env, + Cwd: execReq.Cwd, + Timeout: execReq.Timeout, + WaitForAgent: time.Duration(execReq.WaitForAgent) * time.Second, }) duration := time.Since(startTime) diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index d4a88a6..baa8423 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -115,38 +115,24 @@ func TestExecInstanceNonTTY(t *testing.T) { t.Logf("vsock socket exists: %s", actualInst.VsockSocket) } - // Wait for exec agent to be ready (retry a few times) - var exit *guest.ExitStatus + // Wait for exec agent to be ready using WaitForAgent var stdout, stderr outputBuffer - var execErr error dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) require.NoError(t, err) t.Log("Testing exec command: whoami") - maxRetries := 10 - for i := 0; i < maxRetries; i++ { - stdout = outputBuffer{} - stderr = outputBuffer{} - - exit, execErr = guest.ExecIntoInstance(ctx(), dialer, guest.ExecOptions{ - Command: []string{"/bin/sh", "-c", "whoami"}, - Stdin: nil, - Stdout: &stdout, - Stderr: &stderr, - TTY: false, - }) - - if execErr == nil { - break - } - - t.Logf("Exec attempt %d/%d failed, retrying: %v", i+1, maxRetries, execErr) - time.Sleep(1 * time.Second) - } + exit, execErr := guest.ExecIntoInstance(ctx(), dialer, guest.ExecOptions{ + Command: []string{"/bin/sh", "-c", "whoami"}, + Stdin: nil, + Stdout: &stdout, + Stderr: &stderr, + TTY: false, + WaitForAgent: 10 * time.Second, // Wait up to 10s for guest agent to be ready + }) // Assert exec worked - require.NoError(t, execErr, "exec should succeed after retries") + require.NoError(t, execErr, "exec should succeed") require.NotNil(t, exit, "exit status should be returned") require.Equal(t, 0, exit.Code, "whoami should exit with code 0") diff --git a/integration/systemd_test.go b/integration/systemd_test.go index 760620c..16ba863 100644 --- a/integration/systemd_test.go +++ b/integration/systemd_test.go @@ -170,35 +170,25 @@ func TestSystemdMode(t *testing.T) { // waitForGuestAgent polls until the guest agent is ready func waitForGuestAgent(ctx context.Context, mgr instances.Manager, instanceID string, timeout time.Duration) error { - deadline := time.Now().Add(timeout) - for time.Now().Before(deadline) { - inst, err := mgr.GetInstance(ctx, instanceID) - if err != nil { - time.Sleep(500 * time.Millisecond) - continue - } - - // Try to connect to the guest agent - dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) - if err != nil { - time.Sleep(500 * time.Millisecond) - continue - } - - // Try a simple exec to verify agent is responding - var stdout bytes.Buffer - _, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ - Command: []string{"echo", "ready"}, - Stdout: &stdout, - TTY: false, - }) - if err == nil { - return nil - } + inst, err := mgr.GetInstance(ctx, instanceID) + if err != nil { + return err + } - time.Sleep(500 * time.Millisecond) + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + return err } - return context.DeadlineExceeded + + // Use WaitForAgent to wait for the agent to be ready + var stdout bytes.Buffer + _, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: []string{"echo", "ready"}, + Stdout: &stdout, + TTY: false, + WaitForAgent: timeout, + }) + return err } // execInInstance executes a command in the instance diff --git a/lib/guest/client.go b/lib/guest/client.go index 48361a2..819a659 100644 --- a/lib/guest/client.go +++ b/lib/guest/client.go @@ -30,7 +30,6 @@ const ( // This can happen if: // - The VM is still booting // - The guest agent was stopped or deleted -// - The VM is in systemd mode and the agent service failed to start type AgentConnectionError struct { Err error } @@ -43,14 +42,6 @@ func (e *AgentConnectionError) Unwrap() error { return e.Err } -// IsAgentConnectionError checks if an error is due to the guest agent not responding. -func IsAgentConnectionError(err error) bool { - var agentErr *AgentConnectionError - return err != nil && (strings.Contains(err.Error(), "guest agent not responding") || - strings.Contains(err.Error(), "connection refused") || - errors.As(err, &agentErr)) -} - // connPool manages reusable gRPC connections per vsock dialer key // This avoids the overhead and potential issues of rapidly creating/closing connections var connPool = struct { @@ -122,19 +113,54 @@ type ExitStatus struct { // ExecOptions configures command execution type ExecOptions struct { - Command []string - Stdin io.Reader - Stdout io.Writer - Stderr io.Writer - TTY bool - Env map[string]string // Environment variables - Cwd string // Working directory (optional) - Timeout int32 // Execution timeout in seconds (0 = no timeout) + Command []string + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer + TTY bool + Env map[string]string // Environment variables + Cwd string // Working directory (optional) + Timeout int32 // Execution timeout in seconds (0 = no timeout) + WaitForAgent time.Duration // Max time to wait for agent to be ready (0 = no wait, fail immediately) } // ExecIntoInstance executes command in instance via vsock using gRPC. // The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest. +// If WaitForAgent is set, it will retry on AgentConnectionError until the timeout. func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) { + // If no wait requested, execute immediately + if opts.WaitForAgent == 0 { + return execIntoInstanceOnce(ctx, dialer, opts) + } + + deadline := time.Now().Add(opts.WaitForAgent) + + for { + exit, err := execIntoInstanceOnce(ctx, dialer, opts) + + // Success or non-connection error - return immediately + var connErr *AgentConnectionError + if err == nil || !errors.As(err, &connErr) { + return exit, err + } + + // Connection error - check if we should retry + if time.Now().After(deadline) { + return nil, err + } + + // Wait before retrying, but respect context cancellation + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(500 * time.Millisecond): + // Continue to retry + } + } +} + +// execIntoInstanceOnce executes command in instance via vsock using gRPC (single attempt). +func execIntoInstanceOnce(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) { start := time.Now() var bytesSent int64 diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index c7152ca..e00d2f0 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -97,4 +97,3 @@ func buildEnv(env map[string]string) []string { return result } - diff --git a/lib/system/init/network.go b/lib/system/init/network.go index d9d3d00..61c29a6 100644 --- a/lib/system/init/network.go +++ b/lib/system/init/network.go @@ -55,4 +55,3 @@ func runIP(args ...string) error { } return nil } - From f0d8608ec2eef859e9c845cbe5ad20ca8a289903 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 24 Dec 2025 13:46:23 +0000 Subject: [PATCH 04/13] tweaks --- Makefile | 2 - cmd/api/api/exec_test.go | 1 - lib/system/README.md | 109 +++++++++++++---------------------- lib/system/init/mode_exec.go | 21 +++++-- 4 files changed, 56 insertions(+), 77 deletions(-) diff --git a/Makefile b/Makefile index ec121bb..4436c51 100644 --- a/Makefile +++ b/Makefile @@ -170,12 +170,10 @@ lib/system/guest_agent/guest-agent: lib/system/guest_agent/*.go cd lib/system/guest_agent && CGO_ENABLED=0 go build -ldflags="-s -w" -o guest-agent . # Build init binary (runs as PID 1 in guest VM) for embedding -# Uses static linking for portability across different guest environments lib/system/init/init: lib/system/init/*.go @echo "Building init binary..." cd lib/system/init && CGO_ENABLED=0 go build -ldflags="-s -w" -o init . -# Build all embedded binaries build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init # Build the binary diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index baa8423..51e97b0 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -115,7 +115,6 @@ func TestExecInstanceNonTTY(t *testing.T) { t.Logf("vsock socket exists: %s", actualInst.VsockSocket) } - // Wait for exec agent to be ready using WaitForAgent var stdout, stderr outputBuffer dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) diff --git a/lib/system/README.md b/lib/system/README.md index 59e8ce2..29111fe 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -4,9 +4,9 @@ Manages versioned kernel and initrd files for Cloud Hypervisor VMs. ## Features -- **Automatic Downloads**: Kernel downloaded from Cloud Hypervisor releases on first use +- **Automatic Downloads**: Kernel downloaded from onkernel/linux releases on first use - **Automatic Build**: Initrd built from Alpine base + Go init binary + guest-agent -- **Versioned**: Side-by-side support for multiple kernel/initrd versions +- **Versioned**: Side-by-side support for multiple kernel versions - **Zero Docker**: Uses OCI directly (reuses image manager infrastructure) - **Zero Image Modifications**: All init logic in initrd, OCI images used as-is - **Dual Mode Support**: Exec mode (container-like) and systemd mode (full VM) @@ -18,27 +18,27 @@ Manages versioned kernel and initrd files for Cloud Hypervisor VMs. ``` {dataDir}/system/ ├── kernel/ -│ ├── ch-v6.12.8/ +│ ├── ch-6.12.8-kernel-1-202511182/ │ │ ├── x86_64/vmlinux (~70MB) │ │ └── aarch64/Image (~70MB) -│ └── ch-v6.12.9/ -│ └── ... (future version) +│ └── ch-6.12.8-kernel-1.2-20251213/ +│ └── ... (newer version) ├── initrd/ -│ ├── v1.0.0/ -│ │ ├── x86_64/initrd (~1-2MB) -│ │ └── aarch64/initrd (~1-2MB) -│ └── v1.1.0/ -│ └── ... (when init script changes) -└── oci-cache/ (shared with images manager) - └── blobs/sha256/ (busybox layers cached) +│ ├── 1734567890/ (timestamp-based) +│ │ ├── x86_64/initrd (~5-10MB) +│ │ └── aarch64/initrd +│ ├── x86_64/latest -> 1734567890 (symlink to latest) +│ └── aarch64/latest -> 1734567890 +└── oci-cache/ (shared with images manager) + └── blobs/sha256/ (Alpine layers cached) ``` ### Versioning Rules **Snapshots require exact matches:** ``` -Standby: kernel v6.12.9, initrd v1.0.0, CH v49.0 -Restore: kernel v6.12.9, initrd v1.0.0, CH v49.0 (MUST match) +Standby: kernel ch-6.12.8-kernel-1.2-20251213, CH v49.0 +Restore: kernel ch-6.12.8-kernel-1.2-20251213, CH v49.0 (MUST match) ``` **Maintenance upgrades (shutdown → boot):** @@ -50,8 +50,8 @@ Restore: kernel v6.12.9, initrd v1.0.0, CH v49.0 (MUST match) **Multi-version support:** ``` -Instance A (standby): kernel v6.12.8, initrd v1.0.0 -Instance B (running): kernel v6.12.9, initrd v1.0.0 +Instance A (standby): kernel ch-6.12.8-kernel-1-202511182 +Instance B (running): kernel ch-6.12.8-kernel-1.2-20251213 Both work independently ``` @@ -66,67 +66,36 @@ It replaces the previous shell-based init script with cleaner logic and structur - ✅ Network configuration (if enabled) - ✅ Load GPU drivers (if GPU attached) - ✅ Mount volumes -- ✅ Auto-detect systemd images from CMD - ✅ Execute container entrypoint (exec mode) -- ✅ Hand off to systemd via pivot_root (systemd mode) +- ✅ Hand off to systemd via chroot + exec (systemd mode) **Two boot modes:** -- **Exec mode** (default): Init binary is PID 1, runs entrypoint as child process -- **Systemd mode** (auto-detected): Uses pivot_root to hand off to systemd as PID 1 +- **Exec mode** (default): Init chroots to container rootfs, runs entrypoint as child process, then waits on guest-agent to keep VM alive +- **Systemd mode** (auto-detected on host): Init chroots to container rootfs, then execs /sbin/init so systemd becomes PID 1 -**Systemd detection:** If image CMD is `/sbin/init`, `/lib/systemd/systemd`, or similar, -hypeman automatically uses systemd mode. +**Systemd detection:** Host-side detection in `lib/images/systemd.go` checks if image CMD is +`/sbin/init`, `/lib/systemd/systemd`, or similar. The detected mode is passed to the initrd +via `INIT_MODE` in the config disk. **Result:** OCI images require **zero modifications** - no `/init` script needed! -## Usage - -### Application Startup - -```go -// cmd/api/main.go -systemMgr := system.NewManager(dataDir) - -// Ensure files exist (download/build if needed) -err := systemMgr.EnsureSystemFiles(ctx) - -// Files are ready, instances can be created -``` - -### Instance Creation - -```go -// Instances manager uses system manager automatically -inst, err := instanceManager.CreateInstance(ctx, req) -// Uses default kernel/initrd versions -// Versions stored in instance metadata for restore compatibility -``` - -### Get File Paths - -```go -kernelPath, _ := systemMgr.GetKernelPath(system.KernelV6_12_9) -initrdPath, _ := systemMgr.GetInitrdPath(system.InitrdV1_0_0) -``` - ## Kernel Sources -Kernels downloaded from Cloud Hypervisor releases: -- https://github.com/cloud-hypervisor/linux/releases +Kernels downloaded from onkernel/linux releases (Cloud Hypervisor-optimized fork): +- https://github.com/onkernel/linux/releases Example URLs: -- x86_64: `https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.9/vmlinux-x86_64` -- aarch64: `https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.9/Image-aarch64` +- x86_64: `https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.2-20251213/vmlinux-x86_64` +- aarch64: `https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.2-20251213/Image-arm64` ## Initrd Build Process 1. **Pull Alpine base** (using image manager's OCI client) 2. **Add guest-agent binary** (embedded, runs in guest for exec/shell) -3. **Add init binary** (embedded Go binary, runs as PID 1) -4. **Add NVIDIA modules** (optional, for GPU passthrough) -5. **Package as cpio.gz** (initramfs format) - -**Build tools required:** `find`, `cpio`, `gzip` (standard Unix tools) +3. **Add init.sh wrapper** (mounts /proc, /sys, /dev before Go runtime) +4. **Add init binary** (embedded Go binary, runs as PID 1) +5. **Add NVIDIA modules** (optional, for GPU passthrough) +6. **Package as cpio** (initramfs format, pure Go - no shell tools required) ## Adding New Versions @@ -136,19 +105,19 @@ Example URLs: // lib/system/versions.go const ( - KernelV6_12_10 KernelVersion = "ch-v6.12.10" // Add constant + Kernel_20251220 KernelVersion = "ch-6.12.8-kernel-1.3-20251220" // Add constant ) var KernelDownloadURLs = map[KernelVersion]map[string]string{ // ... existing ... - KernelV6_12_10: { - "x86_64": "https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.10/vmlinux-x86_64", - "aarch64": "https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.10/Image-aarch64", + Kernel_20251220: { + "x86_64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.3-20251220/vmlinux-x86_64", + "aarch64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.3-20251220/Image-arm64", }, } // Update default if needed -var DefaultKernelVersion = KernelV6_12_10 +var DefaultKernelVersion = Kernel_20251220 ``` ### Updating the Init Binary @@ -188,13 +157,13 @@ Files downloaded/built once per version, reused for all instances using that ver ``` lib/system/init/ main.go # Entry point, orchestrates boot - mount.go # Mount operations (proc, sys, dev, overlay) + init.sh # Shell wrapper (mounts /proc, /sys, /dev before Go runtime) + mount.go # Mount operations (overlay, bind mounts) config.go # Parse config disk network.go # Network configuration drivers.go # GPU driver loading volumes.go # Volume mounting - mode_exec.go # Exec mode: run entrypoint - mode_systemd.go # Systemd mode: pivot_root + exec init + mode_exec.go # Exec mode: chroot, run entrypoint, wait on guest-agent + mode_systemd.go # Systemd mode: chroot + exec /sbin/init logger.go # Human-readable logging to hypeman operations log ``` - diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index e00d2f0..52ab8d2 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -12,17 +12,30 @@ import ( // - The init binary remains PID 1 // - Guest-agent runs as a background process // - The container entrypoint runs as a child process -// - When the entrypoint exits, the VM exits +// - After entrypoint exits, guest-agent keeps VM alive func runExecMode(log *Logger, cfg *Config) { const newroot = "/overlay/newroot" + // Change root to the new filesystem using chroot (consistent with systemd mode) + log.Info("exec", "executing chroot") + if err := syscall.Chroot(newroot); err != nil { + log.Error("exec", "chroot failed", err) + dropToShell() + } + + // Change to new root directory + if err := os.Chdir("/"); err != nil { + log.Error("exec", "chdir / failed", err) + dropToShell() + } + // Set up environment os.Setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") os.Setenv("HOME", "/root") - // Start guest-agent in background inside the container namespace + // Start guest-agent in background log.Info("exec", "starting guest-agent in background") - agentCmd := exec.Command("/usr/sbin/chroot", newroot, "/opt/hypeman/guest-agent") + agentCmd := exec.Command("/opt/hypeman/guest-agent") agentCmd.Stdout = os.Stdout agentCmd.Stderr = os.Stderr if err := agentCmd.Start(); err != nil { @@ -47,7 +60,7 @@ func runExecMode(log *Logger, cfg *Config) { log.Info("exec", "launching entrypoint") // Run the entrypoint - appCmd := exec.Command("/usr/sbin/chroot", newroot, "/bin/sh", "-c", shellCmd) + appCmd := exec.Command("/bin/sh", "-c", shellCmd) appCmd.Stdin = os.Stdin appCmd.Stdout = os.Stdout appCmd.Stderr = os.Stderr From c7349e0c75c480e9a56cd4555b5eb6490dd92c63 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 24 Dec 2025 14:04:55 +0000 Subject: [PATCH 05/13] Fix test --- cmd/api/api/exec_test.go | 2 +- lib/system/init/mode_exec.go | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index 51e97b0..016fbea 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -236,7 +236,7 @@ func TestExecWithDebianMinimal(t *testing.T) { // Verify the app exited but VM is still usable (key behavior this test validates) logs = collectTestLogs(t, svc, inst.Id, 200) - assert.Contains(t, logs, "overlay-init: app exited with code", "App should have exited") + assert.Contains(t, logs, "[exec] app exited with code", "App should have exited") // Test exec commands work even though the main app (bash) has exited dialer2, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index 52ab8d2..77866c1 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -59,9 +59,11 @@ func runExecMode(log *Logger, cfg *Config) { log.Info("exec", "launching entrypoint") - // Run the entrypoint + // Run the entrypoint without stdin (defaults to /dev/null). + // This matches the old shell script behavior where the app ran in background with & + // and couldn't read from stdin. Interactive shells like bash will see EOF and exit. + // Users interact with the VM via guest-agent exec, not the entrypoint's stdin. appCmd := exec.Command("/bin/sh", "-c", shellCmd) - appCmd.Stdin = os.Stdin appCmd.Stdout = os.Stdout appCmd.Stderr = os.Stderr From a021f6c4c9efe823a0a92b9b78c5a8f91be61751 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 24 Dec 2025 14:24:05 +0000 Subject: [PATCH 06/13] fix test --- lib/guest/client.go | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/lib/guest/client.go b/lib/guest/client.go index 819a659..f214b42 100644 --- a/lib/guest/client.go +++ b/lib/guest/client.go @@ -18,7 +18,9 @@ import ( securejoin "github.com/cyphar/filepath-securejoin" "github.com/onkernel/hypeman/lib/hypervisor" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/status" ) const ( @@ -126,7 +128,7 @@ type ExecOptions struct { // ExecIntoInstance executes command in instance via vsock using gRPC. // The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest. -// If WaitForAgent is set, it will retry on AgentConnectionError until the timeout. +// If WaitForAgent is set, it will retry on connection errors until the timeout. func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) { // If no wait requested, execute immediately if opts.WaitForAgent == 0 { @@ -138,9 +140,13 @@ func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts E for { exit, err := execIntoInstanceOnce(ctx, dialer, opts) - // Success or non-connection error - return immediately - var connErr *AgentConnectionError - if err == nil || !errors.As(err, &connErr) { + // Success - return immediately + if err == nil { + return exit, err + } + + // Check if this is a retryable connection error + if !isRetryableConnectionError(err) { return exit, err } @@ -159,6 +165,25 @@ func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts E } } +// isRetryableConnectionError returns true if the error indicates the guest agent +// is not yet ready and we should retry connecting. +func isRetryableConnectionError(err error) bool { + // Check for our custom AgentConnectionError + var connErr *AgentConnectionError + if errors.As(err, &connErr) { + return true + } + + // Check for gRPC Unavailable errors (agent not yet listening) + if s, ok := status.FromError(err); ok { + if s.Code() == codes.Unavailable { + return true + } + } + + return false +} + // execIntoInstanceOnce executes command in instance via vsock using gRPC (single attempt). func execIntoInstanceOnce(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) { start := time.Now() From d79a403b14f2d6b3c2ccc34b2931b4189a67e79b Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 24 Dec 2025 14:33:13 +0000 Subject: [PATCH 07/13] rename AgentConnectionError -> AgentVsockDialError --- lib/guest/client.go | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/lib/guest/client.go b/lib/guest/client.go index f214b42..9615128 100644 --- a/lib/guest/client.go +++ b/lib/guest/client.go @@ -28,19 +28,17 @@ const ( vsockGuestPort = 2222 ) -// AgentConnectionError indicates the guest agent is not responding. -// This can happen if: -// - The VM is still booting -// - The guest agent was stopped or deleted -type AgentConnectionError struct { +// AgentVSockDialError indicates the vsock dial to the guest agent failed. +// This typically means the VM is still booting or the agent hasn't started yet. +type AgentVSockDialError struct { Err error } -func (e *AgentConnectionError) Error() string { - return fmt.Sprintf("guest agent not responding (it may have been stopped, deleted, or the VM is still booting): %v", e.Err) +func (e *AgentVSockDialError) Error() string { + return fmt.Sprintf("vsock dial failed (VM may still be booting): %v", e.Err) } -func (e *AgentConnectionError) Unwrap() error { +func (e *AgentVSockDialError) Unwrap() error { return e.Err } @@ -80,7 +78,7 @@ func GetOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc. grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { netConn, err := dialer.DialVsock(ctx, vsockGuestPort) if err != nil { - return nil, &AgentConnectionError{Err: err} + return nil, &AgentVSockDialError{Err: err} } return netConn, nil }), @@ -168,9 +166,9 @@ func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts E // isRetryableConnectionError returns true if the error indicates the guest agent // is not yet ready and we should retry connecting. func isRetryableConnectionError(err error) bool { - // Check for our custom AgentConnectionError - var connErr *AgentConnectionError - if errors.As(err, &connErr) { + // Check for vsock dial errors + var dialErr *AgentVSockDialError + if errors.As(err, &dialErr) { return true } From 126c1112bccd2a3b758dcea463e5039c61ea892d Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Fri, 26 Dec 2025 15:22:15 -0500 Subject: [PATCH 08/13] feat: add shared vmconfig package for host-to-guest config schema Defines Config and VolumeMount types in a shared package that both the host (configdisk.go) and guest init binary can import, eliminating duplication. --- lib/vmconfig/README.md | 25 +++++++++++++++++++++++++ lib/vmconfig/config.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 lib/vmconfig/README.md create mode 100644 lib/vmconfig/config.go diff --git a/lib/vmconfig/README.md b/lib/vmconfig/README.md new file mode 100644 index 0000000..69da537 --- /dev/null +++ b/lib/vmconfig/README.md @@ -0,0 +1,25 @@ +# VM Config + +Shared configuration schema for host-to-guest communication. + +## Purpose + +This package defines the `Config` struct that is: +- Serialized as JSON by the host when creating a VM's config disk +- Deserialized by the guest init binary during boot + +By defining this in a shared package, the host and guest code stay in sync without duplication. + +## Usage + +The host writes this config to `/config.json` on the config disk (attached as `/dev/vdc`). +The guest init binary mounts this disk and reads the JSON configuration. + +## Fields + +- **Entrypoint/Cmd/Workdir**: Container execution parameters from the OCI image +- **Env**: Environment variables (merged from image + instance overrides) +- **Network**: Guest IP, gateway, DNS configuration +- **GPU**: Whether GPU passthrough is enabled +- **VolumeMounts**: Block devices to mount inside the guest +- **InitMode**: Either "exec" (container-like) or "systemd" (full VM) diff --git a/lib/vmconfig/config.go b/lib/vmconfig/config.go new file mode 100644 index 0000000..db7c554 --- /dev/null +++ b/lib/vmconfig/config.go @@ -0,0 +1,39 @@ +// Package vmconfig defines the configuration schema passed from host to guest VM. +package vmconfig + +// Config is the configuration passed to the guest init binary via config.json. +// This struct is serialized by the host (lib/instances/configdisk.go) and +// deserialized by the guest init binary (lib/system/init). +type Config struct { + // Container execution parameters + Entrypoint []string `json:"entrypoint"` + Cmd []string `json:"cmd"` + Workdir string `json:"workdir"` + + // Environment variables + Env map[string]string `json:"env"` + + // Network configuration + NetworkEnabled bool `json:"network_enabled"` + GuestIP string `json:"guest_ip,omitempty"` + GuestCIDR int `json:"guest_cidr,omitempty"` + GuestGW string `json:"guest_gw,omitempty"` + GuestDNS string `json:"guest_dns,omitempty"` + + // GPU passthrough + HasGPU bool `json:"has_gpu"` + + // Volume mounts + VolumeMounts []VolumeMount `json:"volume_mounts,omitempty"` + + // Init mode: "exec" (default) or "systemd" + InitMode string `json:"init_mode"` +} + +// VolumeMount represents a volume mount configuration. +type VolumeMount struct { + Device string `json:"device"` + Path string `json:"path"` + Mode string `json:"mode"` // "ro", "rw", or "overlay" + OverlayDevice string `json:"overlay_device,omitempty"` +} From 5d805cca1673f7f0457a18506a123e2e3d6ff147 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Fri, 26 Dec 2025 15:22:22 -0500 Subject: [PATCH 09/13] refactor: use shared vmconfig package in host and guest code - configdisk.go now uses vmconfig.Config instead of local GuestConfig - init binary now imports vmconfig instead of duplicating types - Also adds logging to dropToShell() for better debugging - mode_systemd.go now runs user's CMD instead of hardcoding /sbin/init --- lib/instances/configdisk.go | 199 ++++++++------------------------ lib/system/init/config.go | 182 +++-------------------------- lib/system/init/main.go | 2 + lib/system/init/mode_exec.go | 33 +++++- lib/system/init/mode_systemd.go | 21 +++- lib/system/init/network.go | 4 +- lib/system/init/volumes.go | 10 +- 7 files changed, 122 insertions(+), 329 deletions(-) diff --git a/lib/instances/configdisk.go b/lib/instances/configdisk.go index ba53ef7..c0e4e5b 100644 --- a/lib/instances/configdisk.go +++ b/lib/instances/configdisk.go @@ -12,12 +12,11 @@ import ( "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/network" + "github.com/onkernel/hypeman/lib/vmconfig" ) -// createConfigDisk generates an erofs disk with instance configuration -// The disk contains: -// - /config.sh - Shell script sourced by init -// - /metadata.json - JSON metadata for programmatic access +// createConfigDisk generates an ext4 disk with instance configuration. +// The disk contains /config.json read by the guest init binary. func (m *manager) createConfigDisk(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) error { // Create temporary directory for config files tmpDir, err := os.MkdirTemp("", "hypeman-config-*") @@ -26,37 +25,19 @@ func (m *manager) createConfigDisk(ctx context.Context, inst *Instance, imageInf } defer os.RemoveAll(tmpDir) - // Generate config.sh - configScript := m.generateConfigScript(ctx, inst, imageInfo, netConfig) - configPath := filepath.Join(tmpDir, "config.sh") - if err := os.WriteFile(configPath, []byte(configScript), 0644); err != nil { - return fmt.Errorf("write config.sh: %w", err) - } - - // Generate metadata.json - configMeta := map[string]interface{}{ - "instance_id": inst.Id, - "instance_name": inst.Name, - "image": inst.Image, - "entrypoint": imageInfo.Entrypoint, - "cmd": imageInfo.Cmd, - "workdir": imageInfo.WorkingDir, - "env": mergeEnv(imageInfo.Env, inst.Env), - } - metaData, err := json.MarshalIndent(configMeta, "", " ") + // Generate config.json + cfg := m.buildGuestConfig(ctx, inst, imageInfo, netConfig) + configData, err := json.MarshalIndent(cfg, "", " ") if err != nil { - return fmt.Errorf("marshal metadata: %w", err) + return fmt.Errorf("marshal config: %w", err) } - metaPath := filepath.Join(tmpDir, "metadata.json") - if err := os.WriteFile(metaPath, metaData, 0644); err != nil { - return fmt.Errorf("write metadata.json: %w", err) + configPath := filepath.Join(tmpDir, "config.json") + if err := os.WriteFile(configPath, configData, 0644); err != nil { + return fmt.Errorf("write config.json: %w", err) } // Create ext4 disk with config files - // Use ext4 for now (can switch to erofs when kernel supports it) diskPath := m.paths.InstanceConfigDisk(inst.Id) - - // Calculate size (config files are tiny, use 1MB minimum) _, err = images.ExportRootfs(tmpDir, diskPath, images.FormatExt4) if err != nil { return fmt.Errorf("create config disk: %w", err) @@ -65,128 +46,71 @@ func (m *manager) createConfigDisk(ctx context.Context, inst *Instance, imageInf return nil } -// generateConfigScript creates the shell script that will be sourced by init -func (m *manager) generateConfigScript(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) string { - // Prepare entrypoint value - entrypoint := "" - if len(imageInfo.Entrypoint) > 0 { - entrypoint = shellQuoteArray(imageInfo.Entrypoint) +// buildGuestConfig creates the vmconfig.Config struct for the guest init binary. +func (m *manager) buildGuestConfig(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) *vmconfig.Config { + cfg := &vmconfig.Config{ + Entrypoint: imageInfo.Entrypoint, + Cmd: imageInfo.Cmd, + Workdir: imageInfo.WorkingDir, + Env: mergeEnv(imageInfo.Env, inst.Env), + InitMode: "exec", } - // Prepare cmd value - cmd := "" - if len(imageInfo.Cmd) > 0 { - cmd = shellQuoteArray(imageInfo.Cmd) + if cfg.Workdir == "" { + cfg.Workdir = "/" } - // Prepare workdir value - workdir := shellQuote("/") - if imageInfo.WorkingDir != "" { - workdir = shellQuote(imageInfo.WorkingDir) - } - - // Build environment variable exports - var envLines strings.Builder - mergedEnv := mergeEnv(imageInfo.Env, inst.Env) - for key, value := range mergedEnv { - envLines.WriteString(fmt.Sprintf("export %s=%s\n", key, shellQuote(value))) - } - - // Build network configuration section - // Use netConfig directly instead of trying to derive it (VM hasn't started yet) - networkSection := "" + // Network configuration if inst.NetworkEnabled && netConfig != nil { - // Convert netmask to CIDR prefix length for ip command - cidr := netmaskToCIDR(netConfig.Netmask) - networkSection = fmt.Sprintf(` -# Network configuration -GUEST_IP="%s" -GUEST_CIDR="%d" -GUEST_GW="%s" -GUEST_DNS="%s" -`, netConfig.IP, cidr, netConfig.Gateway, netConfig.DNS) + cfg.NetworkEnabled = true + cfg.GuestIP = netConfig.IP + cfg.GuestCIDR = netmaskToCIDR(netConfig.Netmask) + cfg.GuestGW = netConfig.Gateway + cfg.GuestDNS = netConfig.DNS } - // GPU passthrough configuration - // Only set HAS_GPU=1 if at least one attached device is actually a GPU - gpuSection := "" + // GPU passthrough - check if any attached device is a GPU for _, deviceID := range inst.Devices { device, err := m.deviceManager.GetDevice(ctx, deviceID) if err == nil && device.Type == devices.DeviceTypeGPU { - gpuSection = "\n# GPU passthrough\nHAS_GPU=1\n" + cfg.HasGPU = true break } } - // Build volume mounts section + // Volume mounts // Volumes are attached as /dev/vdd, /dev/vde, etc. (after vda=rootfs, vdb=overlay, vdc=config) - // For overlay volumes, two devices are used: base + overlay disk - // Format: device:path:mode[:overlay_device] - volumeSection := "" - if len(inst.Volumes) > 0 { - var volumeLines strings.Builder - volumeLines.WriteString("\n# Volume mounts (device:path:mode[:overlay_device])\n") - volumeLines.WriteString("VOLUME_MOUNTS=\"") - deviceIdx := 0 // Track device index (starts at 'd' = vdd) - for i, vol := range inst.Volumes { - device := fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx) - if i > 0 { - volumeLines.WriteString(" ") - } - if vol.Overlay { - // Overlay mode: base device + overlay device - overlayDevice := fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx+1) - volumeLines.WriteString(fmt.Sprintf("%s:%s:overlay:%s", device, vol.MountPath, overlayDevice)) - deviceIdx += 2 // Overlay uses 2 devices + deviceIdx := 0 + for _, vol := range inst.Volumes { + device := fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx) + mount := vmconfig.VolumeMount{ + Device: device, + Path: vol.MountPath, + } + if vol.Overlay { + mount.Mode = "overlay" + mount.OverlayDevice = fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx+1) + deviceIdx += 2 + } else { + if vol.Readonly { + mount.Mode = "ro" } else { - mode := "rw" - if vol.Readonly { - mode = "ro" - } - volumeLines.WriteString(fmt.Sprintf("%s:%s:%s", device, vol.MountPath, mode)) - deviceIdx++ // Regular volume uses 1 device + mount.Mode = "rw" } + deviceIdx++ } - volumeLines.WriteString("\"\n") - volumeSection = volumeLines.String() + cfg.VolumeMounts = append(cfg.VolumeMounts, mount) } // Determine init mode based on image CMD - // If the image's command is /sbin/init or /lib/systemd/systemd, use systemd mode - initModeSection := "" if images.IsSystemdImage(imageInfo.Entrypoint, imageInfo.Cmd) { - initModeSection = "\n# Init mode (auto-detected from image CMD)\nINIT_MODE=\"systemd\"\n" - } else { - initModeSection = "\n# Init mode\nINIT_MODE=\"exec\"\n" + cfg.InitMode = "systemd" } - // Generate script as a readable template block - // ENTRYPOINT and CMD contain shell-quoted arrays that will be eval'd in init - script := fmt.Sprintf(`#!/bin/sh -# Generated config for instance: %s - -# Container execution parameters -ENTRYPOINT="%s" -CMD="%s" -WORKDIR=%s - -# Environment variables -%s%s%s%s%s`, - inst.Id, - entrypoint, - cmd, - workdir, - envLines.String(), - networkSection, - volumeSection, - gpuSection, - initModeSection, - ) - - return script + return cfg } -// mergeEnv merges image environment variables with instance overrides +// mergeEnv merges image environment variables with instance overrides. func mergeEnv(imageEnv map[string]string, instEnv map[string]string) map[string]string { result := make(map[string]string) @@ -203,30 +127,7 @@ func mergeEnv(imageEnv map[string]string, instEnv map[string]string) map[string] return result } -// shellQuote quotes a string for safe use in shell scripts -func shellQuote(s string) string { - // Simple quoting: wrap in single quotes and escape single quotes - s = strings.ReplaceAll(s, "'", "'\\''") - return "'" + s + "'" -} - -// shellQuoteArray quotes each element of an array for safe shell evaluation -// Returns a string that when assigned to a variable and later eval'd, will be properly split -func shellQuoteArray(arr []string) string { - if len(arr) == 0 { - return "" - } - - quoted := make([]string, len(arr)) - for i, s := range arr { - quoted[i] = shellQuote(s) - } - - // Join with spaces and return as-is (will be eval'd later in init script) - return strings.Join(quoted, " ") -} - -// netmaskToCIDR converts dotted decimal netmask to CIDR prefix length +// netmaskToCIDR converts dotted decimal netmask to CIDR prefix length. // e.g., "255.255.255.0" -> 24, "255.255.0.0" -> 16 func netmaskToCIDR(netmask string) int { parts := strings.Split(netmask, ".") diff --git a/lib/system/init/config.go b/lib/system/init/config.go index a6ef1e0..9a0bde0 100644 --- a/lib/system/init/config.go +++ b/lib/system/init/config.go @@ -1,52 +1,18 @@ package main import ( - "bufio" + "encoding/json" "fmt" "os" "os/exec" - "strings" -) - -// Config holds the parsed configuration from the config disk. -type Config struct { - // Container execution parameters - Entrypoint string - Cmd string - Workdir string - - // Environment variables - Env map[string]string - - // Network configuration - NetworkEnabled bool - GuestIP string - GuestCIDR string - GuestGW string - GuestDNS string - - // GPU passthrough - HasGPU bool - - // Volume mounts (format: "device:path:mode[:overlay_device] ...") - VolumeMounts []VolumeMount - // Init mode: "exec" (default) or "systemd" - InitMode string -} - -// VolumeMount represents a volume mount configuration. -type VolumeMount struct { - Device string - Path string - Mode string // "ro", "rw", or "overlay" - OverlayDevice string // Only used for overlay mode -} + "github.com/onkernel/hypeman/lib/vmconfig" +) -// readConfig mounts and reads the config disk, parsing the shell configuration. -func readConfig(log *Logger) (*Config, error) { +// readConfig mounts and reads the config disk, parsing the JSON configuration. +func readConfig(log *Logger) (*vmconfig.Config, error) { const configMount = "/mnt/config" - const configFile = "/mnt/config/config.sh" + const configFile = "/mnt/config/config.json" // Create mount point if err := os.MkdirAll(configMount, 0755); err != nil { @@ -60,135 +26,25 @@ func readConfig(log *Logger) (*Config, error) { } log.Info("config", "mounted config disk") - // Read and parse config.sh - cfg, err := parseConfigFile(configFile) + // Read and parse config.json + data, err := os.ReadFile(configFile) if err != nil { - return nil, fmt.Errorf("parse config: %w", err) + return nil, fmt.Errorf("read config file: %w", err) } - log.Info("config", "parsed configuration") - return cfg, nil -} - -// parseConfigFile parses a shell-style configuration file. -// It handles simple KEY=VALUE and KEY="VALUE" assignments. -func parseConfigFile(path string) (*Config, error) { - f, err := os.Open(path) - if err != nil { - return nil, err + var cfg vmconfig.Config + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parse config json: %w", err) } - defer f.Close() - cfg := &Config{ - Env: make(map[string]string), - InitMode: "exec", // Default to exec mode + // Set defaults + if cfg.InitMode == "" { + cfg.InitMode = "exec" } - - scanner := bufio.NewScanner(f) - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - - // Skip comments and empty lines - if line == "" || strings.HasPrefix(line, "#") { - continue - } - - // Handle export statements - if strings.HasPrefix(line, "export ") { - line = strings.TrimPrefix(line, "export ") - } - - // Parse KEY=VALUE - parts := strings.SplitN(line, "=", 2) - if len(parts) != 2 { - continue - } - - key := strings.TrimSpace(parts[0]) - value := unquote(strings.TrimSpace(parts[1])) - - switch key { - case "ENTRYPOINT": - cfg.Entrypoint = value - case "CMD": - cfg.Cmd = value - case "WORKDIR": - cfg.Workdir = value - case "GUEST_IP": - cfg.GuestIP = value - cfg.NetworkEnabled = true - case "GUEST_CIDR": - cfg.GuestCIDR = value - case "GUEST_GW": - cfg.GuestGW = value - case "GUEST_DNS": - cfg.GuestDNS = value - case "HAS_GPU": - cfg.HasGPU = value == "1" - case "VOLUME_MOUNTS": - cfg.VolumeMounts = parseVolumeMounts(value) - case "INIT_MODE": - cfg.InitMode = value - default: - // Treat as environment variable - cfg.Env[key] = value - } + if cfg.Env == nil { + cfg.Env = make(map[string]string) } - if err := scanner.Err(); err != nil { - return nil, err - } - - return cfg, nil -} - -// parseVolumeMounts parses the VOLUME_MOUNTS string. -// Format: "device:path:mode[:overlay_device] device:path:mode ..." -func parseVolumeMounts(s string) []VolumeMount { - if s == "" { - return nil - } - - var mounts []VolumeMount - for _, vol := range strings.Fields(s) { - parts := strings.Split(vol, ":") - if len(parts) < 3 { - continue - } - - mount := VolumeMount{ - Device: parts[0], - Path: parts[1], - Mode: parts[2], - } - - if len(parts) >= 4 { - mount.OverlayDevice = parts[3] - } - - mounts = append(mounts, mount) - } - - return mounts -} - -// unquote removes surrounding quotes from a string. -// Handles both single and double quotes. -func unquote(s string) string { - if len(s) < 2 { - return s - } - - // Handle double quotes - if s[0] == '"' && s[len(s)-1] == '"' { - return s[1 : len(s)-1] - } - - // Handle single quotes - if s[0] == '\'' && s[len(s)-1] == '\'' { - return s[1 : len(s)-1] - } - - return s + log.Info("config", "parsed configuration") + return &cfg, nil } - diff --git a/lib/system/init/main.go b/lib/system/init/main.go index ca25e42..a98c47f 100644 --- a/lib/system/init/main.go +++ b/lib/system/init/main.go @@ -10,6 +10,7 @@ package main import ( + "fmt" "os" "os/exec" ) @@ -85,6 +86,7 @@ func main() { // dropToShell drops to an interactive shell for debugging when boot fails func dropToShell() { + fmt.Fprintln(os.Stderr, "FATAL: dropping to shell for debugging") cmd := exec.Command("/bin/sh", "-i") cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index 77866c1..f0293c9 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -4,7 +4,10 @@ import ( "fmt" "os" "os/exec" + "strings" "syscall" + + "github.com/onkernel/hypeman/lib/vmconfig" ) // runExecMode runs the container in exec mode (default). @@ -13,7 +16,7 @@ import ( // - Guest-agent runs as a background process // - The container entrypoint runs as a child process // - After entrypoint exits, guest-agent keeps VM alive -func runExecMode(log *Logger, cfg *Config) { +func runExecMode(log *Logger, cfg *vmconfig.Config) { const newroot = "/overlay/newroot" // Change root to the new filesystem using chroot (consistent with systemd mode) @@ -48,14 +51,14 @@ func runExecMode(log *Logger, cfg *Config) { workdir = "/" } - entrypoint := cfg.Entrypoint - cmd := cfg.Cmd + // Shell-quote the entrypoint and cmd arrays for safe execution + entrypoint := shellQuoteArgs(cfg.Entrypoint) + cmd := shellQuoteArgs(cfg.Cmd) - log.Info("exec", fmt.Sprintf("workdir=%s entrypoint=%s cmd=%s", workdir, entrypoint, cmd)) + log.Info("exec", fmt.Sprintf("workdir=%s entrypoint=%v cmd=%v", workdir, cfg.Entrypoint, cfg.Cmd)) // Construct the shell command to run - // ENTRYPOINT and CMD are shell-safe quoted strings from config.sh - shellCmd := fmt.Sprintf("cd %s && exec %s %s", workdir, entrypoint, cmd) + shellCmd := fmt.Sprintf("cd %s && exec %s %s", shellQuote(workdir), entrypoint, cmd) log.Info("exec", "launching entrypoint") @@ -112,3 +115,21 @@ func buildEnv(env map[string]string) []string { return result } + +// shellQuote quotes a string for safe use in shell commands. +func shellQuote(s string) string { + // Use single quotes and escape embedded single quotes + return "'" + strings.ReplaceAll(s, "'", "'\\''") + "'" +} + +// shellQuoteArgs quotes each argument and joins them with spaces. +func shellQuoteArgs(args []string) string { + if len(args) == 0 { + return "" + } + quoted := make([]string, len(args)) + for i, arg := range args { + quoted[i] = shellQuote(arg) + } + return strings.Join(quoted, " ") +} diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go index 6f042a1..cd1227f 100644 --- a/lib/system/init/mode_systemd.go +++ b/lib/system/init/mode_systemd.go @@ -1,8 +1,11 @@ package main import ( + "fmt" "os" "syscall" + + "github.com/onkernel/hypeman/lib/vmconfig" ) // runSystemdMode hands off control to systemd. @@ -10,8 +13,8 @@ import ( // The init binary: // 1. Injects the hypeman-agent.service unit // 2. Uses chroot to switch to the container rootfs -// 3. Execs /sbin/init (systemd) which becomes the new PID 1 -func runSystemdMode(log *Logger, cfg *Config) { +// 3. Execs the image's entrypoint/cmd (systemd) which becomes the new PID 1 +func runSystemdMode(log *Logger, cfg *vmconfig.Config) { const newroot = "/overlay/newroot" // Inject hypeman-agent.service @@ -34,14 +37,20 @@ func runSystemdMode(log *Logger, cfg *Config) { dropToShell() } + // Build effective command from entrypoint + cmd + argv := append(cfg.Entrypoint, cfg.Cmd...) + if len(argv) == 0 { + // Fallback to /sbin/init if no command specified + argv = []string{"/sbin/init"} + } + // Exec systemd - this replaces the current process - log.Info("systemd", "exec /sbin/init") + log.Info("systemd", fmt.Sprintf("exec %v", argv)) // syscall.Exec replaces the current process with the new one - // /sbin/init is typically a symlink to /lib/systemd/systemd - err := syscall.Exec("/sbin/init", []string{"/sbin/init"}, os.Environ()) + err := syscall.Exec(argv[0], argv, os.Environ()) if err != nil { - log.Error("systemd", "exec /sbin/init failed", err) + log.Error("systemd", fmt.Sprintf("exec %s failed", argv[0]), err) dropToShell() } } diff --git a/lib/system/init/network.go b/lib/system/init/network.go index 61c29a6..37640ab 100644 --- a/lib/system/init/network.go +++ b/lib/system/init/network.go @@ -4,11 +4,13 @@ import ( "fmt" "os" "os/exec" + + "github.com/onkernel/hypeman/lib/vmconfig" ) // configureNetwork sets up networking in the guest VM. // This is done from the initrd before pivot_root so it works for both exec and systemd modes. -func configureNetwork(log *Logger, cfg *Config) error { +func configureNetwork(log *Logger, cfg *vmconfig.Config) error { // Bring up loopback interface if err := runIP("link", "set", "lo", "up"); err != nil { return fmt.Errorf("bring up lo: %w", err) diff --git a/lib/system/init/volumes.go b/lib/system/init/volumes.go index 2a52393..f485b89 100644 --- a/lib/system/init/volumes.go +++ b/lib/system/init/volumes.go @@ -5,11 +5,13 @@ import ( "os" "os/exec" "path/filepath" + + "github.com/onkernel/hypeman/lib/vmconfig" ) // mountVolumes mounts attached volumes according to the configuration. // Supports three modes: ro (read-only), rw (read-write), and overlay. -func mountVolumes(log *Logger, cfg *Config) error { +func mountVolumes(log *Logger, cfg *vmconfig.Config) error { log.Info("volumes", "mounting volumes") for _, vol := range cfg.VolumeMounts { @@ -42,7 +44,7 @@ func mountVolumes(log *Logger, cfg *Config) error { // mountVolumeOverlay mounts a volume in overlay mode. // Uses the base device as read-only lower layer and overlay device for writable upper layer. -func mountVolumeOverlay(log *Logger, vol VolumeMount, mountPath string) error { +func mountVolumeOverlay(log *Logger, vol vmconfig.VolumeMount, mountPath string) error { baseName := filepath.Base(vol.Path) baseMount := fmt.Sprintf("/mnt/vol-base-%s", baseName) overlayMount := fmt.Sprintf("/mnt/vol-overlay-%s", baseName) @@ -85,7 +87,7 @@ func mountVolumeOverlay(log *Logger, vol VolumeMount, mountPath string) error { } // mountVolumeReadOnly mounts a volume in read-only mode. -func mountVolumeReadOnly(log *Logger, vol VolumeMount, mountPath string) error { +func mountVolumeReadOnly(log *Logger, vol vmconfig.VolumeMount, mountPath string) error { // Use noload to skip journal recovery for multi-attach safety cmd := exec.Command("/bin/mount", "-t", "ext4", "-o", "ro,noload", vol.Device, mountPath) if output, err := cmd.CombinedOutput(); err != nil { @@ -97,7 +99,7 @@ func mountVolumeReadOnly(log *Logger, vol VolumeMount, mountPath string) error { } // mountVolumeReadWrite mounts a volume in read-write mode. -func mountVolumeReadWrite(log *Logger, vol VolumeMount, mountPath string) error { +func mountVolumeReadWrite(log *Logger, vol vmconfig.VolumeMount, mountPath string) error { cmd := exec.Command("/bin/mount", "-t", "ext4", vol.Device, mountPath) if output, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("%s: %s", err, output) From 824d6e0fbc3d4ab9a25e4ac38045c2cf59fd7012 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Fri, 26 Dec 2025 15:22:28 -0500 Subject: [PATCH 10/13] fix: remove overly-broad /init suffix detection in IsSystemdImage Per review feedback, matching any path ending in /init is too aggressive since many entrypoint scripts are named 'init'. Now only matches explicit systemd paths: /sbin/init, /lib/systemd/systemd, /usr/lib/systemd/systemd --- lib/images/systemd.go | 20 ++++++++------------ lib/images/systemd_test.go | 4 ++-- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/lib/images/systemd.go b/lib/images/systemd.go index 661477f..b03c619 100644 --- a/lib/images/systemd.go +++ b/lib/images/systemd.go @@ -1,7 +1,5 @@ package images -import "strings" - // IsSystemdImage checks if the image's CMD indicates it wants systemd as init. // Detection is based on the effective command (entrypoint + cmd), not whether // systemd is installed in the image. @@ -10,17 +8,21 @@ import "strings" // - /sbin/init // - /lib/systemd/systemd // - /usr/lib/systemd/systemd -// - Any path ending in /init func IsSystemdImage(entrypoint, cmd []string) bool { - // Combine to get the actual command that will run - effective := append(entrypoint, cmd...) + // Combine to get the actual command that will run. + // Create a new slice to avoid corrupting caller's backing array. + effective := make([]string, 0, len(entrypoint)+len(cmd)) + effective = append(effective, entrypoint...) + effective = append(effective, cmd...) if len(effective) == 0 { return false } first := effective[0] - // Match specific systemd/init paths + // Match specific systemd/init paths only. + // We intentionally don't match generic */init paths since many entrypoint + // scripts are named "init" and would be false positives. systemdPaths := []string{ "/sbin/init", "/lib/systemd/systemd", @@ -32,12 +34,6 @@ func IsSystemdImage(entrypoint, cmd []string) bool { } } - // Match any absolute path ending in /init (e.g., /usr/sbin/init) - // Only match absolute paths to avoid false positives like "./init" - if strings.HasPrefix(first, "/") && strings.HasSuffix(first, "/init") { - return true - } - return false } diff --git a/lib/images/systemd_test.go b/lib/images/systemd_test.go index 8b55587..3428d63 100644 --- a/lib/images/systemd_test.go +++ b/lib/images/systemd_test.go @@ -38,10 +38,10 @@ func TestIsSystemdImage(t *testing.T) { expected: true, }, { - name: "path ending in /init", + name: "path ending in /init should not match (too broad)", entrypoint: nil, cmd: []string{"/usr/sbin/init"}, - expected: true, + expected: false, }, { name: "regular command (nginx)", From 4ee56783878b69afba325c9b1b71c66815d26e60 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Fri, 26 Dec 2025 15:22:33 -0500 Subject: [PATCH 11/13] chore: remove plan file before merge --- .../systemd_vm_with_go_init_c0a9c010.plan.md | 418 ------------------ 1 file changed, 418 deletions(-) delete mode 100644 .cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md diff --git a/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md b/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md deleted file mode 100644 index f379daa..0000000 --- a/.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md +++ /dev/null @@ -1,418 +0,0 @@ ---- -name: Systemd VM with Go Init -overview: Support systemd-based OCI images via auto-detection from image CMD, using a Go-based init binary with human-readable logging, performing chroot for systemd handoff. -todos: - - id: go-init-scaffold - content: Create lib/system/init/ package with main.go entry point - status: completed - - id: go-init-mount - content: Implement mount.go for proc/sys/dev/overlay mounting - status: completed - - id: go-init-config - content: Implement config.go to parse config disk - status: completed - - id: go-init-network - content: Implement network.go for network configuration - status: completed - - id: go-init-drivers - content: Implement drivers.go for GPU driver loading - status: completed - - id: go-init-volumes - content: Implement volumes.go for volume mounting - status: completed - - id: go-init-exec - content: Implement mode_exec.go with current behavior - status: completed - - id: go-init-systemd - content: Implement mode_systemd.go with chroot and service injection - status: completed - - id: go-init-logger - content: Implement logger.go with human-readable format - status: completed - - id: initrd-build - content: Modify initrd.go to compile and include init binary - status: completed - - id: systemd-detect - content: Create lib/images/systemd.go with IsSystemdImage() based on CMD - status: completed - - id: config-disk-mode - content: Add INIT_MODE to configdisk.go based on CMD detection - status: completed - - id: agent-location - content: Change guest-agent copy location to /opt/hypeman/ - status: completed - - id: dialer-resilience - content: Add error handling for missing agent in client.go - status: completed - - id: test-dockerfile - content: Create integration/testdata/systemd/Dockerfile - status: completed - - id: e2e-tests - content: Create integration/systemd_test.go with build/push/test flow - status: completed ---- - -# Systemd VM Support with Go-based Init - -## Architecture - -```mermaid -flowchart TB - subgraph initrd [Initrd Contents] - GoInit[init Go binary] - BusyBox[busybox utilities] - GuestAgent[guest-agent binary] - end - - subgraph boot [Boot Flow] - Kernel[Kernel Boot] --> GoInit - GoInit --> MountFS[Mount proc/sys/dev] - MountFS --> Overlay[Setup overlay rootfs] - Overlay --> Config[Read config disk] - Config --> Network[Configure network] - Network --> Drivers[Load GPU drivers if needed] - Drivers --> Volumes[Mount volumes] - Volumes --> CopyAgent[Copy agent to /opt/hypeman/] - CopyAgent --> ModeCheck{init_mode from CMD?} - ModeCheck -->|exec| ExecMode[Exec Mode] - ModeCheck -->|systemd| SystemdMode[Systemd Mode] - end - - subgraph execpath [Exec Mode] - ExecMode --> StartAgentBG[Start guest-agent in background] - StartAgentBG --> RunEntrypoint[Run entrypoint as child] - RunEntrypoint --> WaitPID[Wait for entrypoint exit] - end - - subgraph systemdpath [Systemd Mode] - SystemdMode --> InjectService[Inject guest-agent.service] - InjectService --> Chroot[chroot to overlay/newroot] - Chroot --> ExecInit["exec /sbin/init (systemd)"] - ExecInit --> SystemdPID1[Systemd manages everything] - end -``` - - - -## Shared vs Mode-Specific Behavior - -| Step | Exec Mode | Systemd Mode ||------|-----------|--------------|| Mount proc/sys/dev | Shared | Shared || Mount rootfs overlay | Shared | Shared || Read config disk | Shared | Shared || Configure network | Init configures it | Init configures it (before pivot) || Load GPU drivers | Shared | Shared || Mount volumes | Shared | Shared || Copy guest-agent | To `/opt/hypeman/` | To `/opt/hypeman/` || Start guest-agent | Background process | Systemd service || PID 1 | Go init binary | Systemd || App lifecycle | Managed by init | Managed by systemd | - -## Logging Behavior - -### `hypeman logs` Output by Mode - -| Log Source | Exec Mode | Systemd Mode ||------------|-----------|--------------|| `--source app` (default) | Entrypoint stdout/stderr | Systemd boot messages + console output || `--source hypeman` | Init phases + operations | Init phases + operations (until pivot_root) || `--source vmm` | Cloud Hypervisor logs | Cloud Hypervisor logs |In systemd mode, after pivot_root: - -- Serial console (app.log) shows systemd boot progress and any services writing to console -- To view individual service logs, use: `hypeman exec journalctl -u ` -- To view guest-agent logs: `hypeman exec journalctl -u hypeman-agent` - -### Init Logger Format - -Human-readable format for `hypeman logs --source hypeman`: - -```go -// lib/system/init/logger.go -package main - -type Logger struct { - file *os.File -} - -func (l *Logger) Info(phase, msg string) { - // Format: 2024-12-23T10:15:30Z [INFO] [overlay] mounted rootfs from /dev/vda - fmt.Fprintf(l.file, "%s [INFO] [%s] %s\n", - time.Now().UTC().Format(time.RFC3339), phase, msg) -} - -func (l *Logger) Error(phase, msg string, err error) { - if err != nil { - fmt.Fprintf(l.file, "%s [ERROR] [%s] %s: %v\n", - time.Now().UTC().Format(time.RFC3339), phase, msg, err) - } else { - fmt.Fprintf(l.file, "%s [ERROR] [%s] %s\n", - time.Now().UTC().Format(time.RFC3339), phase, msg) - } -} - -// Example output: -// 2024-12-23T10:15:30Z [INFO] [boot] init starting -// 2024-12-23T10:15:30Z [INFO] [mount] mounted proc/sys/dev -// 2024-12-23T10:15:31Z [INFO] [overlay] mounted rootfs from /dev/vda -// 2024-12-23T10:15:31Z [INFO] [network] configured eth0 with 10.0.0.2/24 -// 2024-12-23T10:15:32Z [INFO] [systemd] performing pivot_root -// 2024-12-23T10:15:32Z [INFO] [systemd] exec /sbin/init -``` - - - -## Go-based Init Binary - -Package structure at `lib/system/init/`: - -```javascript -lib/system/init/ - main.go # Entry point, orchestrates boot - mount.go # Mount operations (proc, sys, dev, overlay) - config.go # Parse config disk - network.go # Network configuration - drivers.go # GPU driver loading - volumes.go # Volume mounting - mode_exec.go # Exec mode: run entrypoint - mode_systemd.go # Systemd mode: pivot_root + exec init - logger.go # Human-readable logging to hypeman operations log -``` - - - -### Main Orchestration - -```go -// lib/system/init/main.go -package main - -func main() { - log := NewLogger() - log.Info("boot", "init starting") - - if err := mountEssentials(log); err != nil { - log.Error("mount", "failed", err) - dropToShell() - } - - if err := setupOverlay(log); err != nil { - log.Error("overlay", "failed", err) - dropToShell() - } - - cfg, err := readConfig(log) - if err != nil { - log.Error("config", "failed", err) - dropToShell() - } - - if cfg.NetworkEnabled { - if err := configureNetwork(log, cfg); err != nil { - log.Error("network", "failed", err) - } - } - - if cfg.HasGPU { - if err := loadGPUDrivers(log); err != nil { - log.Error("gpu", "failed", err) - } - } - - if err := mountVolumes(log, cfg); err != nil { - log.Error("volumes", "failed", err) - } - - if err := copyGuestAgent(log); err != nil { - log.Error("agent", "failed to copy", err) - } - - if cfg.InitMode == "systemd" { - log.Info("mode", "entering systemd mode") - runSystemdMode(log, cfg) - } else { - log.Info("mode", "entering exec mode") - runExecMode(log, cfg) - } -} -``` - - - -### Systemd Mode - -```go -// lib/system/init/mode_systemd.go -package main - -import ( - "os" - "syscall" -) - -func runSystemdMode(log *Logger, cfg *Config) { - const newroot = "/overlay/newroot" - - log.Info("systemd", "injecting hypeman-agent.service") - if err := injectAgentService(newroot); err != nil { - log.Error("systemd", "failed to inject service", err) - } - - // Use chroot instead of pivot_root (more reliable in VM environment) - log.Info("systemd", "executing chroot") - if err := syscall.Chroot(newroot); err != nil { - log.Error("systemd", "chroot failed", err) - dropToShell() - } - - os.Chdir("/") - - log.Info("systemd", "exec /sbin/init") - syscall.Exec("/sbin/init", []string{"/sbin/init"}, os.Environ()) - - log.Error("systemd", "exec failed", nil) - dropToShell() -} - -func injectAgentService(newroot string) error { - serviceContent := `[Unit] -Description=Hypeman Guest Agent -After=network.target - -[Service] -Type=simple -ExecStart=/opt/hypeman/guest-agent -Restart=always -RestartSec=3 - -[Install] -WantedBy=multi-user.target -` - serviceDir := newroot + "/etc/systemd/system" - wantsDir := serviceDir + "/multi-user.target.wants" - os.MkdirAll(wantsDir, 0755) - - servicePath := serviceDir + "/hypeman-agent.service" - if err := os.WriteFile(servicePath, []byte(serviceContent), 0644); err != nil { - return err - } - return os.Symlink(servicePath, wantsDir+"/hypeman-agent.service") -} -``` - - - -## Detection Logic - -Auto-detect systemd mode by inspecting the image's CMD. No override flag - if CMD is a systemd init, always use systemd mode. - -```go -// lib/images/systemd.go -package images - -import "strings" - -// IsSystemdImage checks if the image's CMD indicates it wants systemd as init. -// Detection is based on the effective command (entrypoint + cmd), not whether -// systemd is installed in the image. -func IsSystemdImage(entrypoint, cmd []string) bool { - // Combine to get the actual command that will run - effective := append(entrypoint, cmd...) - if len(effective) == 0 { - return false - } - - first := effective[0] - - // Match specific systemd/init paths - systemdPaths := []string{ - "/sbin/init", - "/lib/systemd/systemd", - "/usr/lib/systemd/systemd", - } - for _, p := range systemdPaths { - if first == p { - return true - } - } - - // Match any path ending in /init (e.g., /usr/sbin/init) - if strings.HasSuffix(first, "/init") { - return true - } - - return false -} -``` - - - -## E2E Test - -Custom Dockerfile in repository at `integration/testdata/systemd/Dockerfile`: - -```dockerfile -FROM ubuntu:22.04 - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && apt-get install -y \ - systemd \ - systemd-sysv \ - dbus \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# Remove unnecessary systemd units -RUN rm -f /lib/systemd/system/multi-user.target.wants/* \ - /etc/systemd/system/*.wants/* \ - /lib/systemd/system/local-fs.target.wants/* \ - /lib/systemd/system/sockets.target.wants/*udev* \ - /lib/systemd/system/sockets.target.wants/*initctl* - -VOLUME ["/sys/fs/cgroup"] -CMD ["/lib/systemd/systemd"] -``` - -Test flow: - -1. Build image with `docker build` -2. Push to hypeman via OCI import -3. Run instance (auto-detects systemd mode from CMD) -4. Verify systemd is PID 1 -5. Verify guest-agent.service is active -6. Verify `hypeman logs` shows systemd boot messages -```go -// integration/systemd_test.go - -func TestSystemdMode(t *testing.T) { - // Build and push test image - buildAndPushTestImage(t, "integration/testdata/systemd", "test-systemd:latest") - - // Create instance (auto-detects systemd mode from CMD) - inst := createInstance(t, "test-systemd:latest") - defer deleteInstance(t, inst.Id) - - time.Sleep(10 * time.Second) - - // Verify systemd is PID 1 - result := execInVM(t, inst, "cat", "/proc/1/comm") - assert.Equal(t, "systemd", strings.TrimSpace(result.Stdout)) - - // Verify agent service is running - result = execInVM(t, inst, "systemctl", "is-active", "hypeman-agent") - assert.Equal(t, "active", strings.TrimSpace(result.Stdout)) - - // Verify agent location - result = execInVM(t, inst, "test", "-x", "/opt/hypeman/guest-agent") - assert.Equal(t, 0, result.ExitCode) - - // Verify can view agent logs via journalctl - result = execInVM(t, inst, "journalctl", "-u", "hypeman-agent", "--no-pager") - assert.Equal(t, 0, result.ExitCode) -} - -func TestExecModeUnchanged(t *testing.T) { - // Regular container image should still work as before - inst := createInstance(t, "nginx:alpine") - defer deleteInstance(t, inst.Id) - - time.Sleep(3 * time.Second) - - // Nginx should be running - result := execInVM(t, inst, "pgrep", "nginx") - assert.Equal(t, 0, result.ExitCode) - - // PID 1 is init binary (not systemd) - result = execInVM(t, inst, "cat", "/proc/1/comm") - assert.Equal(t, "init", strings.TrimSpace(result.Stdout)) -} - - - -``` \ No newline at end of file From 8b82dfd636511ff366b3754fba250d4f2f0bf023 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Fri, 26 Dec 2025 15:32:15 -0500 Subject: [PATCH 12/13] fix: use %d format for GuestCIDR int field --- lib/system/init/network.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/system/init/network.go b/lib/system/init/network.go index 37640ab..76df825 100644 --- a/lib/system/init/network.go +++ b/lib/system/init/network.go @@ -17,7 +17,7 @@ func configureNetwork(log *Logger, cfg *vmconfig.Config) error { } // Add IP address to eth0 - addr := fmt.Sprintf("%s/%s", cfg.GuestIP, cfg.GuestCIDR) + addr := fmt.Sprintf("%s/%d", cfg.GuestIP, cfg.GuestCIDR) if err := runIP("addr", "add", addr, "dev", "eth0"); err != nil { return fmt.Errorf("add IP address: %w", err) } From 0dd309854ce601f64537779d3a7cd1b60b445864 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Fri, 26 Dec 2025 15:45:00 -0500 Subject: [PATCH 13/13] fix: address Bugbot feedback on env vars and volume mount collisions - buildEnv: user's env vars now take precedence over defaults (PATH, HOME) - systemd mode: pass user's env vars via buildEnv instead of os.Environ() - volumes: use device name for overlay mount points to avoid basename collisions --- lib/system/init/mode_exec.go | 16 +++++++++++----- lib/system/init/mode_systemd.go | 3 ++- lib/system/init/volumes.go | 8 +++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index f0293c9..8fd1151 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -103,16 +103,22 @@ func runExecMode(log *Logger, cfg *vmconfig.Config) { } // buildEnv constructs environment variables from the config. +// User-provided env vars take precedence over defaults. func buildEnv(env map[string]string) []string { - result := []string{ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "HOME=/root", - } - + // Start with user's environment variables + result := make([]string, 0, len(env)+2) for k, v := range env { result = append(result, fmt.Sprintf("%s=%s", k, v)) } + // Add defaults only if not already set by user + if _, ok := env["PATH"]; !ok { + result = append(result, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") + } + if _, ok := env["HOME"]; !ok { + result = append(result, "HOME=/root") + } + return result } diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go index cd1227f..5e7188a 100644 --- a/lib/system/init/mode_systemd.go +++ b/lib/system/init/mode_systemd.go @@ -48,7 +48,8 @@ func runSystemdMode(log *Logger, cfg *vmconfig.Config) { log.Info("systemd", fmt.Sprintf("exec %v", argv)) // syscall.Exec replaces the current process with the new one - err := syscall.Exec(argv[0], argv, os.Environ()) + // Use buildEnv to include user's environment variables from the image/instance config + err := syscall.Exec(argv[0], argv, buildEnv(cfg.Env)) if err != nil { log.Error("systemd", fmt.Sprintf("exec %s failed", argv[0]), err) dropToShell() diff --git a/lib/system/init/volumes.go b/lib/system/init/volumes.go index f485b89..278c1db 100644 --- a/lib/system/init/volumes.go +++ b/lib/system/init/volumes.go @@ -45,9 +45,11 @@ func mountVolumes(log *Logger, cfg *vmconfig.Config) error { // mountVolumeOverlay mounts a volume in overlay mode. // Uses the base device as read-only lower layer and overlay device for writable upper layer. func mountVolumeOverlay(log *Logger, vol vmconfig.VolumeMount, mountPath string) error { - baseName := filepath.Base(vol.Path) - baseMount := fmt.Sprintf("/mnt/vol-base-%s", baseName) - overlayMount := fmt.Sprintf("/mnt/vol-overlay-%s", baseName) + // Use device name for unique mount points (e.g., "vdd" from "/dev/vdd") + // This avoids collisions when multiple volumes have the same basename + deviceName := filepath.Base(vol.Device) + baseMount := fmt.Sprintf("/mnt/vol-base-%s", deviceName) + overlayMount := fmt.Sprintf("/mnt/vol-overlay-%s", deviceName) // Create mount points if err := os.MkdirAll(baseMount, 0755); err != nil {