diff --git a/.gitignore b/.gitignore index e76a3b0a..b2b815d4 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,7 @@ cloud-hypervisor cloud-hypervisor/** lib/system/exec_agent/exec-agent lib/system/guest_agent/guest-agent -lib/system/guest_agent/guest_agent +lib/system/init/init # Envoy binaries lib/ingress/binaries/** diff --git a/Makefile b/Makefile index 4925096f..4436c515 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries build-preview-cli release-prep clean +.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded # Directory where local binaries will be installed BIN_DIR ?= $(CURDIR)/bin @@ -165,26 +165,33 @@ ensure-caddy-binaries: fi # Build guest-agent (guest binary) into its own directory for embedding -lib/system/guest_agent/guest-agent: lib/system/guest_agent/main.go +lib/system/guest_agent/guest-agent: lib/system/guest_agent/*.go @echo "Building guest-agent..." cd lib/system/guest_agent && CGO_ENABLED=0 go build -ldflags="-s -w" -o guest-agent . +# Build init binary (runs as PID 1 in guest VM) for embedding +lib/system/init/init: lib/system/init/*.go + @echo "Building init binary..." + cd lib/system/init && CGO_ENABLED=0 go build -ldflags="-s -w" -o init . + +build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init + # Build the binary -build: ensure-ch-binaries ensure-caddy-binaries lib/system/guest_agent/guest-agent | $(BIN_DIR) +build: ensure-ch-binaries ensure-caddy-binaries build-embedded | $(BIN_DIR) go build -tags containers_image_openpgp -o $(BIN_DIR)/hypeman ./cmd/api # Build all binaries build-all: build # Run in development mode with hot reload -dev: ensure-ch-binaries ensure-caddy-binaries lib/system/guest_agent/guest-agent $(AIR) +dev: ensure-ch-binaries ensure-caddy-binaries build-embedded $(AIR) @rm -f ./tmp/main $(AIR) -c .air.toml # Run tests (as root for network capabilities, enables caching and parallelism) # Usage: make test - runs all tests # make test TEST=TestCreateInstanceWithNetwork - runs specific test -test: ensure-ch-binaries ensure-caddy-binaries lib/system/guest_agent/guest-agent +test: ensure-ch-binaries ensure-caddy-binaries build-embedded @if [ -n "$(TEST)" ]; then \ echo "Running specific test: $(TEST)"; \ sudo env "PATH=$$PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" go test -tags containers_image_openpgp -run=$(TEST) -v -timeout=180s ./...; \ @@ -203,8 +210,9 @@ clean: rm -rf lib/vmm/binaries/cloud-hypervisor/ rm -rf lib/ingress/binaries/ rm -f lib/system/guest_agent/guest-agent + rm -f lib/system/init/init # Prepare for release build (called by GoReleaser) # Downloads all embedded binaries and builds embedded components -release-prep: download-ch-binaries build-caddy-binaries lib/system/guest_agent/guest-agent +release-prep: download-ch-binaries build-caddy-binaries build-embedded go mod tidy diff --git a/cmd/api/api/exec.go b/cmd/api/api/exec.go index c06b5a7a..f09b3e79 100644 --- a/cmd/api/api/exec.go +++ b/cmd/api/api/exec.go @@ -29,11 +29,12 @@ var upgrader = websocket.Upgrader{ // ExecRequest represents the JSON body for exec requests type ExecRequest struct { - Command []string `json:"command"` - TTY bool `json:"tty"` - Env map[string]string `json:"env,omitempty"` - Cwd string `json:"cwd,omitempty"` - Timeout int32 `json:"timeout,omitempty"` // seconds + Command []string `json:"command"` + TTY bool `json:"tty"` + Env map[string]string `json:"env,omitempty"` + Cwd string `json:"cwd,omitempty"` + Timeout int32 `json:"timeout,omitempty"` // seconds + WaitForAgent int32 `json:"wait_for_agent,omitempty"` // seconds to wait for guest agent to be ready } // ExecHandler handles exec requests via WebSocket for bidirectional streaming @@ -106,6 +107,7 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { "tty", execReq.TTY, "cwd", execReq.Cwd, "timeout", execReq.Timeout, + "wait_for_agent", execReq.WaitForAgent, ) // Create WebSocket read/writer wrapper @@ -122,14 +124,15 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { // Execute via vsock exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ - Command: execReq.Command, - Stdin: wsConn, - Stdout: wsConn, - Stderr: wsConn, - TTY: execReq.TTY, - Env: execReq.Env, - Cwd: execReq.Cwd, - Timeout: execReq.Timeout, + Command: execReq.Command, + Stdin: wsConn, + Stdout: wsConn, + Stderr: wsConn, + TTY: execReq.TTY, + Env: execReq.Env, + Cwd: execReq.Cwd, + Timeout: execReq.Timeout, + WaitForAgent: time.Duration(execReq.WaitForAgent) * time.Second, }) duration := time.Since(startTime) diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index d4a88a61..016fbea2 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -115,38 +115,23 @@ func TestExecInstanceNonTTY(t *testing.T) { t.Logf("vsock socket exists: %s", actualInst.VsockSocket) } - // Wait for exec agent to be ready (retry a few times) - var exit *guest.ExitStatus var stdout, stderr outputBuffer - var execErr error dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) require.NoError(t, err) t.Log("Testing exec command: whoami") - maxRetries := 10 - for i := 0; i < maxRetries; i++ { - stdout = outputBuffer{} - stderr = outputBuffer{} - - exit, execErr = guest.ExecIntoInstance(ctx(), dialer, guest.ExecOptions{ - Command: []string{"/bin/sh", "-c", "whoami"}, - Stdin: nil, - Stdout: &stdout, - Stderr: &stderr, - TTY: false, - }) - - if execErr == nil { - break - } - - t.Logf("Exec attempt %d/%d failed, retrying: %v", i+1, maxRetries, execErr) - time.Sleep(1 * time.Second) - } + exit, execErr := guest.ExecIntoInstance(ctx(), dialer, guest.ExecOptions{ + Command: []string{"/bin/sh", "-c", "whoami"}, + Stdin: nil, + Stdout: &stdout, + Stderr: &stderr, + TTY: false, + WaitForAgent: 10 * time.Second, // Wait up to 10s for guest agent to be ready + }) // Assert exec worked - require.NoError(t, execErr, "exec should succeed after retries") + require.NoError(t, execErr, "exec should succeed") require.NotNil(t, exit, "exit status should be returned") require.Equal(t, 0, exit.Code, "whoami should exit with code 0") @@ -251,7 +236,7 @@ func TestExecWithDebianMinimal(t *testing.T) { // Verify the app exited but VM is still usable (key behavior this test validates) logs = collectTestLogs(t, svc, inst.Id, 200) - assert.Contains(t, logs, "overlay-init: app exited with code", "App should have exited") + assert.Contains(t, logs, "[exec] app exited with code", "App should have exited") // Test exec commands work even though the main app (bash) has exited dialer2, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) diff --git a/integration/systemd_test.go b/integration/systemd_test.go new file mode 100644 index 00000000..16ba8638 --- /dev/null +++ b/integration/systemd_test.go @@ -0,0 +1,213 @@ +package integration + +import ( + "bytes" + "context" + "os" + "strings" + "testing" + "time" + + "github.com/onkernel/hypeman/cmd/api/config" + "github.com/onkernel/hypeman/lib/devices" + "github.com/onkernel/hypeman/lib/guest" + "github.com/onkernel/hypeman/lib/hypervisor" + "github.com/onkernel/hypeman/lib/images" + "github.com/onkernel/hypeman/lib/instances" + "github.com/onkernel/hypeman/lib/network" + "github.com/onkernel/hypeman/lib/paths" + "github.com/onkernel/hypeman/lib/system" + "github.com/onkernel/hypeman/lib/volumes" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestSystemdMode verifies that hypeman correctly detects and runs +// systemd-based images with systemd as PID 1. +// +// This test uses the jrei/systemd-ubuntu image from Docker Hub which runs +// systemd as its CMD. The test verifies that hypeman auto-detects this and: +// - Uses systemd mode (chroot to container rootfs) +// - Starts systemd as PID 1 +// - Injects and starts the hypeman-agent.service +func TestSystemdMode(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + // Skip if KVM is not available + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { + t.Skip("/dev/kvm not available") + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + // Set up test environment + tmpDir := t.TempDir() + p := paths.New(tmpDir) + + cfg := &config.Config{ + DataDir: tmpDir, + BridgeName: "vmbr0", + SubnetCIDR: "10.100.0.0/16", + DNSServer: "1.1.1.1", + } + + // Create managers + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + systemManager := system.NewManager(p) + networkManager := network.NewManager(p, cfg, nil) + deviceManager := devices.NewManager(p) + volumeManager := volumes.NewManager(p, 0, nil) + + limits := instances.ResourceLimits{ + MaxOverlaySize: 100 * 1024 * 1024 * 1024, + MaxVcpusPerInstance: 0, + MaxMemoryPerInstance: 0, + MaxTotalVcpus: 0, + MaxTotalMemory: 0, + } + + instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil) + + // Cleanup any orphaned instances + t.Cleanup(func() { + instanceManager.DeleteInstance(ctx, "systemd-test") + }) + + imageName := "docker.io/jrei/systemd-ubuntu:22.04" + + // Pull the systemd image + t.Log("Pulling systemd image:", imageName) + _, err = imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: imageName, + }) + require.NoError(t, err) + + // Wait for image to be ready + t.Log("Waiting for image build...") + var img *images.Image + for i := 0; i < 120; i++ { + img, err = imageManager.GetImage(ctx, imageName) + if err == nil && img.Status == images.StatusReady { + break + } + time.Sleep(1 * time.Second) + } + require.Equal(t, images.StatusReady, img.Status, "image should be ready") + + // Verify systemd detection + t.Run("IsSystemdImage", func(t *testing.T) { + isSystemd := images.IsSystemdImage(img.Entrypoint, img.Cmd) + assert.True(t, isSystemd, "image should be detected as systemd, entrypoint=%v cmd=%v", img.Entrypoint, img.Cmd) + }) + + // Ensure system files (kernel, initrd) + t.Log("Ensuring system files...") + err = systemManager.EnsureSystemFiles(ctx) + require.NoError(t, err) + + // Create the systemd instance + t.Log("Creating systemd instance...") + inst, err := instanceManager.CreateInstance(ctx, instances.CreateInstanceRequest{ + Name: "systemd-test", + Image: imageName, + Size: 2 * 1024 * 1024 * 1024, // 2GB + HotplugSize: 512 * 1024 * 1024, + OverlaySize: 1024 * 1024 * 1024, + Vcpus: 2, + NetworkEnabled: false, // No network needed for this test + }) + require.NoError(t, err) + t.Logf("Instance created: %s", inst.Id) + + // Wait for guest agent to be ready + t.Log("Waiting for guest agent...") + err = waitForGuestAgent(ctx, instanceManager, inst.Id, 60*time.Second) + require.NoError(t, err, "guest agent should be ready") + + // Test: Verify systemd is PID 1 + t.Run("SystemdIsPID1", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "cat", "/proc/1/comm") + require.NoError(t, err, "exec should work") + require.Equal(t, 0, exitCode, "command should succeed") + + pid1Name := strings.TrimSpace(output) + assert.Equal(t, "systemd", pid1Name, "PID 1 should be systemd") + t.Logf("PID 1 is: %s", pid1Name) + }) + + // Test: Verify guest-agent binary exists + t.Run("GuestAgentExists", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "test", "-x", "/opt/hypeman/guest-agent") + require.NoError(t, err, "exec should work") + assert.Equal(t, 0, exitCode, "guest-agent binary should exist at /opt/hypeman/guest-agent, output: %s", output) + }) + + // Test: Verify hypeman-agent.service is active + t.Run("AgentServiceActive", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "systemctl", "is-active", "hypeman-agent") + require.NoError(t, err, "exec should work") + status := strings.TrimSpace(output) + assert.Equal(t, 0, exitCode, "hypeman-agent service should be active, status: %s", status) + assert.Equal(t, "active", status, "service status should be 'active'") + t.Logf("hypeman-agent service status: %s", status) + }) + + // Test: Verify we can view agent logs via journalctl + t.Run("AgentLogsAccessible", func(t *testing.T) { + output, exitCode, err := execInInstance(ctx, inst, "journalctl", "-u", "hypeman-agent", "--no-pager", "-n", "5") + require.NoError(t, err, "exec should work") + assert.Equal(t, 0, exitCode, "journalctl should succeed") + t.Logf("Agent logs (last 5 lines):\n%s", output) + }) + + t.Log("All systemd mode tests passed!") +} + +// waitForGuestAgent polls until the guest agent is ready +func waitForGuestAgent(ctx context.Context, mgr instances.Manager, instanceID string, timeout time.Duration) error { + inst, err := mgr.GetInstance(ctx, instanceID) + if err != nil { + return err + } + + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + return err + } + + // Use WaitForAgent to wait for the agent to be ready + var stdout bytes.Buffer + _, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: []string{"echo", "ready"}, + Stdout: &stdout, + TTY: false, + WaitForAgent: timeout, + }) + return err +} + +// execInInstance executes a command in the instance +func execInInstance(ctx context.Context, inst *instances.Instance, command ...string) (string, int, error) { + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + return "", -1, err + } + + var stdout, stderr bytes.Buffer + exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: command, + Stdout: &stdout, + Stderr: &stderr, + TTY: false, + }) + if err != nil { + return stderr.String(), -1, err + } + + return stdout.String(), exit.Code, nil +} diff --git a/lib/guest/client.go b/lib/guest/client.go index dc7a7fd3..96151280 100644 --- a/lib/guest/client.go +++ b/lib/guest/client.go @@ -2,6 +2,7 @@ package guest import ( "context" + "errors" "fmt" "io" "io/fs" @@ -17,7 +18,9 @@ import ( securejoin "github.com/cyphar/filepath-securejoin" "github.com/onkernel/hypeman/lib/hypervisor" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/status" ) const ( @@ -25,6 +28,20 @@ const ( vsockGuestPort = 2222 ) +// AgentVSockDialError indicates the vsock dial to the guest agent failed. +// This typically means the VM is still booting or the agent hasn't started yet. +type AgentVSockDialError struct { + Err error +} + +func (e *AgentVSockDialError) Error() string { + return fmt.Sprintf("vsock dial failed (VM may still be booting): %v", e.Err) +} + +func (e *AgentVSockDialError) Unwrap() error { + return e.Err +} + // connPool manages reusable gRPC connections per vsock dialer key // This avoids the overhead and potential issues of rapidly creating/closing connections var connPool = struct { @@ -59,7 +76,11 @@ func GetOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc. // Create new connection using the VsockDialer conn, err := grpc.Dial("passthrough:///vsock", grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { - return dialer.DialVsock(ctx, vsockGuestPort) + netConn, err := dialer.DialVsock(ctx, vsockGuestPort) + if err != nil { + return nil, &AgentVSockDialError{Err: err} + } + return netConn, nil }), grpc.WithTransportCredentials(insecure.NewCredentials()), ) @@ -92,19 +113,77 @@ type ExitStatus struct { // ExecOptions configures command execution type ExecOptions struct { - Command []string - Stdin io.Reader - Stdout io.Writer - Stderr io.Writer - TTY bool - Env map[string]string // Environment variables - Cwd string // Working directory (optional) - Timeout int32 // Execution timeout in seconds (0 = no timeout) + Command []string + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer + TTY bool + Env map[string]string // Environment variables + Cwd string // Working directory (optional) + Timeout int32 // Execution timeout in seconds (0 = no timeout) + WaitForAgent time.Duration // Max time to wait for agent to be ready (0 = no wait, fail immediately) } // ExecIntoInstance executes command in instance via vsock using gRPC. // The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest. +// If WaitForAgent is set, it will retry on connection errors until the timeout. func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) { + // If no wait requested, execute immediately + if opts.WaitForAgent == 0 { + return execIntoInstanceOnce(ctx, dialer, opts) + } + + deadline := time.Now().Add(opts.WaitForAgent) + + for { + exit, err := execIntoInstanceOnce(ctx, dialer, opts) + + // Success - return immediately + if err == nil { + return exit, err + } + + // Check if this is a retryable connection error + if !isRetryableConnectionError(err) { + return exit, err + } + + // Connection error - check if we should retry + if time.Now().After(deadline) { + return nil, err + } + + // Wait before retrying, but respect context cancellation + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(500 * time.Millisecond): + // Continue to retry + } + } +} + +// isRetryableConnectionError returns true if the error indicates the guest agent +// is not yet ready and we should retry connecting. +func isRetryableConnectionError(err error) bool { + // Check for vsock dial errors + var dialErr *AgentVSockDialError + if errors.As(err, &dialErr) { + return true + } + + // Check for gRPC Unavailable errors (agent not yet listening) + if s, ok := status.FromError(err); ok { + if s.Code() == codes.Unavailable { + return true + } + } + + return false +} + +// execIntoInstanceOnce executes command in instance via vsock using gRPC (single attempt). +func execIntoInstanceOnce(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) { start := time.Now() var bytesSent int64 diff --git a/lib/images/systemd.go b/lib/images/systemd.go new file mode 100644 index 00000000..b03c6191 --- /dev/null +++ b/lib/images/systemd.go @@ -0,0 +1,39 @@ +package images + +// IsSystemdImage checks if the image's CMD indicates it wants systemd as init. +// Detection is based on the effective command (entrypoint + cmd), not whether +// systemd is installed in the image. +// +// Returns true if the image's command is: +// - /sbin/init +// - /lib/systemd/systemd +// - /usr/lib/systemd/systemd +func IsSystemdImage(entrypoint, cmd []string) bool { + // Combine to get the actual command that will run. + // Create a new slice to avoid corrupting caller's backing array. + effective := make([]string, 0, len(entrypoint)+len(cmd)) + effective = append(effective, entrypoint...) + effective = append(effective, cmd...) + if len(effective) == 0 { + return false + } + + first := effective[0] + + // Match specific systemd/init paths only. + // We intentionally don't match generic */init paths since many entrypoint + // scripts are named "init" and would be false positives. + systemdPaths := []string{ + "/sbin/init", + "/lib/systemd/systemd", + "/usr/lib/systemd/systemd", + } + for _, p := range systemdPaths { + if first == p { + return true + } + } + + return false +} + diff --git a/lib/images/systemd_test.go b/lib/images/systemd_test.go new file mode 100644 index 00000000..3428d639 --- /dev/null +++ b/lib/images/systemd_test.go @@ -0,0 +1,91 @@ +package images + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsSystemdImage(t *testing.T) { + tests := []struct { + name string + entrypoint []string + cmd []string + expected bool + }{ + { + name: "empty entrypoint and cmd", + entrypoint: nil, + cmd: nil, + expected: false, + }, + { + name: "/sbin/init as cmd", + entrypoint: nil, + cmd: []string{"/sbin/init"}, + expected: true, + }, + { + name: "/lib/systemd/systemd as cmd", + entrypoint: nil, + cmd: []string{"/lib/systemd/systemd"}, + expected: true, + }, + { + name: "/usr/lib/systemd/systemd as cmd", + entrypoint: nil, + cmd: []string{"/usr/lib/systemd/systemd"}, + expected: true, + }, + { + name: "path ending in /init should not match (too broad)", + entrypoint: nil, + cmd: []string{"/usr/sbin/init"}, + expected: false, + }, + { + name: "regular command (nginx)", + entrypoint: []string{"nginx"}, + cmd: []string{"-g", "daemon off;"}, + expected: false, + }, + { + name: "regular command (python)", + entrypoint: []string{"/usr/bin/python3"}, + cmd: []string{"app.py"}, + expected: false, + }, + { + name: "entrypoint with systemd", + entrypoint: []string{"/lib/systemd/systemd"}, + cmd: nil, + expected: true, + }, + { + name: "entrypoint with init", + entrypoint: []string{"/sbin/init"}, + cmd: nil, + expected: true, + }, + { + name: "shell script named init should not match", + entrypoint: nil, + cmd: []string{"./init"}, + expected: false, + }, + { + name: "bash command should not match", + entrypoint: nil, + cmd: []string{"/bin/bash", "-c", "init"}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := IsSystemdImage(tt.entrypoint, tt.cmd) + assert.Equal(t, tt.expected, result) + }) + } +} + diff --git a/lib/instances/configdisk.go b/lib/instances/configdisk.go index fb9305ac..c0e4e5b7 100644 --- a/lib/instances/configdisk.go +++ b/lib/instances/configdisk.go @@ -12,12 +12,11 @@ import ( "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/network" + "github.com/onkernel/hypeman/lib/vmconfig" ) -// createConfigDisk generates an erofs disk with instance configuration -// The disk contains: -// - /config.sh - Shell script sourced by init -// - /metadata.json - JSON metadata for programmatic access +// createConfigDisk generates an ext4 disk with instance configuration. +// The disk contains /config.json read by the guest init binary. func (m *manager) createConfigDisk(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) error { // Create temporary directory for config files tmpDir, err := os.MkdirTemp("", "hypeman-config-*") @@ -26,37 +25,19 @@ func (m *manager) createConfigDisk(ctx context.Context, inst *Instance, imageInf } defer os.RemoveAll(tmpDir) - // Generate config.sh - configScript := m.generateConfigScript(ctx, inst, imageInfo, netConfig) - configPath := filepath.Join(tmpDir, "config.sh") - if err := os.WriteFile(configPath, []byte(configScript), 0644); err != nil { - return fmt.Errorf("write config.sh: %w", err) - } - - // Generate metadata.json - configMeta := map[string]interface{}{ - "instance_id": inst.Id, - "instance_name": inst.Name, - "image": inst.Image, - "entrypoint": imageInfo.Entrypoint, - "cmd": imageInfo.Cmd, - "workdir": imageInfo.WorkingDir, - "env": mergeEnv(imageInfo.Env, inst.Env), - } - metaData, err := json.MarshalIndent(configMeta, "", " ") + // Generate config.json + cfg := m.buildGuestConfig(ctx, inst, imageInfo, netConfig) + configData, err := json.MarshalIndent(cfg, "", " ") if err != nil { - return fmt.Errorf("marshal metadata: %w", err) + return fmt.Errorf("marshal config: %w", err) } - metaPath := filepath.Join(tmpDir, "metadata.json") - if err := os.WriteFile(metaPath, metaData, 0644); err != nil { - return fmt.Errorf("write metadata.json: %w", err) + configPath := filepath.Join(tmpDir, "config.json") + if err := os.WriteFile(configPath, configData, 0644); err != nil { + return fmt.Errorf("write config.json: %w", err) } // Create ext4 disk with config files - // Use ext4 for now (can switch to erofs when kernel supports it) diskPath := m.paths.InstanceConfigDisk(inst.Id) - - // Calculate size (config files are tiny, use 1MB minimum) _, err = images.ExportRootfs(tmpDir, diskPath, images.FormatExt4) if err != nil { return fmt.Errorf("create config disk: %w", err) @@ -65,118 +46,71 @@ func (m *manager) createConfigDisk(ctx context.Context, inst *Instance, imageInf return nil } -// generateConfigScript creates the shell script that will be sourced by init -func (m *manager) generateConfigScript(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) string { - // Prepare entrypoint value - entrypoint := "" - if len(imageInfo.Entrypoint) > 0 { - entrypoint = shellQuoteArray(imageInfo.Entrypoint) - } - - // Prepare cmd value - cmd := "" - if len(imageInfo.Cmd) > 0 { - cmd = shellQuoteArray(imageInfo.Cmd) - } - - // Prepare workdir value - workdir := shellQuote("/") - if imageInfo.WorkingDir != "" { - workdir = shellQuote(imageInfo.WorkingDir) +// buildGuestConfig creates the vmconfig.Config struct for the guest init binary. +func (m *manager) buildGuestConfig(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) *vmconfig.Config { + cfg := &vmconfig.Config{ + Entrypoint: imageInfo.Entrypoint, + Cmd: imageInfo.Cmd, + Workdir: imageInfo.WorkingDir, + Env: mergeEnv(imageInfo.Env, inst.Env), + InitMode: "exec", } - // Build environment variable exports - var envLines strings.Builder - mergedEnv := mergeEnv(imageInfo.Env, inst.Env) - for key, value := range mergedEnv { - envLines.WriteString(fmt.Sprintf("export %s=%s\n", key, shellQuote(value))) + if cfg.Workdir == "" { + cfg.Workdir = "/" } - // Build network configuration section - // Use netConfig directly instead of trying to derive it (VM hasn't started yet) - networkSection := "" + // Network configuration if inst.NetworkEnabled && netConfig != nil { - // Convert netmask to CIDR prefix length for ip command - cidr := netmaskToCIDR(netConfig.Netmask) - networkSection = fmt.Sprintf(` -# Network configuration -GUEST_IP="%s" -GUEST_CIDR="%d" -GUEST_GW="%s" -GUEST_DNS="%s" -`, netConfig.IP, cidr, netConfig.Gateway, netConfig.DNS) + cfg.NetworkEnabled = true + cfg.GuestIP = netConfig.IP + cfg.GuestCIDR = netmaskToCIDR(netConfig.Netmask) + cfg.GuestGW = netConfig.Gateway + cfg.GuestDNS = netConfig.DNS } - // GPU passthrough configuration - // Only set HAS_GPU=1 if at least one attached device is actually a GPU - gpuSection := "" + // GPU passthrough - check if any attached device is a GPU for _, deviceID := range inst.Devices { device, err := m.deviceManager.GetDevice(ctx, deviceID) if err == nil && device.Type == devices.DeviceTypeGPU { - gpuSection = "\n# GPU passthrough\nHAS_GPU=1\n" + cfg.HasGPU = true break } } - // Build volume mounts section + // Volume mounts // Volumes are attached as /dev/vdd, /dev/vde, etc. (after vda=rootfs, vdb=overlay, vdc=config) - // For overlay volumes, two devices are used: base + overlay disk - // Format: device:path:mode[:overlay_device] - volumeSection := "" - if len(inst.Volumes) > 0 { - var volumeLines strings.Builder - volumeLines.WriteString("\n# Volume mounts (device:path:mode[:overlay_device])\n") - volumeLines.WriteString("VOLUME_MOUNTS=\"") - deviceIdx := 0 // Track device index (starts at 'd' = vdd) - for i, vol := range inst.Volumes { - device := fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx) - if i > 0 { - volumeLines.WriteString(" ") - } - if vol.Overlay { - // Overlay mode: base device + overlay device - overlayDevice := fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx+1) - volumeLines.WriteString(fmt.Sprintf("%s:%s:overlay:%s", device, vol.MountPath, overlayDevice)) - deviceIdx += 2 // Overlay uses 2 devices + deviceIdx := 0 + for _, vol := range inst.Volumes { + device := fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx) + mount := vmconfig.VolumeMount{ + Device: device, + Path: vol.MountPath, + } + if vol.Overlay { + mount.Mode = "overlay" + mount.OverlayDevice = fmt.Sprintf("/dev/vd%c", 'd'+deviceIdx+1) + deviceIdx += 2 + } else { + if vol.Readonly { + mount.Mode = "ro" } else { - mode := "rw" - if vol.Readonly { - mode = "ro" - } - volumeLines.WriteString(fmt.Sprintf("%s:%s:%s", device, vol.MountPath, mode)) - deviceIdx++ // Regular volume uses 1 device + mount.Mode = "rw" } + deviceIdx++ } - volumeLines.WriteString("\"\n") - volumeSection = volumeLines.String() + cfg.VolumeMounts = append(cfg.VolumeMounts, mount) } - // Generate script as a readable template block - // ENTRYPOINT and CMD contain shell-quoted arrays that will be eval'd in init - script := fmt.Sprintf(`#!/bin/sh -# Generated config for instance: %s - -# Container execution parameters -ENTRYPOINT="%s" -CMD="%s" -WORKDIR=%s - -# Environment variables -%s%s%s%s`, - inst.Id, - entrypoint, - cmd, - workdir, - envLines.String(), - networkSection, - volumeSection, - gpuSection, - ) - - return script + // Determine init mode based on image CMD + if images.IsSystemdImage(imageInfo.Entrypoint, imageInfo.Cmd) { + cfg.InitMode = "systemd" + } + + return cfg } -// mergeEnv merges image environment variables with instance overrides +// mergeEnv merges image environment variables with instance overrides. func mergeEnv(imageEnv map[string]string, instEnv map[string]string) map[string]string { result := make(map[string]string) @@ -193,30 +127,7 @@ func mergeEnv(imageEnv map[string]string, instEnv map[string]string) map[string] return result } -// shellQuote quotes a string for safe use in shell scripts -func shellQuote(s string) string { - // Simple quoting: wrap in single quotes and escape single quotes - s = strings.ReplaceAll(s, "'", "'\\''") - return "'" + s + "'" -} - -// shellQuoteArray quotes each element of an array for safe shell evaluation -// Returns a string that when assigned to a variable and later eval'd, will be properly split -func shellQuoteArray(arr []string) string { - if len(arr) == 0 { - return "" - } - - quoted := make([]string, len(arr)) - for i, s := range arr { - quoted[i] = shellQuote(s) - } - - // Join with spaces and return as-is (will be eval'd later in init script) - return strings.Join(quoted, " ") -} - -// netmaskToCIDR converts dotted decimal netmask to CIDR prefix length +// netmaskToCIDR converts dotted decimal netmask to CIDR prefix length. // e.g., "255.255.255.0" -> 24, "255.255.0.0" -> 16 func netmaskToCIDR(netmask string) int { parts := strings.Split(netmask, ".") diff --git a/lib/system/README.md b/lib/system/README.md index 68629f03..29111fe0 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -4,11 +4,12 @@ Manages versioned kernel and initrd files for Cloud Hypervisor VMs. ## Features -- **Automatic Downloads**: Kernel downloaded from Cloud Hypervisor releases on first use -- **Automatic Build**: Initrd built from busybox + custom init script -- **Versioned**: Side-by-side support for multiple kernel/initrd versions +- **Automatic Downloads**: Kernel downloaded from onkernel/linux releases on first use +- **Automatic Build**: Initrd built from Alpine base + Go init binary + guest-agent +- **Versioned**: Side-by-side support for multiple kernel versions - **Zero Docker**: Uses OCI directly (reuses image manager infrastructure) - **Zero Image Modifications**: All init logic in initrd, OCI images used as-is +- **Dual Mode Support**: Exec mode (container-like) and systemd mode (full VM) ## Architecture @@ -17,27 +18,27 @@ Manages versioned kernel and initrd files for Cloud Hypervisor VMs. ``` {dataDir}/system/ ├── kernel/ -│ ├── ch-v6.12.8/ +│ ├── ch-6.12.8-kernel-1-202511182/ │ │ ├── x86_64/vmlinux (~70MB) │ │ └── aarch64/Image (~70MB) -│ └── ch-v6.12.9/ -│ └── ... (future version) +│ └── ch-6.12.8-kernel-1.2-20251213/ +│ └── ... (newer version) ├── initrd/ -│ ├── v1.0.0/ -│ │ ├── x86_64/initrd (~1-2MB) -│ │ └── aarch64/initrd (~1-2MB) -│ └── v1.1.0/ -│ └── ... (when init script changes) -└── oci-cache/ (shared with images manager) - └── blobs/sha256/ (busybox layers cached) +│ ├── 1734567890/ (timestamp-based) +│ │ ├── x86_64/initrd (~5-10MB) +│ │ └── aarch64/initrd +│ ├── x86_64/latest -> 1734567890 (symlink to latest) +│ └── aarch64/latest -> 1734567890 +└── oci-cache/ (shared with images manager) + └── blobs/sha256/ (Alpine layers cached) ``` ### Versioning Rules **Snapshots require exact matches:** ``` -Standby: kernel v6.12.9, initrd v1.0.0, CH v49.0 -Restore: kernel v6.12.9, initrd v1.0.0, CH v49.0 (MUST match) +Standby: kernel ch-6.12.8-kernel-1.2-20251213, CH v49.0 +Restore: kernel ch-6.12.8-kernel-1.2-20251213, CH v49.0 (MUST match) ``` **Maintenance upgrades (shutdown → boot):** @@ -49,69 +50,52 @@ Restore: kernel v6.12.9, initrd v1.0.0, CH v49.0 (MUST match) **Multi-version support:** ``` -Instance A (standby): kernel v6.12.8, initrd v1.0.0 -Instance B (running): kernel v6.12.9, initrd v1.0.0 +Instance A (standby): kernel ch-6.12.8-kernel-1-202511182 +Instance B (running): kernel ch-6.12.8-kernel-1.2-20251213 Both work independently ``` -## Init Script Consolidation +## Go Init Binary -All init logic moved from app rootfs to initrd: +The init binary (`lib/system/init/`) is a Go program that runs as PID 1 in the guest VM. +It replaces the previous shell-based init script with cleaner logic and structured logging. **Initrd handles:** - ✅ Mount overlay filesystem - ✅ Mount and source config disk - ✅ Network configuration (if enabled) -- ✅ Execute container entrypoint +- ✅ Load GPU drivers (if GPU attached) +- ✅ Mount volumes +- ✅ Execute container entrypoint (exec mode) +- ✅ Hand off to systemd via chroot + exec (systemd mode) -**Result:** OCI images require **zero modifications** - no `/init` script needed! - -## Usage - -### Application Startup - -```go -// cmd/api/main.go -systemMgr := system.NewManager(dataDir) - -// Ensure files exist (download/build if needed) -err := systemMgr.EnsureSystemFiles(ctx) - -// Files are ready, instances can be created -``` - -### Instance Creation - -```go -// Instances manager uses system manager automatically -inst, err := instanceManager.CreateInstance(ctx, req) -// Uses default kernel/initrd versions -// Versions stored in instance metadata for restore compatibility -``` +**Two boot modes:** +- **Exec mode** (default): Init chroots to container rootfs, runs entrypoint as child process, then waits on guest-agent to keep VM alive +- **Systemd mode** (auto-detected on host): Init chroots to container rootfs, then execs /sbin/init so systemd becomes PID 1 -### Get File Paths +**Systemd detection:** Host-side detection in `lib/images/systemd.go` checks if image CMD is +`/sbin/init`, `/lib/systemd/systemd`, or similar. The detected mode is passed to the initrd +via `INIT_MODE` in the config disk. -```go -kernelPath, _ := systemMgr.GetKernelPath(system.KernelV6_12_9) -initrdPath, _ := systemMgr.GetInitrdPath(system.InitrdV1_0_0) -``` +**Result:** OCI images require **zero modifications** - no `/init` script needed! ## Kernel Sources -Kernels downloaded from Cloud Hypervisor releases: -- https://github.com/cloud-hypervisor/linux/releases +Kernels downloaded from onkernel/linux releases (Cloud Hypervisor-optimized fork): +- https://github.com/onkernel/linux/releases Example URLs: -- x86_64: `https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.9/vmlinux-x86_64` -- aarch64: `https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.9/Image-aarch64` +- x86_64: `https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.2-20251213/vmlinux-x86_64` +- aarch64: `https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.2-20251213/Image-arm64` ## Initrd Build Process -1. **Pull busybox** (using image manager's OCI client) -2. **Inject init script** (comprehensive, handles all init logic) -3. **Package as cpio.gz** (initramfs format) - -**Build tools required:** `find`, `cpio`, `gzip` (standard Unix tools) +1. **Pull Alpine base** (using image manager's OCI client) +2. **Add guest-agent binary** (embedded, runs in guest for exec/shell) +3. **Add init.sh wrapper** (mounts /proc, /sys, /dev before Go runtime) +4. **Add init binary** (embedded Go binary, runs as PID 1) +5. **Add NVIDIA modules** (optional, for GPU passthrough) +6. **Package as cpio** (initramfs format, pure Go - no shell tools required) ## Adding New Versions @@ -121,36 +105,33 @@ Example URLs: // lib/system/versions.go const ( - KernelV6_12_10 KernelVersion = "ch-v6.12.10" // Add constant + Kernel_20251220 KernelVersion = "ch-6.12.8-kernel-1.3-20251220" // Add constant ) var KernelDownloadURLs = map[KernelVersion]map[string]string{ // ... existing ... - KernelV6_12_10: { - "x86_64": "https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.10/vmlinux-x86_64", - "aarch64": "https://github.com/cloud-hypervisor/linux/releases/download/ch-v6.12.10/Image-aarch64", + Kernel_20251220: { + "x86_64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.3-20251220/vmlinux-x86_64", + "aarch64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.3-20251220/Image-arm64", }, } // Update default if needed -var DefaultKernelVersion = KernelV6_12_10 +var DefaultKernelVersion = Kernel_20251220 ``` -### New Initrd Version +### Updating the Init Binary -```go -// lib/system/versions.go +The init binary is in `lib/system/init/`. After making changes: -const ( - InitrdV1_1_0 InitrdVersion = "v1.1.0" // Add constant -) +1. Build the init binary (statically linked for Alpine): + ```bash + make build-init + ``` -// lib/system/init_script.go -// Update GenerateInitScript() if init logic changes +2. The binary is embedded via `lib/system/init_binary.go` -// Update default -var DefaultInitrdVersion = InitrdV1_1_0 -``` +3. The initrd hash includes the binary, so it will auto-rebuild on next startup ## Testing @@ -167,7 +148,22 @@ go test ./lib/system/... | File | Size | Purpose | |------|------|---------| | kernel/*/vmlinux | ~70MB | Cloud Hypervisor optimized kernel | -| initrd/*/initrd | ~1-2MB | Busybox + comprehensive init script | +| initrd/*/initrd | ~5-10MB | Alpine base + Go init binary + guest-agent | Files downloaded/built once per version, reused for all instances using that version. +## Init Binary Package Structure + +``` +lib/system/init/ + main.go # Entry point, orchestrates boot + init.sh # Shell wrapper (mounts /proc, /sys, /dev before Go runtime) + mount.go # Mount operations (overlay, bind mounts) + config.go # Parse config disk + network.go # Network configuration + drivers.go # GPU driver loading + volumes.go # Volume mounting + mode_exec.go # Exec mode: chroot, run entrypoint, wait on guest-agent + mode_systemd.go # Systemd mode: chroot + exec /sbin/init + logger.go # Human-readable logging to hypeman operations log +``` diff --git a/lib/system/guest_agent_binary.go b/lib/system/guest_agent_binary.go index 78a5b7b3..57d69722 100644 --- a/lib/system/guest_agent_binary.go +++ b/lib/system/guest_agent_binary.go @@ -6,4 +6,3 @@ import _ "embed" // This is built by the Makefile before the main binary is compiled //go:embed guest_agent/guest-agent var GuestAgentBinary []byte - diff --git a/lib/system/init/config.go b/lib/system/init/config.go new file mode 100644 index 00000000..9a0bde04 --- /dev/null +++ b/lib/system/init/config.go @@ -0,0 +1,50 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + + "github.com/onkernel/hypeman/lib/vmconfig" +) + +// readConfig mounts and reads the config disk, parsing the JSON configuration. +func readConfig(log *Logger) (*vmconfig.Config, error) { + const configMount = "/mnt/config" + const configFile = "/mnt/config/config.json" + + // Create mount point + if err := os.MkdirAll(configMount, 0755); err != nil { + return nil, fmt.Errorf("mkdir config mount: %w", err) + } + + // Mount config disk (/dev/vdc) read-only + cmd := exec.Command("/bin/mount", "-o", "ro", "/dev/vdc", configMount) + if output, err := cmd.CombinedOutput(); err != nil { + return nil, fmt.Errorf("mount config disk: %s: %s", err, output) + } + log.Info("config", "mounted config disk") + + // Read and parse config.json + data, err := os.ReadFile(configFile) + if err != nil { + return nil, fmt.Errorf("read config file: %w", err) + } + + var cfg vmconfig.Config + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parse config json: %w", err) + } + + // Set defaults + if cfg.InitMode == "" { + cfg.InitMode = "exec" + } + if cfg.Env == nil { + cfg.Env = make(map[string]string) + } + + log.Info("config", "parsed configuration") + return &cfg, nil +} diff --git a/lib/system/init/drivers.go b/lib/system/init/drivers.go new file mode 100644 index 00000000..935e0f9e --- /dev/null +++ b/lib/system/init/drivers.go @@ -0,0 +1,190 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" +) + +// loadGPUDrivers loads NVIDIA kernel modules for GPU passthrough. +func loadGPUDrivers(log *Logger) error { + log.Info("gpu", "loading NVIDIA kernel modules") + + // Find kernel version directory + modules, err := os.ReadDir("/lib/modules") + if err != nil { + return fmt.Errorf("read /lib/modules: %w", err) + } + + if len(modules) == 0 { + return fmt.Errorf("no kernel modules found") + } + + kver := modules[0].Name() + gpuDir := filepath.Join("/lib/modules", kver, "kernel/drivers/gpu") + + if _, err := os.Stat(gpuDir); err != nil { + return fmt.Errorf("GPU modules not found for kernel %s", kver) + } + + // Load modules in order (dependencies first) + moduleOrder := []string{ + "nvidia.ko", + "nvidia-uvm.ko", + "nvidia-modeset.ko", + "nvidia-drm.ko", + } + + for _, mod := range moduleOrder { + modPath := filepath.Join(gpuDir, mod) + if _, err := os.Stat(modPath); err != nil { + log.Error("gpu", fmt.Sprintf("%s not found", mod), nil) + continue + } + + args := []string{modPath} + // nvidia-drm needs modeset=1 + if mod == "nvidia-drm.ko" { + args = append(args, "modeset=1") + } + + cmd := exec.Command("/sbin/insmod", args...) + if output, err := cmd.CombinedOutput(); err != nil { + log.Error("gpu", fmt.Sprintf("insmod %s failed", mod), fmt.Errorf("%s", output)) + } + } + + log.Info("gpu", fmt.Sprintf("loaded NVIDIA modules for kernel %s", kver)) + + // Create device nodes using nvidia-modprobe if available + if err := createNvidiaDevices(log); err != nil { + log.Error("gpu", "failed to create device nodes", err) + } + + // Inject NVIDIA userspace driver libraries into container rootfs + if err := injectNvidiaLibraries(log); err != nil { + log.Error("gpu", "failed to inject driver libraries", err) + } + + return nil +} + +// createNvidiaDevices creates NVIDIA device nodes. +func createNvidiaDevices(log *Logger) error { + // Try nvidia-modprobe first (the official NVIDIA utility) + if _, err := os.Stat("/usr/bin/nvidia-modprobe"); err == nil { + log.Info("gpu", "running nvidia-modprobe to create device nodes") + + cmd := exec.Command("/usr/bin/nvidia-modprobe") + cmd.CombinedOutput() + + cmd = exec.Command("/usr/bin/nvidia-modprobe", "-u", "-c=0") + cmd.CombinedOutput() + + return nil + } + + // Fallback: Manual device node creation + log.Info("gpu", "nvidia-modprobe not found, creating device nodes manually") + + // Read major numbers from /proc/devices + data, err := os.ReadFile("/proc/devices") + if err != nil { + return err + } + + lines := strings.Split(string(data), "\n") + var nvidiaMajor, uvmMajor string + + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) >= 2 { + if fields[1] == "nvidia-frontend" || fields[1] == "nvidia" { + nvidiaMajor = fields[0] + } else if fields[1] == "nvidia-uvm" { + uvmMajor = fields[0] + } + } + } + + if nvidiaMajor != "" { + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidiactl", "c", nvidiaMajor, "255").Run() + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia0", "c", nvidiaMajor, "0").Run() + log.Info("gpu", fmt.Sprintf("created /dev/nvidiactl and /dev/nvidia0 (major %s)", nvidiaMajor)) + } + + if uvmMajor != "" { + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia-uvm", "c", uvmMajor, "0").Run() + exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia-uvm-tools", "c", uvmMajor, "1").Run() + log.Info("gpu", fmt.Sprintf("created /dev/nvidia-uvm* (major %s)", uvmMajor)) + } + + return nil +} + +// injectNvidiaLibraries injects NVIDIA userspace driver libraries into the container rootfs. +// This allows containers to use standard CUDA images without bundled drivers. +func injectNvidiaLibraries(log *Logger) error { + srcDir := "/usr/lib/nvidia" + if _, err := os.Stat(srcDir); err != nil { + return nil // No driver libraries to inject + } + + log.Info("gpu", "injecting NVIDIA driver libraries into container") + + // Determine library path based on architecture + var libDst string + if runtime.GOARCH == "arm64" { + libDst = "/overlay/newroot/usr/lib/aarch64-linux-gnu" + } else { + libDst = "/overlay/newroot/usr/lib/x86_64-linux-gnu" + } + binDst := "/overlay/newroot/usr/bin" + + if err := os.MkdirAll(libDst, 0755); err != nil { + return err + } + if err := os.MkdirAll(binDst, 0755); err != nil { + return err + } + + // Copy all driver libraries + libs, _ := filepath.Glob(filepath.Join(srcDir, "*.so.*")) + for _, lib := range libs { + libname := filepath.Base(lib) + data, err := os.ReadFile(lib) + if err != nil { + continue + } + os.WriteFile(filepath.Join(libDst, libname), data, 0755) + + // Create standard symlinks + base := strings.Split(libname, ".so.")[0] + os.Symlink(libname, filepath.Join(libDst, base+".so.1")) + os.Symlink(base+".so.1", filepath.Join(libDst, base+".so")) + } + + // Copy nvidia-smi and nvidia-modprobe binaries + for _, bin := range []string{"nvidia-smi", "nvidia-modprobe"} { + srcPath := filepath.Join("/usr/bin", bin) + if data, err := os.ReadFile(srcPath); err == nil { + os.WriteFile(filepath.Join(binDst, bin), data, 0755) + } + } + + // Update ldconfig cache + exec.Command("/usr/sbin/chroot", "/overlay/newroot", "ldconfig").Run() + + // Read driver version + version := "unknown" + if data, err := os.ReadFile(filepath.Join(srcDir, "version")); err == nil { + version = strings.TrimSpace(string(data)) + } + + log.Info("gpu", fmt.Sprintf("injected NVIDIA driver libraries (version: %s)", version)) + return nil +} + diff --git a/lib/system/init/init.sh b/lib/system/init/init.sh new file mode 100644 index 00000000..06b739cd --- /dev/null +++ b/lib/system/init/init.sh @@ -0,0 +1,17 @@ +#!/bin/sh +# Minimal init wrapper that sets up environment before running Go init +# The Go runtime needs /proc and /dev to exist during initialization +# +# This pattern is used by other Go-based init systems: +# - u-root (github.com/u-root/u-root) - uses assembly stub for early mount +# - LinuxKit (github.com/linuxkit/linuxkit) - similar shell wrapper approach +# - gokrazy (github.com/gokrazy/gokrazy) - mounts filesystems before Go starts + +# Mount essential filesystems BEFORE running Go binary +mkdir -p /proc /sys /dev +mount -t proc proc /proc +mount -t sysfs sysfs /sys +mount -t devtmpfs devtmpfs /dev + +# Now exec the Go init binary (it will take over as PID 1) +exec /init.bin "$@" diff --git a/lib/system/init/logger.go b/lib/system/init/logger.go new file mode 100644 index 00000000..6d0a5217 --- /dev/null +++ b/lib/system/init/logger.go @@ -0,0 +1,71 @@ +package main + +import ( + "fmt" + "os" + "time" +) + +// Logger provides human-readable structured logging for the init process. +// Logs are written to serial console. +type Logger struct { + console *os.File +} + +// NewLogger creates a new logger that writes to serial console. +func NewLogger() *Logger { + l := &Logger{} + + // Open serial console for output + // ttyS0 for x86_64, ttyAMA0 for ARM64 (PL011 UART) + if f, err := os.OpenFile("/dev/ttyAMA0", os.O_WRONLY, 0); err == nil { + l.console = f + } else if f, err := os.OpenFile("/dev/ttyS0", os.O_WRONLY, 0); err == nil { + l.console = f + } else { + // Fallback to stdout + l.console = os.Stdout + } + return l +} + +// SetConsole sets the serial console for output. +func (l *Logger) SetConsole(path string) { + if f, err := os.OpenFile(path, os.O_WRONLY, 0); err == nil { + l.console = f + } +} + +// Info logs an informational message. +// Format: 2024-12-23T10:15:30Z [INFO] [phase] message +func (l *Logger) Info(phase, msg string) { + ts := time.Now().UTC().Format(time.RFC3339) + line := fmt.Sprintf("%s [INFO] [%s] %s\n", ts, phase, msg) + l.write(line) +} + +// Error logs an error message. +// Format: 2024-12-23T10:15:30Z [ERROR] [phase] message: error +func (l *Logger) Error(phase, msg string, err error) { + ts := time.Now().UTC().Format(time.RFC3339) + var line string + if err != nil { + line = fmt.Sprintf("%s [ERROR] [%s] %s: %v\n", ts, phase, msg, err) + } else { + line = fmt.Sprintf("%s [ERROR] [%s] %s\n", ts, phase, msg) + } + l.write(line) +} + +// Infof logs a formatted informational message. +func (l *Logger) Infof(phase, format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + l.Info(phase, msg) +} + +// write outputs a log line to serial console. +func (l *Logger) write(line string) { + if l.console != nil { + l.console.WriteString(line) + } +} diff --git a/lib/system/init/main.go b/lib/system/init/main.go new file mode 100644 index 00000000..a98c47fe --- /dev/null +++ b/lib/system/init/main.go @@ -0,0 +1,97 @@ +// Package main implements the hypeman init binary that runs as PID 1 in guest VMs. +// +// This binary replaces the shell-based init script with a Go program that provides: +// - Human-readable structured logging +// - Clean separation of boot phases +// - Support for both exec mode (container-like) and systemd mode (full VM) +// +// Note: This binary is called by init.sh wrapper which mounts /proc, /sys, /dev +// before the Go runtime starts (Go requires these during initialization). +package main + +import ( + "fmt" + "os" + "os/exec" +) + +func main() { + log := NewLogger() + log.Info("boot", "init starting") + + // Phase 1: Mount additional filesystems (proc/sys/dev already mounted by init.sh) + if err := mountEssentials(log); err != nil { + log.Error("mount", "failed to mount essentials", err) + dropToShell() + } + + // Phase 2: Setup overlay rootfs + if err := setupOverlay(log); err != nil { + log.Error("overlay", "failed to setup overlay", err) + dropToShell() + } + + // Phase 3: Read and parse config + cfg, err := readConfig(log) + if err != nil { + log.Error("config", "failed to read config", err) + dropToShell() + } + + // Phase 4: Configure network (shared between modes) + if cfg.NetworkEnabled { + if err := configureNetwork(log, cfg); err != nil { + log.Error("network", "failed to configure network", err) + // Continue anyway - network isn't always required + } + } + + // Phase 5: Load GPU drivers if needed + if cfg.HasGPU { + if err := loadGPUDrivers(log); err != nil { + log.Error("gpu", "failed to load GPU drivers", err) + // Continue anyway + } + } + + // Phase 6: Mount volumes + if len(cfg.VolumeMounts) > 0 { + if err := mountVolumes(log, cfg); err != nil { + log.Error("volumes", "failed to mount volumes", err) + // Continue anyway + } + } + + // Phase 7: Bind mount filesystems to new root + if err := bindMountsToNewRoot(log); err != nil { + log.Error("bind", "failed to bind mounts", err) + dropToShell() + } + + // Phase 8: Copy guest-agent to target location + if err := copyGuestAgent(log); err != nil { + log.Error("agent", "failed to copy guest-agent", err) + // Continue anyway - exec will still work, just no remote access + } + + // Phase 9: Mode-specific execution + if cfg.InitMode == "systemd" { + log.Info("mode", "entering systemd mode") + runSystemdMode(log, cfg) + } else { + log.Info("mode", "entering exec mode") + runExecMode(log, cfg) + } +} + +// dropToShell drops to an interactive shell for debugging when boot fails +func dropToShell() { + fmt.Fprintln(os.Stderr, "FATAL: dropping to shell for debugging") + cmd := exec.Command("/bin/sh", "-i") + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Run() + os.Exit(1) +} + diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go new file mode 100644 index 00000000..8fd11511 --- /dev/null +++ b/lib/system/init/mode_exec.go @@ -0,0 +1,141 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "strings" + "syscall" + + "github.com/onkernel/hypeman/lib/vmconfig" +) + +// runExecMode runs the container in exec mode (default). +// This is the Docker-like behavior where: +// - The init binary remains PID 1 +// - Guest-agent runs as a background process +// - The container entrypoint runs as a child process +// - After entrypoint exits, guest-agent keeps VM alive +func runExecMode(log *Logger, cfg *vmconfig.Config) { + const newroot = "/overlay/newroot" + + // Change root to the new filesystem using chroot (consistent with systemd mode) + log.Info("exec", "executing chroot") + if err := syscall.Chroot(newroot); err != nil { + log.Error("exec", "chroot failed", err) + dropToShell() + } + + // Change to new root directory + if err := os.Chdir("/"); err != nil { + log.Error("exec", "chdir / failed", err) + dropToShell() + } + + // Set up environment + os.Setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") + os.Setenv("HOME", "/root") + + // Start guest-agent in background + log.Info("exec", "starting guest-agent in background") + agentCmd := exec.Command("/opt/hypeman/guest-agent") + agentCmd.Stdout = os.Stdout + agentCmd.Stderr = os.Stderr + if err := agentCmd.Start(); err != nil { + log.Error("exec", "failed to start guest-agent", err) + } + + // Build the entrypoint command + workdir := cfg.Workdir + if workdir == "" { + workdir = "/" + } + + // Shell-quote the entrypoint and cmd arrays for safe execution + entrypoint := shellQuoteArgs(cfg.Entrypoint) + cmd := shellQuoteArgs(cfg.Cmd) + + log.Info("exec", fmt.Sprintf("workdir=%s entrypoint=%v cmd=%v", workdir, cfg.Entrypoint, cfg.Cmd)) + + // Construct the shell command to run + shellCmd := fmt.Sprintf("cd %s && exec %s %s", shellQuote(workdir), entrypoint, cmd) + + log.Info("exec", "launching entrypoint") + + // Run the entrypoint without stdin (defaults to /dev/null). + // This matches the old shell script behavior where the app ran in background with & + // and couldn't read from stdin. Interactive shells like bash will see EOF and exit. + // Users interact with the VM via guest-agent exec, not the entrypoint's stdin. + appCmd := exec.Command("/bin/sh", "-c", shellCmd) + appCmd.Stdout = os.Stdout + appCmd.Stderr = os.Stderr + + // Set up environment for the app + appCmd.Env = buildEnv(cfg.Env) + + if err := appCmd.Start(); err != nil { + log.Error("exec", "failed to start entrypoint", err) + dropToShell() + } + + log.Info("exec", fmt.Sprintf("container app started (PID %d)", appCmd.Process.Pid)) + + // Wait for app to exit + err := appCmd.Wait() + exitCode := 0 + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + exitCode = exitErr.ExitCode() + } + } + + log.Info("exec", fmt.Sprintf("app exited with code %d", exitCode)) + + // Wait for guest-agent (keeps init alive, prevents kernel panic) + // The guest-agent runs forever, so this effectively keeps the VM alive + // until it's explicitly terminated + if agentCmd.Process != nil { + agentCmd.Wait() + } + + // Exit with the app's exit code + syscall.Exit(exitCode) +} + +// buildEnv constructs environment variables from the config. +// User-provided env vars take precedence over defaults. +func buildEnv(env map[string]string) []string { + // Start with user's environment variables + result := make([]string, 0, len(env)+2) + for k, v := range env { + result = append(result, fmt.Sprintf("%s=%s", k, v)) + } + + // Add defaults only if not already set by user + if _, ok := env["PATH"]; !ok { + result = append(result, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") + } + if _, ok := env["HOME"]; !ok { + result = append(result, "HOME=/root") + } + + return result +} + +// shellQuote quotes a string for safe use in shell commands. +func shellQuote(s string) string { + // Use single quotes and escape embedded single quotes + return "'" + strings.ReplaceAll(s, "'", "'\\''") + "'" +} + +// shellQuoteArgs quotes each argument and joins them with spaces. +func shellQuoteArgs(args []string) string { + if len(args) == 0 { + return "" + } + quoted := make([]string, len(args)) + for i, arg := range args { + quoted[i] = shellQuote(arg) + } + return strings.Join(quoted, " ") +} diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go new file mode 100644 index 00000000..5e7188a0 --- /dev/null +++ b/lib/system/init/mode_systemd.go @@ -0,0 +1,100 @@ +package main + +import ( + "fmt" + "os" + "syscall" + + "github.com/onkernel/hypeman/lib/vmconfig" +) + +// runSystemdMode hands off control to systemd. +// This is used when the image's CMD is /sbin/init or /lib/systemd/systemd. +// The init binary: +// 1. Injects the hypeman-agent.service unit +// 2. Uses chroot to switch to the container rootfs +// 3. Execs the image's entrypoint/cmd (systemd) which becomes the new PID 1 +func runSystemdMode(log *Logger, cfg *vmconfig.Config) { + const newroot = "/overlay/newroot" + + // Inject hypeman-agent.service + log.Info("systemd", "injecting hypeman-agent.service") + if err := injectAgentService(newroot); err != nil { + log.Error("systemd", "failed to inject service", err) + // Continue anyway - VM will work, just without agent + } + + // Change root to the new filesystem using chroot + log.Info("systemd", "executing chroot") + if err := syscall.Chroot(newroot); err != nil { + log.Error("systemd", "chroot failed", err) + dropToShell() + } + + // Change to new root directory + if err := os.Chdir("/"); err != nil { + log.Error("systemd", "chdir / failed", err) + dropToShell() + } + + // Build effective command from entrypoint + cmd + argv := append(cfg.Entrypoint, cfg.Cmd...) + if len(argv) == 0 { + // Fallback to /sbin/init if no command specified + argv = []string{"/sbin/init"} + } + + // Exec systemd - this replaces the current process + log.Info("systemd", fmt.Sprintf("exec %v", argv)) + + // syscall.Exec replaces the current process with the new one + // Use buildEnv to include user's environment variables from the image/instance config + err := syscall.Exec(argv[0], argv, buildEnv(cfg.Env)) + if err != nil { + log.Error("systemd", fmt.Sprintf("exec %s failed", argv[0]), err) + dropToShell() + } +} + +// injectAgentService creates the systemd service unit for the hypeman guest-agent. +func injectAgentService(newroot string) error { + serviceContent := `[Unit] +Description=Hypeman Guest Agent +After=network.target +Wants=network.target + +[Service] +Type=simple +ExecStart=/opt/hypeman/guest-agent +Restart=always +RestartSec=3 +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target +` + + serviceDir := newroot + "/etc/systemd/system" + wantsDir := serviceDir + "/multi-user.target.wants" + + // Create directories + if err := os.MkdirAll(serviceDir, 0755); err != nil { + return err + } + if err := os.MkdirAll(wantsDir, 0755); err != nil { + return err + } + + // Write service file + servicePath := serviceDir + "/hypeman-agent.service" + if err := os.WriteFile(servicePath, []byte(serviceContent), 0644); err != nil { + return err + } + + // Enable the service by creating a symlink in wants directory + symlinkPath := wantsDir + "/hypeman-agent.service" + // Use relative path for the symlink + return os.Symlink("../hypeman-agent.service", symlinkPath) +} + diff --git a/lib/system/init/mount.go b/lib/system/init/mount.go new file mode 100644 index 00000000..3dcee32a --- /dev/null +++ b/lib/system/init/mount.go @@ -0,0 +1,210 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "syscall" + "time" +) + +// mountEssentials mounts additional filesystems needed for boot. +// Note: /proc, /sys, /dev are already mounted by the init.sh wrapper script +// before the Go binary runs (the Go runtime needs them during initialization). +// This function mounts: +// - /dev/pts (pseudo-terminals) +// - /dev/shm (shared memory) +func mountEssentials(log *Logger) error { + // Create mount points for pts and shm (proc/sys/dev already exist from wrapper) + for _, dir := range []string{"/dev/pts", "/dev/shm"} { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + + // Mount devpts for PTY support (needed for guest-agent and interactive shells) + if err := syscall.Mount("devpts", "/dev/pts", "devpts", 0, ""); err != nil { + return fmt.Errorf("mount /dev/pts: %w", err) + } + + // Set permissions on /dev/shm + if err := os.Chmod("/dev/shm", 01777); err != nil { + return fmt.Errorf("chmod /dev/shm: %w", err) + } + + log.Info("mount", "mounted devpts/shm") + + // Set up serial console now that /dev is mounted + // ttyS0 for x86_64, ttyAMA0 for ARM64 (PL011 UART) + if _, err := os.Stat("/dev/ttyAMA0"); err == nil { + log.SetConsole("/dev/ttyAMA0") + redirectToConsole("/dev/ttyAMA0") + } else if _, err := os.Stat("/dev/ttyS0"); err == nil { + log.SetConsole("/dev/ttyS0") + redirectToConsole("/dev/ttyS0") + } + + log.Info("mount", "redirected to serial console") + + return nil +} + +// setupOverlay sets up the overlay filesystem: +// - /dev/vda: readonly rootfs (ext4) +// - /dev/vdb: writable overlay disk (ext4) +// - /overlay/newroot: merged overlay filesystem +func setupOverlay(log *Logger) error { + // Wait for block devices to be ready + time.Sleep(500 * time.Millisecond) + + // Create mount points + for _, dir := range []string{"/lower", "/overlay"} { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + + // Mount readonly rootfs from /dev/vda (ext4 filesystem) + if err := mount("/dev/vda", "/lower", "ext4", "ro"); err != nil { + return fmt.Errorf("mount rootfs: %w", err) + } + log.Info("overlay", "mounted rootfs from /dev/vda") + + // Mount writable overlay disk from /dev/vdb + if err := mount("/dev/vdb", "/overlay", "ext4", ""); err != nil { + return fmt.Errorf("mount overlay disk: %w", err) + } + + // Create overlay directories + for _, dir := range []string{"/overlay/upper", "/overlay/work", "/overlay/newroot"} { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + log.Info("overlay", "mounted overlay disk from /dev/vdb") + + // Create overlay filesystem + if err := mountOverlay("/lower", "/overlay/upper", "/overlay/work", "/overlay/newroot"); err != nil { + return fmt.Errorf("mount overlay: %w", err) + } + log.Info("overlay", "created overlay filesystem") + + return nil +} + +// bindMountsToNewRoot bind-mounts essential filesystems to the new root. +// Uses bind mounts instead of move so that the original /dev remains populated +// for processes running in the initrd namespace. +func bindMountsToNewRoot(log *Logger) error { + newroot := "/overlay/newroot" + + // Create mount points in new root + for _, dir := range []string{"proc", "sys", "dev", "dev/pts"} { + if err := os.MkdirAll(newroot+"/"+dir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + + // Bind mount filesystems + mounts := []struct{ src, dst string }{ + {"/proc", newroot + "/proc"}, + {"/sys", newroot + "/sys"}, + {"/dev", newroot + "/dev"}, + {"/dev/pts", newroot + "/dev/pts"}, + } + + for _, m := range mounts { + if err := bindMount(m.src, m.dst); err != nil { + return fmt.Errorf("bind mount %s: %w", m.src, err) + } + } + + log.Info("bind", "bound mounts to new root") + + // Set up /dev symlinks for process substitution inside the container + symlinks := []struct{ target, link string }{ + {"/proc/self/fd", newroot + "/dev/fd"}, + {"/proc/self/fd/0", newroot + "/dev/stdin"}, + {"/proc/self/fd/1", newroot + "/dev/stdout"}, + {"/proc/self/fd/2", newroot + "/dev/stderr"}, + } + + for _, s := range symlinks { + os.Remove(s.link) // Remove if exists + os.Symlink(s.target, s.link) + } + + return nil +} + +// mount executes a mount command +func mount(source, target, fstype, options string) error { + args := []string{"-t", fstype} + if options != "" { + args = append(args, "-o", options) + } + args = append(args, source, target) + + cmd := exec.Command("/bin/mount", args...) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} + +// mountOverlay creates an overlay filesystem +func mountOverlay(lower, upper, work, target string) error { + options := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lower, upper, work) + cmd := exec.Command("/bin/mount", "-t", "overlay", "-o", options, "overlay", target) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} + +// bindMount performs a bind mount +func bindMount(source, target string) error { + cmd := exec.Command("/bin/mount", "--bind", source, target) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} + +// redirectToConsole redirects stdout/stderr to the serial console +func redirectToConsole(device string) { + f, err := os.OpenFile(device, os.O_WRONLY, 0) + if err != nil { + return + } + os.Stdout = f + os.Stderr = f +} + +// copyGuestAgent copies the guest-agent binary to the target location in the new root. +func copyGuestAgent(log *Logger) error { + const ( + src = "/usr/local/bin/guest-agent" + dst = "/overlay/newroot/opt/hypeman/guest-agent" + ) + + // Create target directory + if err := os.MkdirAll("/overlay/newroot/opt/hypeman", 0755); err != nil { + return fmt.Errorf("mkdir: %w", err) + } + + // Read source binary + data, err := os.ReadFile(src) + if err != nil { + return fmt.Errorf("read source: %w", err) + } + + // Write to destination + if err := os.WriteFile(dst, data, 0755); err != nil { + return fmt.Errorf("write destination: %w", err) + } + + log.Info("agent", "copied guest-agent to /opt/hypeman/") + return nil +} + diff --git a/lib/system/init/network.go b/lib/system/init/network.go new file mode 100644 index 00000000..76df8254 --- /dev/null +++ b/lib/system/init/network.go @@ -0,0 +1,59 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + + "github.com/onkernel/hypeman/lib/vmconfig" +) + +// configureNetwork sets up networking in the guest VM. +// This is done from the initrd before pivot_root so it works for both exec and systemd modes. +func configureNetwork(log *Logger, cfg *vmconfig.Config) error { + // Bring up loopback interface + if err := runIP("link", "set", "lo", "up"); err != nil { + return fmt.Errorf("bring up lo: %w", err) + } + + // Add IP address to eth0 + addr := fmt.Sprintf("%s/%d", cfg.GuestIP, cfg.GuestCIDR) + if err := runIP("addr", "add", addr, "dev", "eth0"); err != nil { + return fmt.Errorf("add IP address: %w", err) + } + + // Bring up eth0 + if err := runIP("link", "set", "eth0", "up"); err != nil { + return fmt.Errorf("bring up eth0: %w", err) + } + + // Add default route + if err := runIP("route", "add", "default", "via", cfg.GuestGW); err != nil { + return fmt.Errorf("add default route: %w", err) + } + + // Configure DNS in the new root + resolvConf := fmt.Sprintf("nameserver %s\n", cfg.GuestDNS) + resolvPath := "/overlay/newroot/etc/resolv.conf" + + // Ensure /etc exists + if err := os.MkdirAll("/overlay/newroot/etc", 0755); err != nil { + return fmt.Errorf("mkdir /etc: %w", err) + } + + if err := os.WriteFile(resolvPath, []byte(resolvConf), 0644); err != nil { + return fmt.Errorf("write resolv.conf: %w", err) + } + + log.Info("network", fmt.Sprintf("configured eth0 with %s", addr)) + return nil +} + +// runIP executes an 'ip' command with the given arguments. +func runIP(args ...string) error { + cmd := exec.Command("/sbin/ip", args...) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + return nil +} diff --git a/lib/system/init/volumes.go b/lib/system/init/volumes.go new file mode 100644 index 00000000..278c1dbd --- /dev/null +++ b/lib/system/init/volumes.go @@ -0,0 +1,113 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + + "github.com/onkernel/hypeman/lib/vmconfig" +) + +// mountVolumes mounts attached volumes according to the configuration. +// Supports three modes: ro (read-only), rw (read-write), and overlay. +func mountVolumes(log *Logger, cfg *vmconfig.Config) error { + log.Info("volumes", "mounting volumes") + + for _, vol := range cfg.VolumeMounts { + mountPath := filepath.Join("/overlay/newroot", vol.Path) + + // Create mount point + if err := os.MkdirAll(mountPath, 0755); err != nil { + log.Error("volumes", fmt.Sprintf("mkdir %s failed", vol.Path), err) + continue + } + + switch vol.Mode { + case "overlay": + if err := mountVolumeOverlay(log, vol, mountPath); err != nil { + log.Error("volumes", fmt.Sprintf("mount overlay %s failed", vol.Path), err) + } + case "ro": + if err := mountVolumeReadOnly(log, vol, mountPath); err != nil { + log.Error("volumes", fmt.Sprintf("mount ro %s failed", vol.Path), err) + } + default: // "rw" + if err := mountVolumeReadWrite(log, vol, mountPath); err != nil { + log.Error("volumes", fmt.Sprintf("mount rw %s failed", vol.Path), err) + } + } + } + + return nil +} + +// mountVolumeOverlay mounts a volume in overlay mode. +// Uses the base device as read-only lower layer and overlay device for writable upper layer. +func mountVolumeOverlay(log *Logger, vol vmconfig.VolumeMount, mountPath string) error { + // Use device name for unique mount points (e.g., "vdd" from "/dev/vdd") + // This avoids collisions when multiple volumes have the same basename + deviceName := filepath.Base(vol.Device) + baseMount := fmt.Sprintf("/mnt/vol-base-%s", deviceName) + overlayMount := fmt.Sprintf("/mnt/vol-overlay-%s", deviceName) + + // Create mount points + if err := os.MkdirAll(baseMount, 0755); err != nil { + return err + } + if err := os.MkdirAll(overlayMount, 0755); err != nil { + return err + } + + // Mount base volume read-only (noload to skip journal recovery) + cmd := exec.Command("/bin/mount", "-t", "ext4", "-o", "ro,noload", vol.Device, baseMount) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mount base: %s: %s", err, output) + } + + // Mount overlay disk (writable) + cmd = exec.Command("/bin/mount", "-t", "ext4", vol.OverlayDevice, overlayMount) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mount overlay disk: %s: %s", err, output) + } + + // Create overlay directories + upperDir := filepath.Join(overlayMount, "upper") + workDir := filepath.Join(overlayMount, "work") + os.MkdirAll(upperDir, 0755) + os.MkdirAll(workDir, 0755) + + // Create overlayfs + options := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", baseMount, upperDir, workDir) + cmd = exec.Command("/bin/mount", "-t", "overlay", "-o", options, "overlay", mountPath) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("mount overlay: %s: %s", err, output) + } + + log.Info("volumes", fmt.Sprintf("mounted %s at %s (overlay via %s)", vol.Device, vol.Path, vol.OverlayDevice)) + return nil +} + +// mountVolumeReadOnly mounts a volume in read-only mode. +func mountVolumeReadOnly(log *Logger, vol vmconfig.VolumeMount, mountPath string) error { + // Use noload to skip journal recovery for multi-attach safety + cmd := exec.Command("/bin/mount", "-t", "ext4", "-o", "ro,noload", vol.Device, mountPath) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + + log.Info("volumes", fmt.Sprintf("mounted %s at %s (ro)", vol.Device, vol.Path)) + return nil +} + +// mountVolumeReadWrite mounts a volume in read-write mode. +func mountVolumeReadWrite(log *Logger, vol vmconfig.VolumeMount, mountPath string) error { + cmd := exec.Command("/bin/mount", "-t", "ext4", vol.Device, mountPath) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %s", err, output) + } + + log.Info("volumes", fmt.Sprintf("mounted %s at %s (rw)", vol.Device, vol.Path)) + return nil +} + diff --git a/lib/system/init_binary.go b/lib/system/init_binary.go new file mode 100644 index 00000000..ad378a67 --- /dev/null +++ b/lib/system/init_binary.go @@ -0,0 +1,11 @@ +package system + +import _ "embed" + +// InitBinary contains the embedded init binary for guest VMs. +// This is built by the Makefile before the main binary is compiled. +// The init binary is a statically-linked Go program that runs as PID 1 in the guest VM. +// It matches the architecture of the host (VMs run on the same arch as the host). +// +//go:embed init/init +var InitBinary []byte diff --git a/lib/system/init_script.go b/lib/system/init_script.go deleted file mode 100644 index 9e9b397a..00000000 --- a/lib/system/init_script.go +++ /dev/null @@ -1,294 +0,0 @@ -package system - -// GenerateInitScript returns the comprehensive init script for initrd -// This consolidates ALL init logic - no modifications to OCI images needed -// -// The script: -// 1. Mounts essential filesystems (proc, sys, dev) -// 2. Sets up overlay filesystem (lowerdir=rootfs, upperdir=overlay disk) -// 3. Mounts and sources config disk (/dev/vdc) -// 4. Loads NVIDIA kernel modules (if HAS_GPU=1 in config.sh) -// 5. Configures networking (if enabled) -// 6. Executes container entrypoint -// -// GPU support: When HAS_GPU=1 is set in the instance's config.sh, the init script -// will load NVIDIA kernel modules before launching the container entrypoint. -func GenerateInitScript() string { - return `#!/bin/sh -set -xe - -echo "overlay-init: START" > /dev/kmsg - -# Create mount points -mkdir -p /proc /sys /dev - -# Mount essential filesystems -# devtmpfs handles /dev population (null, zero, vsock, etc.) automatically -mount -t proc none /proc -mount -t sysfs none /sys -mount -t devtmpfs none /dev - -# Setup PTY support (needed for guest-agent and interactive shells) -mkdir -p /dev/pts /dev/shm -mount -t devpts devpts /dev/pts -chmod 1777 /dev/shm - -echo "overlay-init: mounted proc/sys/dev" > /dev/kmsg - -# Redirect all output to serial console -# ttyS0 for x86_64, ttyAMA0 for ARM64 (PL011 UART) -if [ -e /dev/ttyAMA0 ]; then - exec >/dev/ttyAMA0 2>&1 -else - exec >/dev/ttyS0 2>&1 -fi - -echo "overlay-init: redirected to serial console" - -# Wait for block devices to be ready -sleep 0.5 - -# Mount readonly rootfs from /dev/vda (ext4 filesystem) -mkdir -p /lower -mount -t ext4 -o ro /dev/vda /lower -echo "overlay-init: mounted rootfs from /dev/vda" - -# Mount writable overlay disk from /dev/vdb -mkdir -p /overlay -mount -t ext4 /dev/vdb /overlay -mkdir -p /overlay/upper /overlay/work /overlay/newroot -echo "overlay-init: mounted overlay disk from /dev/vdb" - -# Create overlay filesystem -mount -t overlay \ - -o lowerdir=/lower,upperdir=/overlay/upper,workdir=/overlay/work \ - overlay /overlay/newroot -echo "overlay-init: created overlay filesystem" - -# Mount config disk (/dev/vdc) -mkdir -p /mnt/config -mount -o ro /dev/vdc /mnt/config -echo "overlay-init: mounted config disk" - -# Source configuration -if [ -f /mnt/config/config.sh ]; then - . /mnt/config/config.sh - echo "overlay-init: sourced config" -else - echo "overlay-init: ERROR - config.sh not found!" - /bin/sh -i - exit 1 -fi - -# Load NVIDIA kernel modules for GPU passthrough (if HAS_GPU=1) -if [ "${HAS_GPU:-0}" = "1" ]; then - echo "overlay-init: loading NVIDIA kernel modules for GPU passthrough" - if [ -d /lib/modules ]; then - # Find the kernel version directory - KVER=$(ls /lib/modules/ 2>/dev/null | head -1) - if [ -n "$KVER" ] && [ -d "/lib/modules/$KVER/kernel/drivers/gpu" ]; then - # Load modules in order (dependencies first) - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia.ko 2>&1 || echo "overlay-init: nvidia.ko load failed" - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia-uvm.ko 2>&1 || echo "overlay-init: nvidia-uvm.ko load failed" - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia-modeset.ko 2>&1 || echo "overlay-init: nvidia-modeset.ko load failed" - insmod /lib/modules/$KVER/kernel/drivers/gpu/nvidia-drm.ko modeset=1 2>&1 || echo "overlay-init: nvidia-drm.ko load failed" - echo "overlay-init: NVIDIA modules loaded for kernel $KVER" - - # Use nvidia-modprobe to create device nodes with correct major/minor numbers. - # nvidia-modprobe is the official NVIDIA utility that: - # 1. Loads kernel modules if needed (already done above) - # 2. Creates /dev/nvidiactl and /dev/nvidia0 with correct permissions - # 3. Creates /dev/nvidia-uvm and /dev/nvidia-uvm-tools - if [ -x /usr/bin/nvidia-modprobe ]; then - echo "overlay-init: running nvidia-modprobe to create device nodes" - /usr/bin/nvidia-modprobe 2>&1 || echo "overlay-init: nvidia-modprobe failed" - /usr/bin/nvidia-modprobe -u -c=0 2>&1 || echo "overlay-init: nvidia-modprobe -u failed" - echo "overlay-init: nvidia-modprobe completed" - ls -la /dev/nvidia* 2>/dev/null || true - else - echo "overlay-init: nvidia-modprobe not found, falling back to manual mknod" - # Fallback: Manual device node creation - NVIDIA_MAJOR=$(awk '/nvidia-frontend|^[0-9]+ nvidia$/ {print $1}' /proc/devices 2>/dev/null | head -1) - NVIDIA_UVM_MAJOR=$(awk '/nvidia-uvm/ {print $1}' /proc/devices 2>/dev/null) - - if [ -n "$NVIDIA_MAJOR" ]; then - mknod -m 666 /dev/nvidiactl c $NVIDIA_MAJOR 255 - mknod -m 666 /dev/nvidia0 c $NVIDIA_MAJOR 0 - echo "overlay-init: created /dev/nvidiactl and /dev/nvidia0 (major $NVIDIA_MAJOR)" - fi - - if [ -n "$NVIDIA_UVM_MAJOR" ]; then - mknod -m 666 /dev/nvidia-uvm c $NVIDIA_UVM_MAJOR 0 - mknod -m 666 /dev/nvidia-uvm-tools c $NVIDIA_UVM_MAJOR 1 - echo "overlay-init: created /dev/nvidia-uvm* (major $NVIDIA_UVM_MAJOR)" - fi - fi - else - echo "overlay-init: NVIDIA modules not found in /lib/modules/$KVER" - fi - else - echo "overlay-init: /lib/modules not found, skipping NVIDIA module loading" - fi - - # Inject NVIDIA userspace driver libraries into container rootfs - # This allows containers to use standard CUDA images without bundled drivers - # See lib/devices/GPU.md for documentation - if [ -d /usr/lib/nvidia ]; then - echo "overlay-init: injecting NVIDIA driver libraries into container" - - DRIVER_VERSION=$(cat /usr/lib/nvidia/version 2>/dev/null || echo "unknown") - # Determine library path based on architecture - if [ "$(uname -m)" = "aarch64" ]; then - LIB_DST="/overlay/newroot/usr/lib/aarch64-linux-gnu" - else - LIB_DST="/overlay/newroot/usr/lib/x86_64-linux-gnu" - fi - BIN_DST="/overlay/newroot/usr/bin" - - mkdir -p "$LIB_DST" "$BIN_DST" - - # Copy all driver libraries and create symlinks - for lib in /usr/lib/nvidia/*.so.*; do - if [ -f "$lib" ]; then - libname=$(basename "$lib") - cp "$lib" "$LIB_DST/" - - # Create standard symlinks: libfoo.so.VERSION -> libfoo.so.1 -> libfoo.so - base=$(echo "$libname" | sed 's/\.so\..*//') - ln -sf "$libname" "$LIB_DST/${base}.so.1" 2>/dev/null || true - ln -sf "${base}.so.1" "$LIB_DST/${base}.so" 2>/dev/null || true - fi - done - - # Copy nvidia-smi and nvidia-modprobe binaries - for bin in nvidia-smi nvidia-modprobe; do - if [ -x /usr/bin/$bin ]; then - cp /usr/bin/$bin "$BIN_DST/" - fi - done - - # Update ldconfig cache so applications can find the libraries - chroot /overlay/newroot ldconfig 2>/dev/null || true - - echo "overlay-init: NVIDIA driver libraries injected (version: $DRIVER_VERSION)" - fi -fi - -# Mount attached volumes (from config: VOLUME_MOUNTS="device:path:mode[:overlay_device] ...") -# Modes: ro (read-only), rw (read-write), overlay (base ro + per-instance overlay) -if [ -n "${VOLUME_MOUNTS:-}" ]; then - echo "overlay-init: mounting volumes" - for vol in $VOLUME_MOUNTS; do - device=$(echo "$vol" | cut -d: -f1) - path=$(echo "$vol" | cut -d: -f2) - mode=$(echo "$vol" | cut -d: -f3) - - # Create mount point in overlay - mkdir -p "/overlay/newroot${path}" - - if [ "$mode" = "overlay" ]; then - # Overlay mode: mount base read-only, create overlayfs with per-instance writable layer - overlay_device=$(echo "$vol" | cut -d: -f4) - - # Create temp mount points for base and overlay disk. - # These persist for the lifetime of the VM but are NOT leaked - they exist inside - # the ephemeral guest rootfs (which is itself an overlayfs) and are destroyed - # when the VM terminates along with all guest state. - base_mount="/mnt/vol-base-$(basename "$path")" - overlay_mount="/mnt/vol-overlay-$(basename "$path")" - mkdir -p "$base_mount" "$overlay_mount" - - # Mount base volume read-only (noload to skip journal recovery) - mount -t ext4 -o ro,noload "$device" "$base_mount" - - # Mount overlay disk (writable) - mount -t ext4 "$overlay_device" "$overlay_mount" - mkdir -p "$overlay_mount/upper" "$overlay_mount/work" - - # Create overlayfs combining base (lower) and instance overlay (upper) - mount -t overlay \ - -o "lowerdir=$base_mount,upperdir=$overlay_mount/upper,workdir=$overlay_mount/work" \ - overlay "/overlay/newroot${path}" - - echo "overlay-init: mounted volume $device at $path (overlay via $overlay_device)" - elif [ "$mode" = "ro" ]; then - # Read-only mount (noload to skip journal recovery for multi-attach safety) - mount -t ext4 -o ro,noload "$device" "/overlay/newroot${path}" - echo "overlay-init: mounted volume $device at $path (ro)" - else - # Read-write mount - mount -t ext4 "$device" "/overlay/newroot${path}" - echo "overlay-init: mounted volume $device at $path (rw)" - fi - done -fi - -# Prepare new root mount points -# We use bind mounts instead of move so that the original /dev remains populated -# for processes running in the initrd namespace (like guest-agent). -mkdir -p /overlay/newroot/proc -mkdir -p /overlay/newroot/sys -mkdir -p /overlay/newroot/dev -mkdir -p /overlay/newroot/dev/pts - -mount --bind /proc /overlay/newroot/proc -mount --bind /sys /overlay/newroot/sys -mount --bind /dev /overlay/newroot/dev -mount --bind /dev/pts /overlay/newroot/dev/pts - -echo "overlay-init: bound mounts to new root" - -# Set up /dev symlinks for process substitution inside the container -chroot /overlay/newroot ln -sf /proc/self/fd /dev/fd 2>/dev/null || true -chroot /overlay/newroot ln -sf /proc/self/fd/0 /dev/stdin 2>/dev/null || true -chroot /overlay/newroot ln -sf /proc/self/fd/1 /dev/stdout 2>/dev/null || true -chroot /overlay/newroot ln -sf /proc/self/fd/2 /dev/stderr 2>/dev/null || true - -# Configure network from initrd (using busybox ip, not container's ip) -# Network interfaces are shared, so we can configure them from here -if [ -n "${GUEST_IP:-}" ]; then - echo "overlay-init: configuring network" - ip link set lo up - ip addr add ${GUEST_IP}/${GUEST_CIDR} dev eth0 - ip link set eth0 up - ip route add default via ${GUEST_GW} - echo "nameserver ${GUEST_DNS}" > /overlay/newroot/etc/resolv.conf - echo "overlay-init: network configured - IP: ${GUEST_IP}/${GUEST_CIDR}" -fi - -# Set PATH for initrd tools -export PATH='/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin' -export HOME='/root' - -# Copy guest-agent into container rootfs and start it in container namespace -# This way the PTY and shell run in the same namespace, fixing signal handling -echo "overlay-init: copying guest-agent to container" -mkdir -p /overlay/newroot/usr/local/bin -cp /usr/local/bin/guest-agent /overlay/newroot/usr/local/bin/guest-agent - -# Start vsock guest agent inside the container namespace -echo "overlay-init: starting guest agent in container namespace" -chroot /overlay/newroot /usr/local/bin/guest-agent & - -echo "overlay-init: launching entrypoint" -echo "overlay-init: workdir=${WORKDIR:-/} entrypoint=${ENTRYPOINT} cmd=${CMD}" - -set +e - -# Construct the command string carefully -# ENTRYPOINT and CMD are shell-safe quoted strings from config.sh -eval "chroot /overlay/newroot /bin/sh -c \"cd ${WORKDIR:-/} && exec ${ENTRYPOINT} ${CMD}\"" & -APP_PID=$! - -echo "overlay-init: container app started (PID $APP_PID)" - -# Wait for app to exit -wait $APP_PID -APP_EXIT=$? - -echo "overlay-init: app exited with code $APP_EXIT" - -# Wait for all background jobs (guest-agent runs forever, keeping init alive) -# This prevents kernel panic from killing init (PID 1) -wait` -} diff --git a/lib/system/init_wrapper.go b/lib/system/init_wrapper.go new file mode 100644 index 00000000..16056f2d --- /dev/null +++ b/lib/system/init_wrapper.go @@ -0,0 +1,6 @@ +package system + +import _ "embed" + +//go:embed init/init.sh +var InitWrapper []byte diff --git a/lib/system/initrd.go b/lib/system/initrd.go index 3048b75b..567247d2 100644 --- a/lib/system/initrd.go +++ b/lib/system/initrd.go @@ -20,7 +20,7 @@ import ( const alpineBaseImage = "alpine:3.22" -// buildInitrd builds initrd from Alpine base + embedded guest-agent + generated init script +// buildInitrd builds initrd from Alpine base + embedded guest-agent + embedded init binary func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) { // Create temp directory for building tempDir, err := os.MkdirTemp("", "hypeman-initrd-*") @@ -67,11 +67,17 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) log.InfoContext(ctx, "skipping NVIDIA modules", "error", err) } - // Write generated init script - initScript := GenerateInitScript() - initPath := filepath.Join(rootfsDir, "init") - if err := os.WriteFile(initPath, []byte(initScript), 0755); err != nil { - return "", fmt.Errorf("write init script: %w", err) + // Write shell wrapper as /init (sets up /proc, /sys, /dev before Go runtime) + // The Go runtime needs these filesystems during initialization + initWrapperPath := filepath.Join(rootfsDir, "init") + if err := os.WriteFile(initWrapperPath, InitWrapper, 0755); err != nil { + return "", fmt.Errorf("write init wrapper: %w", err) + } + + // Write Go init binary as /init.bin (called by wrapper after setup) + initBinPath := filepath.Join(rootfsDir, "init.bin") + if err := os.WriteFile(initBinPath, InitBinary, 0755); err != nil { + return "", fmt.Errorf("write init binary: %w", err) } // Generate timestamp for this build @@ -89,7 +95,7 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) // Store hash for staleness detection hashPath := filepath.Join(filepath.Dir(outputPath), ".hash") - currentHash := computeInitrdHash() + currentHash := computeInitrdHash(arch) if err := os.WriteFile(hashPath, []byte(currentHash), 0644); err != nil { return "", fmt.Errorf("write hash file: %w", err) } @@ -117,7 +123,7 @@ func (m *manager) ensureInitrd(ctx context.Context) (string, error) { initrdPath := m.paths.SystemInitrdTimestamp(target, arch) if _, err := os.Stat(initrdPath); err == nil { // File exists, check if it's stale by comparing embedded binary hash - if !m.isInitrdStale(initrdPath) { + if !m.isInitrdStale(initrdPath, arch) { return initrdPath, nil } } @@ -133,7 +139,7 @@ func (m *manager) ensureInitrd(ctx context.Context) (string, error) { } // isInitrdStale checks if the initrd needs rebuilding by comparing hashes -func (m *manager) isInitrdStale(initrdPath string) bool { +func (m *manager) isInitrdStale(initrdPath, arch string) bool { // Read stored hash hashPath := filepath.Join(filepath.Dir(initrdPath), ".hash") storedHash, err := os.ReadFile(hashPath) @@ -143,22 +149,23 @@ func (m *manager) isInitrdStale(initrdPath string) bool { } // Compare with current hash - currentHash := computeInitrdHash() + currentHash := computeInitrdHash(arch) return string(storedHash) != currentHash } -// computeInitrdHash computes a hash of the embedded binary, init script, and NVIDIA assets -func computeInitrdHash() string { +// computeInitrdHash computes a hash of the embedded binaries and NVIDIA assets for a specific architecture +func computeInitrdHash(arch string) string { h := sha256.New() h.Write(GuestAgentBinary) - h.Write([]byte(GenerateInitScript())) + h.Write(InitBinary) + h.Write(InitWrapper) // Include NVIDIA driver version in hash so initrd is rebuilt when driver changes if ver, ok := NvidiaDriverVersion[DefaultKernelVersion]; ok { h.Write([]byte(ver)) } // Include driver libs URL so initrd is rebuilt when the libs tarball changes if archURLs, ok := NvidiaDriverLibURLs[DefaultKernelVersion]; ok { - if url, ok := archURLs["x86_64"]; ok { + if url, ok := archURLs[arch]; ok { h.Write([]byte(url)) } } diff --git a/lib/system/manager_test.go b/lib/system/manager_test.go index ce32df12..ea91ae6b 100644 --- a/lib/system/manager_test.go +++ b/lib/system/manager_test.go @@ -58,16 +58,9 @@ func TestEnsureSystemFiles(t *testing.T) { require.NoError(t, err) } -func TestInitScriptGeneration(t *testing.T) { - script := GenerateInitScript() - - // Verify script contains essential components - assert.Contains(t, script, "#!/bin/sh") - assert.Contains(t, script, "mount -t overlay") - assert.Contains(t, script, "/dev/vda") // rootfs disk - assert.Contains(t, script, "/dev/vdb") // overlay disk - assert.Contains(t, script, "/dev/vdc") // config disk - assert.Contains(t, script, "guest-agent") // vsock guest agent - assert.Contains(t, script, "${ENTRYPOINT}") - assert.Contains(t, script, "wait $APP_PID") // Supervisor pattern +func TestInitBinaryEmbedded(t *testing.T) { + // Verify the init binary is embedded and has reasonable size + // The Go init binary should be at least 1MB when statically linked + assert.NotEmpty(t, InitBinary, "init binary should be embedded") + assert.Greater(t, len(InitBinary), 100000, "init binary should be at least 100KB") } diff --git a/lib/vmconfig/README.md b/lib/vmconfig/README.md new file mode 100644 index 00000000..69da5370 --- /dev/null +++ b/lib/vmconfig/README.md @@ -0,0 +1,25 @@ +# VM Config + +Shared configuration schema for host-to-guest communication. + +## Purpose + +This package defines the `Config` struct that is: +- Serialized as JSON by the host when creating a VM's config disk +- Deserialized by the guest init binary during boot + +By defining this in a shared package, the host and guest code stay in sync without duplication. + +## Usage + +The host writes this config to `/config.json` on the config disk (attached as `/dev/vdc`). +The guest init binary mounts this disk and reads the JSON configuration. + +## Fields + +- **Entrypoint/Cmd/Workdir**: Container execution parameters from the OCI image +- **Env**: Environment variables (merged from image + instance overrides) +- **Network**: Guest IP, gateway, DNS configuration +- **GPU**: Whether GPU passthrough is enabled +- **VolumeMounts**: Block devices to mount inside the guest +- **InitMode**: Either "exec" (container-like) or "systemd" (full VM) diff --git a/lib/vmconfig/config.go b/lib/vmconfig/config.go new file mode 100644 index 00000000..db7c554f --- /dev/null +++ b/lib/vmconfig/config.go @@ -0,0 +1,39 @@ +// Package vmconfig defines the configuration schema passed from host to guest VM. +package vmconfig + +// Config is the configuration passed to the guest init binary via config.json. +// This struct is serialized by the host (lib/instances/configdisk.go) and +// deserialized by the guest init binary (lib/system/init). +type Config struct { + // Container execution parameters + Entrypoint []string `json:"entrypoint"` + Cmd []string `json:"cmd"` + Workdir string `json:"workdir"` + + // Environment variables + Env map[string]string `json:"env"` + + // Network configuration + NetworkEnabled bool `json:"network_enabled"` + GuestIP string `json:"guest_ip,omitempty"` + GuestCIDR int `json:"guest_cidr,omitempty"` + GuestGW string `json:"guest_gw,omitempty"` + GuestDNS string `json:"guest_dns,omitempty"` + + // GPU passthrough + HasGPU bool `json:"has_gpu"` + + // Volume mounts + VolumeMounts []VolumeMount `json:"volume_mounts,omitempty"` + + // Init mode: "exec" (default) or "systemd" + InitMode string `json:"init_mode"` +} + +// VolumeMount represents a volume mount configuration. +type VolumeMount struct { + Device string `json:"device"` + Path string `json:"path"` + Mode string `json:"mode"` // "ro", "rw", or "overlay" + OverlayDevice string `json:"overlay_device,omitempty"` +}