From 4bff8232e95a384157b6c4fc759708129145b826 Mon Sep 17 00:00:00 2001 From: Harsh Rawat Date: Mon, 6 Apr 2026 10:40:13 +0530 Subject: [PATCH] [shimV2] adds the container spec builder Adds the new `internal/builder/container` package for container resource planning and spec construction. - adds core container builder types and resource planning logic - adds LCOW-specific implementations for devices, layers, mounts, reserve, and specs - adds package documentation and generated mocks used by tests - adds comprehensive unit tests across core and LCOW components Signed-off-by: Harsh Rawat --- internal/builder/container/lcow/devices.go | 68 +++ .../builder/container/lcow/devices_test.go | 265 ++++++++ internal/builder/container/lcow/doc.go | 39 ++ internal/builder/container/lcow/layers.go | 175 ++++++ .../builder/container/lcow/layers_test.go | 569 ++++++++++++++++++ internal/builder/container/lcow/mounts.go | 301 +++++++++ .../builder/container/lcow/mounts_test.go | 191 ++++++ internal/builder/container/lcow/reserve.go | 102 ++++ .../builder/container/lcow/reserve_test.go | 287 +++++++++ internal/builder/container/lcow/specs.go | 121 ++++ internal/builder/container/lcow/specs_test.go | 369 ++++++++++++ .../builder/container/mocks/mock_types.go | 184 ++++++ internal/builder/container/resource_plan.go | 74 +++ .../builder/container/resource_plan_test.go | 299 +++++++++ internal/builder/container/types.go | 50 ++ 15 files changed, 3094 insertions(+) create mode 100644 internal/builder/container/lcow/devices.go create mode 100644 internal/builder/container/lcow/devices_test.go create mode 100644 internal/builder/container/lcow/doc.go create mode 100644 internal/builder/container/lcow/layers.go create mode 100644 internal/builder/container/lcow/layers_test.go create mode 100644 internal/builder/container/lcow/mounts.go create mode 100644 internal/builder/container/lcow/mounts_test.go create mode 100644 internal/builder/container/lcow/reserve.go create mode 100644 internal/builder/container/lcow/reserve_test.go create mode 100644 internal/builder/container/lcow/specs.go create mode 100644 internal/builder/container/lcow/specs_test.go create mode 100644 internal/builder/container/mocks/mock_types.go create mode 100644 internal/builder/container/resource_plan.go create mode 100644 internal/builder/container/resource_plan_test.go create mode 100644 internal/builder/container/types.go diff --git a/internal/builder/container/lcow/devices.go b/internal/builder/container/lcow/devices.go new file mode 100644 index 0000000000..3874782438 --- /dev/null +++ b/internal/builder/container/lcow/devices.go @@ -0,0 +1,68 @@ +//go:build windows && lcow + +package lcow + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/builder/container" + "github.com/Microsoft/hcsshim/internal/controller/device/vpci" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + + "github.com/Microsoft/go-winio/pkg/guid" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +// reserveAndUpdateDevices reserves vPCI devices on the host and updates each +// device's ID in the spec to the resulting VMBus channel GUID. +// +// On partial failure the successfully reserved IDs are still returned so the +// caller's top-level cleanup can release them. +func reserveAndUpdateDevices( + ctx context.Context, + vpciReserver container.VPCIReserver, + specDevs []specs.WindowsDevice, +) ([]guid.GUID, error) { + log.G(ctx).WithField("devices", log.Format(ctx, specDevs)).Trace("reserving vPCI devices") + + var reservations []guid.GUID + + for deviceIdx := range specDevs { + device := &specDevs[deviceIdx] + + // Validate that the device type is supported before attempting reservation. + if !vpci.IsValidDeviceType(device.IDType) { + return reservations, fmt.Errorf("reserve device %s: unsupported type %s", device.ID, device.IDType) + } + + // Parse the device path into a PCI ID and optional virtual function index. + pciID, virtualFunctionIndex := vpci.GetDeviceInfoFromPath(device.ID) + + // Reserve the device on the host and obtain the VMBus channel GUID. + vmBusGUID, err := vpciReserver.Reserve(ctx, vpci.Device{ + DeviceInstanceID: pciID, + VirtualFunctionIndex: virtualFunctionIndex, + }) + if err != nil { + return reservations, fmt.Errorf("reserve device %s: %w", device.ID, err) + } + + log.G(ctx).WithFields(logrus.Fields{ + logfields.DeviceID: pciID, + logfields.VFIndex: virtualFunctionIndex, + logfields.VMBusGUID: vmBusGUID.String(), + }).Trace("reserved vPCI device") + + // Update the spec entry so GCS references the VMBus GUID + // instead of the original device path. + device.ID = vmBusGUID.String() + reservations = append(reservations, vmBusGUID) + } + + log.G(ctx).Debug("all vPCI devices reserved successfully") + + return reservations, nil +} diff --git a/internal/builder/container/lcow/devices_test.go b/internal/builder/container/lcow/devices_test.go new file mode 100644 index 0000000000..d915e25695 --- /dev/null +++ b/internal/builder/container/lcow/devices_test.go @@ -0,0 +1,265 @@ +//go:build windows && lcow + +package lcow + +import ( + "errors" + "testing" + + "github.com/Microsoft/go-winio/pkg/guid" + "go.uber.org/mock/gomock" + + "github.com/Microsoft/hcsshim/internal/builder/container/mocks" + "github.com/Microsoft/hcsshim/internal/controller/device/vpci" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Test helpers +// ───────────────────────────────────────────────────────────────────────────── + +// newGUID generates a random GUID and fails the test on error. +func newGUID(t *testing.T) guid.GUID { + t.Helper() + id, err := guid.NewV4() + if err != nil { + t.Fatalf("failed to generate GUID: %v", err) + } + return id +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveAndUpdateDevices — empty device list +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveDevices_EmptyList verifies that an empty device slice produces +// no reservations and no error. +func TestReserveDevices_EmptyList(t *testing.T) { + t.Parallel() + vpciReserver := mocks.NewMockVPCIReserver(gomock.NewController(t)) + + reservations, err := reserveAndUpdateDevices(t.Context(), vpciReserver, nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(reservations) != 0 { + t.Errorf("expected 0 reservations, got %d", len(reservations)) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveAndUpdateDevices — single valid device +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveDevices_SingleDevice verifies that a single valid device is +// reserved and its spec ID is rewritten to the VMBus GUID. +func TestReserveDevices_SingleDevice(t *testing.T) { + t.Parallel() + vpciReserver := mocks.NewMockVPCIReserver(gomock.NewController(t)) + + vmBusGUID := newGUID(t) + devicePath := `PCI\VEN_1234&DEV_5678` + + vpciReserver.EXPECT().Reserve(gomock.Any(), vpci.Device{ + DeviceInstanceID: devicePath, + VirtualFunctionIndex: 0, + }).Return(vmBusGUID, nil) + + specDevs := []specs.WindowsDevice{ + {ID: devicePath, IDType: vpci.DeviceIDType}, + } + + reservations, err := reserveAndUpdateDevices(t.Context(), vpciReserver, specDevs) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(reservations) != 1 { + t.Fatalf("expected 1 reservation, got %d", len(reservations)) + } + if reservations[0] != vmBusGUID { + t.Errorf("expected reservation GUID %s, got %s", vmBusGUID, reservations[0]) + } + if specDevs[0].ID != vmBusGUID.String() { + t.Errorf("expected spec device ID rewritten to %s, got %s", vmBusGUID, specDevs[0].ID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveAndUpdateDevices — device with virtual function index +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveDevices_WithVirtualFunctionIndex verifies that a device path +// containing a trailing VF index (e.g. "DEVICE_ID/2") is parsed into the +// correct DeviceInstanceID and VirtualFunctionIndex. +func TestReserveDevices_WithVirtualFunctionIndex(t *testing.T) { + t.Parallel() + vpciReserver := mocks.NewMockVPCIReserver(gomock.NewController(t)) + + vmBusGUID := newGUID(t) + devicePath := `PCI\VEN_1234&DEV_5678` + + vpciReserver.EXPECT().Reserve(gomock.Any(), vpci.Device{ + DeviceInstanceID: devicePath, + VirtualFunctionIndex: 3, + }).Return(vmBusGUID, nil) + + specDevs := []specs.WindowsDevice{ + {ID: devicePath + `/3`, IDType: vpci.DeviceIDType}, + } + + reservations, err := reserveAndUpdateDevices(t.Context(), vpciReserver, specDevs) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(reservations) != 1 { + t.Fatalf("expected 1 reservation, got %d", len(reservations)) + } + if specDevs[0].ID != vmBusGUID.String() { + t.Errorf("expected spec device ID rewritten to %s, got %s", vmBusGUID, specDevs[0].ID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveAndUpdateDevices — multiple devices in order +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveDevices_MultipleDevices verifies that multiple devices are +// reserved in order and all spec IDs are rewritten. +func TestReserveDevices_MultipleDevices(t *testing.T) { + t.Parallel() + vpciReserver := mocks.NewMockVPCIReserver(gomock.NewController(t)) + + guid1, guid2 := newGUID(t), newGUID(t) + path1, path2 := `PCI\DEV_A`, `PCI\DEV_B` + + gomock.InOrder( + vpciReserver.EXPECT().Reserve(gomock.Any(), vpci.Device{ + DeviceInstanceID: path1, + }).Return(guid1, nil), + vpciReserver.EXPECT().Reserve(gomock.Any(), vpci.Device{ + DeviceInstanceID: path2, + }).Return(guid2, nil), + ) + + specDevs := []specs.WindowsDevice{ + {ID: path1, IDType: vpci.DeviceIDTypeLegacy}, + {ID: path2, IDType: vpci.GpuDeviceIDType}, + } + + reservations, err := reserveAndUpdateDevices(t.Context(), vpciReserver, specDevs) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(reservations) != 2 { + t.Fatalf("expected 2 reservations, got %d", len(reservations)) + } + if reservations[0] != guid1 || reservations[1] != guid2 { + t.Errorf("unexpected reservation GUIDs: %v", reservations) + } + if specDevs[0].ID != guid1.String() || specDevs[1].ID != guid2.String() { + t.Errorf("spec device IDs not rewritten: %q, %q", specDevs[0].ID, specDevs[1].ID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveAndUpdateDevices — unsupported device type +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveDevices_UnsupportedType verifies that an unsupported device type +// returns an error without calling Reserve. +func TestReserveDevices_UnsupportedType(t *testing.T) { + t.Parallel() + vpciReserver := mocks.NewMockVPCIReserver(gomock.NewController(t)) + + // No Reserve expectations — Reserve must not be called. + specDevs := []specs.WindowsDevice{ + {ID: `PCI\DEV_X`, IDType: "unsupported-type"}, + } + + _, err := reserveAndUpdateDevices(t.Context(), vpciReserver, specDevs) + if err == nil { + t.Fatal("expected error for unsupported device type") + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveAndUpdateDevices — reserve failure returns partial results +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveDevices_ReserveFailure verifies that when Reserve fails on the +// second device, the first reservation is still returned for cleanup. +func TestReserveDevices_ReserveFailure(t *testing.T) { + t.Parallel() + vpciReserver := mocks.NewMockVPCIReserver(gomock.NewController(t)) + + guid1 := newGUID(t) + path1, path2 := `PCI\DEV_A`, `PCI\DEV_B` + + gomock.InOrder( + vpciReserver.EXPECT().Reserve(gomock.Any(), vpci.Device{ + DeviceInstanceID: path1, + }).Return(guid1, nil), + vpciReserver.EXPECT().Reserve(gomock.Any(), vpci.Device{ + DeviceInstanceID: path2, + }).Return(guid.GUID{}, errors.New("reservation failed")), + ) + + specDevs := []specs.WindowsDevice{ + {ID: path1, IDType: vpci.DeviceIDType}, + {ID: path2, IDType: vpci.DeviceIDType}, + } + + reservations, err := reserveAndUpdateDevices(t.Context(), vpciReserver, specDevs) + if err == nil { + t.Fatal("expected error from Reserve failure") + } + // The first successful reservation must still be returned. + if len(reservations) != 1 { + t.Fatalf("expected 1 partial reservation, got %d", len(reservations)) + } + if reservations[0] != guid1 { + t.Errorf("expected partial reservation GUID %s, got %s", guid1, reservations[0]) + } + if specDevs[0].ID != guid1.String() { + t.Errorf("expected first device ID rewritten to %s, got %s", guid1, specDevs[0].ID) + } + if specDevs[1].ID != path2 { + t.Errorf("expected failing device ID to remain %s, got %s", path2, specDevs[1].ID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveAndUpdateDevices — unsupported type after valid device +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveDevices_UnsupportedTypeAfterValid verifies that an unsupported +// type on the second device returns the first successful reservation. +func TestReserveDevices_UnsupportedTypeAfterValid(t *testing.T) { + t.Parallel() + vpciReserver := mocks.NewMockVPCIReserver(gomock.NewController(t)) + + guid1 := newGUID(t) + + vpciReserver.EXPECT().Reserve(gomock.Any(), vpci.Device{ + DeviceInstanceID: `PCI\DEV_A`, + }).Return(guid1, nil) + + specDevs := []specs.WindowsDevice{ + {ID: `PCI\DEV_A`, IDType: vpci.DeviceIDType}, + {ID: `PCI\DEV_B`, IDType: "bad-type"}, + } + + reservations, err := reserveAndUpdateDevices(t.Context(), vpciReserver, specDevs) + if err == nil { + t.Fatal("expected error for unsupported device type") + } + if len(reservations) != 1 { + t.Fatalf("expected 1 partial reservation, got %d", len(reservations)) + } + if specDevs[0].ID != guid1.String() { + t.Errorf("expected first device ID rewritten to %s, got %s", guid1, specDevs[0].ID) + } + if specDevs[1].ID != `PCI\DEV_B` { + t.Errorf("expected unsupported device ID to remain %s, got %s", `PCI\DEV_B`, specDevs[1].ID) + } +} diff --git a/internal/builder/container/lcow/doc.go b/internal/builder/container/lcow/doc.go new file mode 100644 index 0000000000..58a704306a --- /dev/null +++ b/internal/builder/container/lcow/doc.go @@ -0,0 +1,39 @@ +//go:build windows && lcow + +// Package lcow prepares everything needed to create a Linux container inside a +// utility VM. The container [controller] drives it in two phases: +// +// 1. Resource reservation — [ReserveAll] orchestrates [parseAndReserveLayers], +// [reserveAndUpdateMounts], and [reserveAndUpdateDevices] to claim host-side +// SCSI, Plan9, and vPCI resources. It rewrites the OCI spec in place so that +// mount sources and device IDs reference their guest-visible paths. +// Each sub-function returns partial results on error so that a single +// deferred [container.ResourcePlan.Release] in ReserveAll cleans up every +// reservation that was successfully made — no per-function rollback needed. +// +// 2. Spec generation — [GenerateSpecs] produces a sanitized copy of the OCI +// spec suitable for the Linux GCS, stripping unsupported fields and +// applying safe defaults. +// +// The resulting [container.ResourcePlan] and spec are handed back to the +// controller, which commits them to the VM and sends the final container +// document to GCS for container creation. Because reservations are tracked as +// individual IDs (not blanket closers), the controller can selectively release +// or transfer each resource during live migration save/restore. +// +// The controller's Create method drives the overall flow: +// +// // 1. Reserve resources (layers, mounts, devices) and rewrite the spec. +// reservations := lcow.ReserveAll(ctx, scsiReserver, plan9Reserver, vpciReserver, spec, cfg) +// +// // 2. Generate the sanitized OCI spec for the GCS. +// doc := generateContainerDocument(spec, reservations) // calls lcow.GenerateSpecs +// +// // 3. Allocate (attach/mount) the reserved resources into the VM. +// allocateContainerResources(reservations) +// +// // 4. Send the document to the GCS to create the container. +// guestMgr.CreateContainer(doc) +// +// [container.Controller]: github.com/Microsoft/hcsshim/internal/controller/container +package lcow diff --git a/internal/builder/container/lcow/layers.go b/internal/builder/container/lcow/layers.go new file mode 100644 index 0000000000..a2352340ed --- /dev/null +++ b/internal/builder/container/lcow/layers.go @@ -0,0 +1,175 @@ +//go:build windows && lcow + +package lcow + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/builder/container" + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + scsiMount "github.com/Microsoft/hcsshim/internal/controller/device/scsi/mount" + "github.com/Microsoft/hcsshim/internal/guestpath" + "github.com/Microsoft/hcsshim/internal/layers" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + "github.com/Microsoft/hcsshim/internal/ospath" + "github.com/Microsoft/hcsshim/internal/wclayer" + + "github.com/Microsoft/go-winio/pkg/fs" + "github.com/Microsoft/go-winio/pkg/guid" + containerdtypes "github.com/containerd/containerd/api/types" + "github.com/sirupsen/logrus" +) + +// parseAndReserveLayers parses the container rootfs and layer folders into LCOW layers, +// then reserves SCSI slots for each read-only layer and the scratch layer. +// +// On partial failure the successfully reserved layers are still returned so the +// caller's top-level cleanup can release them. +func parseAndReserveLayers( + ctx context.Context, + vmID string, + podID string, + containerID string, + layerFolders []string, + rootfs []*containerdtypes.Mount, + isScratchEncryptionEnabled bool, + scsiReserver container.SCSIReserver, +) (*container.SCSILayerPlan, error) { + + log.G(ctx).WithFields(logrus.Fields{ + logfields.ContainerID: containerID, + logfields.PodID: podID, + }).Trace("parsing and reserving LCOW layers") + + // Parse the rootfs mounts and layer folders into the canonical LCOW layer format. + lcowLayers, err := layers.ParseLCOWLayers(rootfs, layerFolders) + if err != nil { + return nil, fmt.Errorf("parse lcow layers: %w", err) + } + + // Create a layer plan to return. + layerPlan := &container.SCSILayerPlan{} + + // Reserve a SCSI slot for each read-only layer. + // The ordering is intentionally preserved so that Live Migration can + // perform an exact replacement at the destination. + for _, readonlyLayer := range lcowLayers.Layers { + reservationID, layerGuestPath, err := reserveReadonlyLayer(ctx, scsiReserver, readonlyLayer) + if err != nil { + return layerPlan, fmt.Errorf("reserve readonly layer %s: %w", readonlyLayer.VHDPath, err) + } + + layerPlan.ROLayers = append(layerPlan.ROLayers, container.MountReservation{ + ID: reservationID, + GuestPath: layerGuestPath, + }) + } + + // Reserve a SCSI slot for the writable scratch layer. + scratchReservationID, scratchMountPath, err := reserveScratchLayer( + ctx, scsiReserver, vmID, lcowLayers.ScratchVHDPath, isScratchEncryptionEnabled, + ) + if err != nil { + return layerPlan, fmt.Errorf("reserve scratch layer %s: %w", lcowLayers.ScratchVHDPath, err) + } + + // When sharing a scratch disk across multiple containers, derive a unique + // sub-path per container to prevent upper/work directory collisions. + scratchGuestPath := ospath.Join("linux", scratchMountPath, "scratch", podID, containerID) + rootfsPath := ospath.Join("linux", guestpath.LCOWV2RootPrefixInVM, podID, containerID, guestpath.RootfsPath) + + layerPlan.Scratch = container.MountReservation{ + ID: scratchReservationID, + GuestPath: scratchGuestPath, + } + layerPlan.RootfsGuestPath = rootfsPath + + log.G(ctx).WithField("Plan", log.Format(ctx, layerPlan)).Trace("all LCOW layers reserved successfully") + return layerPlan, nil +} + +// reserveReadonlyLayer resolves the host path for a read-only layer and +// reserves a SCSI slot for it. +func reserveReadonlyLayer( + ctx context.Context, + scsiReserver container.SCSIReserver, + layer *layers.LCOWLayer, +) (guid.GUID, string, error) { + // Read-only layers come from the containerd snapshotter with broad read + // permissions (typically via GrantVmGroupAccess), so no per-VM access + // grant is needed here. + + hostPath, err := fs.ResolvePath(layer.VHDPath) + if err != nil { + return guid.GUID{}, "", fmt.Errorf("resolve symlinks for layer %s: %w", layer.VHDPath, err) + } + + reservationID, guestPath, err := scsiReserver.Reserve( + ctx, + disk.Config{ + HostPath: hostPath, + ReadOnly: true, + Type: disk.TypeVirtualDisk, + }, + scsiMount.Config{ + Partition: layer.Partition, + ReadOnly: true, + Options: []string{"ro"}, + }, + ) + if err != nil { + return guid.GUID{}, "", fmt.Errorf("reserve scsi slot for layer %s: %w", layer.VHDPath, err) + } + + return reservationID, guestPath, nil +} + +// reserveScratchLayer resolves the host path for the scratch VHD, grants VM +// access, and reserves a SCSI slot for it. +func reserveScratchLayer( + ctx context.Context, + scsiReserver container.SCSIReserver, + vmID string, + scratchVHDPath string, + isScratchEncryptionEnabled bool, +) (guid.GUID, string, error) { + // The scratch path may be a symlink to a shared sandbox.vhdx from another + // container (e.g. the sandbox container). Resolve it before granting access. + hostPath, err := fs.ResolvePath(scratchVHDPath) + if err != nil { + return guid.GUID{}, "", fmt.Errorf("resolve symlinks for scratch %s: %w", scratchVHDPath, err) + } + + // Unlike read-only layers, the scratch VHD requires explicit per-VM access. + if err = wclayer.GrantVmAccess(ctx, vmID, hostPath); err != nil { + return guid.GUID{}, "", fmt.Errorf("grant vm access to scratch %s: %w", hostPath, err) + } + + // Encrypted scratch disks use xfs; all others default to ext4. + fileSystem := "ext4" + if isScratchEncryptionEnabled { + fileSystem = "xfs" + } + + reservationID, guestPath, err := scsiReserver.Reserve( + ctx, + disk.Config{ + HostPath: hostPath, + ReadOnly: false, + Type: disk.TypeVirtualDisk, + }, + scsiMount.Config{ + Encrypted: isScratchEncryptionEnabled, + EnsureFilesystem: true, + ReadOnly: false, + Filesystem: fileSystem, + }, + ) + if err != nil { + return guid.GUID{}, "", fmt.Errorf("reserve scsi slot for scratch %s: %w", scratchVHDPath, err) + } + + return reservationID, guestPath, nil +} diff --git a/internal/builder/container/lcow/layers_test.go b/internal/builder/container/lcow/layers_test.go new file mode 100644 index 0000000000..9f0a6e2f69 --- /dev/null +++ b/internal/builder/container/lcow/layers_test.go @@ -0,0 +1,569 @@ +//go:build windows && lcow + +package lcow + +import ( + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/Microsoft/go-winio/pkg/guid" + "go.uber.org/mock/gomock" + + "github.com/Microsoft/hcsshim/internal/builder/container/mocks" + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + scsiMount "github.com/Microsoft/hcsshim/internal/controller/device/scsi/mount" + "github.com/Microsoft/hcsshim/internal/layers" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Test helpers +// ───────────────────────────────────────────────────────────────────────────── + +// createTempDirWithFile creates a temporary directory containing a single file +// with the given name. The directory path is returned. +func createTempDirWithFile(t *testing.T, fileName string) string { + t.Helper() + dir := t.TempDir() + f, err := os.Create(filepath.Join(dir, fileName)) + if err != nil { + t.Fatalf("failed to create %s: %v", fileName, err) + } + _ = f.Close() + return dir +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveReadonlyLayer — success +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveReadonlyLayer_Success verifies that a valid layer file is resolved +// and reserved with the correct SCSI disk and mount configuration. +func TestReserveReadonlyLayer_Success(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir := createTempDirWithFile(t, "layer.vhd") + vhdPath := filepath.Join(layerDir, "layer.vhd") + + reservationID := newGUID(t) + expectedGuestPath := "/dev/sda" + + // Verify the mount config is passed exactly: read-only, no partition, "ro" option. + scsi.EXPECT().Reserve( + gomock.Any(), + gomock.AssignableToTypeOf(disk.Config{}), + scsiMount.Config{ + ReadOnly: true, + Options: []string{"ro"}, + }, + ).DoAndReturn(func(_ context.Context, diskCfg disk.Config, _ scsiMount.Config) (guid.GUID, string, error) { + if diskCfg.HostPath == "" { + t.Error("expected non-empty host path in disk config") + } + if !diskCfg.ReadOnly { + t.Error("expected disk config ReadOnly to be true") + } + if diskCfg.Type != disk.TypeVirtualDisk { + t.Errorf("expected disk type %s, got %s", disk.TypeVirtualDisk, diskCfg.Type) + } + return reservationID, expectedGuestPath, nil + }) + + layer := &layers.LCOWLayer{VHDPath: vhdPath} + gotID, gotPath, err := reserveReadonlyLayer(t.Context(), scsi, layer) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if gotID != reservationID { + t.Errorf("expected reservation ID %s, got %s", reservationID, gotID) + } + if gotPath != expectedGuestPath { + t.Errorf("expected guest path %q, got %q", expectedGuestPath, gotPath) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveReadonlyLayer — with partition +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveReadonlyLayer_WithPartition verifies that the partition index from +// the LCOWLayer is forwarded to the SCSI mount configuration. +func TestReserveReadonlyLayer_WithPartition(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir := createTempDirWithFile(t, "layer.vhd") + vhdPath := filepath.Join(layerDir, "layer.vhd") + + reservationID := newGUID(t) + + scsi.EXPECT().Reserve( + gomock.Any(), + gomock.Any(), + scsiMount.Config{ + Partition: 3, + ReadOnly: true, + Options: []string{"ro"}, + }, + ).Return(reservationID, "/dev/sda3", nil) + + layer := &layers.LCOWLayer{VHDPath: vhdPath, Partition: 3} + gotID, _, err := reserveReadonlyLayer(t.Context(), scsi, layer) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if gotID != reservationID { + t.Errorf("expected reservation ID %s, got %s", reservationID, gotID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveReadonlyLayer — non-existent path +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveReadonlyLayer_BadPath verifies that a non-existent VHD path +// returns an error from fs.ResolvePath without calling Reserve. +func TestReserveReadonlyLayer_BadPath(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + // No Reserve expectations — Reserve must not be called. + layer := &layers.LCOWLayer{VHDPath: `C:\nonexistent\path\layer.vhd`} + _, _, err := reserveReadonlyLayer(t.Context(), scsi, layer) + if err == nil { + t.Fatal("expected error for non-existent VHD path") + } + if !strings.Contains(err.Error(), "resolve symlinks") { + t.Errorf("expected error about resolving symlinks, got: %v", err) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveReadonlyLayer — reserve failure +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveReadonlyLayer_ReserveFailure verifies that an error from +// scsiReserver.Reserve is propagated with wrapping context. +func TestReserveReadonlyLayer_ReserveFailure(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir := createTempDirWithFile(t, "layer.vhd") + vhdPath := filepath.Join(layerDir, "layer.vhd") + + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("no free scsi slots")) + + layer := &layers.LCOWLayer{VHDPath: vhdPath} + _, _, err := reserveReadonlyLayer(t.Context(), scsi, layer) + if err == nil { + t.Fatal("expected error from Reserve failure") + } + if !strings.Contains(err.Error(), "reserve scsi slot") { + t.Errorf("expected wrapped error about reserving scsi slot, got: %v", err) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveScratchLayer — non-existent path +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveScratchLayer_BadPath verifies that a non-existent scratch VHD +// path returns an error from fs.ResolvePath without calling GrantVmAccess +// or Reserve. +func TestReserveScratchLayer_BadPath(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + // No Reserve expectations. + _, _, err := reserveScratchLayer(t.Context(), scsi, "test-vm", `C:\nonexistent\sandbox.vhdx`, false) + if err == nil { + t.Fatal("expected error for non-existent scratch VHD path") + } + if !strings.Contains(err.Error(), "resolve symlinks") { + t.Errorf("expected error about resolving symlinks, got: %v", err) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveScratchLayer — success with encryption disabled +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveScratchLayer_Success verifies that reserveScratchLayer resolves +// the scratch VHD path, grants VM access, and reserves a SCSI slot with the +// correct ext4 filesystem configuration when encryption is disabled. +func TestReserveScratchLayer_Success(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + scratchPath := filepath.Join(scratchDir, "sandbox.vhdx") + + reservationID := newGUID(t) + expectedGuestPath := "/dev/sdb" + + scsi.EXPECT().Reserve( + gomock.Any(), + gomock.AssignableToTypeOf(disk.Config{}), + scsiMount.Config{ + EnsureFilesystem: true, + Filesystem: "ext4", + }, + ).DoAndReturn(func(_ context.Context, diskCfg disk.Config, _ scsiMount.Config) (guid.GUID, string, error) { + if diskCfg.HostPath == "" { + t.Error("expected non-empty host path in disk config") + } + if diskCfg.ReadOnly { + t.Error("expected ReadOnly=false for scratch") + } + if diskCfg.Type != disk.TypeVirtualDisk { + t.Errorf("expected disk type %s, got %s", disk.TypeVirtualDisk, diskCfg.Type) + } + return reservationID, expectedGuestPath, nil + }) + + gotID, gotPath, err := reserveScratchLayer(t.Context(), scsi, "test-vm", scratchPath, false) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if gotID != reservationID { + t.Errorf("expected reservation ID %s, got %s", reservationID, gotID) + } + if gotPath != expectedGuestPath { + t.Errorf("expected guest path %q, got %q", expectedGuestPath, gotPath) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveScratchLayer — encryption enabled uses xfs +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveScratchLayer_EncryptionUsesXFS verifies that when scratch +// encryption is enabled, the filesystem is set to xfs and the Encrypted +// flag is propagated. +func TestReserveScratchLayer_EncryptionUsesXFS(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + scratchPath := filepath.Join(scratchDir, "sandbox.vhdx") + + reservationID := newGUID(t) + + scsi.EXPECT().Reserve( + gomock.Any(), + gomock.Any(), + scsiMount.Config{ + Encrypted: true, + EnsureFilesystem: true, + Filesystem: "xfs", + }, + ).Return(reservationID, "/dev/sdb", nil) + + _, _, err := reserveScratchLayer(t.Context(), scsi, "test-vm", scratchPath, true) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// reserveScratchLayer — reserve failure +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveScratchLayer_ReserveFailure verifies that an error from +// scsiReserver.Reserve for the scratch layer is propagated with context. +func TestReserveScratchLayer_ReserveFailure(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + scratchPath := filepath.Join(scratchDir, "sandbox.vhdx") + + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("no free scsi slots")) + + _, _, err := reserveScratchLayer(t.Context(), scsi, "test-vm", scratchPath, false) + if err == nil { + t.Fatal("expected error from Reserve failure") + } + if !strings.Contains(err.Error(), "reserve scsi slot for scratch") { + t.Errorf("expected wrapped error about reserving scsi slot, got: %v", err) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// parseAndReserveLayers — invalid input +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseAndReserveLayers_InvalidInput verifies that bad input to +// layers.ParseLCOWLayers (both nil rootfs and empty layerFolders) returns +// an error without making any reservations. +func TestParseAndReserveLayers_InvalidInput(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + // No Reserve expectations — parsing should fail before any reservation. + plan, err := parseAndReserveLayers(t.Context(), "vm-1", "pod-1", "ctr-1", nil, nil, false, scsi) + if err == nil { + t.Fatal("expected error for nil rootfs and empty layer folders") + } + if plan != nil { + t.Errorf("expected nil plan on parse failure, got %+v", plan) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// parseAndReserveLayers — single layer folder (insufficient) +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseAndReserveLayers_SingleLayerFolder verifies that providing only one +// layer folder (scratch without any parent) causes ParseLCOWLayers to fail +// with a precondition error. No reservations should be made. +func TestParseAndReserveLayers_SingleLayerFolder(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + layerFolders := []string{scratchDir} + + // No Reserve expectations — parsing should fail before any reservation. + plan, err := parseAndReserveLayers(t.Context(), "test-vm", "pod-1", "ctr-1", layerFolders, nil, false, scsi) + if err == nil { + t.Fatal("expected error for single layer folder (no parent layers)") + } + if plan != nil { + t.Errorf("expected nil plan on parse failure, got %+v", plan) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// parseAndReserveLayers — first RO layer reserve failure +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseAndReserveLayers_FirstROLayerFails verifies that when the first +// read-only layer reservation fails, the error is returned with an empty +// partial plan (no successfully reserved layers to clean up). +func TestParseAndReserveLayers_FirstROLayerFails(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + layerFolders := []string{layerDir, scratchDir} + + // The single read-only layer reservation fails. + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("scsi bus full")) + + plan, err := parseAndReserveLayers(t.Context(), "vm-1", "pod-1", "ctr-1", layerFolders, nil, false, scsi) + if err == nil { + t.Fatal("expected error from RO layer reservation failure") + } + if plan == nil { + t.Fatal("expected non-nil plan on partial failure") + } + if len(plan.ROLayers) != 0 { + t.Errorf("expected 0 RO layers (none succeeded), got %d", len(plan.ROLayers)) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// parseAndReserveLayers — second RO layer fails, partial result +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseAndReserveLayers_SecondROLayerFails verifies that when the second +// read-only layer reservation fails, the first successful reservation is still +// returned in the partial plan for cleanup. +func TestParseAndReserveLayers_SecondROLayerFails(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir1 := createTempDirWithFile(t, "layer.vhd") + layerDir2 := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + // layerFolders: [parent1, parent2, scratch] + layerFolders := []string{layerDir1, layerDir2, scratchDir} + + reservationID1 := newGUID(t) + + gomock.InOrder( + // First read-only layer succeeds. + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(reservationID1, "/layer1", nil), + // Second read-only layer fails. + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("scsi bus full")), + ) + + plan, err := parseAndReserveLayers(t.Context(), "vm-1", "pod-1", "ctr-1", layerFolders, nil, false, scsi) + if err == nil { + t.Fatal("expected error from second RO layer reservation failure") + } + // The first successfully reserved layer must be in the plan for cleanup. + if plan == nil { + t.Fatal("expected non-nil plan with partial results") + } + if len(plan.ROLayers) != 1 { + t.Fatalf("expected 1 partial RO layer, got %d", len(plan.ROLayers)) + } + if plan.ROLayers[0].ID != reservationID1 { + t.Errorf("expected partial RO layer ID %s, got %s", reservationID1, plan.ROLayers[0].ID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// parseAndReserveLayers — RO layers succeed, scratch Reserve fails +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseAndReserveLayers_ScratchReserveFailure verifies that when all +// read-only layers are successfully reserved but the scratch layer Reserve +// fails, the partial plan containing the RO layers is returned. +func TestParseAndReserveLayers_ScratchReserveFailure(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + // layerFolders: [parent, scratch] + layerFolders := []string{layerDir, scratchDir} + + reservationID := newGUID(t) + + gomock.InOrder( + // Read-only layer Reserve succeeds. + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(reservationID, "/layer1", nil), + // Scratch layer Reserve fails. + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("no free scsi slots")), + ) + + plan, err := parseAndReserveLayers(t.Context(), "test-vm", "pod-1", "ctr-1", layerFolders, nil, false, scsi) + if err == nil { + t.Fatal("expected error from scratch reservation failure") + } + // The RO layer reservation must be in the plan for cleanup. + if plan == nil { + t.Fatal("expected non-nil plan with partial results") + } + if len(plan.ROLayers) != 1 { + t.Fatalf("expected 1 RO layer in partial plan, got %d", len(plan.ROLayers)) + } + if plan.ROLayers[0].ID != reservationID { + t.Errorf("expected RO layer ID %s, got %s", reservationID, plan.ROLayers[0].ID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// parseAndReserveLayers — full success with guest path verification +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseAndReserveLayers_FullSuccess verifies the complete success path +// including correct guest path construction for scratch and rootfs. +func TestParseAndReserveLayers_FullSuccess(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + layerFolders := []string{layerDir, scratchDir} + + roReservationID := newGUID(t) + scratchReservationID := newGUID(t) + + gomock.InOrder( + // Read-only layer Reserve succeeds. + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(roReservationID, "/layer1", nil), + // Scratch Reserve succeeds. + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(scratchReservationID, "/scratch/mount", nil), + ) + + plan, err := parseAndReserveLayers(t.Context(), "test-vm", "pod-1", "ctr-1", layerFolders, nil, false, scsi) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if plan == nil { + t.Fatal("expected non-nil plan") + } + + // Verify RO layer reservation. + if len(plan.ROLayers) != 1 { + t.Fatalf("expected 1 RO layer, got %d", len(plan.ROLayers)) + } + if plan.ROLayers[0].ID != roReservationID { + t.Errorf("expected RO layer ID %s, got %s", roReservationID, plan.ROLayers[0].ID) + } + + // Verify scratch reservation and guest path: + // ospath.Join("linux", scratchMountPath, "scratch", podID, containerID) + if plan.Scratch.ID != scratchReservationID { + t.Errorf("expected scratch ID %s, got %s", scratchReservationID, plan.Scratch.ID) + } + expectedScratchGuestPath := "/scratch/mount/scratch/pod-1/ctr-1" + if plan.Scratch.GuestPath != expectedScratchGuestPath { + t.Errorf("expected scratch guest path %q, got %q", expectedScratchGuestPath, plan.Scratch.GuestPath) + } + + // Verify rootfs guest path: + // ospath.Join("linux", LCOWV2RootPrefixInVM, podID, containerID, RootfsPath) + expectedRootfsPath := "/run/gcs/pods/pod-1/ctr-1/rootfs" + if plan.RootfsGuestPath != expectedRootfsPath { + t.Errorf("expected rootfs guest path %q, got %q", expectedRootfsPath, plan.RootfsGuestPath) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// parseAndReserveLayers — full success with multiple RO layers +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseAndReserveLayers_MultipleROLayers verifies that multiple read-only +// layers are reserved in order and all appear in the plan. +func TestParseAndReserveLayers_MultipleROLayers(t *testing.T) { + t.Parallel() + scsi := mocks.NewMockSCSIReserver(gomock.NewController(t)) + + layerDir1 := createTempDirWithFile(t, "layer.vhd") + layerDir2 := createTempDirWithFile(t, "layer.vhd") + layerDir3 := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + layerFolders := []string{layerDir1, layerDir2, layerDir3, scratchDir} + + roID1 := newGUID(t) + roID2 := newGUID(t) + roID3 := newGUID(t) + scratchID := newGUID(t) + + gomock.InOrder( + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(roID1, "/layer1", nil), + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(roID2, "/layer2", nil), + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(roID3, "/layer3", nil), + scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(scratchID, "/scratch", nil), + ) + + plan, err := parseAndReserveLayers(t.Context(), "test-vm", "pod-1", "ctr-1", layerFolders, nil, false, scsi) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(plan.ROLayers) != 3 { + t.Fatalf("expected 3 RO layers, got %d", len(plan.ROLayers)) + } + if plan.ROLayers[0].ID != roID1 || plan.ROLayers[1].ID != roID2 || plan.ROLayers[2].ID != roID3 { + t.Errorf("RO layer IDs not in expected order: %v", plan.ROLayers) + } + if plan.Scratch.ID != scratchID { + t.Errorf("expected scratch ID %s, got %s", scratchID, plan.Scratch.ID) + } +} diff --git a/internal/builder/container/lcow/mounts.go b/internal/builder/container/lcow/mounts.go new file mode 100644 index 0000000000..347a206149 --- /dev/null +++ b/internal/builder/container/lcow/mounts.go @@ -0,0 +1,301 @@ +//go:build windows && lcow + +package lcow + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/Microsoft/hcsshim/internal/builder/container" + plan9Mount "github.com/Microsoft/hcsshim/internal/controller/device/plan9/mount" + "github.com/Microsoft/hcsshim/internal/controller/device/plan9/share" + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + scsiMount "github.com/Microsoft/hcsshim/internal/controller/device/scsi/mount" + "github.com/Microsoft/hcsshim/internal/guestpath" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/wclayer" + + "github.com/Microsoft/go-winio/pkg/fs" + "github.com/Microsoft/go-winio/pkg/guid" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Mount type constants. +const ( + // mountTypeBind is a regular host-directory bind mount served via a Plan9 share. + mountTypeBind = "bind" + + // mountTypePhysicalDisk hot-adds a physical pass-through disk via the SCSI controller. + mountTypePhysicalDisk = "physical-disk" + + // mountTypeVirtualDisk hot-adds a VHD or VHDX via the SCSI controller. + mountTypeVirtualDisk = "virtual-disk" + + // mountTypeExtensibleVirtualDisk hot-adds an extensible virtual disk via the SCSI controller. + mountTypeExtensibleVirtualDisk = "extensible-virtual-disk" + + // mountTypeNone signals that the mount is a disk-backed device mount whose + // filesystem will be resolved when the guest actually mounts the device. + mountTypeNone = "none" +) + +// reserveAndUpdateMounts reserves host-side resources for each OCI mount and +// rewrites mount sources to their guest-visible paths. +// +// On partial failure the successfully reserved IDs are still returned so the +// caller's top-level cleanup can release them. +func reserveAndUpdateMounts( + ctx context.Context, + vmID string, + scsiReserver container.SCSIReserver, + plan9Reserver container.Plan9Reserver, + mounts []specs.Mount, +) ([]guid.GUID, []guid.GUID, error) { + var scsiReservations, plan9Reservations []guid.GUID + + for idx := range mounts { + log.G(ctx).WithField("mount", log.Format(ctx, mounts[idx])).Trace("processing OCI mount") + mount := &mounts[idx] + + // Validate that every mount has the minimum required fields. + if mount.Destination == "" || mount.Source == "" { + return scsiReservations, plan9Reservations, fmt.Errorf("invalid mount: both source and destination are required: %+v", mount) + } + + // Check if the mount is read-only. + isReadOnly := isReadOnlyMount(mount) + + // Dispatch to a mount-type-specific handler. + switch mount.Type { + case mountTypeVirtualDisk, mountTypePhysicalDisk, mountTypeExtensibleVirtualDisk: + reservationID, err := reserveSCSIMount(ctx, vmID, scsiReserver, mount, isReadOnly) + if err != nil { + return scsiReservations, plan9Reservations, err + } + + scsiReservations = append(scsiReservations, reservationID) + + case mountTypeBind: + // Hugepages mounts are backed by a pre-existing mount inside the UVM; + // only validate the path format and move on. + if strings.HasPrefix(mount.Source, guestpath.HugePagesMountPrefix) { + if err := validateHugePageMount(mount.Source); err != nil { + return scsiReservations, plan9Reservations, err + } + continue + } + + // Guest-internal paths (sandbox://, sandbox-tmp://, uvm://) resolve + // entirely inside the UVM and require no host-side reservation. + if isGuestInternalPath(mount.Source) { + continue + } + + // All remaining bind mounts are host directories served via Plan9 for Linux guests. + reservationID, err := reservePlan9Mount(ctx, plan9Reserver, mount, isReadOnly) + if err != nil { + return scsiReservations, plan9Reservations, err + } + plan9Reservations = append(plan9Reservations, reservationID) + + default: + // Unknown mount types (e.g. tmpfs, devpts, proc) are passed through + // to the guest without host-side resource reservation. + } + } + + log.G(ctx).Debug("all OCI mounts reserved successfully") + return scsiReservations, plan9Reservations, nil +} + +// --- SCSI mounts (virtual-disk / physical-disk / extensible-virtual-disk) --- + +// reserveSCSIMount resolves the host path, optionally grants VM access, and +// reserves a SCSI slot for any disk-backed mount type. +func reserveSCSIMount( + ctx context.Context, + vmID string, + scsiReserver container.SCSIReserver, + mount *specs.Mount, + isReadOnly bool, +) (guid.GUID, error) { + // Build the disk config based on mount type. Each branch resolves the + // host path and sets the appropriate disk type. + var diskConfig disk.Config + switch mount.Type { + case mountTypeVirtualDisk, mountTypePhysicalDisk: + // Resolve any symlinks to get the real host path for the disk. + hostPath, err := fs.ResolvePath(mount.Source) + if err != nil { + return guid.GUID{}, fmt.Errorf("resolve symlinks for mount source %s: %w", mount.Source, err) + } + + // The VM needs explicit access to the disk before it can be attached. + if err = wclayer.GrantVmAccess(ctx, vmID, hostPath); err != nil { + return guid.GUID{}, fmt.Errorf("grant vm access to %s: %w", hostPath, err) + } + + // Physical disks use pass-through; everything else is a virtual disk. + diskType := disk.TypeVirtualDisk + if mount.Type == mountTypePhysicalDisk { + diskType = disk.TypePassThru + } + + // Create the final disk config. + diskConfig = disk.Config{HostPath: hostPath, ReadOnly: isReadOnly, Type: diskType} + + case mountTypeExtensibleVirtualDisk: + // EVD paths encode the provider type in the source URI. + evdType, sourcePath, err := parseExtensibleVirtualDiskPath(mount.Source) + if err != nil { + return guid.GUID{}, fmt.Errorf("parse extensible virtual disk path: %w", err) + } + + // Resolve any symlinks to get the real host path for the disk. + hostPath, err := fs.ResolvePath(sourcePath) + if err != nil { + return guid.GUID{}, fmt.Errorf("resolve symlinks for mount source %s: %w", sourcePath, err) + } + + // Create the final disk config. + diskConfig = disk.Config{HostPath: hostPath, ReadOnly: isReadOnly, Type: disk.TypeExtensibleVirtualDisk, EVDType: evdType} + + default: + return guid.GUID{}, fmt.Errorf("unsupported scsi mount type %q", mount.Type) + } + + // Check if this was a block dev mount. + isBlockDev := strings.HasPrefix(mount.Destination, guestpath.BlockDevMountPrefix) + + // Reserve the mount. + reservationID, guestPath, err := scsiReserver.Reserve( + ctx, + diskConfig, + scsiMount.Config{ + ReadOnly: isReadOnly, + Options: mount.Options, + BlockDev: isBlockDev, + }, + ) + if err != nil { + return guid.GUID{}, fmt.Errorf("reserve scsi mount for %+v: %w", mount, err) + } + + // Rewrite the mount source to the guest-visible path and clear the type + // so the guest resolves the filesystem. Block-device mounts retain bind type. + mount.Source = guestPath + mount.Type = mountTypeNone + if isBlockDev { + mount.Type = mountTypeBind + } + + return reservationID, nil +} + +// parseExtensibleVirtualDiskPath extracts the EVD type and source path from an +// extensible virtual disk host path with the format "evd:///". +func parseExtensibleVirtualDiskPath(hostPath string) (evdType, sourcePath string, err error) { + const evdPrefix = "evd://" + + if !strings.HasPrefix(hostPath, evdPrefix) { + return "", "", fmt.Errorf("invalid extensible virtual disk path %q: missing %q prefix", hostPath, evdPrefix) + } + + trimmedPath := strings.TrimPrefix(hostPath, evdPrefix) + separatorIdx := strings.Index(trimmedPath, "/") + if separatorIdx <= 0 { + return "", "", fmt.Errorf("invalid extensible virtual disk path %q: expected format %s/", hostPath, evdPrefix) + } + + return trimmedPath[:separatorIdx], trimmedPath[separatorIdx+1:], nil +} + +// --- Bind mounts (Plan9 shares, hugepages, guest-internal paths) --- + +// validateHugePageMount checks that a hugepages mount source has the expected +// format and a supported page size. +func validateHugePageMount(source string) error { + // Expected format: "hugepages:///" + hugePageSubDirs := strings.Split(strings.TrimPrefix(source, guestpath.HugePagesMountPrefix), "/") + if len(hugePageSubDirs) < 2 { + return fmt.Errorf( + "invalid hugepages mount path %s: expected format %s/", + source, + guestpath.HugePagesMountPrefix, + ) + } + + // Only 2M (megabyte) hugepages are currently supported. + if hugePageSubDirs[0] != "2M" { + return fmt.Errorf("unsupported hugepage size %s: only 2M is supported", hugePageSubDirs[0]) + } + + return nil +} + +// reservePlan9Mount reserves a Plan9 share for a host-backed bind mount, +// restricting to a single file when the source is not a directory. +func reservePlan9Mount( + ctx context.Context, + plan9Reserver container.Plan9Reserver, + mount *specs.Mount, + isReadOnly bool, +) (guid.GUID, error) { + // Ensure that mount source exists. + fileInfo, err := os.Stat(mount.Source) + if err != nil { + return guid.GUID{}, fmt.Errorf("stat bind mount source %s: %w", mount.Source, err) + } + + shareConfig := share.Config{ + HostPath: mount.Source, + ReadOnly: isReadOnly, + } + + // For single-file mounts, share the containing directory but restrict + // access to the specific file. + if !fileInfo.IsDir() { + hostDir, fileName := filepath.Split(mount.Source) + shareConfig.HostPath = hostDir + shareConfig.Restrict = true + shareConfig.AllowedNames = append(shareConfig.AllowedNames, fileName) + } + + // Reserve the plan9 share. + reservationID, guestPath, err := plan9Reserver.Reserve(ctx, shareConfig, plan9Mount.Config{ReadOnly: isReadOnly}) + if err != nil { + return guid.GUID{}, fmt.Errorf("reserve plan9 share for mount %+v: %w", mount, err) + } + + // Rewrite the mount source to the guest-visible path. + mount.Source = guestPath + return reservationID, nil +} + +// --- Helpers --- + +// isReadOnlyMount returns true if the mount options contain the "ro" flag. +func isReadOnlyMount(mount *specs.Mount) bool { + for _, option := range mount.Options { + if strings.EqualFold(option, "ro") { + return true + } + } + return false +} + +// isGuestInternalPath reports whether the path uses a UVM-internal prefix +// that resolves inside the guest. +func isGuestInternalPath(path string) bool { + // Mounts that map to a path in UVM are specified with a 'sandbox://', 'sandbox-tmp://', or 'uvm://' prefix. + // examples: + // - sandbox:///a/dirInUvm destination:/b/dirInContainer + // - sandbox-tmp:///a/dirInUvm destination:/b/dirInContainer + // - uvm:///a/dirInUvm destination:/b/dirInContainer + return strings.HasPrefix(path, guestpath.SandboxMountPrefix) || + strings.HasPrefix(path, guestpath.SandboxTmpfsMountPrefix) || + strings.HasPrefix(path, guestpath.UVMMountPrefix) +} diff --git a/internal/builder/container/lcow/mounts_test.go b/internal/builder/container/lcow/mounts_test.go new file mode 100644 index 0000000000..87fed5aca6 --- /dev/null +++ b/internal/builder/container/lcow/mounts_test.go @@ -0,0 +1,191 @@ +//go:build windows && lcow + +package lcow + +import ( + "testing" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// ───────────────────────────────────────────────────────────────────────────── +// parseExtensibleVirtualDiskPath +// ───────────────────────────────────────────────────────────────────────────── + +// TestParseExtensibleVirtualDiskPath verifies parsing of EVD URIs into +// provider type and source path, including error cases. +func TestParseExtensibleVirtualDiskPath(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input string + wantType string + wantSourcePath string + wantErr bool + }{ + { + name: "valid path", + input: "evd://mytype/some/path/disk.vhdx", + wantType: "mytype", + wantSourcePath: "some/path/disk.vhdx", + }, + { + name: "missing evd:// prefix", + input: "notevd://type/path", + wantErr: true, + }, + { + name: "no type/path separator", + input: "evd://typeonly", + wantErr: true, + }, + { + name: "empty type", + input: "evd:///path", + wantErr: true, + }, + { + name: "empty source path is accepted", + input: "evd://mytype/", + wantType: "mytype", + wantSourcePath: "", + }, + { + name: "source path can start with slash", + input: "evd://mytype//disk.vhdx", + wantType: "mytype", + wantSourcePath: "/disk.vhdx", + }, + { + name: "empty input", + input: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + evdType, sourcePath, err := parseExtensibleVirtualDiskPath(tt.input) + + if tt.wantErr { + if err == nil { + t.Fatal("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if evdType != tt.wantType { + t.Errorf("expected EVD type %q, got %q", tt.wantType, evdType) + } + if sourcePath != tt.wantSourcePath { + t.Errorf("expected source path %q, got %q", tt.wantSourcePath, sourcePath) + } + }) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// validateHugePageMount +// ───────────────────────────────────────────────────────────────────────────── + +// TestValidateHugePageMount verifies validation of hugepages mount sources, +// including supported/unsupported sizes and malformed paths. +func TestValidateHugePageMount(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + source string + wantErr bool + }{ + {name: "valid 2M", source: "hugepages://2M/location", wantErr: false}, + {name: "valid 2M nested location", source: "hugepages://2M/a/b/c", wantErr: false}, + {name: "unsupported size 1G", source: "hugepages://1G/location", wantErr: true}, + {name: "missing subdirectories", source: "hugepages://", wantErr: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + err := validateHugePageMount(tt.source) + if tt.wantErr && err == nil { + t.Fatal("expected error, got nil") + } + if !tt.wantErr && err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// isReadOnlyMount +// ───────────────────────────────────────────────────────────────────────────── + +// TestIsReadOnlyMount verifies detection of the "ro" mount option across +// various option slices and casing. +func TestIsReadOnlyMount(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + options []string + expected bool + }{ + {name: "has ro option", options: []string{"rw", "ro"}, expected: true}, + {name: "case insensitive RO", options: []string{"RO"}, expected: true}, + {name: "no ro option", options: []string{"rw", "noatime"}, expected: false}, + {name: "empty options", options: nil, expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + mount := &specs.Mount{Options: tt.options} + if got := isReadOnlyMount(mount); got != tt.expected { + t.Errorf("isReadOnlyMount(%v) = %v, want %v", tt.options, got, tt.expected) + } + }) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// isGuestInternalPath +// ───────────────────────────────────────────────────────────────────────────── + +// TestIsGuestInternalPath verifies that known guest-internal prefixes are +// detected and regular paths are not. +func TestIsGuestInternalPath(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + path string + expected bool + }{ + {name: "sandbox prefix", path: "sandbox:///a/path", expected: true}, + {name: "sandbox-tmp prefix", path: "sandbox-tmp:///a/path", expected: true}, + {name: "uvm prefix", path: "uvm:///a/path", expected: true}, + {name: "regular host path", path: `/host/data`, expected: false}, + {name: "hugepages prefix", path: "hugepages://2M/loc", expected: false}, + {name: "empty string", path: "", expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + if got := isGuestInternalPath(tt.path); got != tt.expected { + t.Errorf("isGuestInternalPath(%q) = %v, want %v", tt.path, got, tt.expected) + } + }) + } +} diff --git a/internal/builder/container/lcow/reserve.go b/internal/builder/container/lcow/reserve.go new file mode 100644 index 0000000000..1b03c53985 --- /dev/null +++ b/internal/builder/container/lcow/reserve.go @@ -0,0 +1,102 @@ +//go:build windows && lcow + +package lcow + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/builder/container" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + containerdtypes "github.com/containerd/containerd/api/types" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +// ReserveConfig holds all inputs needed by [ReserveAll]. +type ReserveConfig struct { + VMID string + PodID string + ContainerID string + Rootfs []*containerdtypes.Mount + ScratchEncrypted bool +} + +// ReserveAll orchestrates reservation of all host-side resources for an LCOW +// container and rewrites the OCI spec in place so mount sources, device IDs, +// and root paths reference their guest-visible equivalents. +// +// On partial failure every previously successful reservation is released +// before the error is returned. On success the caller receives a +// [container.ResourcePlan] whose fields are individually trackable IDs +// suitable for save/restore during live migration. +func ReserveAll( + ctx context.Context, + scsiReserver container.SCSIReserver, + plan9Reserver container.Plan9Reserver, + vpciReserver container.VPCIReserver, + spec *specs.Spec, + cfg *ReserveConfig, +) (_ *container.ResourcePlan, err error) { + + log.G(ctx).WithFields(logrus.Fields{ + logfields.ContainerID: cfg.ContainerID, + logfields.PodID: cfg.PodID, + logfields.UVMID: cfg.VMID, + }).Debug("reserving all host-side resources for LCOW container") + + // Build the plan incrementally. Each sub-function returns whatever it + // successfully reserved even on error, so the single deferred Release + // cleans up everything. + plan := &container.ResourcePlan{} + defer func() { + if err != nil { + log.G(ctx).WithError(err).Warn("reservation failed, releasing partially reserved resources") + plan.Release(ctx, scsiReserver, plan9Reserver, vpciReserver) + } + }() + + // Phase 1: Reserve SCSI slots for read-only layers and the scratch layer. + log.G(ctx).Debug("phase 1: reserving layers") + plan.SCSILayers, err = parseAndReserveLayers( + ctx, + cfg.VMID, + cfg.PodID, + cfg.ContainerID, + spec.Windows.LayerFolders, + cfg.Rootfs, + cfg.ScratchEncrypted, + scsiReserver, + ) + if err != nil { + return nil, fmt.Errorf("reserve layers: %w", err) + } + + // Set the container root path from the layer plan. + if spec.Root == nil { + spec.Root = &specs.Root{} + } + spec.Root.Path = plan.SCSILayers.RootfsGuestPath + + // Phase 2: Reserve SCSI and Plan9 resources for OCI mounts. + log.G(ctx).Debug("phase 2: reserving mounts") + plan.SCSI, plan.Plan9, err = reserveAndUpdateMounts( + ctx, cfg.VMID, scsiReserver, plan9Reserver, spec.Mounts, + ) + if err != nil { + return nil, fmt.Errorf("reserve mounts: %w", err) + } + + // Phase 3: Reserve vPCI devices. + if spec.Windows != nil { + log.G(ctx).Debug("phase 3: reserving vPCI devices") + plan.Devices, err = reserveAndUpdateDevices(ctx, vpciReserver, spec.Windows.Devices) + if err != nil { + return nil, fmt.Errorf("reserve devices: %w", err) + } + } + + log.G(ctx).Info("all host-side resources reserved successfully for LCOW container") + return plan, nil +} diff --git a/internal/builder/container/lcow/reserve_test.go b/internal/builder/container/lcow/reserve_test.go new file mode 100644 index 0000000000..ea95dd2447 --- /dev/null +++ b/internal/builder/container/lcow/reserve_test.go @@ -0,0 +1,287 @@ +//go:build windows && lcow + +package lcow + +import ( + "errors" + "testing" + + "github.com/Microsoft/hcsshim/internal/builder/container/mocks" + + "github.com/Microsoft/go-winio/pkg/guid" + "github.com/opencontainers/runtime-spec/specs-go" + "go.uber.org/mock/gomock" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Test helpers +// ───────────────────────────────────────────────────────────────────────────── + +// reserveTestController bundles mock reservers for ReserveAll tests. +type reserveTestController struct { + scsi *mocks.MockSCSIReserver + p9 *mocks.MockPlan9Reserver + vpci *mocks.MockVPCIReserver +} + +// newReserveTestController creates a parallel-safe reserveTestController. +func newReserveTestController(t *testing.T) *reserveTestController { + t.Helper() + t.Parallel() + + ctrl := gomock.NewController(t) + return &reserveTestController{ + scsi: mocks.NewMockSCSIReserver(ctrl), + p9: mocks.NewMockPlan9Reserver(ctrl), + vpci: mocks.NewMockVPCIReserver(ctrl), + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// ReserveAll — first RO layer fails, no cleanup needed +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveAll_FirstROLayerFails verifies that when the only read-only layer +// reservation fails (nothing was successfully reserved), the deferred cleanup +// runs without making any UnmapFromGuest calls. +func TestReserveAll_FirstROLayerFails(t *testing.T) { + tc := newReserveTestController(t) + + layerDir := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + spec := &specs.Spec{ + Windows: &specs.Windows{ + LayerFolders: []string{layerDir, scratchDir}, + }, + } + cfg := &ReserveConfig{ + VMID: "vm-1", + PodID: "pod-1", + ContainerID: "ctr-1", + } + + // First (and only) RO layer fails — no successful reservation to clean up. + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("scsi bus full")) + + plan, err := ReserveAll(t.Context(), tc.scsi, tc.p9, tc.vpci, spec, cfg) + if err == nil { + t.Fatal("expected error from RO layer reservation failure") + } + if plan != nil { + t.Errorf("expected nil plan on failure, got %+v", plan) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// ReserveAll — partial RO layer failure triggers cleanup +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveAll_ROLayerFailureTriggersCleanup verifies that when the second +// read-only layer reservation fails, the first successfully reserved layer +// is released via the deferred cleanup. +func TestReserveAll_ROLayerFailureTriggersCleanup(t *testing.T) { + tc := newReserveTestController(t) + + layerDir1 := createTempDirWithFile(t, "layer.vhd") + layerDir2 := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + spec := &specs.Spec{ + Windows: &specs.Windows{ + LayerFolders: []string{layerDir1, layerDir2, scratchDir}, + }, + } + cfg := &ReserveConfig{ + VMID: "vm-1", + PodID: "pod-1", + ContainerID: "ctr-1", + } + + reservationID := newGUID(t) + + gomock.InOrder( + // Phase 1: First RO layer reserved successfully. + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(reservationID, "/layer1", nil), + // Phase 1: Second RO layer fails. + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("scsi bus full")), + // Cleanup (defer): first RO layer released. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), reservationID).Return(nil), + ) + + plan, err := ReserveAll(t.Context(), tc.scsi, tc.p9, tc.vpci, spec, cfg) + if err == nil { + t.Fatal("expected error from RO layer reservation failure") + } + if plan != nil { + t.Errorf("expected nil plan on failure, got %+v", plan) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// ReserveAll — scratch Reserve failure triggers cleanup of RO layers +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveAll_ScratchFailureReleasesROLayers verifies that when all +// read-only layers are reserved but the scratch layer Reserve fails, +// the deferred cleanup releases every RO layer. +func TestReserveAll_ScratchFailureReleasesROLayers(t *testing.T) { + tc := newReserveTestController(t) + + layerDir := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + spec := &specs.Spec{ + Windows: &specs.Windows{ + LayerFolders: []string{layerDir, scratchDir}, + }, + } + cfg := &ReserveConfig{ + VMID: "test-vm", + PodID: "pod-1", + ContainerID: "ctr-1", + } + + reservationID := newGUID(t) + + gomock.InOrder( + // Phase 1: Single RO layer reserved successfully. + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(reservationID, "/layer1", nil), + // Phase 1: Scratch layer Reserve fails. + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(guid.GUID{}, "", errors.New("no free scsi slots")), + // Cleanup (defer): RO layer released after scratch reservation failure. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), reservationID).Return(nil), + ) + + plan, err := ReserveAll(t.Context(), tc.scsi, tc.p9, tc.vpci, spec, cfg) + if err == nil { + t.Fatal("expected error from scratch layer reservation failure") + } + if plan != nil { + t.Errorf("expected nil plan on failure, got %+v", plan) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// ReserveAll — full success sets spec root path +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveAll_FullSuccess verifies the complete success path: all layers +// are reserved, the spec root path is set, and the returned plan contains +// the expected reservations. +func TestReserveAll_FullSuccess(t *testing.T) { + tc := newReserveTestController(t) + + layerDir := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + spec := &specs.Spec{ + Windows: &specs.Windows{ + LayerFolders: []string{layerDir, scratchDir}, + }, + } + cfg := &ReserveConfig{ + VMID: "test-vm", + PodID: "pod-1", + ContainerID: "ctr-1", + } + + roReservationID := newGUID(t) + scratchReservationID := newGUID(t) + + gomock.InOrder( + // Phase 1: RO layer. + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(roReservationID, "/layer1", nil), + // Phase 1: Scratch. + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(scratchReservationID, "/scratch/mount", nil), + ) + // Phase 2: No mounts (spec.Mounts is nil). + // Phase 3: No devices (spec.Windows.Devices is nil). + + plan, err := ReserveAll(t.Context(), tc.scsi, tc.p9, tc.vpci, spec, cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if plan == nil { + t.Fatal("expected non-nil plan") + } + + // Verify spec root path was set (auto-created since spec.Root was nil). + expectedRootfs := "/run/gcs/pods/pod-1/ctr-1/rootfs" + if spec.Root == nil { + t.Fatal("expected spec.Root to be auto-created") + } + if spec.Root.Path != expectedRootfs { + t.Errorf("expected spec.Root.Path = %q, got %q", expectedRootfs, spec.Root.Path) + } + + // Verify the plan references. + if plan.SCSILayers == nil { + t.Fatal("expected non-nil SCSILayers in plan") + } + if len(plan.SCSILayers.ROLayers) != 1 { + t.Fatalf("expected 1 RO layer, got %d", len(plan.SCSILayers.ROLayers)) + } + if plan.SCSILayers.ROLayers[0].ID != roReservationID { + t.Errorf("expected RO layer ID %s, got %s", roReservationID, plan.SCSILayers.ROLayers[0].ID) + } + if plan.SCSILayers.Scratch.ID != scratchReservationID { + t.Errorf("expected scratch ID %s, got %s", scratchReservationID, plan.SCSILayers.Scratch.ID) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// ReserveAll — existing spec.Root is preserved +// ───────────────────────────────────────────────────────────────────────────── + +// TestReserveAll_ExistingSpecRoot verifies that when spec.Root is already +// non-nil, only the Path field is overwritten with the layer plan's rootfs. +func TestReserveAll_ExistingSpecRoot(t *testing.T) { + tc := newReserveTestController(t) + + layerDir := createTempDirWithFile(t, "layer.vhd") + scratchDir := createTempDirWithFile(t, "sandbox.vhdx") + + spec := &specs.Spec{ + Root: &specs.Root{ + Path: "/old/rootfs", + Readonly: true, + }, + Windows: &specs.Windows{ + LayerFolders: []string{layerDir, scratchDir}, + }, + } + cfg := &ReserveConfig{ + VMID: "test-vm", + PodID: "pod-1", + ContainerID: "ctr-1", + } + + gomock.InOrder( + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(newGUID(t), "/layer1", nil), + tc.scsi.EXPECT().Reserve(gomock.Any(), gomock.Any(), gomock.Any()). + Return(newGUID(t), "/scratch", nil), + ) + + _, err := ReserveAll(t.Context(), tc.scsi, tc.p9, tc.vpci, spec, cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + expectedRootfs := "/run/gcs/pods/pod-1/ctr-1/rootfs" + if spec.Root.Path != expectedRootfs { + t.Errorf("expected spec.Root.Path = %q, got %q", expectedRootfs, spec.Root.Path) + } + // The Readonly field should remain unchanged. + if !spec.Root.Readonly { + t.Error("expected spec.Root.Readonly to remain true") + } +} diff --git a/internal/builder/container/lcow/specs.go b/internal/builder/container/lcow/specs.go new file mode 100644 index 0000000000..9a400d4c5c --- /dev/null +++ b/internal/builder/container/lcow/specs.go @@ -0,0 +1,121 @@ +//go:build windows && lcow + +package lcow + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/Microsoft/hcsshim/internal/oci" + "github.com/Microsoft/hcsshim/pkg/annotations" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// GenerateSpecs produces a deep copy of oci specs tailored for an LCOW container, +// stripping unsupported fields and applying safe defaults. +func GenerateSpecs(ctx context.Context, origSpec *specs.Spec) (*specs.Spec, error) { + if origSpec.Linux == nil { + return nil, fmt.Errorf("linux section must be present for lcow container") + } + + // Create a deep copy of the original OCI specs. + spec, err := deepCopySpec(origSpec) + if err != nil { + return nil, err + } + + // Preserve only the network namespace and assigned devices from the Windows section. + spec.Windows = extractWindowsFields(origSpec) + + // Hooks are not supported in the guest; they should be executed on the host. + spec.Hooks = nil + + // Sanitize and apply safe defaults for Linux fields. + sanitizeLinuxCPUDefaults(spec) + + // Clear any unsupported Linux resources from the spec. + clearUnsupportedLinuxResources(spec) + + // Disable seccomp filtering for privileged containers. + isPrivileged := oci.ParseAnnotationsBool(ctx, spec.Annotations, annotations.LCOWPrivileged, false) + if isPrivileged { + spec.Linux.Seccomp = nil + } + + return spec, nil +} + +// deepCopySpec performs a deep copy of the spec via JSON round-trip so mutations +// do not affect the caller's copy. +func deepCopySpec(origSpec *specs.Spec) (*specs.Spec, error) { + specJSON, err := json.Marshal(origSpec) + if err != nil { + return nil, fmt.Errorf("marshal spec: %w", err) + } + spec := &specs.Spec{} + if err := json.Unmarshal(specJSON, spec); err != nil { + return nil, fmt.Errorf("unmarshal spec: %w", err) + } + return spec, nil +} + +// extractWindowsFields extracts only the Windows fields relevant for LCOW: +// the network namespace and any assigned vPCI / GPU devices. +func extractWindowsFields(origSpec *specs.Spec) *specs.Windows { + if origSpec.Windows == nil { + return nil + } + + var windows *specs.Windows + + // Preserve the network namespace so the container joins the correct HNS network. + if origSpec.Windows.Network != nil && origSpec.Windows.Network.NetworkNamespace != "" { + windows = &specs.Windows{ + Network: &specs.WindowsNetwork{ + NetworkNamespace: origSpec.Windows.Network.NetworkNamespace, + }, + } + } + + // Carry over any assigned vPCI / GPU devices. + if origSpec.Windows.Devices != nil { + if windows == nil { + windows = &specs.Windows{} + } + windows.Devices = origSpec.Windows.Devices + } + + return windows +} + +// sanitizeLinuxCPUDefaults applies safe CPU defaults when the values are explicitly zeroed. +func sanitizeLinuxCPUDefaults(spec *specs.Spec) { + if spec.Linux.Resources == nil || spec.Linux.Resources.CPU == nil { + return + } + + cpuResources := spec.Linux.Resources.CPU + if cpuResources.Period != nil && *cpuResources.Period == 0 { + *cpuResources.Period = 100000 // Default CFS period in microseconds. + } + if cpuResources.Quota != nil && *cpuResources.Quota == 0 { + *cpuResources.Quota = -1 // Unlimited CPU quota. + } +} + +// clearUnsupportedLinuxResources removes resource types the GCS does not support +// or manages on its own. +func clearUnsupportedLinuxResources(spec *specs.Spec) { + // GCS controls the cgroup hierarchy internally. + spec.Linux.CgroupsPath = "" + + if spec.Linux.Resources != nil { + spec.Linux.Resources.Devices = nil + spec.Linux.Resources.Pids = nil + spec.Linux.Resources.BlockIO = nil + spec.Linux.Resources.HugepageLimits = nil + spec.Linux.Resources.Network = nil + } +} diff --git a/internal/builder/container/lcow/specs_test.go b/internal/builder/container/lcow/specs_test.go new file mode 100644 index 0000000000..86492a3d85 --- /dev/null +++ b/internal/builder/container/lcow/specs_test.go @@ -0,0 +1,369 @@ +//go:build windows && lcow + +package lcow + +import ( + "testing" + + "github.com/Microsoft/hcsshim/pkg/annotations" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// ───────────────────────────────────────────────────────────────────────────── +// GenerateSpecs — nil Linux section +// ───────────────────────────────────────────────────────────────────────────── + +// TestGenerateSpecs_NilLinux verifies that a spec without a Linux section +// returns an error. +func TestGenerateSpecs_NilLinux(t *testing.T) { + t.Parallel() + + _, err := GenerateSpecs(t.Context(), &specs.Spec{}) + if err == nil { + t.Fatal("expected error for nil Linux section") + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// GenerateSpecs — hooks are stripped +// ───────────────────────────────────────────────────────────────────────────── + +// TestGenerateSpecs_HooksStripped verifies that OCI hooks are removed from the +// generated spec. +func TestGenerateSpecs_HooksStripped(t *testing.T) { + t.Parallel() + + origSpec := &specs.Spec{ + Linux: &specs.Linux{}, + Hooks: &specs.Hooks{ + CreateRuntime: []specs.Hook{{Path: "/bin/prehook"}}, + }, + } + + spec, err := GenerateSpecs(t.Context(), origSpec) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if spec.Hooks != nil { + t.Error("expected hooks to be nil") + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// GenerateSpecs — deep copy does not mutate original +// ───────────────────────────────────────────────────────────────────────────── + +// TestGenerateSpecs_DeepCopy verifies that the returned spec is a deep copy +// and mutations to it do not affect the original. +func TestGenerateSpecs_DeepCopy(t *testing.T) { + t.Parallel() + + origSpec := &specs.Spec{ + Linux: &specs.Linux{}, + Process: &specs.Process{Args: []string{"sh"}}, + } + + spec, err := GenerateSpecs(t.Context(), origSpec) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Mutate the returned spec. + spec.Process.Args = append(spec.Process.Args, "-c", "exit") + + // The original must remain unchanged. + if len(origSpec.Process.Args) != 1 { + t.Errorf("original spec mutated: args = %v", origSpec.Process.Args) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// GenerateSpecs — privileged container clears seccomp +// ───────────────────────────────────────────────────────────────────────────── + +// TestGenerateSpecs_PrivilegedClearsSeccomp verifies that when the privileged +// annotation is set, the seccomp profile is cleared. +func TestGenerateSpecs_PrivilegedClearsSeccomp(t *testing.T) { + t.Parallel() + + origSpec := &specs.Spec{ + Linux: &specs.Linux{ + Seccomp: &specs.LinuxSeccomp{DefaultAction: specs.ActErrno}, + }, + Annotations: map[string]string{ + annotations.LCOWPrivileged: "true", + }, + } + + spec, err := GenerateSpecs(t.Context(), origSpec) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if spec.Linux.Seccomp != nil { + t.Error("expected seccomp to be nil for privileged container") + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// GenerateSpecs — non-privileged preserves seccomp +// ───────────────────────────────────────────────────────────────────────────── + +// TestGenerateSpecs_NonPrivilegedPreservesSeccomp verifies that the seccomp +// profile is preserved when the container is not privileged. +func TestGenerateSpecs_NonPrivilegedPreservesSeccomp(t *testing.T) { + t.Parallel() + + origSpec := &specs.Spec{ + Linux: &specs.Linux{ + Seccomp: &specs.LinuxSeccomp{DefaultAction: specs.ActErrno}, + }, + } + + spec, err := GenerateSpecs(t.Context(), origSpec) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if spec.Linux.Seccomp == nil { + t.Error("expected seccomp to be preserved for non-privileged container") + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// extractWindowsFields +// ───────────────────────────────────────────────────────────────────────────── + +// TestExtractWindowsFields verifies that only the network namespace and +// assigned devices are preserved from the Windows section. +func TestExtractWindowsFields(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input *specs.Spec + wantNil bool + wantNamespace string + wantDeviceIDs []string + }{ + { + name: "nil Windows", + input: &specs.Spec{}, + wantNil: true, + }, + { + name: "network namespace only", + input: &specs.Spec{ + Windows: &specs.Windows{ + Network: &specs.WindowsNetwork{NetworkNamespace: "ns-123"}, + }, + }, + wantNamespace: "ns-123", + }, + { + name: "devices only", + input: &specs.Spec{ + Windows: &specs.Windows{ + Devices: []specs.WindowsDevice{{ID: "dev-1"}}, + }, + }, + wantDeviceIDs: []string{"dev-1"}, + }, + { + name: "both network and devices", + input: &specs.Spec{ + Windows: &specs.Windows{ + Network: &specs.WindowsNetwork{NetworkNamespace: "ns-456"}, + Devices: []specs.WindowsDevice{{ID: "dev-2"}}, + }, + }, + wantNamespace: "ns-456", + wantDeviceIDs: []string{"dev-2"}, + }, + { + name: "empty network namespace", + input: &specs.Spec{ + Windows: &specs.Windows{ + Network: &specs.WindowsNetwork{NetworkNamespace: ""}, + }, + }, + wantNil: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := extractWindowsFields(tt.input) + + if tt.wantNil { + if result != nil { + t.Fatalf("expected nil, got %+v", result) + } + return + } + + if result == nil { + t.Fatal("expected non-nil Windows") + } + + // Validate network namespace. + if tt.wantNamespace != "" { + if result.Network == nil || result.Network.NetworkNamespace != tt.wantNamespace { + t.Errorf("expected namespace %q, got %+v", tt.wantNamespace, result.Network) + } + } else if result.Network != nil { + t.Errorf("expected nil network, got %+v", result.Network) + } + + // Validate devices. + if len(tt.wantDeviceIDs) != len(result.Devices) { + t.Fatalf("expected %d devices, got %d", len(tt.wantDeviceIDs), len(result.Devices)) + } + for i, wantID := range tt.wantDeviceIDs { + if result.Devices[i].ID != wantID { + t.Errorf("device[%d]: expected ID %q, got %q", i, wantID, result.Devices[i].ID) + } + } + }) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// sanitizeLinuxCPUDefaults +// ───────────────────────────────────────────────────────────────────────────── + +// TestSanitizeLinuxCPUDefaults verifies that zero-valued CPU period and quota +// are replaced with safe defaults, while non-zero values and nil fields are +// left unchanged. +func TestSanitizeLinuxCPUDefaults(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + resources *specs.LinuxResources + wantPeriod *uint64 + wantQuota *int64 + }{ + { + name: "nil Resources", + resources: nil, + }, + { + name: "nil CPU", + resources: &specs.LinuxResources{}, + }, + { + name: "zero Period and Quota get defaults", + resources: func() *specs.LinuxResources { + var period uint64 + var quota int64 + return &specs.LinuxResources{ + CPU: &specs.LinuxCPU{Period: &period, Quota: "a}, + } + }(), + wantPeriod: func() *uint64 { v := uint64(100000); return &v }(), + wantQuota: func() *int64 { v := int64(-1); return &v }(), + }, + { + name: "non-zero values unchanged", + resources: func() *specs.LinuxResources { + period := uint64(50000) + quota := int64(25000) + return &specs.LinuxResources{ + CPU: &specs.LinuxCPU{Period: &period, Quota: "a}, + } + }(), + wantPeriod: func() *uint64 { v := uint64(50000); return &v }(), + wantQuota: func() *int64 { v := int64(25000); return &v }(), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + spec := &specs.Spec{Linux: &specs.Linux{Resources: tt.resources}} + + sanitizeLinuxCPUDefaults(spec) + + // For nil Resources or nil CPU, just verify no panic. + if tt.wantPeriod == nil && tt.wantQuota == nil { + return + } + + cpu := spec.Linux.Resources.CPU + if tt.wantPeriod != nil && *cpu.Period != *tt.wantPeriod { + t.Errorf("expected Period %d, got %d", *tt.wantPeriod, *cpu.Period) + } + if tt.wantQuota != nil && *cpu.Quota != *tt.wantQuota { + t.Errorf("expected Quota %d, got %d", *tt.wantQuota, *cpu.Quota) + } + }) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// clearUnsupportedLinuxResources +// ───────────────────────────────────────────────────────────────────────────── + +// TestClearUnsupportedResources verifies that unsupported resource types are +// cleared and the cgroups path is emptied. +func TestClearUnsupportedResources(t *testing.T) { + t.Parallel() + + shares := uint64(512) + + spec := &specs.Spec{ + Linux: &specs.Linux{ + CgroupsPath: "/sys/fs/cgroup/test", + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{{Allow: true}}, + Pids: &specs.LinuxPids{Limit: 100}, + BlockIO: &specs.LinuxBlockIO{}, + HugepageLimits: []specs.LinuxHugepageLimit{{Pagesize: "2MB", Limit: 1024}}, + Network: &specs.LinuxNetwork{}, + // CPU and Memory should be preserved (not cleared). + CPU: &specs.LinuxCPU{Shares: &shares}, + Memory: &specs.LinuxMemory{}, + }, + }, + } + + clearUnsupportedLinuxResources(spec) + + if spec.Linux.CgroupsPath != "" { + t.Errorf("expected empty cgroups path, got %q", spec.Linux.CgroupsPath) + } + if spec.Linux.Resources.Devices != nil { + t.Error("expected Devices to be nil") + } + if spec.Linux.Resources.Pids != nil { + t.Error("expected Pids to be nil") + } + if spec.Linux.Resources.BlockIO != nil { + t.Error("expected BlockIO to be nil") + } + if spec.Linux.Resources.HugepageLimits != nil { + t.Error("expected HugepageLimits to be nil") + } + if spec.Linux.Resources.Network != nil { + t.Error("expected Network to be nil") + } + // CPU and Memory must survive. + if spec.Linux.Resources.CPU == nil { + t.Error("expected CPU to be preserved") + } + if spec.Linux.Resources.Memory == nil { + t.Error("expected Memory to be preserved") + } +} + +// TestClearUnsupportedResources_NilResources verifies that nil Resources does +// not panic and the cgroups path is still cleared. +func TestClearUnsupportedResources_NilResources(t *testing.T) { + t.Parallel() + + spec := &specs.Spec{Linux: &specs.Linux{CgroupsPath: "/test"}} + + clearUnsupportedLinuxResources(spec) + + if spec.Linux.CgroupsPath != "" { + t.Errorf("expected empty cgroups path, got %q", spec.Linux.CgroupsPath) + } +} diff --git a/internal/builder/container/mocks/mock_types.go b/internal/builder/container/mocks/mock_types.go new file mode 100644 index 0000000000..fc15c448a9 --- /dev/null +++ b/internal/builder/container/mocks/mock_types.go @@ -0,0 +1,184 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: types.go +// +// Generated by this command: +// +// mockgen -source types.go -package mocks -destination mocks/mock_types.go +// + +// Package mocks is a generated GoMock package. +package mocks + +import ( + context "context" + reflect "reflect" + + guid "github.com/Microsoft/go-winio/pkg/guid" + mount "github.com/Microsoft/hcsshim/internal/controller/device/plan9/mount" + share "github.com/Microsoft/hcsshim/internal/controller/device/plan9/share" + disk "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + mount0 "github.com/Microsoft/hcsshim/internal/controller/device/scsi/mount" + vpci "github.com/Microsoft/hcsshim/internal/controller/device/vpci" + gomock "go.uber.org/mock/gomock" +) + +// MockSCSIReserver is a mock of SCSIReserver interface. +type MockSCSIReserver struct { + ctrl *gomock.Controller + recorder *MockSCSIReserverMockRecorder + isgomock struct{} +} + +// MockSCSIReserverMockRecorder is the mock recorder for MockSCSIReserver. +type MockSCSIReserverMockRecorder struct { + mock *MockSCSIReserver +} + +// NewMockSCSIReserver creates a new mock instance. +func NewMockSCSIReserver(ctrl *gomock.Controller) *MockSCSIReserver { + mock := &MockSCSIReserver{ctrl: ctrl} + mock.recorder = &MockSCSIReserverMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockSCSIReserver) EXPECT() *MockSCSIReserverMockRecorder { + return m.recorder +} + +// Reserve mocks base method. +func (m *MockSCSIReserver) Reserve(ctx context.Context, diskConfig disk.Config, mountConfig mount0.Config) (guid.GUID, string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Reserve", ctx, diskConfig, mountConfig) + ret0, _ := ret[0].(guid.GUID) + ret1, _ := ret[1].(string) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 +} + +// Reserve indicates an expected call of Reserve. +func (mr *MockSCSIReserverMockRecorder) Reserve(ctx, diskConfig, mountConfig any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reserve", reflect.TypeOf((*MockSCSIReserver)(nil).Reserve), ctx, diskConfig, mountConfig) +} + +// UnmapFromGuest mocks base method. +func (m *MockSCSIReserver) UnmapFromGuest(ctx context.Context, reservation guid.GUID) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UnmapFromGuest", ctx, reservation) + ret0, _ := ret[0].(error) + return ret0 +} + +// UnmapFromGuest indicates an expected call of UnmapFromGuest. +func (mr *MockSCSIReserverMockRecorder) UnmapFromGuest(ctx, reservation any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnmapFromGuest", reflect.TypeOf((*MockSCSIReserver)(nil).UnmapFromGuest), ctx, reservation) +} + +// MockPlan9Reserver is a mock of Plan9Reserver interface. +type MockPlan9Reserver struct { + ctrl *gomock.Controller + recorder *MockPlan9ReserverMockRecorder + isgomock struct{} +} + +// MockPlan9ReserverMockRecorder is the mock recorder for MockPlan9Reserver. +type MockPlan9ReserverMockRecorder struct { + mock *MockPlan9Reserver +} + +// NewMockPlan9Reserver creates a new mock instance. +func NewMockPlan9Reserver(ctrl *gomock.Controller) *MockPlan9Reserver { + mock := &MockPlan9Reserver{ctrl: ctrl} + mock.recorder = &MockPlan9ReserverMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockPlan9Reserver) EXPECT() *MockPlan9ReserverMockRecorder { + return m.recorder +} + +// Reserve mocks base method. +func (m *MockPlan9Reserver) Reserve(ctx context.Context, shareConfig share.Config, mountConfig mount.Config) (guid.GUID, string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Reserve", ctx, shareConfig, mountConfig) + ret0, _ := ret[0].(guid.GUID) + ret1, _ := ret[1].(string) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 +} + +// Reserve indicates an expected call of Reserve. +func (mr *MockPlan9ReserverMockRecorder) Reserve(ctx, shareConfig, mountConfig any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reserve", reflect.TypeOf((*MockPlan9Reserver)(nil).Reserve), ctx, shareConfig, mountConfig) +} + +// UnmapFromGuest mocks base method. +func (m *MockPlan9Reserver) UnmapFromGuest(ctx context.Context, reservation guid.GUID) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UnmapFromGuest", ctx, reservation) + ret0, _ := ret[0].(error) + return ret0 +} + +// UnmapFromGuest indicates an expected call of UnmapFromGuest. +func (mr *MockPlan9ReserverMockRecorder) UnmapFromGuest(ctx, reservation any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnmapFromGuest", reflect.TypeOf((*MockPlan9Reserver)(nil).UnmapFromGuest), ctx, reservation) +} + +// MockVPCIReserver is a mock of VPCIReserver interface. +type MockVPCIReserver struct { + ctrl *gomock.Controller + recorder *MockVPCIReserverMockRecorder + isgomock struct{} +} + +// MockVPCIReserverMockRecorder is the mock recorder for MockVPCIReserver. +type MockVPCIReserverMockRecorder struct { + mock *MockVPCIReserver +} + +// NewMockVPCIReserver creates a new mock instance. +func NewMockVPCIReserver(ctrl *gomock.Controller) *MockVPCIReserver { + mock := &MockVPCIReserver{ctrl: ctrl} + mock.recorder = &MockVPCIReserverMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockVPCIReserver) EXPECT() *MockVPCIReserverMockRecorder { + return m.recorder +} + +// RemoveFromVM mocks base method. +func (m *MockVPCIReserver) RemoveFromVM(ctx context.Context, vmBusGUID guid.GUID) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemoveFromVM", ctx, vmBusGUID) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemoveFromVM indicates an expected call of RemoveFromVM. +func (mr *MockVPCIReserverMockRecorder) RemoveFromVM(ctx, vmBusGUID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveFromVM", reflect.TypeOf((*MockVPCIReserver)(nil).RemoveFromVM), ctx, vmBusGUID) +} + +// Reserve mocks base method. +func (m *MockVPCIReserver) Reserve(ctx context.Context, device vpci.Device) (guid.GUID, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Reserve", ctx, device) + ret0, _ := ret[0].(guid.GUID) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Reserve indicates an expected call of Reserve. +func (mr *MockVPCIReserverMockRecorder) Reserve(ctx, device any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reserve", reflect.TypeOf((*MockVPCIReserver)(nil).Reserve), ctx, device) +} diff --git a/internal/builder/container/resource_plan.go b/internal/builder/container/resource_plan.go new file mode 100644 index 0000000000..249934392d --- /dev/null +++ b/internal/builder/container/resource_plan.go @@ -0,0 +1,74 @@ +//go:build windows + +package container + +import ( + "context" + + "github.com/Microsoft/go-winio/pkg/guid" + "github.com/Microsoft/hcsshim/internal/log" +) + +// ResourcePlan holds all host-side resource reservations for a container. +type ResourcePlan struct { + // SCSILayers holds the read-only and scratch layer SCSI reservations. + SCSILayers *SCSILayerPlan + // SCSI holds non-layer SCSI mount reservations (virtual-disk, physical-disk, EVD). + SCSI []guid.GUID + // Plan9 holds Plan9 share reservations for bind mounts. + Plan9 []guid.GUID + // Devices holds vPCI / GPU device reservations. + Devices []guid.GUID +} + +// Release undoes every reservation held by the plan in reverse allocation order. +// Errors are logged but do not stop the remaining cleanup. +func (rp *ResourcePlan) Release(ctx context.Context, scsi SCSIReserver, plan9 Plan9Reserver, vpci VPCIReserver) { + if rp == nil { + return + } + + log.G(ctx).Debug("releasing all resource reservations") + + // Plan9 shares. + for _, reservationID := range rp.Plan9 { + if err := plan9.UnmapFromGuest(ctx, reservationID); err != nil { + log.G(ctx).WithError(err).Error("failed to release plan9 share reservation") + } + } + + // VPCI devices. + for _, deviceID := range rp.Devices { + if err := vpci.RemoveFromVM(ctx, deviceID); err != nil { + log.G(ctx).WithError(err).Error("failed to release vpci device reservation") + } + } + + // SCSI mounts (non-layer). + for _, reservationID := range rp.SCSI { + if err := scsi.UnmapFromGuest(ctx, reservationID); err != nil { + log.G(ctx).WithError(err).Error("failed to release scsi mount reservation") + } + } + + // If there were no layer reservations, then we can return. + if rp.SCSILayers == nil { + return + } + + // Scratch layer — zero GUID means it was never reserved. + if rp.SCSILayers.Scratch.ID != (guid.GUID{}) { + if err := scsi.UnmapFromGuest(ctx, rp.SCSILayers.Scratch.ID); err != nil { + log.G(ctx).WithError(err).Error("failed to release scratch layer reservation") + } + } + + // Read-only layers. + for _, layer := range rp.SCSILayers.ROLayers { + if err := scsi.UnmapFromGuest(ctx, layer.ID); err != nil { + log.G(ctx).WithError(err).Error("failed to release read-only layer reservation") + } + } + + log.G(ctx).Debug("all resource reservations released") +} diff --git a/internal/builder/container/resource_plan_test.go b/internal/builder/container/resource_plan_test.go new file mode 100644 index 0000000000..20195e12a5 --- /dev/null +++ b/internal/builder/container/resource_plan_test.go @@ -0,0 +1,299 @@ +//go:build windows + +package container + +import ( + "errors" + "testing" + + "github.com/Microsoft/go-winio/pkg/guid" + "go.uber.org/mock/gomock" + + "github.com/Microsoft/hcsshim/internal/builder/container/mocks" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Test helpers +// ───────────────────────────────────────────────────────────────────────────── + +// testController bundles the mock reservers used by every Release test. +type testController struct { + scsi *mocks.MockSCSIReserver + p9 *mocks.MockPlan9Reserver + vpci *mocks.MockVPCIReserver +} + +// newTestController creates a parallel-safe testController with fresh mocks. +func newTestController(t *testing.T) *testController { + t.Helper() + t.Parallel() + + ctrl := gomock.NewController(t) + return &testController{ + scsi: mocks.NewMockSCSIReserver(ctrl), + p9: mocks.NewMockPlan9Reserver(ctrl), + vpci: mocks.NewMockVPCIReserver(ctrl), + } +} + +// newGUID is a test helper that generates a random GUID and fails the test on error. +func newGUID(t *testing.T) guid.GUID { + t.Helper() + id, err := guid.NewV4() + if err != nil { + t.Fatalf("failed to generate GUID: %v", err) + } + return id +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — nil receiver +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_NilPlan verifies that Release on a nil *ResourcePlan is a no-op +// and does not panic. +func TestRelease_NilPlan(t *testing.T) { + tc := newTestController(t) + + // No expectations — nothing should be called. + var rp *ResourcePlan + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — empty plan +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_EmptyPlan verifies that Release on an empty (but non-nil) +// ResourcePlan is a no-op and does not call any reserver methods. +func TestRelease_EmptyPlan(t *testing.T) { + tc := newTestController(t) + + (&ResourcePlan{}).Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — Plan9 shares only +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_Plan9SharesOnly verifies that Release unmaps all Plan9 shares +// when no other resource types are present. +func TestRelease_Plan9SharesOnly(t *testing.T) { + tc := newTestController(t) + + id1, id2 := newGUID(t), newGUID(t) + + gomock.InOrder( + tc.p9.EXPECT().UnmapFromGuest(gomock.Any(), id1).Return(nil), + tc.p9.EXPECT().UnmapFromGuest(gomock.Any(), id2).Return(nil), + ) + + rp := &ResourcePlan{ + Plan9: []guid.GUID{id1, id2}, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — VPCI devices only +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_VPCIDevicesOnly verifies that Release removes all vPCI devices +// when no other resource types are present. +func TestRelease_VPCIDevicesOnly(t *testing.T) { + tc := newTestController(t) + + id1, id2 := newGUID(t), newGUID(t) + + gomock.InOrder( + tc.vpci.EXPECT().RemoveFromVM(gomock.Any(), id1).Return(nil), + tc.vpci.EXPECT().RemoveFromVM(gomock.Any(), id2).Return(nil), + ) + + rp := &ResourcePlan{ + Devices: []guid.GUID{id1, id2}, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — SCSI mounts only (non-layer) +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_SCSIMountsOnly verifies that Release unmaps all non-layer SCSI +// mounts when no other resource types are present. +func TestRelease_SCSIMountsOnly(t *testing.T) { + tc := newTestController(t) + + id1, id2 := newGUID(t), newGUID(t) + + gomock.InOrder( + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), id1).Return(nil), + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), id2).Return(nil), + ) + + rp := &ResourcePlan{ + SCSI: []guid.GUID{id1, id2}, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — SCSI layers only (scratch + read-only) +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_SCSILayersOnly verifies that Release unmaps the scratch layer and +// all read-only layers when only SCSILayers are present. +func TestRelease_SCSILayersOnly(t *testing.T) { + tc := newTestController(t) + + scratchID := newGUID(t) + roID1, roID2 := newGUID(t), newGUID(t) + + gomock.InOrder( + // Scratch layer release. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), scratchID).Return(nil), + // Read-only layer releases. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), roID1).Return(nil), + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), roID2).Return(nil), + ) + + rp := &ResourcePlan{ + SCSILayers: &SCSILayerPlan{ + Scratch: MountReservation{ID: scratchID, GuestPath: "/scratch"}, + ROLayers: []MountReservation{ + {ID: roID1, GuestPath: "/layer1"}, + {ID: roID2, GuestPath: "/layer2"}, + }, + }, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — SCSI layers with zero-GUID scratch (never reserved) +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_SCSILayers_ZeroScratch verifies that Release skips the scratch +// layer when its GUID is the zero value (never reserved) but still releases +// read-only layers. +func TestRelease_SCSILayers_ZeroScratch(t *testing.T) { + tc := newTestController(t) + + roID := newGUID(t) + + // Only the read-only layer should be released — scratch is zero GUID. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), roID).Return(nil) + + rp := &ResourcePlan{ + SCSILayers: &SCSILayerPlan{ + Scratch: MountReservation{}, // zero GUID — never reserved. + ROLayers: []MountReservation{ + {ID: roID, GuestPath: "/layer1"}, + }, + }, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — full plan (all resource types) +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_FullPlan verifies that Release cleans up every resource type +// (Plan9, vPCI, non-layer SCSI, scratch, and read-only layers) in a single call. +func TestRelease_FullPlan(t *testing.T) { + tc := newTestController(t) + + plan9ID := newGUID(t) + vpciID := newGUID(t) + scsiMountID := newGUID(t) + scratchID := newGUID(t) + roID := newGUID(t) + + gomock.InOrder( + // Plan9 share. + tc.p9.EXPECT().UnmapFromGuest(gomock.Any(), plan9ID).Return(nil), + // vPCI device. + tc.vpci.EXPECT().RemoveFromVM(gomock.Any(), vpciID).Return(nil), + // Non-layer SCSI mount. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), scsiMountID).Return(nil), + // Scratch layer. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), scratchID).Return(nil), + // Read-only layer. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), roID).Return(nil), + ) + + rp := &ResourcePlan{ + Plan9: []guid.GUID{plan9ID}, + Devices: []guid.GUID{vpciID}, + SCSI: []guid.GUID{scsiMountID}, + SCSILayers: &SCSILayerPlan{ + Scratch: MountReservation{ID: scratchID, GuestPath: "/scratch"}, + ROLayers: []MountReservation{{ID: roID, GuestPath: "/layer1"}}, + }, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — errors are logged but do not stop cleanup +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_ErrorsDoNotStopCleanup verifies that when one reserver returns an +// error, Release continues releasing all remaining resources rather than aborting. +func TestRelease_ErrorsDoNotStopCleanup(t *testing.T) { + tc := newTestController(t) + + plan9ID := newGUID(t) + vpciID := newGUID(t) + scsiMountID := newGUID(t) + scratchID := newGUID(t) + roID := newGUID(t) + + errRelease := errors.New("release failed") + + gomock.InOrder( + // Plan9 fails, but cleanup must continue. + tc.p9.EXPECT().UnmapFromGuest(gomock.Any(), plan9ID).Return(errRelease), + // vPCI fails, but cleanup must continue. + tc.vpci.EXPECT().RemoveFromVM(gomock.Any(), vpciID).Return(errRelease), + // Non-layer SCSI fails, but cleanup must continue. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), scsiMountID).Return(errRelease), + // Scratch fails, but cleanup must continue. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), scratchID).Return(errRelease), + // Read-only layer must still be released. + tc.scsi.EXPECT().UnmapFromGuest(gomock.Any(), roID).Return(nil), + ) + + rp := &ResourcePlan{ + Plan9: []guid.GUID{plan9ID}, + Devices: []guid.GUID{vpciID}, + SCSI: []guid.GUID{scsiMountID}, + SCSILayers: &SCSILayerPlan{ + Scratch: MountReservation{ID: scratchID, GuestPath: "/scratch"}, + ROLayers: []MountReservation{{ID: roID, GuestPath: "/layer1"}}, + }, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Release — SCSILayers with only zero-GUID scratch and empty ROLayers +// ───────────────────────────────────────────────────────────────────────────── + +// TestRelease_SCSILayers_ZeroScratch_EmptyROLayers verifies that Release handles +// a non-nil SCSILayerPlan where the scratch is zero GUID and the ROLayers slice +// is empty — nothing should be released for layers. +func TestRelease_SCSILayers_ZeroScratch_EmptyROLayers(t *testing.T) { + tc := newTestController(t) + + // No expectations — nothing should be called on any mock. + rp := &ResourcePlan{ + SCSILayers: &SCSILayerPlan{ + Scratch: MountReservation{}, // zero GUID. + ROLayers: nil, + }, + } + rp.Release(t.Context(), tc.scsi, tc.p9, tc.vpci) +} diff --git a/internal/builder/container/types.go b/internal/builder/container/types.go new file mode 100644 index 0000000000..796dc1777d --- /dev/null +++ b/internal/builder/container/types.go @@ -0,0 +1,50 @@ +//go:build windows + +package container + +import ( + "context" + + plan9Mount "github.com/Microsoft/hcsshim/internal/controller/device/plan9/mount" + "github.com/Microsoft/hcsshim/internal/controller/device/plan9/share" + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + scsiMount "github.com/Microsoft/hcsshim/internal/controller/device/scsi/mount" + "github.com/Microsoft/hcsshim/internal/controller/device/vpci" + + "github.com/Microsoft/go-winio/pkg/guid" +) + +// MountReservation pairs a device mount reservation ID with its resolved guest path. +type MountReservation struct { + ID guid.GUID + GuestPath string +} + +// SCSILayerPlan holds the reservations and guest paths for a container's +// read-only layers, scratch layer, and rootfs, mounted via SCSI. +type SCSILayerPlan struct { + // ROLayers holds the read-only layer reservations in overlay order. + ROLayers []MountReservation + // Scratch is the writable scratch layer reservation. + Scratch MountReservation + // RootfsGuestPath is the guest path where the container rootfs will be mounted. + RootfsGuestPath string +} + +// SCSIReserver reserves a SCSI disk and returns the reservation ID and guest path. +type SCSIReserver interface { + Reserve(ctx context.Context, diskConfig disk.Config, mountConfig scsiMount.Config) (guid.GUID, string, error) + UnmapFromGuest(ctx context.Context, reservation guid.GUID) error +} + +// Plan9Reserver reserves a Plan9 share and returns the reservation ID and guest path. +type Plan9Reserver interface { + Reserve(ctx context.Context, shareConfig share.Config, mountConfig plan9Mount.Config) (guid.GUID, string, error) + UnmapFromGuest(ctx context.Context, reservation guid.GUID) error +} + +// VPCIReserver reserves a vPCI device and returns the VMBus channel GUID. +type VPCIReserver interface { + Reserve(ctx context.Context, device vpci.Device) (guid.GUID, error) + RemoveFromVM(ctx context.Context, vmBusGUID guid.GUID) error +}