From 32802d942754ca494883f83dc334aea42bbb9dd1 Mon Sep 17 00:00:00 2001 From: Shreyansh Sancheti Date: Mon, 4 May 2026 16:52:33 +0530 Subject: [PATCH 1/2] =?UTF-8?q?guest:=20unify=20pod=20model=20=E2=80=94=20?= =?UTF-8?q?replace=20VirtualPod=20with=20single=20pod=20abstraction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the separate VirtualPod tracking (dedicated type, exported methods, parent cgroup manager, reverse-lookup map) with a unified uvmPod type and a single pods map on Host. All pod types (V1 sandbox, virtual pod, V2 shim) now go through the same code path: - createPodInUVM allocates a cgroup under /pods/{sandboxID} - RemoveContainer handles cleanup uniformly Cgroup hierarchy changes from: /containers/{id} (V1 sandbox) /containers/virtual-pods/{virtualPodID} (virtual pod) to: /pods/{sandboxID} (all pod types) /pods/{sandboxID}/{containerID} (workload containers) Signed-off-by: Shreyansh Jain Signed-off-by: Shreyansh Sancheti --- cmd/gcs/main.go | 59 +-- internal/guest/runtime/hcsv2/container.go | 3 + .../runtime/hcsv2/container_stats_test.go | 13 - .../guest/runtime/hcsv2/sandbox_container.go | 12 +- .../runtime/hcsv2/standalone_container.go | 8 +- internal/guest/runtime/hcsv2/uvm.go | 370 +++++------------- .../guest/runtime/hcsv2/workload_container.go | 13 +- test/gcs/cgroup_test.go | 4 +- 8 files changed, 132 insertions(+), 350 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index c21ea167fa..a4a65fd0f5 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -10,7 +10,6 @@ import ( "os" "os/exec" "path/filepath" - "strings" "syscall" "time" @@ -80,12 +79,7 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre } count++ - var msg string - if strings.HasPrefix(cgName, "/virtual-pods") { - msg = "memory usage for virtual pods cgroup exceeded threshold" - } else { - msg = "memory usage for cgroup exceeded threshold" - } + msg := "memory usage for cgroup exceeded threshold" cgVersion := "v2" if isV1 { @@ -328,7 +322,7 @@ func main() { // Setup the UVM cgroups to protect against a workload taking all available // memory and causing the GCS to malfunction we create cgroups: gcs, - // containers, and virtual-pods for multi-pod support. + // containers, and pods for pod support. // // Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy @@ -340,8 +334,7 @@ func main() { } } - // The containers cgroup is limited only by {Totalram - 75 MB - // (reservation)}. + // The pods cgroup is limited only by {Totalram - 75 MB (reservation)}. // // The gcs cgroup is not limited but an event will get logged if memory // usage exceeds 50 MB. @@ -349,28 +342,16 @@ func main() { if err := syscall.Sysinfo(&sinfo); err != nil { logrus.WithError(err).Fatal("failed to get sys info") } - containersLimit := int64(sinfo.Totalram - *rootMemReserveBytes) - containersControl, err := cgroup.NewManager("/containers", &oci.LinuxResources{ - Memory: &oci.LinuxMemory{ - Limit: &containersLimit, - }, - }) - if err != nil { - logrus.WithError(err).Fatal("failed to create containers cgroup") - } - defer containersControl.Delete() //nolint:errcheck - - // Create virtual-pods cgroup hierarchy for multi-pod support - // This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID} - virtualPodsControl, err := cgroup.NewManager("/containers/virtual-pods", &oci.LinuxResources{ + podsLimit := int64(sinfo.Totalram - *rootMemReserveBytes) + podsControl, err := cgroup.NewManager("/pods", &oci.LinuxResources{ Memory: &oci.LinuxMemory{ - Limit: &containersLimit, // Share the same limit as containers + Limit: &podsLimit, }, }) if err != nil { - logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup") + logrus.WithError(err).Fatal("failed to create pods cgroup") } - defer virtualPodsControl.Delete() //nolint:errcheck + defer podsControl.Delete() //nolint:errcheck gcsControl, err := cgroup.NewManager("/gcs", &oci.LinuxResources{}) if err != nil { @@ -388,10 +369,6 @@ func main() { } h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) - // Initialize virtual pod support in the host - if err := h.InitializeVirtualPodSupport(virtualPodsControl); err != nil { - logrus.WithError(err).Warn("Virtual pod support initialization failed") - } // During live migration the VM is frozen and only wakes up when the host // shim is ready, so the vsock port should be immediately available. We @@ -406,20 +383,13 @@ func main() { gefdFile := os.NewFile(gefd, "gefd") defer gefdFile.Close() - oom, err := containersControl.OOMEventFD() - if err != nil { - logrus.WithError(err).Fatal("failed to retrieve the container cgroups oom eventfd") - } - oomFile := os.NewFile(oom, "cefd") - defer oomFile.Close() - - // Setup OOM monitoring for virtual-pods cgroup - virtualPodsOom, err := virtualPodsControl.OOMEventFD() + // Setup OOM monitoring for pods cgroup + podsOom, err := podsControl.OOMEventFD() if err != nil { - logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd") + logrus.WithError(err).Fatal("failed to retrieve the pods cgroups oom eventfd") } - virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd") - defer virtualPodsOomFile.Close() + podsOomFile := os.NewFile(podsOom, "pods-oomfd") + defer podsOomFile.Close() // time synchronization service if !(*disableTimeSync) { @@ -429,8 +399,7 @@ func main() { } go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) - go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) - go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl) + go readMemoryEvents(startTime, podsOomFile, "/pods", podsLimit, podsControl) mux := bridge.NewBridgeMux() b := bridge.New(mux, *v4) diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 0ce930dd65..36907e112b 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -44,7 +44,10 @@ const ( ) type Container struct { + // id is the unique container identifier. id string + // sandboxID is the pod this container belongs to. For sandbox containers, sandboxID == id. + sandboxID string vsock transport.Transport logPath string // path to [logFile]. diff --git a/internal/guest/runtime/hcsv2/container_stats_test.go b/internal/guest/runtime/hcsv2/container_stats_test.go index c1986ca2b1..fa44a4451a 100644 --- a/internal/guest/runtime/hcsv2/container_stats_test.go +++ b/internal/guest/runtime/hcsv2/container_stats_test.go @@ -494,16 +494,3 @@ func TestConvertV2StatsToV1_NilInput(t *testing.T) { t.Error("ConvertV2StatsToV1(nil) should return empty metrics with all nil fields") } } - -func TestHost_InitializeVirtualPodSupport_ErrorCases(t *testing.T) { - host := &Host{} - - // Test with nil input - err := host.InitializeVirtualPodSupport(nil) - if err == nil { - t.Error("Expected error for nil input") - } - if err != nil && err.Error() != "no valid cgroup manager provided for virtual pod support" { - t.Errorf("Unexpected error message: %s", err.Error()) - } -} diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 12590f4dc0..308c349afc 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -119,14 +119,12 @@ func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec // also has a concept of a sandbox/shm file when the IPC NamespaceMode != // NODE. - // Set cgroup path - check if this is a virtual pod - if virtualSandboxID != "" { - // Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID - } else { - // Traditional sandbox goes under /containers - spec.Linux.CgroupsPath = "/containers/" + id + // Set cgroup path under the pod. + sandboxID := id + if vpID := spec.Annotations[annotations.VirtualPodID]; vpID != "" { + sandboxID = vpID } + spec.Linux.CgroupsPath = "/pods/" + sandboxID // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 768adda344..742a3716cd 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -15,7 +15,6 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/network" specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" "github.com/Microsoft/hcsshim/internal/oc" - "github.com/Microsoft/hcsshim/pkg/annotations" ) func getStandaloneHostnamePath(rootDir string) string { @@ -119,12 +118,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec } // Set cgroup path - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] - if virtualSandboxID != "" { - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + "/" + id - } else { - spec.Linux.CgroupsPath = "/containers/" + id - } + spec.Linux.CgroupsPath = "/pods/" + id // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index bbec0b7564..9ebfe91286 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -73,32 +73,26 @@ func checkValidContainerID(id string, idType string) error { return errors.Errorf("invalid %s id: %s (must match %s)", idType, id, validContainerIDRegex.String()) } -// VirtualPod represents a virtual pod that shares a UVM/Sandbox with other pods -type VirtualPod struct { - VirtualSandboxID string - MasterSandboxID string - NetworkNamespace string - CgroupPath string - CgroupControl cgroup.Manager // Unified cgroup manager (v1 or v2) - Containers map[string]bool // containerID -> exists - CreatedAt time.Time +// uvmPod tracks pod-level state within the UVM. +type uvmPod struct { + sandboxID string + networkNamespace string + cgroupPath string + cgroupControl cgroup.Manager + containers map[string]bool } // Host is the structure tracking all UVM host state including all containers // and processes. type Host struct { + // containersMutex guards both containers and pods maps. containersMutex sync.Mutex containers map[string]*Container + pods map[string]*uvmPod externalProcessesMutex sync.Mutex externalProcesses map[int]*externalProcess - // Virtual pod support for multi-pod scenarios - virtualPodsMutex sync.Mutex - virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod - containerToVirtualPod map[string]string // containerID -> virtualSandboxID - virtualPodsCgroupManager cgroup.Manager // Parent cgroup for all virtual pods - // sandboxRoots maps sandboxID to the resolved sandbox root directory. // Populated via registerSandboxRoot during sandbox creation using // the host-provided OCIBundlePath as source of truth. @@ -126,16 +120,15 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s logWriter, ) return &Host{ - containers: make(map[string]*Container), - externalProcesses: make(map[int]*externalProcess), - virtualPods: make(map[string]*VirtualPod), - containerToVirtualPod: make(map[string]string), - sandboxRoots: make(map[string]string), - rtime: rtime, - vsock: vsock, - devNullTransport: &transport.DevNullTransport{}, - hostMounts: newHostMounts(), - securityOptions: securityPolicyOptions, + containers: make(map[string]*Container), + externalProcesses: make(map[int]*externalProcess), + pods: make(map[string]*uvmPod), + sandboxRoots: make(map[string]string), + rtime: rtime, + vsock: vsock, + devNullTransport: &transport.DevNullTransport{}, + hostMounts: newHostMounts(), + securityOptions: securityPolicyOptions, } } @@ -214,26 +207,40 @@ func (h *Host) RemoveContainer(id string) { return } - // Check if this container is part of a virtual pod - virtualPodID, isVirtualPod := c.spec.Annotations[annotations.VirtualPodID] - if isVirtualPod { - // Remove from virtual pod tracking - h.RemoveContainerFromVirtualPod(id) - // Network namespace cleanup is handled in virtual pod cleanup when last container is removed. - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: id, - logfields.VirtualSandboxID: virtualPodID, - }).Info("Container removed from virtual pod") - } else { - // delete the network namespace for standalone and sandbox containers - criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] - if !isCRI || criType == "sandbox" { - _ = RemoveNetworkNamespace(context.Background(), id) + criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] + + // Do NOT call RemoveNetworkNamespace for virtual pod sandbox containers. + // The host-driven teardown path (TearDownNetworking → RemoveNetNS → removeNIC) + // removes adapters first and then the namespace. Calling it here would fail + // with "contains adapters" because the host hasn't removed them yet. + virtualPodID := c.spec.Annotations[annotations.VirtualPodID] + isVirtualPodSandbox := virtualPodID != "" && id == virtualPodID + if !isVirtualPodSandbox && (!isCRI || criType == "sandbox") { + if err := RemoveNetworkNamespace(context.Background(), id); err != nil { + logrus.WithError(err).WithField(logfields.ContainerID, id).Warn("failed to remove network namespace") } } delete(h.containers, id) + // Clean up pod tracking. For standalone containers, sandboxID == id but + // no pod entry exists (createPodInUVM is only called for CRI sandboxes), + // so the lookup returns false and the block is skipped. + if pod, exists := h.pods[c.sandboxID]; exists { + delete(pod.containers, id) + // When the sandbox container itself is removed, tear down the pod. + if id == c.sandboxID { + if pod.cgroupControl != nil { + if err := pod.cgroupControl.Delete(); err != nil { + logrus.WithFields(logrus.Fields{ + "sandboxID": c.sandboxID, + }).WithError(err).Warn("failed to delete pod cgroup") + } + } + delete(h.pods, c.sandboxID) + } + } + // Clean up the sandbox root mapping for sandbox containers. if c.isSandbox { h.unregisterSandboxRoot(id) @@ -377,7 +384,8 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM criType, isCRI := settings.OCISpecification.Annotations[annotations.KubernetesContainerType] // Check for virtual pod annotation - virtualPodID, isVirtualPod := settings.OCISpecification.Annotations[annotations.VirtualPodID] + virtualPodID := settings.OCISpecification.Annotations[annotations.VirtualPodID] + isVirtualPod := virtualPodID != "" if h.HasSecurityPolicy() { if err = checkValidContainerID(id, "container"); err != nil { @@ -415,6 +423,21 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } c.setStatus(containerCreating) + // Resolve sandboxID early so all downstream code uses the correct value. + // For sandbox containers, sandboxID == id (or virtualPodID for virtual pods). + // For workload containers, sandboxID comes from the KubernetesSandboxID annotation. + // For standalone containers, sandboxID == id. + sandboxID := id + if isVirtualPod { + sandboxID = virtualPodID + } else if criType == "container" { + sid := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] + if sid != "" { + sandboxID = sid + } + } + c.sandboxID = sandboxID + if err := h.AddContainer(id, c); err != nil { return nil, err } @@ -424,50 +447,10 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() - // Handle virtual pod logic - if isVirtualPod && isCRI { - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: id, - logfields.VirtualSandboxID: virtualPodID, - "criType": criType, - }).Info("Processing container for virtual pod") - - if criType == "sandbox" { - // This is a virtual pod sandbox - create the virtual pod if it doesn't exist - if _, exists := h.GetVirtualPod(virtualPodID); !exists { - // Use the network namespace ID from the current container spec - // Virtual pods share the same network namespace - networkNamespace := specGuest.GetNetworkNamespaceID(settings.OCISpecification) - if networkNamespace == "" { - networkNamespace = fmt.Sprintf("virtual-pod-%s", virtualPodID) - } - - if err := h.CreateVirtualPod(ctx, virtualPodID, virtualPodID, networkNamespace, settings.OCISpecification); err != nil { - return nil, errors.Wrapf(err, "failed to create virtual pod %s", virtualPodID) - } - } - } - - // Add this container to the virtual pod - if err := h.AddContainerToVirtualPod(id, virtualPodID); err != nil { - return nil, errors.Wrapf(err, "failed to add container %s to virtual pod %s", id, virtualPodID) - } - } - - // Normally we would be doing policy checking here at the start of our - // "policy gated function". However, we can't for create container as we - // need a properly correct sandboxID which might be changed by the code - // below that determines the sandboxID. This is a bit of future proofing - // as currently for our single use case, the sandboxID is the same as the - // container id - var namespaceID string - // for sandbox container sandboxID is same as container id - sandboxID := id if isCRI { switch criType { case "sandbox": - // Capture namespaceID if any because setupSandboxContainerSpec clears the Windows section. namespaceID = specGuest.GetNetworkNamespaceID(settings.OCISpecification) // Resolve the sandbox root from OCIBundlePath. @@ -503,20 +486,23 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if err := securitypolicy.ExtendPolicyWithNetworkingMounts(id, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { return nil, err } + + if err := h.createPodInUVM(sandboxID, settings.OCISpecification, namespaceID); err != nil { + return nil, err + } case "container": - sid, ok := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] - sandboxID = sid + sid := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] if h.HasSecurityPolicy() { if err = checkValidContainerID(sid, "sandbox"); err != nil { return nil, err } } - if !ok || sid == "" { + if sid == "" { return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) } - sandboxRoot := h.resolveSandboxRoot(sid) + sandboxRoot := h.resolveSandboxRoot(sandboxID) c.sandboxRoot = sandboxRoot - if err = setupWorkloadContainerSpec(ctx, sid, id, sandboxRoot, settings.OCISpecification, settings.OCIBundlePath); err != nil { + if err = setupWorkloadContainerSpec(ctx, sandboxID, id, sandboxRoot, settings.OCISpecification, settings.OCIBundlePath); err != nil { return nil, err } @@ -540,8 +526,15 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM default: return nil, errors.Errorf("unsupported 'io.kubernetes.cri.container-type': '%s'", criType) } + + // Register container with its pod. + h.containersMutex.Lock() + if pod, exists := h.pods[sandboxID]; exists { + pod.containers[id] = true + } + h.containersMutex.Unlock() } else { - // Capture namespaceID if any because setupStandaloneContainerSpec clears the Windows section. + // Standalone container: no pod entry is created. namespaceID = specGuest.GetNetworkNamespaceID(settings.OCISpecification) // Standalone uses OCIBundlePath directly as its root. c.sandboxRoot = settings.OCIBundlePath @@ -1489,194 +1482,41 @@ func isPrivilegedContainerCreationRequest(ctx context.Context, spec *specs.Spec) return oci.ParseAnnotationsBool(ctx, spec.Annotations, annotations.LCOWPrivileged, false) } -// Virtual Pod Management Methods - -// InitializeVirtualPodSupport sets up the parent cgroup for virtual pods -func (h *Host) InitializeVirtualPodSupport(mgr cgroup.Manager) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - if mgr == nil { - return errors.New("no valid cgroup manager provided for virtual pod support") - } - - h.virtualPodsCgroupManager = mgr - logrus.Info("Virtual pod support initialized") - return nil -} - -// CreateVirtualPod creates a new virtual pod with its own cgroup and network namespace -func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string, pSpec *specs.Spec) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - // Check if virtual pod already exists - if _, exists := h.virtualPods[virtualSandboxID]; exists { - return fmt.Errorf("virtual pod %s already exists", virtualSandboxID) +// createPodInUVM allocates a cgroup for a pod and registers it in the host. +func (h *Host) createPodInUVM(sid string, pSpec *specs.Spec, nsID string) error { + h.containersMutex.Lock() + if _, exists := h.pods[sid]; exists { + h.containersMutex.Unlock() + return fmt.Errorf("pod %s already exists", sid) } + h.containersMutex.Unlock() - // Extract resource limits if provided + cgroupPath := path.Join("/pods", sid) resources := &specs.LinuxResources{} if pSpec != nil && pSpec.Linux != nil && pSpec.Linux.Resources != nil { resources = pSpec.Linux.Resources - logrus.WithFields(logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - }).Info("Creating virtual pod with specified resources") - } else { - logrus.WithField(logfields.VirtualSandboxID, virtualSandboxID).Info("Creating pod cgroup with default resources as none were specified") - } - - if h.virtualPodsCgroupManager == nil { - return fmt.Errorf("no virtual pod cgroup manager available") } - - cgroupPath := path.Join("/containers/virtual-pods", virtualSandboxID) cgroupControl, err := cgroup.NewManager(cgroupPath, resources) if err != nil { - return errors.Wrapf(err, "failed to create cgroup for virtual pod %s", virtualSandboxID) - } - logrus.WithField("cgroupPath", cgroupPath).Info("Created virtual pod cgroup") - - // Create virtual pod structure - virtualPod := &VirtualPod{ - VirtualSandboxID: virtualSandboxID, - MasterSandboxID: masterSandboxID, - NetworkNamespace: networkNamespace, - CgroupPath: cgroupPath, - CgroupControl: cgroupControl, - Containers: make(map[string]bool), - CreatedAt: time.Now(), + return fmt.Errorf("failed to create cgroup for pod %s: %w", sid, err) } - - h.virtualPods[virtualSandboxID] = virtualPod - - logrus.WithFields(logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - "masterSandboxID": masterSandboxID, - "cgroupPath": cgroupPath, - "networkNamespace": networkNamespace, - }).Info("Virtual pod created successfully") - - return nil -} - -// CreateVirtualPodWithoutMemoryLimit creates a virtual pod without memory limits (backward compatibility) -func (h *Host) CreateVirtualPodWithoutMemoryLimit(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string) error { - return h.CreateVirtualPod(ctx, virtualSandboxID, masterSandboxID, networkNamespace, nil) -} - -// GetVirtualPod retrieves a virtual pod by its virtualSandboxID -func (h *Host) GetVirtualPod(virtualSandboxID string) (*VirtualPod, bool) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - vp, exists := h.virtualPods[virtualSandboxID] - return vp, exists -} - -// AddContainerToVirtualPod associates a container with a virtual pod -func (h *Host) AddContainerToVirtualPod(containerID, virtualSandboxID string) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - // Check if virtual pod exists - vp, exists := h.virtualPods[virtualSandboxID] - if !exists { - return fmt.Errorf("virtual pod %s does not exist", virtualSandboxID) + h.containersMutex.Lock() + defer h.containersMutex.Unlock() + // Double-check after cgroup creation in case of a race. + if _, exists := h.pods[sid]; exists { + _ = cgroupControl.Delete() + return fmt.Errorf("pod %s already exists", sid) + } + h.pods[sid] = &uvmPod{ + sandboxID: sid, + networkNamespace: nsID, + cgroupPath: cgroupPath, + cgroupControl: cgroupControl, + containers: make(map[string]bool), } - - // Add container to virtual pod - vp.Containers[containerID] = true - h.containerToVirtualPod[containerID] = virtualSandboxID - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: containerID, - logfields.VirtualSandboxID: virtualSandboxID, - }).Info("Container added to virtual pod") - + "sandboxID": sid, + "cgroupPath": cgroupPath, + }).Info("pod created in UVM") return nil } - -// RemoveContainerFromVirtualPod removes a container from a virtual pod -func (h *Host) RemoveContainerFromVirtualPod(containerID string) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - virtualSandboxID, exists := h.containerToVirtualPod[containerID] - if !exists { - return // Container not in any virtual pod - } - - ctx, entry := log.SetEntry(context.Background(), logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - logfields.ContainerID: containerID, - }) - - // Remove from virtual pod - if vp, vpExists := h.virtualPods[virtualSandboxID]; vpExists { - delete(vp.Containers, containerID) - - // If this is the sandbox container, clean up the network namespace adapters first, - // then remove the namespace. - - // Do NOT call [RemoveNetworkNamespace] here. - // When cleaning up the virtual pod's sandbox container on the host, - // [UtilityVM.TearDownNetworking] calls [UtilityVM.RemoveNetNS], - // which, via [UtilityVM.removeNIC], removes the NICs from the uVM and - // sends [guestresource.ResourceTypeNetwork] RPCs that ultimately cal [modifyNetwork] - // to remove adapters from guest tracking. - // However, that happens AFTER this function runs. - // Calling [RemoveNetworkNamespace] here would - // fail with "contains adapters" since the host hasn't removed them yet. - // - // The host-driven path handles cleanup in the correct order. - if containerID == virtualSandboxID && vp.NetworkNamespace != "" { - entry.WithField(logfields.NamespaceID, vp.NetworkNamespace).Debug("skipping virtual pod network namespace removal") - } - - // If this was the last container, cleanup the virtual pod - if len(vp.Containers) == 0 { - h.cleanupVirtualPod(ctx, virtualSandboxID) - } - } - - delete(h.containerToVirtualPod, containerID) - - entry.Info("Container removed from virtual pod") -} - -func (h *Host) cleanupVirtualPod(ctx context.Context, virtualSandboxID string) { - // logfields set on [Host.RemoveContainerFromVirtualPod] - entry := log.G(ctx) - - vp, exists := h.virtualPods[virtualSandboxID] - if !exists { - entry.Warn("attempted to cleanup non-existent virtual sandbox pod") - return // virtual pod does not exist - } - - // Delete the cgroup - if vp.CgroupControl != nil { - if err := vp.CgroupControl.Delete(); err != nil { - entry.WithError(err).Warn("failed to delete virtual pod cgroup") - } - } - - // NOTE: Do NOT call [RemoveNetworkNamespace] here. - // See comment in [Host.RemoveContainerFromVirtualPod] for more info. - - // Clean up the virtual pod root directory which contains hostname, hosts, - // resolv.conf, and logs/. These are NOT cleaned by Container.Delete() because - // they live under /run/gcs/c/virtual-pods// which is separate from the - // container's ociBundlePath (/run/gcs/c//). - if vpRootDir := specGuest.VirtualPodRootDir(virtualSandboxID); vpRootDir != "" { - if err := os.RemoveAll(vpRootDir); err != nil { - entry.WithField("path", vpRootDir).WithError(err).Warn("Failed to remove virtual pod root directory") - } else { - entry.WithField("path", vpRootDir).Debug("Removed virtual pod root directory") - } - } - - delete(h.virtualPods, virtualSandboxID) - entry.Info("Virtual pod cleaned up") -} diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index d118a4d35c..32a924bb1f 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -224,17 +224,8 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id, sandboxRoot strin } } - // Check if this is a virtual pod container - virtualPodID := spec.Annotations[annotations.VirtualPodID] - - // Set cgroup path - check if this is a virtual pod container - if virtualPodID != "" { - // Virtual pod containers go under /containers/virtual-pods/virtualPodID/containerID - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualPodID + "/" + id - } else { - // Regular containers go under /containers - spec.Linux.CgroupsPath = "/containers/" + id - } + // Set cgroup path under the sandbox's pod cgroup. + spec.Linux.CgroupsPath = "/pods/" + sbid + "/" + id if spec.Windows != nil { // we only support Nvidia gpus right now diff --git a/test/gcs/cgroup_test.go b/test/gcs/cgroup_test.go index 72b84808bf..71ee644a01 100644 --- a/test/gcs/cgroup_test.go +++ b/test/gcs/cgroup_test.go @@ -435,8 +435,8 @@ func TestOOMEventFD_ContainerKill(t *testing.T) { // --- Set up OOMEventFD on the container's cgroup --- // - // Standalone containers get cgroup path "/containers/". - cgroupPath := "/containers/" + id + // Standalone containers get cgroup path "/pods/". + cgroupPath := "/pods/" + id // LoadManager works for both v1 and v2 — the cgroup already exists (created by runc). mgr, err := cgroup.LoadManager(cgroupPath) From 309596fa6396cbcfcfa0a6b16721fa215a170491 Mon Sep 17 00:00:00 2001 From: Shreyansh Sancheti Date: Mon, 4 May 2026 17:15:35 +0530 Subject: [PATCH 2/2] =?UTF-8?q?guest:=20address=20review=20=E2=80=94=20sin?= =?UTF-8?q?gle=20lock=20in=20createPodInUVM,=20direct=20sandboxID=20assign?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Lock containersMutex over the entire createPodInUVM method instead of the double-check pattern. - Assign sandboxID directly from annotation without intermediate sid variable in the early resolution block. Signed-off-by: Shreyansh Jain Signed-off-by: Shreyansh Sancheti --- internal/guest/runtime/hcsv2/uvm.go | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 9ebfe91286..428d6bb588 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -431,10 +431,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if isVirtualPod { sandboxID = virtualPodID } else if criType == "container" { - sid := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] - if sid != "" { - sandboxID = sid - } + sandboxID = settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] } c.sandboxID = sandboxID @@ -1484,29 +1481,23 @@ func isPrivilegedContainerCreationRequest(ctx context.Context, spec *specs.Spec) // createPodInUVM allocates a cgroup for a pod and registers it in the host. func (h *Host) createPodInUVM(sid string, pSpec *specs.Spec, nsID string) error { - h.containersMutex.Lock() - if _, exists := h.pods[sid]; exists { - h.containersMutex.Unlock() - return fmt.Errorf("pod %s already exists", sid) - } - h.containersMutex.Unlock() - cgroupPath := path.Join("/pods", sid) resources := &specs.LinuxResources{} if pSpec != nil && pSpec.Linux != nil && pSpec.Linux.Resources != nil { resources = pSpec.Linux.Resources } - cgroupControl, err := cgroup.NewManager(cgroupPath, resources) - if err != nil { - return fmt.Errorf("failed to create cgroup for pod %s: %w", sid, err) - } + h.containersMutex.Lock() defer h.containersMutex.Unlock() - // Double-check after cgroup creation in case of a race. + if _, exists := h.pods[sid]; exists { - _ = cgroupControl.Delete() return fmt.Errorf("pod %s already exists", sid) } + + cgroupControl, err := cgroup.NewManager(cgroupPath, resources) + if err != nil { + return fmt.Errorf("failed to create cgroup for pod %s: %w", sid, err) + } h.pods[sid] = &uvmPod{ sandboxID: sid, networkNamespace: nsID,