From d5995de6848e11d32a105f274689c2d692993eae Mon Sep 17 00:00:00 2001 From: Antoine Gaillard Date: Thu, 28 Aug 2025 15:23:46 +0200 Subject: [PATCH 1/3] POC: Force writable image volumes using overlay filesystem This is a proof-of-concept that makes all image volumes writable by: - Removing the readonly restriction check - Creating an overlay filesystem with: - Lower layer: Original read-only image content - Upper/Work layers: Writable directories for modifications - Setting the mount as non-readonly for containers The changes are isolated per container through separate upper/work directories. NOTE: This is a temporary POC implementation. Proper support requires: - Kubernetes API changes to support writable image volumes - Kubelet changes to pass the readonly flag correctly - Removal of the hardcoded forcing of writable mounts Changes made: - Skip readonly validation for image volumes - Create overlay filesystem for each image mount - Update cleanup logic to handle overlay directory structure - Force Readonly=false on mount spec for containers --- internal/cri/server/container_image_mount.go | 103 ++++++++++++++----- 1 file changed, 80 insertions(+), 23 deletions(-) diff --git a/internal/cri/server/container_image_mount.go b/internal/cri/server/container_image_mount.go index 1f5aa55cf38f6..d75314c7ee21a 100644 --- a/internal/cri/server/container_image_mount.go +++ b/internal/cri/server/container_image_mount.go @@ -21,6 +21,7 @@ import ( "fmt" "os" "path/filepath" + "strings" containerd "github.com/containerd/containerd/v2/client" "github.com/containerd/containerd/v2/core/leases" @@ -79,9 +80,12 @@ func (c *criService) mutateImageMount( if extraMount.GetHostPath() != "" { return fmt.Errorf("hostpath must be empty while mount image: %+v", extraMount) } - if !extraMount.GetReadonly() { - return fmt.Errorf("readonly must be true while mount image: %+v", extraMount) - } + // POC: Force all image volumes to be writable via overlay filesystem + // TODO: Remove when Kubernetes API supports writable image volumes + // Original check: + // if !extraMount.GetReadonly() { + // return fmt.Errorf("readonly must be true while mount image: %+v", extraMount) + // } ref := imageSpec.GetImage() if ref == "" { @@ -99,7 +103,12 @@ func (c *criService) mutateImageMount( // This is a digest of the manifest imageID := containerdImage.Target().Digest.Encoded() - target := c.getImageVolumeHostPath(sandboxID, imageID) + // POC: Use overlay filesystem to make image volumes writable + // Paths for overlay components + target := c.getImageVolumeHostPath(sandboxID, imageID+"-overlay") + lowerDir := c.getImageVolumeHostPath(sandboxID, imageID+"-lower") + upperDir := c.getImageVolumeHostPath(sandboxID, imageID+"-upper") + workDir := c.getImageVolumeHostPath(sandboxID, imageID+"-work") // Already mounted in another container on the same pod mounted, err := ensureImageVolumeMounted(target) @@ -108,6 +117,8 @@ func (c *criService) mutateImageMount( } if mounted { extraMount.HostPath = target + // POC: Mark mount as writable + extraMount.Readonly = false return nil } @@ -128,32 +139,67 @@ func (c *criService) mutateImageMount( chainID := identity.ChainID(diffIDs).String() s := c.client.SnapshotService(snapshotter) - mounts, err := s.Prepare(ctx, target, chainID) + + // POC: Prepare snapshot for lower layer + snapshotKey := fmt.Sprintf("%s-lower-%s", sandboxID, imageID) + mounts, err := s.Prepare(ctx, snapshotKey, chainID) if err != nil { if errdefs.IsAlreadyExists(err) { - mounts, err = s.Mounts(ctx, target) + mounts, err = s.Mounts(ctx, snapshotKey) } } if err != nil { - return fmt.Errorf("failed to prepare for image volume %q: %w", ref, err) + return fmt.Errorf("failed to prepare snapshot for image volume %q: %w", ref, err) } defer func() { if retErr != nil { - _ = s.Remove(ctx, target) + _ = s.Remove(ctx, snapshotKey) } }() - - err = os.MkdirAll(target, 0755) - if err != nil { - return fmt.Errorf("failed to create directory to image volume target path %q: %w", target, err) + + // Mount the lower layer + if err := os.MkdirAll(lowerDir, 0755); err != nil { + return fmt.Errorf("failed to create lower dir %q: %w", lowerDir, err) } - mounts = addVolatileOptionOnImageVolumeMount(mounts) - if err := mount.All(mounts, target); err != nil { - return fmt.Errorf("failed to mount image volume component %q: %w", target, err) + if err := mount.All(mounts, lowerDir); err != nil { + return fmt.Errorf("failed to mount lower layer %q: %w", lowerDir, err) } - + defer func() { + if retErr != nil { + _ = mount.UnmountAll(lowerDir, 0) + } + }() + + // Create upper and work directories for overlay + if err := os.MkdirAll(upperDir, 0755); err != nil { + return fmt.Errorf("failed to create upper dir %q: %w", upperDir, err) + } + if err := os.MkdirAll(workDir, 0755); err != nil { + return fmt.Errorf("failed to create work dir %q: %w", workDir, err) + } + if err := os.MkdirAll(target, 0755); err != nil { + return fmt.Errorf("failed to create target dir %q: %w", target, err) + } + + // Mount overlay filesystem + overlayMount := mount.Mount{ + Type: "overlay", + Source: "overlay", + Options: []string{ + fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir), + }, + } + + if err := overlayMount.Mount(target); err != nil { + return fmt.Errorf("failed to mount writable overlay at %q: %w", target, err) + } + + log.G(ctx).Infof("POC: Mounted writable overlay for image volume at %s", target) + extraMount.HostPath = target + // POC: Mark mount as writable + extraMount.Readonly = false return nil } @@ -183,18 +229,29 @@ func (c *criService) cleanupImageMounts( for _, entry := range entries { target := filepath.Join(targetBase, entry.Name()) + entryName := entry.Name() + // Unmount the target err = mount.UnmountAll(target, 0) if err != nil { - return fmt.Errorf("failed to unmount image volume component %q: %w", target, err) + log.G(ctx).WithError(err).Warnf("failed to unmount image volume component %q", target) } - err = s.Remove(ctx, target) - if err != nil && !errdefs.IsNotFound(err) { - return fmt.Errorf("failed to removing snapshot: %w", err) + + // POC: Handle snapshot cleanup for overlay setup + // For lower directories, use the snapshot key format + if strings.HasSuffix(entryName, "-lower") { + imageID := strings.TrimSuffix(entryName, "-lower") + snapshotKey := fmt.Sprintf("%s-lower-%s", sandboxID, imageID) + err = s.Remove(ctx, snapshotKey) + if err != nil && !errdefs.IsNotFound(err) { + log.G(ctx).WithError(err).Debugf("failed to remove snapshot %q", snapshotKey) + } } - err = os.Remove(target) - if err != nil && !errdefs.IsNotFound(err) { - return fmt.Errorf("failed to removing mounts directory: %w", err) + + // Remove the directory + err = os.RemoveAll(target) + if err != nil && !os.IsNotExist(err) { + log.G(ctx).WithError(err).Warnf("failed to remove directory %q", target) } } From 286b388428ec04b5393c03fa61c9ef7594412ed8 Mon Sep 17 00:00:00 2001 From: Antoine Gaillard Date: Thu, 4 Sep 2025 19:28:44 +0200 Subject: [PATCH 2/3] Fix snapshot preparation and add tmpfs-backed overlay - Fix snapshot key to use lowerDir consistently - Add tmpfs mounting for upper/work directories (4x image size, 256MB-32GB) - Create overlay subdirectories within tmpfs mounts - Add tmpfs cleanup in removal logic --- .github/workflows/links.yml | 1 - internal/cri/server/container_image_mount.go | 89 +++++++++++++++++--- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index 07dd99204170d..386772d144000 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -14,7 +14,6 @@ permissions: jobs: check: runs-on: ubuntu-latest - if: github.repository == 'containerd/containerd' name: lychee timeout-minutes: 15 steps: diff --git a/internal/cri/server/container_image_mount.go b/internal/cri/server/container_image_mount.go index d75314c7ee21a..a52ef523fa7bc 100644 --- a/internal/cri/server/container_image_mount.go +++ b/internal/cri/server/container_image_mount.go @@ -140,24 +140,23 @@ func (c *criService) mutateImageMount( s := c.client.SnapshotService(snapshotter) - // POC: Prepare snapshot for lower layer - snapshotKey := fmt.Sprintf("%s-lower-%s", sandboxID, imageID) - mounts, err := s.Prepare(ctx, snapshotKey, chainID) + // Prepare snapshot for lower directory with lowerDir as the key + mounts, err := s.Prepare(ctx, lowerDir, chainID) if err != nil { if errdefs.IsAlreadyExists(err) { - mounts, err = s.Mounts(ctx, snapshotKey) + mounts, err = s.Mounts(ctx, lowerDir) } } if err != nil { - return fmt.Errorf("failed to prepare snapshot for image volume %q: %w", ref, err) + return fmt.Errorf("failed to prepare for image volume %q: %w", ref, err) } defer func() { if retErr != nil { - _ = s.Remove(ctx, snapshotKey) + _ = s.Remove(ctx, lowerDir) } }() - // Mount the lower layer + // Mount the snapshot to the lower layer (this puts the image content there) if err := os.MkdirAll(lowerDir, 0755); err != nil { return fmt.Errorf("failed to create lower dir %q: %w", lowerDir, err) } @@ -171,23 +170,83 @@ func (c *criService) mutateImageMount( } }() - // Create upper and work directories for overlay + // Get image size to determine tmpfs size + imageSize, err := containerdImage.Size(ctx) + if err != nil { + log.G(ctx).WithError(err).Warnf("failed to get image size, using default") + imageSize = 512 * 1024 * 1024 // Default to 512MB + } + + // Use 4x image size for tmpfs, with reasonable min/max bounds + tmpfsSize := imageSize * 4 + minSize := int64(256 * 1024 * 1024) // 256MB minimum + maxSize := int64(32 * 1024 * 1024 * 1024) // 32GB maximum + + if tmpfsSize < minSize { + tmpfsSize = minSize + } + if tmpfsSize > maxSize { + tmpfsSize = maxSize + } + + // Create and mount tmpfs for upper and work directories if err := os.MkdirAll(upperDir, 0755); err != nil { return fmt.Errorf("failed to create upper dir %q: %w", upperDir, err) } if err := os.MkdirAll(workDir, 0755); err != nil { return fmt.Errorf("failed to create work dir %q: %w", workDir, err) } + + tmpfsMount := mount.Mount{ + Type: "tmpfs", + Source: "tmpfs", + Options: []string{ + fmt.Sprintf("size=%d", tmpfsSize), + "mode=0755", + }, + } + + if err := tmpfsMount.Mount(upperDir); err != nil { + return fmt.Errorf("failed to mount tmpfs on upper dir %q: %w", upperDir, err) + } + defer func() { + if retErr != nil { + _ = mount.UnmountAll(upperDir, 0) + } + }() + + if err := tmpfsMount.Mount(workDir); err != nil { + return fmt.Errorf("failed to mount tmpfs on work dir %q: %w", workDir, err) + } + defer func() { + if retErr != nil { + _ = mount.UnmountAll(workDir, 0) + } + }() + + // Create actual overlay directories within the tmpfs mounts + // Overlay needs empty directories, not the mount points themselves + upperOverlayDir := filepath.Join(upperDir, "overlay") + workOverlayDir := filepath.Join(workDir, "overlay") + + if err := os.MkdirAll(upperOverlayDir, 0755); err != nil { + return fmt.Errorf("failed to create overlay dir in tmpfs upper %q: %w", upperOverlayDir, err) + } + if err := os.MkdirAll(workOverlayDir, 0755); err != nil { + return fmt.Errorf("failed to create overlay dir in tmpfs work %q: %w", workOverlayDir, err) + } + + log.G(ctx).Infof("POC: Mounted tmpfs overlay (size: %dMB) for image volume at %s", tmpfsSize/(1024*1024), target) if err := os.MkdirAll(target, 0755); err != nil { return fmt.Errorf("failed to create target dir %q: %w", target, err) } - // Mount overlay filesystem + // Mount overlay filesystem using subdirectories within tmpfs overlayMount := mount.Mount{ Type: "overlay", Source: "overlay", Options: []string{ - fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir), + fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperOverlayDir, workOverlayDir), }, } @@ -231,12 +290,20 @@ func (c *criService) cleanupImageMounts( target := filepath.Join(targetBase, entry.Name()) entryName := entry.Name() - // Unmount the target + // Unmount the target (overlay) err = mount.UnmountAll(target, 0) if err != nil { log.G(ctx).WithError(err).Warnf("failed to unmount image volume component %q", target) } + // Also unmount tmpfs upper and work directories + if strings.HasSuffix(entryName, "-upper") || strings.HasSuffix(entryName, "-work") { + err = mount.UnmountAll(target, 0) + if err != nil { + log.G(ctx).WithError(err).Debugf("failed to unmount tmpfs at %q", target) + } + } + // POC: Handle snapshot cleanup for overlay setup // For lower directories, use the snapshot key format if strings.HasSuffix(entryName, "-lower") { From 5508f420a9507d12a485eeadfcfde1cff4610ed9 Mon Sep 17 00:00:00 2001 From: Antoine Gaillard Date: Thu, 4 Sep 2025 20:34:38 +0200 Subject: [PATCH 3/3] Use /dev/shm for overlay upper/work dirs Working writable image volumes with in-memory overlay. Add debug logging and fix snapshot preparation. --- internal/cri/server/container_image_mount.go | 90 ++++++-------------- 1 file changed, 26 insertions(+), 64 deletions(-) diff --git a/internal/cri/server/container_image_mount.go b/internal/cri/server/container_image_mount.go index a52ef523fa7bc..5053599703cdd 100644 --- a/internal/cri/server/container_image_mount.go +++ b/internal/cri/server/container_image_mount.go @@ -34,6 +34,13 @@ import ( runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) +func dirExists(path string) bool { + if _, err := os.Stat(path); os.IsNotExist(err) { + return false + } + return true +} + func (c *criService) mutateMounts( ctx context.Context, extraMounts []*runtime.Mount, @@ -107,8 +114,9 @@ func (c *criService) mutateImageMount( // Paths for overlay components target := c.getImageVolumeHostPath(sandboxID, imageID+"-overlay") lowerDir := c.getImageVolumeHostPath(sandboxID, imageID+"-lower") - upperDir := c.getImageVolumeHostPath(sandboxID, imageID+"-upper") - workDir := c.getImageVolumeHostPath(sandboxID, imageID+"-work") + // Use /dev/shm for upper/work directories for in-memory performance + upperDir := filepath.Join("/dev/shm/containerd-image-volumes", sandboxID, imageID+"-upper") + workDir := filepath.Join("/dev/shm/containerd-image-volumes", sandboxID, imageID+"-work") // Already mounted in another container on the same pod mounted, err := ensureImageVolumeMounted(target) @@ -161,104 +169,58 @@ func (c *criService) mutateImageMount( return fmt.Errorf("failed to create lower dir %q: %w", lowerDir, err) } mounts = addVolatileOptionOnImageVolumeMount(mounts) + log.G(ctx).Infof("POC DEBUG: About to mount snapshot to lower dir %s with %d mounts", lowerDir, len(mounts)) if err := mount.All(mounts, lowerDir); err != nil { return fmt.Errorf("failed to mount lower layer %q: %w", lowerDir, err) } + log.G(ctx).Infof("POC DEBUG: Successfully mounted lower layer %s", lowerDir) defer func() { if retErr != nil { _ = mount.UnmountAll(lowerDir, 0) } }() - // Get image size to determine tmpfs size - imageSize, err := containerdImage.Size(ctx) - if err != nil { - log.G(ctx).WithError(err).Warnf("failed to get image size, using default") - imageSize = 512 * 1024 * 1024 // Default to 512MB - } - - // Use 4x image size for tmpfs, with reasonable min/max bounds - tmpfsSize := imageSize * 4 - minSize := int64(256 * 1024 * 1024) // 256MB minimum - maxSize := int64(32 * 1024 * 1024 * 1024) // 32GB maximum - - if tmpfsSize < minSize { - tmpfsSize = minSize - } - if tmpfsSize > maxSize { - tmpfsSize = maxSize - } - - // Create and mount tmpfs for upper and work directories + // Create upper and work directories in /dev/shm for in-memory performance + log.G(ctx).Infof("POC DEBUG: Creating /dev/shm directories - upper: %s, work: %s", upperDir, workDir) if err := os.MkdirAll(upperDir, 0755); err != nil { return fmt.Errorf("failed to create upper dir %q: %w", upperDir, err) } if err := os.MkdirAll(workDir, 0755); err != nil { return fmt.Errorf("failed to create work dir %q: %w", workDir, err) } - - tmpfsMount := mount.Mount{ - Type: "tmpfs", - Source: "tmpfs", - Options: []string{ - fmt.Sprintf("size=%d", tmpfsSize), - "mode=0755", - }, - } - - if err := tmpfsMount.Mount(upperDir); err != nil { - return fmt.Errorf("failed to mount tmpfs on upper dir %q: %w", upperDir, err) - } - defer func() { - if retErr != nil { - _ = mount.UnmountAll(upperDir, 0) - } - }() - - if err := tmpfsMount.Mount(workDir); err != nil { - return fmt.Errorf("failed to mount tmpfs on work dir %q: %w", workDir, err) - } + log.G(ctx).Infof("POC DEBUG: Created /dev/shm directories successfully") defer func() { if retErr != nil { - _ = mount.UnmountAll(workDir, 0) + _ = os.RemoveAll(upperDir) + _ = os.RemoveAll(workDir) } }() - - // Create actual overlay directories within the tmpfs mounts - // Overlay needs empty directories, not the mount points themselves - upperOverlayDir := filepath.Join(upperDir, "overlay") - workOverlayDir := filepath.Join(workDir, "overlay") - - if err := os.MkdirAll(upperOverlayDir, 0755); err != nil { - return fmt.Errorf("failed to create overlay dir in tmpfs upper %q: %w", upperOverlayDir, err) - } - if err := os.MkdirAll(workOverlayDir, 0755); err != nil { - return fmt.Errorf("failed to create overlay dir in tmpfs work %q: %w", workOverlayDir, err) - } - - log.G(ctx).Infof("POC: Mounted tmpfs overlay (size: %dMB) for image volume at %s", tmpfsSize/(1024*1024), target) if err := os.MkdirAll(target, 0755); err != nil { return fmt.Errorf("failed to create target dir %q: %w", target, err) } - // Mount overlay filesystem using subdirectories within tmpfs + // Mount overlay filesystem using /dev/shm directories + overlayOpts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir) + log.G(ctx).Infof("POC DEBUG: Mounting overlay with opts: %s", overlayOpts) overlayMount := mount.Mount{ Type: "overlay", Source: "overlay", - Options: []string{ - fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperOverlayDir, workOverlayDir), - }, + Options: []string{overlayOpts}, } if err := overlayMount.Mount(target); err != nil { + log.G(ctx).Errorf("POC DEBUG: Overlay mount failed - lower exists: %v, upper exists: %v, work exists: %v", + dirExists(lowerDir), dirExists(upperDir), dirExists(workDir)) return fmt.Errorf("failed to mount writable overlay at %q: %w", target, err) } - log.G(ctx).Infof("POC: Mounted writable overlay for image volume at %s", target) + log.G(ctx).Infof("POC DEBUG: Successfully mounted overlay at %s", target) extraMount.HostPath = target // POC: Mark mount as writable + log.G(ctx).Infof("POC DEBUG: Setting extraMount.Readonly = false (was %v)", extraMount.GetReadonly()) extraMount.Readonly = false + log.G(ctx).Infof("POC DEBUG: Final mount - HostPath: %s, Readonly: %v", extraMount.HostPath, extraMount.Readonly) return nil }