From 1cd0ef7a81f422af262bd500f90e928cd6f7cbcb Mon Sep 17 00:00:00 2001 From: Olli Janatuinen Date: Wed, 8 May 2019 04:27:15 +0300 Subject: [PATCH 1/4] First step to implement full garbage collector for image layers Refactored exiting logic on way that layers are first marked to be under removal so if actual removal fails they can be found from disk and cleaned up. Full garbage collector will be implemented as part of containerd migration. Signed-off-by: Olli Janatuinen (cherry picked from commit 213681b66a41d7728445e30125176834ce349c2a) Signed-off-by: Sebastiaan van Stijn --- integration/image/remove_unix_test.go | 88 +++++++++++++++++++++++++++ internal/test/daemon/daemon.go | 7 +++ internal/test/daemon/ops.go | 7 +++ internal/test/daemon/ops_unix.go | 34 +++++++++++ layer/filestore.go | 76 ++++++++++++++++++++++- layer/layer_store.go | 40 +++++++++++- 6 files changed, 247 insertions(+), 5 deletions(-) create mode 100644 integration/image/remove_unix_test.go create mode 100644 internal/test/daemon/ops_unix.go diff --git a/integration/image/remove_unix_test.go b/integration/image/remove_unix_test.go new file mode 100644 index 0000000000000..984ce5c9d72db --- /dev/null +++ b/integration/image/remove_unix_test.go @@ -0,0 +1,88 @@ +// +build !windows + +package image // import "github.com/docker/docker/integration/image" + +import ( + "context" + "io" + "io/ioutil" + "os" + "strconv" + "syscall" + "testing" + "unsafe" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/internal/test/daemon" + "github.com/docker/docker/internal/test/fakecontext" + "gotest.tools/assert" + "gotest.tools/skip" +) + +// This is a regression test for #38488 +// It ensures that orphan layers can be found and cleaned up +// after unsuccessful image removal +func TestRemoveImageGarbageCollector(t *testing.T) { + // This test uses very platform specific way to prevent + // daemon for remove image layer. + skip.If(t, testEnv.DaemonInfo.OSType != "linux") + skip.If(t, os.Getenv("DOCKER_ENGINE_GOARCH") != "amd64") + + // Create daemon with overlay2 graphdriver because vfs uses disk differently + // and this test case would not work with it. + d := daemon.New(t, daemon.WithStorageDriver("overlay2"), daemon.WithImageService) + d.Start(t) + defer d.Stop(t) + + ctx := context.Background() + client := d.NewClientT(t) + i := d.ImageService() + + img := "test-garbage-collector" + + // Build a image with multiple layers + dockerfile := `FROM busybox + RUN echo echo Running... > /run.sh` + source := fakecontext.New(t, "", fakecontext.WithDockerfile(dockerfile)) + defer source.Close() + resp, err := client.ImageBuild(ctx, + source.AsTarReader(t), + types.ImageBuildOptions{ + Remove: true, + ForceRemove: true, + Tags: []string{img}, + }) + assert.NilError(t, err) + _, err = io.Copy(ioutil.Discard, resp.Body) + resp.Body.Close() + assert.NilError(t, err) + image, _, err := client.ImageInspectWithRaw(ctx, img) + assert.NilError(t, err) + + // Mark latest image layer to immutable + data := image.GraphDriver.Data + file, _ := os.Open(data["UpperDir"]) + attr := 0x00000010 + fsflags := uintptr(0x40086602) + argp := uintptr(unsafe.Pointer(&attr)) + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), fsflags, argp) + assert.Equal(t, "errno 0", errno.Error()) + + // Try to remove the image, it should generate error + // but marking layer back to mutable before checking errors (so we don't break CI server) + _, err = client.ImageRemove(ctx, img, types.ImageRemoveOptions{}) + attr = 0x00000000 + argp = uintptr(unsafe.Pointer(&attr)) + _, _, errno = syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), fsflags, argp) + assert.Equal(t, "errno 0", errno.Error()) + assert.ErrorContains(t, err, "permission denied") + + // Verify that layer remaining on disk + dir, _ := os.Stat(data["UpperDir"]) + assert.Equal(t, "true", strconv.FormatBool(dir.IsDir())) + + // Run imageService.Cleanup() and make sure that layer was removed from disk + i.Cleanup() + dir, err = os.Stat(data["UpperDir"]) + assert.ErrorContains(t, err, "no such file or directory") +} diff --git a/internal/test/daemon/daemon.go b/internal/test/daemon/daemon.go index 7ad377347989c..f5a55d4758e32 100644 --- a/internal/test/daemon/daemon.go +++ b/internal/test/daemon/daemon.go @@ -16,6 +16,7 @@ import ( "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/events" "github.com/docker/docker/client" + "github.com/docker/docker/daemon/images" "github.com/docker/docker/internal/test" "github.com/docker/docker/internal/test/request" "github.com/docker/docker/opts" @@ -77,6 +78,7 @@ type Daemon struct { init bool dockerdBinary string log logT + imageService *images.ImageService // swarm related field swarmListenAddr string @@ -720,3 +722,8 @@ func cleanupRaftDir(t testingT, rootPath string) { } } } + +// ImageService returns the Daemon's ImageService +func (d *Daemon) ImageService() *images.ImageService { + return d.imageService +} diff --git a/internal/test/daemon/ops.go b/internal/test/daemon/ops.go index ff8cd88946f9c..a8dd283b82bda 100644 --- a/internal/test/daemon/ops.go +++ b/internal/test/daemon/ops.go @@ -63,3 +63,10 @@ func WithEnvironment(e environment.Execution) func(*Daemon) { } } } + +// WithStorageDriver sets store driver option +func WithStorageDriver(driver string) func(d *Daemon) { + return func(d *Daemon) { + d.storageDriver = driver + } +} diff --git a/internal/test/daemon/ops_unix.go b/internal/test/daemon/ops_unix.go new file mode 100644 index 0000000000000..41011e87c5c09 --- /dev/null +++ b/internal/test/daemon/ops_unix.go @@ -0,0 +1,34 @@ +// +build !windows + +package daemon + +import ( + "path/filepath" + "runtime" + + "github.com/docker/docker/daemon/images" + "github.com/docker/docker/layer" + + // register graph drivers + _ "github.com/docker/docker/daemon/graphdriver/register" + "github.com/docker/docker/pkg/idtools" +) + +// WithImageService sets imageService options +func WithImageService(d *Daemon) { + layerStores := make(map[string]layer.Store) + os := runtime.GOOS + layerStores[os], _ = layer.NewStoreFromOptions(layer.StoreOptions{ + Root: d.Root, + MetadataStorePathTemplate: filepath.Join(d.RootDir(), "image", "%s", "layerdb"), + GraphDriver: d.storageDriver, + GraphDriverOptions: nil, + IDMapping: &idtools.IdentityMapping{}, + PluginGetter: nil, + ExperimentalEnabled: false, + OS: os, + }) + d.imageService = images.NewImageService(images.ImageServiceConfig{ + LayerStores: layerStores, + }) +} diff --git a/layer/filestore.go b/layer/filestore.go index 208a0c3a859a4..37a927022a88f 100644 --- a/layer/filestore.go +++ b/layer/filestore.go @@ -305,6 +305,47 @@ func (fms *fileMetadataStore) GetMountParent(mount string) (ChainID, error) { return ChainID(dgst), nil } +func (fms *fileMetadataStore) getOrphan() ([]roLayer, error) { + var orphanLayers []roLayer + for _, algorithm := range supportedAlgorithms { + fileInfos, err := ioutil.ReadDir(filepath.Join(fms.root, string(algorithm))) + if err != nil { + if os.IsNotExist(err) { + continue + } + return nil, err + } + + for _, fi := range fileInfos { + if fi.IsDir() && strings.Contains(fi.Name(), "-removing") { + nameSplit := strings.Split(fi.Name(), "-") + dgst := digest.NewDigestFromEncoded(algorithm, nameSplit[0]) + if err := dgst.Validate(); err != nil { + logrus.Debugf("Ignoring invalid digest %s:%s", algorithm, nameSplit[0]) + } else { + chainID := ChainID(dgst) + chainFile := filepath.Join(fms.root, string(algorithm), fi.Name(), "cache-id") + contentBytes, err := ioutil.ReadFile(chainFile) + if err != nil { + logrus.WithError(err).WithField("digest", dgst).Error("cannot get cache ID") + } + cacheID := strings.TrimSpace(string(contentBytes)) + if cacheID == "" { + logrus.Errorf("invalid cache id value") + } + + l := &roLayer{ + chainID: chainID, + cacheID: cacheID, + } + orphanLayers = append(orphanLayers, *l) + } + } + } + } + return orphanLayers, nil +} + func (fms *fileMetadataStore) List() ([]ChainID, []string, error) { var ids []ChainID for _, algorithm := range supportedAlgorithms { @@ -346,8 +387,39 @@ func (fms *fileMetadataStore) List() ([]ChainID, []string, error) { return ids, mounts, nil } -func (fms *fileMetadataStore) Remove(layer ChainID) error { - return os.RemoveAll(fms.getLayerDirectory(layer)) +// Remove layerdb folder if that is marked for removal +func (fms *fileMetadataStore) Remove(layer ChainID, cache string) error { + dgst := digest.Digest(layer) + files, err := ioutil.ReadDir(filepath.Join(fms.root, string(dgst.Algorithm()))) + if err != nil { + return err + } + for _, f := range files { + if !strings.HasSuffix(f.Name(), "-removing") || !strings.HasPrefix(f.Name(), dgst.String()) { + continue + } + + // Make sure that we only remove layerdb folder which points to + // requested cacheID + dir := filepath.Join(fms.root, string(dgst.Algorithm()), f.Name()) + chainFile := filepath.Join(dir, "cache-id") + contentBytes, err := ioutil.ReadFile(chainFile) + if err != nil { + logrus.WithError(err).WithField("file", chainFile).Error("cannot get cache ID") + continue + } + cacheID := strings.TrimSpace(string(contentBytes)) + if cacheID != cache { + continue + } + logrus.Debugf("Removing folder: %s", dir) + err = os.RemoveAll(dir) + if err != nil && !os.IsNotExist(err) { + logrus.WithError(err).WithField("name", f.Name()).Error("cannot remove layer") + continue + } + } + return nil } func (fms *fileMetadataStore) RemoveMount(mount string) error { diff --git a/layer/layer_store.go b/layer/layer_store.go index 81730e9d92116..7b8c011f4c07c 100644 --- a/layer/layer_store.go +++ b/layer/layer_store.go @@ -5,6 +5,8 @@ import ( "fmt" "io" "io/ioutil" + "os" + "path/filepath" "sync" "github.com/docker/distribution" @@ -415,11 +417,24 @@ func (ls *layerStore) Map() map[ChainID]Layer { } func (ls *layerStore) deleteLayer(layer *roLayer, metadata *Metadata) error { + // Rename layer digest folder first so we detect orphan layer(s) + // if ls.driver.Remove fails + dir := ls.store.getLayerDirectory(layer.chainID) + for { + dgst := digest.Digest(layer.chainID) + tmpID := fmt.Sprintf("%s-%s-removing", dgst.Hex(), stringid.GenerateRandomID()) + dir := filepath.Join(ls.store.root, string(dgst.Algorithm()), tmpID) + err := os.Rename(ls.store.getLayerDirectory(layer.chainID), dir) + if os.IsExist(err) { + continue + } + break + } err := ls.driver.Remove(layer.cacheID) if err != nil { return err } - err = ls.store.Remove(layer.chainID) + err = os.RemoveAll(dir) if err != nil { return err } @@ -452,12 +467,14 @@ func (ls *layerStore) releaseLayer(l *roLayer) ([]Metadata, error) { if l.hasReferences() { panic("cannot delete referenced layer") } + // Remove layer from layer map first so it is not considered to exist + // when if ls.deleteLayer fails. + delete(ls.layerMap, l.chainID) + var metadata Metadata if err := ls.deleteLayer(l, &metadata); err != nil { return nil, err } - - delete(ls.layerMap, l.chainID) removed = append(removed, metadata) if l.parent == nil { @@ -743,6 +760,23 @@ func (ls *layerStore) assembleTarTo(graphID string, metadata io.ReadCloser, size } func (ls *layerStore) Cleanup() error { + orphanLayers, err := ls.store.getOrphan() + if err != nil { + logrus.Errorf("Cannot get orphan layers: %v", err) + } + logrus.Debugf("found %v orphan layers", len(orphanLayers)) + for _, orphan := range orphanLayers { + logrus.Debugf("removing orphan layer, chain ID: %v , cache ID: %v", orphan.chainID, orphan.cacheID) + err = ls.driver.Remove(orphan.cacheID) + if err != nil && !os.IsNotExist(err) { + logrus.WithError(err).WithField("cache-id", orphan.cacheID).Error("cannot remove orphan layer") + continue + } + err = ls.store.Remove(orphan.chainID, orphan.cacheID) + if err != nil { + logrus.WithError(err).WithField("chain-id", orphan.chainID).Error("cannot remove orphan layer metadata") + } + } return ls.driver.Cleanup() } From e73d951adfd92ea0f8e3d30626a98ad7efc8dd69 Mon Sep 17 00:00:00 2001 From: Vikram bir Singh Date: Tue, 3 Sep 2019 19:11:21 +0000 Subject: [PATCH 2/4] Reimplement iteration over fileInfos in getOrphan. 1. Reduce complexity due to nested if blocks by using early return/continue 2. Improve logging Changes suggested as a part of code review comments in 39748 Signed-off-by: Vikram bir Singh (cherry picked from commit ebf12dbda08633375ab12387255d3f617ee9be38) Signed-off-by: Sebastiaan van Stijn --- layer/filestore.go | 54 ++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/layer/filestore.go b/layer/filestore.go index 37a927022a88f..a1726a5a404c4 100644 --- a/layer/filestore.go +++ b/layer/filestore.go @@ -14,7 +14,7 @@ import ( "github.com/docker/distribution" "github.com/docker/docker/pkg/ioutils" - "github.com/opencontainers/go-digest" + digest "github.com/opencontainers/go-digest" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -317,32 +317,40 @@ func (fms *fileMetadataStore) getOrphan() ([]roLayer, error) { } for _, fi := range fileInfos { - if fi.IsDir() && strings.Contains(fi.Name(), "-removing") { - nameSplit := strings.Split(fi.Name(), "-") - dgst := digest.NewDigestFromEncoded(algorithm, nameSplit[0]) - if err := dgst.Validate(); err != nil { - logrus.Debugf("Ignoring invalid digest %s:%s", algorithm, nameSplit[0]) - } else { - chainID := ChainID(dgst) - chainFile := filepath.Join(fms.root, string(algorithm), fi.Name(), "cache-id") - contentBytes, err := ioutil.ReadFile(chainFile) - if err != nil { - logrus.WithError(err).WithField("digest", dgst).Error("cannot get cache ID") - } - cacheID := strings.TrimSpace(string(contentBytes)) - if cacheID == "" { - logrus.Errorf("invalid cache id value") - } - - l := &roLayer{ - chainID: chainID, - cacheID: cacheID, - } - orphanLayers = append(orphanLayers, *l) + if !fi.IsDir() || !strings.HasSuffix(fi.Name(), "-removing") { + continue + } + // At this stage, fi.Name value looks like --removing + // Split on '-' to get the digest value. + nameSplit := strings.Split(fi.Name(), "-") + dgst := digest.NewDigestFromEncoded(algorithm, nameSplit[0]) + if err := dgst.Validate(); err != nil { + logrus.WithError(err).WithField("digest", string(algorithm)+":"+nameSplit[0]).Debug("ignoring invalid digest") + continue + } + + chainFile := filepath.Join(fms.root, string(algorithm), fi.Name(), "cache-id") + contentBytes, err := ioutil.ReadFile(chainFile) + if err != nil { + if !os.IsNotExist(err) { + logrus.WithError(err).WithField("digest", dgst).Error("failed to read cache ID") } + continue + } + cacheID := strings.TrimSpace(string(contentBytes)) + if cacheID == "" { + logrus.Error("invalid cache ID") + continue } + + l := &roLayer{ + chainID: ChainID(dgst), + cacheID: cacheID, + } + orphanLayers = append(orphanLayers, *l) } } + return orphanLayers, nil } From 59c7e59795b9c6fa3ef648a6ddcdb45399b7286c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 22 Jul 2019 20:39:22 +0000 Subject: [PATCH 3/4] Add extra permission check in removal test Signed-off-by: Michael Crosby (cherry picked from commit d6cbeee470105df403b7ae716c923fd11003b67c) Signed-off-by: Sebastiaan van Stijn --- integration/image/remove_unix_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/integration/image/remove_unix_test.go b/integration/image/remove_unix_test.go index 984ce5c9d72db..428a312742e2c 100644 --- a/integration/image/remove_unix_test.go +++ b/integration/image/remove_unix_test.go @@ -8,6 +8,7 @@ import ( "io/ioutil" "os" "strconv" + "strings" "syscall" "testing" "unsafe" @@ -75,7 +76,11 @@ func TestRemoveImageGarbageCollector(t *testing.T) { argp = uintptr(unsafe.Pointer(&attr)) _, _, errno = syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), fsflags, argp) assert.Equal(t, "errno 0", errno.Error()) - assert.ErrorContains(t, err, "permission denied") + assert.Assert(t, err != nil) + errStr := err.Error() + if !(strings.Contains(errStr, "permission denied") || strings.Contains(errStr, "operation not permitted")) { + t.Errorf("ImageRemove error not an permission error %s", errStr) + } // Verify that layer remaining on disk dir, _ := os.Stat(data["UpperDir"]) From 43abde3068b0de4be4300e238aca0f8bf8ca5578 Mon Sep 17 00:00:00 2001 From: Olli Janatuinen Date: Sat, 10 Aug 2019 14:48:47 +0300 Subject: [PATCH 4/4] Unit test for getOrphan Signed-off-by: Olli Janatuinen (cherry picked from commit 8660330173e5053e274cf12860079f132cbaa9fa) Signed-off-by: Sebastiaan van Stijn --- layer/filestore_test.go | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/layer/filestore_test.go b/layer/filestore_test.go index 498379e37fe2e..878dc54e3616e 100644 --- a/layer/filestore_test.go +++ b/layer/filestore_test.go @@ -10,6 +10,7 @@ import ( "syscall" "testing" + "github.com/docker/docker/pkg/stringid" "github.com/opencontainers/go-digest" ) @@ -102,3 +103,50 @@ func TestStartTransactionFailure(t *testing.T) { t.Fatal(err) } } + +func TestGetOrphan(t *testing.T) { + fms, td, cleanup := newFileMetadataStore(t) + defer cleanup() + + layerRoot := filepath.Join(td, "sha256") + if err := os.MkdirAll(layerRoot, 0755); err != nil { + t.Fatal(err) + } + + tx, err := fms.StartTransaction() + if err != nil { + t.Fatal(err) + } + + layerid := randomLayerID(5) + err = tx.Commit(layerid) + if err != nil { + t.Fatal(err) + } + layerPath := fms.getLayerDirectory(layerid) + if err := ioutil.WriteFile(filepath.Join(layerPath, "cache-id"), []byte(stringid.GenerateRandomID()), 0644); err != nil { + t.Fatal(err) + } + + orphanLayers, err := fms.getOrphan() + if err != nil { + t.Fatal(err) + } + if len(orphanLayers) != 0 { + t.Fatalf("Expected to have zero orphan layers") + } + + layeridSplit := strings.Split(layerid.String(), ":") + newPath := filepath.Join(layerRoot, fmt.Sprintf("%s-%s-removing", layeridSplit[1], stringid.GenerateRandomID())) + err = os.Rename(layerPath, newPath) + if err != nil { + t.Fatal(err) + } + orphanLayers, err = fms.getOrphan() + if err != nil { + t.Fatal(err) + } + if len(orphanLayers) != 1 { + t.Fatalf("Expected to have one orphan layer") + } +}