From c2a5fd1bb739aa540926596741b640f00821c365 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 18 Nov 2020 09:30:48 +0100 Subject: [PATCH 1/4] drivers: add interface to directly write diff Signed-off-by: Giuseppe Scrivano --- drivers/driver.go | 34 ++++++++++++++++ errors.go | 2 + layers.go | 100 ++++++++++++++++++++++++++++++++++++++++++++++ store.go | 83 ++++++++++++++++++++++++++++++++++++++ types/errors.go | 2 + 5 files changed, 221 insertions(+) diff --git a/drivers/driver.go b/drivers/driver.go index 9aa407168f..4832594ab0 100644 --- a/drivers/driver.go +++ b/drivers/driver.go @@ -167,6 +167,40 @@ type Driver interface { LayerIDMapUpdater } +// DriverWithDifferOutput is the result of ApplyDiffWithDiffer +// This API is experimental and can be changed without bumping the major version number. +type DriverWithDifferOutput struct { + Differ Differ + Target string + Size int64 + UIDs []uint32 + GIDs []uint32 + UncompressedDigest digest.Digest + Metadata string + BigData map[string][]byte +} + +// Differ defines the interface for using a custom differ. +// This API is experimental and can be changed without bumping the major version number. +type Differ interface { + ApplyDiff(dest string, options *archive.TarOptions) (DriverWithDifferOutput, error) +} + +// DriverWithDiffer is the interface for direct diff access. +// This API is experimental and can be changed without bumping the major version number. +type DriverWithDiffer interface { + Driver + // ApplyDiffWithDiffer applies the changes using the callback function. + // If id is empty, then a staging directory is created. The staging directory is guaranteed to be usable with ApplyDiffFromStagingDirectory. + ApplyDiffWithDiffer(id, parent string, options *ApplyDiffOpts, differ Differ) (output DriverWithDifferOutput, err error) + // ApplyDiffFromStagingDirectory applies the changes using the specified staging directory. + ApplyDiffFromStagingDirectory(id, parent, stagingDirectory string, diffOutput *DriverWithDifferOutput, options *ApplyDiffOpts) error + // CleanupStagingDirectory cleanups the staging directory. It can be used to cleanup the staging directory on errors + CleanupStagingDirectory(stagingDirectory string) error + // DifferTarget gets the location where files are stored for the layer. + DifferTarget(id string) (string, error) +} + // Capabilities defines a list of capabilities a driver may implement. // These capabilities are not required; however, they do determine how a // graphdriver can be used. diff --git a/errors.go b/errors.go index 35288d87ad..5fc810b89d 100644 --- a/errors.go +++ b/errors.go @@ -53,4 +53,6 @@ var ( ErrSizeUnknown = types.ErrSizeUnknown // ErrStoreIsReadOnly is returned when the caller makes a call to a read-only store that would require modifying its contents. ErrStoreIsReadOnly = types.ErrStoreIsReadOnly + // ErrNotSupported is returned when the requested functionality is not supported. + ErrNotSupported = types.ErrNotSupported ) diff --git a/layers.go b/layers.go index d398a3ff94..089703e240 100644 --- a/layers.go +++ b/layers.go @@ -247,6 +247,19 @@ type LayerStore interface { // applies its changes to a specified layer. ApplyDiff(to string, diff io.Reader) (int64, error) + // ApplyDiffWithDiffer applies the changes through the differ callback function. + // If to is the empty string, then a staging directory is created by the driver. + ApplyDiffWithDiffer(to string, options *drivers.ApplyDiffOpts, differ drivers.Differ) (*drivers.DriverWithDifferOutput, error) + + // CleanupStagingDirectory cleanups the staging directory. It can be used to cleanup the staging directory on errors + CleanupStagingDirectory(stagingDirectory string) error + + // ApplyDiffFromStagingDirectory uses stagingDirectory to create the diff. + ApplyDiffFromStagingDirectory(id, stagingDirectory string, diffOutput *drivers.DriverWithDifferOutput, options *drivers.ApplyDiffOpts) error + + // DifferTarget gets the location where files are stored for the layer. + DifferTarget(id string) (string, error) + // LoadLocked wraps Load in a locked state. This means it loads the store // and cleans-up invalid layers if needed. LoadLocked() error @@ -1553,6 +1566,93 @@ func (r *layerStore) ApplyDiff(to string, diff io.Reader) (size int64, err error return size, err } +func (r *layerStore) DifferTarget(id string) (string, error) { + ddriver, ok := r.driver.(drivers.DriverWithDiffer) + if !ok { + return "", ErrNotSupported + } + layer, ok := r.lookup(id) + if !ok { + return "", ErrLayerUnknown + } + return ddriver.DifferTarget(layer.ID) +} + +func (r *layerStore) ApplyDiffFromStagingDirectory(id, stagingDirectory string, diffOutput *drivers.DriverWithDifferOutput, options *drivers.ApplyDiffOpts) error { + ddriver, ok := r.driver.(drivers.DriverWithDiffer) + if !ok { + return ErrNotSupported + } + layer, ok := r.lookup(id) + if !ok { + return ErrLayerUnknown + } + if options == nil { + options = &drivers.ApplyDiffOpts{ + Mappings: r.layerMappings(layer), + MountLabel: layer.MountLabel, + } + } + err := ddriver.ApplyDiffFromStagingDirectory(layer.ID, layer.Parent, stagingDirectory, diffOutput, options) + if err != nil { + return err + } + layer.UIDs = diffOutput.UIDs + layer.GIDs = diffOutput.GIDs + layer.UncompressedDigest = diffOutput.UncompressedDigest + layer.UncompressedSize = diffOutput.Size + layer.Metadata = diffOutput.Metadata + if err = r.Save(); err != nil { + return err + } + for k, v := range diffOutput.BigData { + if err := r.SetBigData(id, k, bytes.NewReader(v)); err != nil { + r.Delete(id) + return err + } + } + return err +} + +func (r *layerStore) ApplyDiffWithDiffer(to string, options *drivers.ApplyDiffOpts, differ drivers.Differ) (*drivers.DriverWithDifferOutput, error) { + ddriver, ok := r.driver.(drivers.DriverWithDiffer) + if !ok { + return nil, ErrNotSupported + } + + if to == "" { + output, err := ddriver.ApplyDiffWithDiffer("", "", options, differ) + return &output, err + } + + layer, ok := r.lookup(to) + if !ok { + return nil, ErrLayerUnknown + } + if options == nil { + options = &drivers.ApplyDiffOpts{ + Mappings: r.layerMappings(layer), + MountLabel: layer.MountLabel, + } + } + output, err := ddriver.ApplyDiffWithDiffer(layer.ID, layer.Parent, options, differ) + if err != nil { + return nil, err + } + layer.UIDs = output.UIDs + layer.GIDs = output.GIDs + err = r.Save() + return &output, err +} + +func (r *layerStore) CleanupStagingDirectory(stagingDirectory string) error { + ddriver, ok := r.driver.(drivers.DriverWithDiffer) + if !ok { + return ErrNotSupported + } + return ddriver.CleanupStagingDirectory(stagingDirectory) +} + func (r *layerStore) layersByDigestMap(m map[digest.Digest][]string, d digest.Digest) ([]Layer, error) { var layers []Layer for _, layerID := range m[d] { diff --git a/store.go b/store.go index 25767b7a2f..b67776e61c 100644 --- a/store.go +++ b/store.go @@ -317,6 +317,20 @@ type Store interface { // } ApplyDiff(to string, diff io.Reader) (int64, error) + // ApplyDiffer applies a diff to a layer. + // It is the caller responsibility to clean the staging directory if it is not + // successfully applied with ApplyDiffFromStagingDirectory. + ApplyDiffWithDiffer(to string, options *drivers.ApplyDiffOpts, differ drivers.Differ) (*drivers.DriverWithDifferOutput, error) + + // ApplyDiffFromStagingDirectory uses stagingDirectory to create the diff. + ApplyDiffFromStagingDirectory(to, stagingDirectory string, diffOutput *drivers.DriverWithDifferOutput, options *drivers.ApplyDiffOpts) error + + // CleanupStagingDirectory cleanups the staging directory. It can be used to cleanup the staging directory on errors + CleanupStagingDirectory(stagingDirectory string) error + + // DifferTarget gets the path to the differ target. + DifferTarget(id string) (string, error) + // LayersByCompressedDigest returns a slice of the layers with the // specified compressed digest value recorded for them. LayersByCompressedDigest(d digest.Digest) ([]Layer, error) @@ -2939,6 +2953,75 @@ func (s *store) Diff(from, to string, options *DiffOptions) (io.ReadCloser, erro return nil, ErrLayerUnknown } +func (s *store) ApplyDiffFromStagingDirectory(to, stagingDirectory string, diffOutput *drivers.DriverWithDifferOutput, options *drivers.ApplyDiffOpts) error { + rlstore, err := s.LayerStore() + if err != nil { + return err + } + rlstore.Lock() + defer rlstore.Unlock() + if modified, err := rlstore.Modified(); modified || err != nil { + if err = rlstore.Load(); err != nil { + return err + } + } + if !rlstore.Exists(to) { + return ErrLayerUnknown + } + return rlstore.ApplyDiffFromStagingDirectory(to, stagingDirectory, diffOutput, options) +} + +func (s *store) CleanupStagingDirectory(stagingDirectory string) error { + rlstore, err := s.LayerStore() + if err != nil { + return err + } + rlstore.Lock() + defer rlstore.Unlock() + if modified, err := rlstore.Modified(); modified || err != nil { + if err = rlstore.Load(); err != nil { + return err + } + } + return rlstore.CleanupStagingDirectory(stagingDirectory) +} + +func (s *store) ApplyDiffWithDiffer(to string, options *drivers.ApplyDiffOpts, differ drivers.Differ) (*drivers.DriverWithDifferOutput, error) { + rlstore, err := s.LayerStore() + if err != nil { + return nil, err + } + rlstore.Lock() + defer rlstore.Unlock() + if modified, err := rlstore.Modified(); modified || err != nil { + if err = rlstore.Load(); err != nil { + return nil, err + } + } + if to != "" && !rlstore.Exists(to) { + return nil, ErrLayerUnknown + } + return rlstore.ApplyDiffWithDiffer(to, options, differ) +} + +func (s *store) DifferTarget(id string) (string, error) { + rlstore, err := s.LayerStore() + if err != nil { + return "", err + } + rlstore.Lock() + defer rlstore.Unlock() + if modified, err := rlstore.Modified(); modified || err != nil { + if err = rlstore.Load(); err != nil { + return "", err + } + } + if rlstore.Exists(id) { + return rlstore.DifferTarget(id) + } + return "", ErrLayerUnknown +} + func (s *store) ApplyDiff(to string, diff io.Reader) (int64, error) { rlstore, err := s.LayerStore() if err != nil { diff --git a/types/errors.go b/types/errors.go index 4b923dcf69..d920d12eb5 100644 --- a/types/errors.go +++ b/types/errors.go @@ -53,4 +53,6 @@ var ( ErrSizeUnknown = errors.New("size is not known") // ErrStoreIsReadOnly is returned when the caller makes a call to a read-only store that would require modifying its contents. ErrStoreIsReadOnly = errors.New("called a write method on a read-only store") + // ErrNotSupported is returned when the requested functionality is not supported. + ErrNotSupported = errors.New("not supported") ) From c2c9ca976112eadaa89e26a2c5b894a46dc8eb64 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 18 Nov 2020 09:31:16 +0100 Subject: [PATCH 2/4] overlay: implement direct diff access Signed-off-by: Giuseppe Scrivano --- drivers/overlay/overlay.go | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/drivers/overlay/overlay.go b/drivers/overlay/overlay.go index 60eb047bd3..0b4ec6bccf 100644 --- a/drivers/overlay/overlay.go +++ b/drivers/overlay/overlay.go @@ -705,6 +705,7 @@ func (d *Driver) Metadata(id string) (map[string]string, error) { // is being shutdown. For now, we just have to unmount the bind mounted // we had created. func (d *Driver) Cleanup() error { + _ = os.RemoveAll(d.getStagingDir()) return mount.Unmount(d.home) } @@ -1490,6 +1491,10 @@ func (f fileGetNilCloser) Close() error { return nil } +func (d *Driver) getStagingDir() string { + return filepath.Join(d.home, "staging") +} + // DiffGetter returns a FileGetCloser that can read files from the directory that // contains files for the layer differences. Used for direct access for tar-split. func (d *Driver) DiffGetter(id string) (graphdriver.FileGetCloser, error) { @@ -1500,6 +1505,75 @@ func (d *Driver) DiffGetter(id string) (graphdriver.FileGetCloser, error) { return fileGetNilCloser{storage.NewPathFileGetter(p)}, nil } +// CleanupStagingDirectory cleanups the staging directory. +func (d *Driver) CleanupStagingDirectory(stagingDirectory string) error { + return os.RemoveAll(stagingDirectory) +} + +// ApplyDiff applies the changes in the new layer using the specified function +func (d *Driver) ApplyDiffWithDiffer(id, parent string, options *graphdriver.ApplyDiffOpts, differ graphdriver.Differ) (output graphdriver.DriverWithDifferOutput, err error) { + var idMappings *idtools.IDMappings + if options != nil { + idMappings = options.Mappings + } + if idMappings == nil { + idMappings = &idtools.IDMappings{} + } + + applyDir := "" + + if id == "" { + err := os.MkdirAll(d.getStagingDir(), 0700) + if err != nil && !os.IsExist(err) { + return graphdriver.DriverWithDifferOutput{}, err + } + applyDir, err = ioutil.TempDir(d.getStagingDir(), "") + if err != nil { + return graphdriver.DriverWithDifferOutput{}, err + } + + } else { + var err error + applyDir, err = d.getDiffPath(id) + if err != nil { + return graphdriver.DriverWithDifferOutput{}, err + } + } + + logrus.Debugf("Applying differ in %s", applyDir) + + out, err := differ.ApplyDiff(applyDir, &archive.TarOptions{ + UIDMaps: idMappings.UIDs(), + GIDMaps: idMappings.GIDs(), + IgnoreChownErrors: d.options.ignoreChownErrors, + WhiteoutFormat: d.getWhiteoutFormat(), + InUserNS: rsystem.RunningInUserNS(), + }) + out.Target = applyDir + return out, err +} + +// ApplyDiffFromStagingDirectory applies the changes using the specified staging directory. +func (d *Driver) ApplyDiffFromStagingDirectory(id, parent, stagingDirectory string, diffOutput *graphdriver.DriverWithDifferOutput, options *graphdriver.ApplyDiffOpts) error { + if filepath.Dir(stagingDirectory) != d.getStagingDir() { + return fmt.Errorf("%q is not a staging directory", stagingDirectory) + } + + diff, err := d.getDiffPath(id) + if err != nil { + return err + } + if err := os.RemoveAll(diff); err != nil && !os.IsNotExist(err) { + return err + } + return os.Rename(stagingDirectory, diff) +} + +// DifferTarget gets the location where files are stored for the layer. +func (d *Driver) DifferTarget(id string) (string, error) { + return d.getDiffPath(id) +} + // ApplyDiff applies the new layer into a root func (d *Driver) ApplyDiff(id, parent string, options graphdriver.ApplyDiffOpts) (size int64, err error) { From 74006473780c10d903b46a55b68e49260c655dc0 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 4 May 2021 10:04:15 +0200 Subject: [PATCH 3/4] types: add map PullOptions to the conf file Signed-off-by: Giuseppe Scrivano --- pkg/config/config.go | 4 ++++ types/options.go | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/pkg/config/config.go b/pkg/config/config.go index 2d24707226..b92af218a4 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -189,6 +189,10 @@ type OptionsConfig struct { // MountOpt specifies extra mount options used when mounting MountOpt string `toml:"mountopt"` + + // PullOptions specifies options to be handed to pull managers + // This API is experimental and can be changed without bumping the major version number. + PullOptions map[string]string `toml:"pull_options"` } // GetGraphDriverOptions returns the driver specific options diff --git a/types/options.go b/types/options.go index 223db8f00f..987ea5f26d 100644 --- a/types/options.go +++ b/types/options.go @@ -148,6 +148,9 @@ type StoreOptions struct { AutoNsMinSize uint32 `json:"auto_userns_min_size,omitempty"` // AutoNsMaxSize is the maximum size for an automatic user namespace. AutoNsMaxSize uint32 `json:"auto_userns_max_size,omitempty"` + // PullOptions specifies options to be handed to pull managers + // This API is experimental and can be changed without bumping the major version number. + PullOptions map[string]string `toml:"pull_options"` } // isRootlessDriver returns true if the given storage driver is valid for containers running as non root @@ -362,6 +365,9 @@ func ReloadConfigurationFile(configFile string, storeOptions *StoreOptions) { if config.Storage.Options.AutoUsernsMaxSize > 0 { storeOptions.AutoNsMaxSize = config.Storage.Options.AutoUsernsMaxSize } + if config.Storage.Options.PullOptions != nil { + storeOptions.PullOptions = config.Storage.Options.PullOptions + } storeOptions.GraphDriverOptions = append(storeOptions.GraphDriverOptions, cfg.GetGraphDriverOptions(storeOptions.GraphDriverName, config.Storage.Options)...) From a0ff89e09dcf3f345c31642c8820ed74934e06f3 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 30 Nov 2020 13:47:08 +0100 Subject: [PATCH 4/4] pkg/chunked: new zstd variant zstd:chunked add a new custom variant of the zstd compression that permits to retrieve each file separately. The idea is based on CRFS and its stargz format for having seekable and indexable tarballs. One disadvantage of the stargz format is that a custom file is added to the tarball to store the layer metadata, and the metadata file is part of the image itself. Clients that are not aware of the stargz format will propagate the metadata file inside of the containers. The zstd compression supports embeddeding additional data as part of the stream that the zstd decompressor will ignore (skippable frame), so the issue above with CRFS can be solved directly within the zstd compression format. Beside this minor advantage, zstd is much faster and compresses better than gzip, so take this opportunity to push the zstd format further. The zstd compression is supported by the OCI image specs since August 2019: https://github.com/opencontainers/image-spec/pull/788 and has been supported by containers/image since then. Clients that are not aware of the zstd:chunked format, won't notice any difference when handling a blob that uses the variant. Signed-off-by: Giuseppe Scrivano --- pkg/chunked/compression.go | 513 ++++++++++++++++++ pkg/chunked/storage.go | 894 ++++++++++++++++++++++++++++++++ pkg/chunked/zstdchunked_test.go | 172 ++++++ 3 files changed, 1579 insertions(+) create mode 100644 pkg/chunked/compression.go create mode 100644 pkg/chunked/storage.go create mode 100644 pkg/chunked/zstdchunked_test.go diff --git a/pkg/chunked/compression.go b/pkg/chunked/compression.go new file mode 100644 index 0000000000..605be4b8f8 --- /dev/null +++ b/pkg/chunked/compression.go @@ -0,0 +1,513 @@ +package chunked + +import ( + "bytes" + "encoding/base64" + "encoding/binary" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "time" + + "github.com/containers/storage/pkg/ioutils" + "github.com/klauspost/compress/zstd" + digest "github.com/opencontainers/go-digest" + "github.com/pkg/errors" + "github.com/vbatts/tar-split/archive/tar" +) + +type zstdTOC struct { + Version int `json:"version"` + Entries []zstdFileMetadata `json:"entries"` +} + +type zstdFileMetadata struct { + Type string `json:"type"` + Name string `json:"name"` + Linkname string `json:"linkName,omitempty"` + Mode int64 `json:"mode,omitempty"` + Size int64 `json:"size"` + UID int `json:"uid"` + GID int `json:"gid"` + ModTime time.Time `json:"modtime"` + AccessTime time.Time `json:"accesstime"` + ChangeTime time.Time `json:"changetime"` + Devmajor int64 `json:"devMajor"` + Devminor int64 `json:"devMinor"` + Xattrs map[string]string `json:"xattrs,omitempty"` + Digest string `json:"digest,omitempty"` + Offset int64 `json:"offset,omitempty"` + EndOffset int64 `json:"endOffset,omitempty"` + + // Currently chunking is not supported. + ChunkSize int64 `json:"chunkSize,omitempty"` + ChunkOffset int64 `json:"chunkOffset,omitempty"` + ChunkDigest string `json:"chunkDigest,omitempty"` +} + +const ( + TypeReg = "reg" + TypeChunk = "chunk" + TypeLink = "hardlink" + TypeChar = "char" + TypeBlock = "block" + TypeDir = "dir" + TypeFifo = "fifo" + TypeSymlink = "symlink" +) + +var tarTypes = map[byte]string{ + tar.TypeReg: TypeReg, + tar.TypeRegA: TypeReg, + tar.TypeLink: TypeLink, + tar.TypeChar: TypeChar, + tar.TypeBlock: TypeBlock, + tar.TypeDir: TypeDir, + tar.TypeFifo: TypeFifo, + tar.TypeSymlink: TypeSymlink, +} + +var typesToTar = map[string]byte{ + TypeReg: tar.TypeReg, + TypeLink: tar.TypeLink, + TypeChar: tar.TypeChar, + TypeBlock: tar.TypeBlock, + TypeDir: tar.TypeDir, + TypeFifo: tar.TypeFifo, + TypeSymlink: tar.TypeSymlink, +} + +func getType(t byte) (string, error) { + r, found := tarTypes[t] + if !found { + return "", fmt.Errorf("unknown tarball type: %v", t) + } + return r, nil +} + +func typeToTarType(t string) (byte, error) { + r, found := typesToTar[t] + if !found { + return 0, fmt.Errorf("unknown type: %v", t) + } + return r, nil +} + +const ( + manifestChecksumKey = "io.containers.zstd-chunked.manifest-checksum" + manifestInfoKey = "io.containers.zstd-chunked.manifest-position" + + // manifestTypeCRFS is a manifest file compatible with the CRFS TOC file. + manifestTypeCRFS = 1 + + // footerSizeSupported is the footer size supported by this implementation. + // Newer versions of the image format might increase this value, so reject + // any version that is not supported. + footerSizeSupported = 40 +) + +var ( + // when the zstd decoder encounters a skippable frame + 1 byte for the size, it + // will ignore it. + // https://tools.ietf.org/html/rfc8478#section-3.1.2 + skippableFrameMagic = []byte{0x50, 0x2a, 0x4d, 0x18} + + zstdChunkedFrameMagic = []byte{0x47, 0x6e, 0x55, 0x6c, 0x49, 0x6e, 0x55, 0x78} +) + +func isZstdChunkedFrameMagic(data []byte) bool { + if len(data) < 8 { + return false + } + return bytes.Equal(zstdChunkedFrameMagic, data[:8]) +} + +// readZstdChunkedManifest reads the zstd:chunked manifest from the seekable stream blobStream. The blob total size must +// be specified. +// This function uses the io.containers.zstd-chunked. annotations when specified. +func readZstdChunkedManifest(blobStream ImageSourceSeekable, blobSize int64, annotations map[string]string) ([]byte, error) { + footerSize := int64(footerSizeSupported) + if blobSize <= footerSize { + return nil, errors.New("blob too small") + } + + manifestChecksumAnnotation := annotations[manifestChecksumKey] + if manifestChecksumAnnotation == "" { + return nil, fmt.Errorf("manifest checksum annotation %q not found", manifestChecksumKey) + } + + var offset, length, lengthUncompressed, manifestType uint64 + + if offsetMetadata := annotations[manifestInfoKey]; offsetMetadata != "" { + if _, err := fmt.Sscanf(offsetMetadata, "%d:%d:%d:%d", &offset, &length, &lengthUncompressed, &manifestType); err != nil { + return nil, err + } + } else { + chunk := ImageSourceChunk{ + Offset: uint64(blobSize - footerSize), + Length: uint64(footerSize), + } + parts, errs, err := blobStream.GetBlobAt([]ImageSourceChunk{chunk}) + if err != nil { + return nil, err + } + var reader io.ReadCloser + select { + case r := <-parts: + reader = r + case err := <-errs: + return nil, err + } + footer := make([]byte, footerSize) + if _, err := io.ReadFull(reader, footer); err != nil { + return nil, err + } + + offset = binary.LittleEndian.Uint64(footer[0:8]) + length = binary.LittleEndian.Uint64(footer[8:16]) + lengthUncompressed = binary.LittleEndian.Uint64(footer[16:24]) + manifestType = binary.LittleEndian.Uint64(footer[24:32]) + if !isZstdChunkedFrameMagic(footer[32:40]) { + return nil, errors.New("invalid magic number") + } + } + + if manifestType != manifestTypeCRFS { + return nil, errors.New("invalid manifest type") + } + + // set a reasonable limit + if length > (1<<20)*50 { + return nil, errors.New("manifest too big") + } + if lengthUncompressed > (1<<20)*50 { + return nil, errors.New("manifest too big") + } + + chunk := ImageSourceChunk{ + Offset: offset, + Length: length, + } + + parts, errs, err := blobStream.GetBlobAt([]ImageSourceChunk{chunk}) + if err != nil { + return nil, err + } + var reader io.ReadCloser + select { + case r := <-parts: + reader = r + case err := <-errs: + return nil, err + } + + manifest := make([]byte, length) + if _, err := io.ReadFull(reader, manifest); err != nil { + return nil, err + } + + manifestDigester := digest.Canonical.Digester() + manifestChecksum := manifestDigester.Hash() + if _, err := manifestChecksum.Write(manifest); err != nil { + return nil, err + } + + d, err := digest.Parse(manifestChecksumAnnotation) + if err != nil { + return nil, err + } + if manifestDigester.Digest() != d { + return nil, errors.New("invalid manifest checksum") + } + + decoder, err := zstd.NewReader(nil) + if err != nil { + return nil, err + } + defer decoder.Close() + + b := make([]byte, 0, lengthUncompressed) + if decoded, err := decoder.DecodeAll(manifest, b); err == nil { + return decoded, nil + } + + return manifest, nil +} + +func appendZstdSkippableFrame(dest io.Writer, data []byte) error { + if _, err := dest.Write(skippableFrameMagic); err != nil { + return err + } + + var size []byte = make([]byte, 4) + binary.LittleEndian.PutUint32(size, uint32(len(data))) + if _, err := dest.Write(size); err != nil { + return err + } + if _, err := dest.Write(data); err != nil { + return err + } + return nil +} + +func writeZstdChunkedManifest(dest io.Writer, outMetadata map[string]string, offset uint64, metadata []zstdFileMetadata, level int) error { + // 8 is the size of the zstd skippable frame header + the frame size + manifestOffset := offset + 8 + + toc := zstdTOC{ + Version: 1, + Entries: metadata, + } + + // Generate the manifest + manifest, err := json.Marshal(toc) + if err != nil { + return err + } + + var compressedBuffer bytes.Buffer + zstdWriter, err := zstdWriterWithLevel(&compressedBuffer, level) + if err != nil { + return err + } + if _, err := zstdWriter.Write(manifest); err != nil { + zstdWriter.Close() + return err + } + if err := zstdWriter.Close(); err != nil { + return err + } + compressedManifest := compressedBuffer.Bytes() + + manifestDigester := digest.Canonical.Digester() + manifestChecksum := manifestDigester.Hash() + if _, err := manifestChecksum.Write(compressedManifest); err != nil { + return err + } + + outMetadata[manifestChecksumKey] = manifestDigester.Digest().String() + outMetadata[manifestInfoKey] = fmt.Sprintf("%d:%d:%d:%d", manifestOffset, len(compressedManifest), len(manifest), manifestTypeCRFS) + if err := appendZstdSkippableFrame(dest, compressedManifest); err != nil { + return err + } + + // Store the offset to the manifest and its size in LE order + var manifestDataLE []byte = make([]byte, footerSizeSupported) + binary.LittleEndian.PutUint64(manifestDataLE, manifestOffset) + binary.LittleEndian.PutUint64(manifestDataLE[8:], uint64(len(compressedManifest))) + binary.LittleEndian.PutUint64(manifestDataLE[16:], uint64(len(manifest))) + binary.LittleEndian.PutUint64(manifestDataLE[24:], uint64(manifestTypeCRFS)) + copy(manifestDataLE[32:], zstdChunkedFrameMagic) + + return appendZstdSkippableFrame(dest, manifestDataLE) +} + +func writeZstdChunkedStream(destFile io.Writer, outMetadata map[string]string, reader io.Reader, level int) error { + // total written so far. Used to retrieve partial offsets in the file + dest := ioutils.NewWriteCounter(destFile) + + tr := tar.NewReader(reader) + tr.RawAccounting = true + + buf := make([]byte, 4096) + + zstdWriter, err := zstdWriterWithLevel(dest, level) + if err != nil { + return err + } + defer func() { + if zstdWriter != nil { + zstdWriter.Close() + zstdWriter.Flush() + } + }() + + restartCompression := func() (int64, error) { + var offset int64 + if zstdWriter != nil { + if err := zstdWriter.Close(); err != nil { + return 0, err + } + if err := zstdWriter.Flush(); err != nil { + return 0, err + } + offset = dest.Count + zstdWriter.Reset(dest) + } + return offset, nil + } + + var metadata []zstdFileMetadata + for { + hdr, err := tr.Next() + if err != nil { + if err == io.EOF { + break + } + return err + } + + rawBytes := tr.RawBytes() + if _, err := zstdWriter.Write(rawBytes); err != nil { + return err + } + payloadDigester := digest.Canonical.Digester() + payloadChecksum := payloadDigester.Hash() + + payloadDest := io.MultiWriter(payloadChecksum, zstdWriter) + + // Now handle the payload, if any + var startOffset, endOffset int64 + checksum := "" + for { + read, errRead := tr.Read(buf) + if errRead != nil && errRead != io.EOF { + return err + } + + // restart the compression only if there is + // a payload. + if read > 0 { + if startOffset == 0 { + startOffset, err = restartCompression() + if err != nil { + return err + } + } + _, err := payloadDest.Write(buf[:read]) + if err != nil { + return err + } + } + if errRead == io.EOF { + if startOffset > 0 { + endOffset, err = restartCompression() + if err != nil { + return err + } + checksum = payloadDigester.Digest().String() + } + break + } + } + + typ, err := getType(hdr.Typeflag) + if err != nil { + return err + } + xattrs := make(map[string]string) + for k, v := range hdr.Xattrs { + xattrs[k] = base64.StdEncoding.EncodeToString([]byte(v)) + } + m := zstdFileMetadata{ + Type: typ, + Name: hdr.Name, + Linkname: hdr.Linkname, + Mode: hdr.Mode, + Size: hdr.Size, + UID: hdr.Uid, + GID: hdr.Gid, + ModTime: hdr.ModTime, + AccessTime: hdr.AccessTime, + ChangeTime: hdr.ChangeTime, + Devmajor: hdr.Devmajor, + Devminor: hdr.Devminor, + Xattrs: xattrs, + Digest: checksum, + Offset: startOffset, + EndOffset: endOffset, + + // ChunkSize is 0 for the last chunk + ChunkSize: 0, + ChunkOffset: 0, + ChunkDigest: checksum, + } + metadata = append(metadata, m) + } + + rawBytes := tr.RawBytes() + if _, err := zstdWriter.Write(rawBytes); err != nil { + return err + } + if err := zstdWriter.Flush(); err != nil { + return err + } + if err := zstdWriter.Close(); err != nil { + return err + } + zstdWriter = nil + + return writeZstdChunkedManifest(dest, outMetadata, uint64(dest.Count), metadata, level) +} + +type zstdChunkedWriter struct { + tarSplitOut *io.PipeWriter + tarSplitErr chan error +} + +func (w zstdChunkedWriter) Close() error { + err := <-w.tarSplitErr + if err != nil { + w.tarSplitOut.Close() + return err + } + return w.tarSplitOut.Close() +} + +func (w zstdChunkedWriter) Write(p []byte) (int, error) { + select { + case err := <-w.tarSplitErr: + w.tarSplitOut.Close() + return 0, err + default: + return w.tarSplitOut.Write(p) + } +} + +// zstdChunkedWriterWithLevel writes a zstd compressed tarball where each file is +// compressed separately so it can be addressed separately. Idea based on CRFS: +// https://github.com/google/crfs +// The difference with CRFS is that the zstd compression is used instead of gzip. +// The reason for it is that zstd supports embedding metadata ignored by the decoder +// as part of the compressed stream. +// A manifest json file with all the metadata is appended at the end of the tarball +// stream, using zstd skippable frames. +// The final file will look like: +// [FILE_1][FILE_2]..[FILE_N][SKIPPABLE FRAME 1][SKIPPABLE FRAME 2] +// Where: +// [FILE_N]: [ZSTD HEADER][TAR HEADER][PAYLOAD FILE_N][ZSTD FOOTER] +// [SKIPPABLE FRAME 1]: [ZSTD SKIPPABLE FRAME, SIZE=MANIFEST LENGTH][MANIFEST] +// [SKIPPABLE FRAME 2]: [ZSTD SKIPPABLE FRAME, SIZE=16][MANIFEST_OFFSET][MANIFEST_LENGTH][MANIFEST_LENGTH_UNCOMPRESSED][MANIFEST_TYPE][CHUNKED_ZSTD_MAGIC_NUMBER] +// MANIFEST_OFFSET, MANIFEST_LENGTH, MANIFEST_LENGTH_UNCOMPRESSED and CHUNKED_ZSTD_MAGIC_NUMBER are 64 bits unsigned in little endian format. +func zstdChunkedWriterWithLevel(out io.Writer, metadata map[string]string, level int) (io.WriteCloser, error) { + ch := make(chan error, 1) + r, w := io.Pipe() + + go func() { + ch <- writeZstdChunkedStream(out, metadata, r, level) + io.Copy(ioutil.Discard, r) + r.Close() + close(ch) + }() + + return zstdChunkedWriter{ + tarSplitOut: w, + tarSplitErr: ch, + }, nil +} + +func zstdWriterWithLevel(dest io.Writer, level int) (*zstd.Encoder, error) { + el := zstd.EncoderLevelFromZstd(level) + return zstd.NewWriter(dest, zstd.WithEncoderLevel(el)) +} + +// ZstdCompressor is a CompressorFunc for the zstd compression algorithm. +func ZstdCompressor(r io.Writer, metadata map[string]string, level *int) (io.WriteCloser, error) { + if level == nil { + l := 3 + level = &l + } + + return zstdChunkedWriterWithLevel(r, metadata, *level) +} diff --git a/pkg/chunked/storage.go b/pkg/chunked/storage.go new file mode 100644 index 0000000000..67ed50b78e --- /dev/null +++ b/pkg/chunked/storage.go @@ -0,0 +1,894 @@ +package chunked + +import ( + archivetar "archive/tar" + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strings" + "syscall" + "time" + + storage "github.com/containers/storage" + graphdriver "github.com/containers/storage/drivers" + driversCopy "github.com/containers/storage/drivers/copy" + "github.com/containers/storage/pkg/archive" + "github.com/containers/storage/pkg/idtools" + "github.com/containers/storage/types" + "github.com/klauspost/compress/zstd" + digest "github.com/opencontainers/go-digest" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "github.com/vbatts/tar-split/archive/tar" + "golang.org/x/sys/unix" +) + +const ( + maxNumberMissingChunks = 1024 + newFileFlags = (unix.O_CREAT | unix.O_TRUNC | unix.O_WRONLY | unix.O_EXCL) + containersOverrideXattr = "user.containers.override_stat" + bigDataKey = "zstd-chunked-manifest" +) + +// ImageSourceChunk is a portion of a blob. +type ImageSourceChunk struct { + Offset uint64 + Length uint64 +} + +// ImageSourceSeekable is an image source that permits to fetch chunks of the entire blob. +type ImageSourceSeekable interface { + // GetBlobAt returns a stream for the specified blob. + GetBlobAt([]ImageSourceChunk) (chan io.ReadCloser, chan error, error) +} + +type chunkedZstdDiffer struct { + stream ImageSourceSeekable + manifest []byte + layersMetadata map[string][]zstdFileMetadata + layersTarget map[string]string +} + +// ErrBadRequest is returned when the request is not valid +type ErrBadRequest struct { +} + +func (e ErrBadRequest) Error() string { + return fmt.Sprintf("http bad request") +} + +func timeToTimespec(time time.Time) (ts unix.Timespec) { + if time.IsZero() { + // Return UTIME_OMIT special value + ts.Sec = 0 + ts.Nsec = ((1 << 30) - 2) + return + } + return unix.NsecToTimespec(time.UnixNano()) +} + +func copyFileContent(src, destFile, root string, dirfd int, missingDirsMode, mode os.FileMode) (*os.File, int64, error) { + st, err := os.Stat(src) + if err != nil { + return nil, -1, err + } + + copyWithFileRange, copyWithFileClone := true, true + + // If the destination file already exists, we shouldn't blow it away + dstFile, err := openFileUnderRoot(destFile, root, dirfd, newFileFlags, mode) + if err != nil { + return nil, -1, err + } + + err = driversCopy.CopyRegularToFile(src, dstFile, st, ©WithFileRange, ©WithFileClone) + if err != nil { + dstFile.Close() + return nil, -1, err + } + return dstFile, st.Size(), err +} + +func prepareOtherLayersCache(layersMetadata map[string][]zstdFileMetadata) map[string]map[string]*zstdFileMetadata { + maps := make(map[string]map[string]*zstdFileMetadata) + + for layerID, v := range layersMetadata { + r := make(map[string]*zstdFileMetadata) + for i := range v { + r[v[i].Digest] = &v[i] + } + maps[layerID] = r + } + return maps +} + +func getLayersCache(store storage.Store) (map[string][]zstdFileMetadata, map[string]string, error) { + allLayers, err := store.Layers() + if err != nil { + return nil, nil, err + } + + layersMetadata := make(map[string][]zstdFileMetadata) + layersTarget := make(map[string]string) + for _, r := range allLayers { + manifestReader, err := store.LayerBigData(r.ID, bigDataKey) + if err != nil { + continue + } + defer manifestReader.Close() + manifest, err := ioutil.ReadAll(manifestReader) + if err != nil { + return nil, nil, err + } + var toc zstdTOC + if err := json.Unmarshal(manifest, &toc); err != nil { + continue + } + layersMetadata[r.ID] = toc.Entries + target, err := store.DifferTarget(r.ID) + if err != nil { + return nil, nil, err + } + layersTarget[r.ID] = target + } + + return layersMetadata, layersTarget, nil +} + +// GetDiffer returns a differ than can be used with ApplyDiffWithDiffer. +func GetDiffer(ctx context.Context, store storage.Store, blobSize int64, annotations map[string]string, iss ImageSourceSeekable) (graphdriver.Differ, error) { + if _, ok := annotations[manifestChecksumKey]; ok { + return makeZstdChunkedDiffer(ctx, store, blobSize, annotations, iss) + } + return nil, errors.New("blob type not supported for partial retrieval") +} + +func makeZstdChunkedDiffer(ctx context.Context, store storage.Store, blobSize int64, annotations map[string]string, iss ImageSourceSeekable) (*chunkedZstdDiffer, error) { + manifest, err := readZstdChunkedManifest(iss, blobSize, annotations) + if err != nil { + return nil, err + } + layersMetadata, layersTarget, err := getLayersCache(store) + if err != nil { + return nil, err + } + + return &chunkedZstdDiffer{ + stream: iss, + manifest: manifest, + layersMetadata: layersMetadata, + layersTarget: layersTarget, + }, nil +} + +func findFileInOtherLayers(file zstdFileMetadata, root string, dirfd int, layersMetadata map[string]map[string]*zstdFileMetadata, layersTarget map[string]string, missingDirsMode os.FileMode) (*os.File, int64, error) { + // this is ugly, needs to be indexed + for layerID, checksums := range layersMetadata { + m, found := checksums[file.Digest] + if !found { + continue + } + + source, ok := layersTarget[layerID] + if !ok { + continue + } + + srcDirfd, err := unix.Open(root, unix.O_RDONLY, 0) + if err != nil { + continue + } + defer unix.Close(srcDirfd) + + srcFile, err := openFileUnderRoot(m.Name, source, srcDirfd, unix.O_RDONLY, 0) + if err != nil { + continue + } + defer srcFile.Close() + + srcPath := fmt.Sprintf("/proc/self/fd/%d", srcFile.Fd()) + + dstFile, written, err := copyFileContent(srcPath, file.Name, root, dirfd, missingDirsMode, 0) + if err != nil { + continue + } + return dstFile, written, nil + } + return nil, 0, nil +} + +func getFileDigest(f *os.File) (digest.Digest, error) { + digester := digest.Canonical.Digester() + if _, err := io.Copy(digester.Hash(), f); err != nil { + return "", err + } + return digester.Digest(), nil +} + +// findFileOnTheHost checks whether the requested file already exist on the host and copies the file content from there if possible. +// It is currently implemented to look only at the file with the same path. Ideally it can detect the same content also at different +// paths. +func findFileOnTheHost(file zstdFileMetadata, root string, dirfd int, missingDirsMode os.FileMode) (*os.File, int64, error) { + sourceFile := filepath.Clean(filepath.Join("/", file.Name)) + if !strings.HasPrefix(sourceFile, "/usr/") { + // limit host deduplication to files under /usr. + return nil, 0, nil + } + + st, err := os.Stat(sourceFile) + if err != nil || !st.Mode().IsRegular() { + return nil, 0, nil + } + + if st.Size() != file.Size { + return nil, 0, nil + } + + fd, err := unix.Open(sourceFile, unix.O_RDONLY|unix.O_NONBLOCK, 0) + if err != nil { + return nil, 0, nil + } + + f := os.NewFile(uintptr(fd), "fd") + defer f.Close() + + manifestChecksum, err := digest.Parse(file.Digest) + if err != nil { + return nil, 0, err + } + + checksum, err := getFileDigest(f) + if err != nil { + return nil, 0, err + } + + if checksum != manifestChecksum { + return nil, 0, nil + } + + dstFile, written, err := copyFileContent(fmt.Sprintf("/proc/self/fd/%d", fd), file.Name, root, dirfd, missingDirsMode, 0) + if err != nil { + return nil, 0, nil + } + + // calculate the checksum again to make sure the file wasn't modified while it was copied + if _, err := f.Seek(0, 0); err != nil { + return nil, 0, err + } + checksum, err = getFileDigest(f) + if err != nil { + return nil, 0, err + } + if checksum != manifestChecksum { + return nil, 0, nil + } + return dstFile, written, nil +} + +func maybeDoIDRemap(manifest []zstdFileMetadata, options *archive.TarOptions) error { + if options.ChownOpts == nil && len(options.UIDMaps) == 0 || len(options.GIDMaps) == 0 { + return nil + } + + idMappings := idtools.NewIDMappingsFromMaps(options.UIDMaps, options.GIDMaps) + + for i := range manifest { + if options.ChownOpts != nil { + manifest[i].UID = options.ChownOpts.UID + manifest[i].GID = options.ChownOpts.GID + } else { + pair := idtools.IDPair{ + UID: manifest[i].UID, + GID: manifest[i].GID, + } + var err error + manifest[i].UID, manifest[i].GID, err = idMappings.ToContainer(pair) + if err != nil { + return err + } + } + } + return nil +} + +type missingFile struct { + File *zstdFileMetadata + Gap int64 +} + +func (m missingFile) Length() int64 { + return m.File.EndOffset - m.File.Offset +} + +type missingChunk struct { + RawChunk ImageSourceChunk + Files []missingFile +} + +func setFileAttrs(file *os.File, mode os.FileMode, metadata *zstdFileMetadata, options *archive.TarOptions) error { + if file == nil || file.Fd() < 0 { + return errors.Errorf("invalid file") + } + fd := int(file.Fd()) + + t, err := typeToTarType(metadata.Type) + if err != nil { + return err + } + if t == tar.TypeSymlink { + return nil + } + + if err := unix.Fchown(fd, metadata.UID, metadata.GID); err != nil { + if !options.IgnoreChownErrors { + return err + } + } + + for k, v := range metadata.Xattrs { + data, err := base64.StdEncoding.DecodeString(v) + if err != nil { + return err + } + if err := unix.Fsetxattr(fd, k, data, 0); err != nil { + return err + } + } + + ts := []unix.Timespec{timeToTimespec(metadata.AccessTime), timeToTimespec(metadata.ModTime)} + if err := unix.UtimesNanoAt(fd, "", ts, 0); err != nil && errors.Is(err, unix.ENOSYS) { + return err + } + + if err := unix.Fchmod(fd, uint32(mode)); err != nil { + return err + } + return nil +} + +func openFileUnderRoot(name, root string, dirfd int, flags uint64, mode os.FileMode) (*os.File, error) { + how := unix.OpenHow{ + Flags: flags, + Mode: uint64(mode & 07777), + Resolve: unix.RESOLVE_IN_ROOT, + } + + fd, err := unix.Openat2(dirfd, name, &how) + if err != nil { + return nil, err + } + return os.NewFile(uintptr(fd), name), nil +} + +func createFileFromZstdStream(dest string, dirfd int, reader io.Reader, missingDirsMode, mode os.FileMode, metadata *zstdFileMetadata, options *archive.TarOptions) (err error) { + file, err := openFileUnderRoot(metadata.Name, dest, dirfd, newFileFlags, 0) + if err != nil { + return err + } + defer func() { + err2 := file.Close() + if err == nil { + err = err2 + } + }() + + z, err := zstd.NewReader(reader) + if err != nil { + return err + } + defer z.Close() + + digester := digest.Canonical.Digester() + checksum := digester.Hash() + _, err = z.WriteTo(io.MultiWriter(file, checksum)) + if err != nil { + return err + } + manifestChecksum, err := digest.Parse(metadata.Digest) + if err != nil { + return err + } + if digester.Digest() != manifestChecksum { + return fmt.Errorf("checksum mismatch for %q", dest) + } + return setFileAttrs(file, mode, metadata, options) +} + +func storeMissingFiles(streams chan io.ReadCloser, errs chan error, dest string, dirfd int, missingChunks []missingChunk, missingDirsMode os.FileMode, options *archive.TarOptions) error { + for mc := 0; ; mc++ { + var part io.ReadCloser + select { + case p := <-streams: + part = p + case err := <-errs: + return err + } + if part == nil { + if mc == len(missingChunks) { + break + } + return errors.Errorf("invalid stream returned %d %d", mc, len(missingChunks)) + } + if mc == len(missingChunks) { + return errors.Errorf("too many chunks returned") + } + + for _, mf := range missingChunks[mc].Files { + if mf.Gap > 0 { + limitReader := io.LimitReader(part, mf.Gap) + _, err := io.Copy(ioutil.Discard, limitReader) + if err != nil { + return err + } + continue + } + + limitReader := io.LimitReader(part, mf.Length()) + + if err := createFileFromZstdStream(dest, dirfd, limitReader, missingDirsMode, os.FileMode(mf.File.Mode), mf.File, options); err != nil { + part.Close() + return err + } + } + part.Close() + } + return nil +} + +func mergeMissingChunks(missingChunks []missingChunk, target int) []missingChunk { + if len(missingChunks) <= target { + return missingChunks + } + + getGap := func(missingChunks []missingChunk, i int) int { + prev := missingChunks[i-1].RawChunk.Offset + missingChunks[i-1].RawChunk.Length + return int(missingChunks[i].RawChunk.Offset - prev) + } + + // this implementation doesn't account for duplicates, so it could merge + // more than necessary to reach the specified target. Since target itself + // is a heuristic value, it doesn't matter. + var gaps []int + for i := 1; i < len(missingChunks); i++ { + gaps = append(gaps, getGap(missingChunks, i)) + } + sort.Ints(gaps) + + toShrink := len(missingChunks) - target + targetValue := gaps[toShrink-1] + + newMissingChunks := missingChunks[0:1] + for i := 1; i < len(missingChunks); i++ { + gap := getGap(missingChunks, i) + if gap > targetValue { + newMissingChunks = append(newMissingChunks, missingChunks[i]) + } else { + prev := &newMissingChunks[len(newMissingChunks)-1] + gapFile := missingFile{ + Gap: int64(gap), + } + prev.RawChunk.Length += uint64(gap) + missingChunks[i].RawChunk.Length + prev.Files = append(append(prev.Files, gapFile), missingChunks[i].Files...) + } + } + + return newMissingChunks +} + +func retrieveMissingFiles(input *chunkedZstdDiffer, dest string, dirfd int, missingChunks []missingChunk, missingDirsMode os.FileMode, options *archive.TarOptions) error { + var chunksToRequest []ImageSourceChunk + for _, c := range missingChunks { + chunksToRequest = append(chunksToRequest, c.RawChunk) + } + + // There are some missing files. Prepare a multirange request for the missing chunks. + var streams chan io.ReadCloser + var err error + var errs chan error + for { + streams, errs, err = input.stream.GetBlobAt(chunksToRequest) + if err == nil { + break + } + + if _, ok := err.(ErrBadRequest); ok { + requested := len(missingChunks) + // If the server cannot handle at least 64 chunks in a single request, just give up. + if requested < 64 { + return err + } + + // Merge more chunks to request + missingChunks = mergeMissingChunks(missingChunks, requested/2) + continue + } + return err + } + + if err := storeMissingFiles(streams, errs, dest, dirfd, missingChunks, missingDirsMode, options); err != nil { + return err + } + return nil +} + +func safeMkdir(target string, dirfd int, mode os.FileMode, metadata *zstdFileMetadata, options *archive.TarOptions) error { + parent := filepath.Dir(metadata.Name) + base := filepath.Base(metadata.Name) + + parentFd := dirfd + if parent != "." { + parentFile, err := openFileUnderRoot(parent, target, dirfd, unix.O_DIRECTORY|unix.O_PATH|unix.O_RDONLY, 0) + if err != nil { + return err + } + defer parentFile.Close() + parentFd = int(parentFile.Fd()) + } + + if err := unix.Mkdirat(parentFd, base, uint32(mode)); err != nil { + if !os.IsExist(err) { + return err + } + } + + file, err := openFileUnderRoot(metadata.Name, target, dirfd, unix.O_RDONLY, 0) + if err != nil { + return err + } + defer file.Close() + + return setFileAttrs(file, mode, metadata, options) +} + +func safeLink(target string, dirfd int, mode os.FileMode, metadata *zstdFileMetadata, options *archive.TarOptions) error { + sourceFile, err := openFileUnderRoot(metadata.Linkname, target, dirfd, unix.O_RDONLY, 0) + if err != nil { + return err + } + defer sourceFile.Close() + + destDir, destBase := filepath.Dir(metadata.Name), filepath.Base(metadata.Name) + destDirFd := dirfd + if destDir != "." { + f, err := openFileUnderRoot(destDir, target, dirfd, unix.O_RDONLY, 0) + if err != nil { + return err + } + defer f.Close() + destDirFd = int(f.Fd()) + } + + err = unix.Linkat(int(sourceFile.Fd()), "", destDirFd, destBase, unix.AT_EMPTY_PATH) + if err != nil { + return err + } + + newFile, err := openFileUnderRoot(metadata.Name, target, dirfd, unix.O_WRONLY, 0) + if err != nil { + return err + } + defer newFile.Close() + + return setFileAttrs(newFile, mode, metadata, options) +} + +func safeSymlink(target string, dirfd int, mode os.FileMode, metadata *zstdFileMetadata, options *archive.TarOptions) error { + destDir, destBase := filepath.Dir(metadata.Name), filepath.Base(metadata.Name) + destDirFd := dirfd + if destDir != "." { + f, err := openFileUnderRoot(destDir, target, dirfd, unix.O_RDONLY, 0) + if err != nil { + return err + } + defer f.Close() + destDirFd = int(f.Fd()) + } + + return unix.Symlinkat(metadata.Linkname, destDirFd, destBase) +} + +type whiteoutHandler struct { + Dirfd int + Root string +} + +func (d whiteoutHandler) Setxattr(path, name string, value []byte) error { + file, err := openFileUnderRoot(path, d.Root, d.Dirfd, unix.O_RDONLY, 0) + if err != nil { + return err + } + defer file.Close() + + return unix.Fsetxattr(int(file.Fd()), name, value, 0) +} + +func (d whiteoutHandler) Mknod(path string, mode uint32, dev int) error { + dir := filepath.Dir(path) + base := filepath.Base(path) + + dirfd := d.Dirfd + if dir != "" { + dir, err := openFileUnderRoot(dir, d.Root, d.Dirfd, unix.O_RDONLY, 0) + if err != nil { + return err + } + defer dir.Close() + + dirfd = int(dir.Fd()) + } + + return unix.Mknodat(dirfd, base, mode, dev) +} + +func checkChownErr(err error, name string, uid, gid int) error { + if errors.Is(err, syscall.EINVAL) { + return errors.Wrapf(err, "potentially insufficient UIDs or GIDs available in user namespace (requested %d:%d for %s): Check /etc/subuid and /etc/subgid", uid, gid, name) + } + return err +} + +func (d whiteoutHandler) Chown(path string, uid, gid int) error { + file, err := openFileUnderRoot(path, d.Root, d.Dirfd, unix.O_PATH, 0) + if err != nil { + return err + } + defer file.Close() + + if err := unix.Fchownat(int(file.Fd()), "", uid, gid, unix.AT_EMPTY_PATH); err != nil { + var stat unix.Stat_t + if unix.Fstat(int(file.Fd()), &stat) == nil { + if stat.Uid == uint32(uid) && stat.Gid == uint32(gid) { + return nil + } + } + return checkChownErr(err, path, uid, gid) + } + return nil +} + +type hardLinkToCreate struct { + dest string + dirfd int + mode os.FileMode + metadata *zstdFileMetadata +} + +func (d *chunkedZstdDiffer) ApplyDiff(dest string, options *archive.TarOptions) (graphdriver.DriverWithDifferOutput, error) { + bigData := map[string][]byte{ + bigDataKey: d.manifest, + } + output := graphdriver.DriverWithDifferOutput{ + Differ: d, + BigData: bigData, + } + + storeOpts, err := types.DefaultStoreOptionsAutoDetectUID() + if err != nil { + return output, err + } + + enableHostDedup := false + if value := storeOpts.PullOptions["enable_host_deduplication"]; strings.ToLower(value) == "true" { + enableHostDedup = true + } + + // Generate the manifest + var toc zstdTOC + if err := json.Unmarshal(d.manifest, &toc); err != nil { + return output, err + } + + whiteoutConverter := archive.GetWhiteoutConverter(options.WhiteoutFormat, options.WhiteoutData) + + var missingChunks []missingChunk + var mergedEntries []zstdFileMetadata + + if err := maybeDoIDRemap(toc.Entries, options); err != nil { + return output, err + } + + for _, e := range toc.Entries { + if e.Type == TypeChunk { + l := len(mergedEntries) + if l == 0 || mergedEntries[l-1].Type != TypeReg { + return output, errors.New("chunk type without a regular file") + } + mergedEntries[l-1].EndOffset = e.EndOffset + continue + } + mergedEntries = append(mergedEntries, e) + } + + if options.ForceMask != nil { + uid, gid, mode, err := archive.GetFileOwner(dest) + if err == nil { + value := fmt.Sprintf("%d:%d:0%o", uid, gid, mode) + if err := unix.Setxattr(dest, containersOverrideXattr, []byte(value), 0); err != nil { + return output, err + } + } + } + + dirfd, err := unix.Open(dest, unix.O_RDONLY|unix.O_PATH, 0) + if err != nil { + return output, err + } + defer unix.Close(dirfd) + + otherLayersCache := prepareOtherLayersCache(d.layersMetadata) + + missingDirsMode := os.FileMode(0700) + if options.ForceMask != nil { + missingDirsMode = *options.ForceMask + } + + // hardlinks can point to missing files. So create them after all files + // are retrieved + var hardLinks []hardLinkToCreate + + missingChunksSize, totalChunksSize := int64(0), int64(0) + for i, r := range mergedEntries { + if options.ForceMask != nil { + value := fmt.Sprintf("%d:%d:0%o", r.UID, r.GID, r.Mode&07777) + r.Xattrs[containersOverrideXattr] = base64.StdEncoding.EncodeToString([]byte(value)) + r.Mode = int64(*options.ForceMask) + } + + mode := os.FileMode(r.Mode) + + r.Name = filepath.Clean(r.Name) + r.Linkname = filepath.Clean(r.Linkname) + + t, err := typeToTarType(r.Type) + if err != nil { + return output, err + } + if whiteoutConverter != nil { + hdr := archivetar.Header{ + Typeflag: t, + Name: r.Name, + Linkname: r.Linkname, + Size: r.Size, + Mode: r.Mode, + Uid: r.UID, + Gid: r.GID, + } + handler := whiteoutHandler{ + Dirfd: dirfd, + Root: dest, + } + writeFile, err := whiteoutConverter.ConvertReadWithHandler(&hdr, r.Name, &handler) + if err != nil { + return output, err + } + if !writeFile { + continue + } + } + switch t { + case tar.TypeReg: + // Create directly empty files. + if r.Size == 0 { + // Used to have a scope for cleanup. + createEmptyFile := func() error { + file, err := openFileUnderRoot(r.Name, dest, dirfd, newFileFlags, 0) + if err != nil { + return err + } + defer file.Close() + if err := setFileAttrs(file, mode, &r, options); err != nil { + return err + } + return nil + } + if err := createEmptyFile(); err != nil { + return output, err + } + continue + } + + case tar.TypeDir: + if err := safeMkdir(dest, dirfd, mode, &r, options); err != nil { + return output, err + } + continue + + case tar.TypeLink: + dest := dest + dirfd := dirfd + mode := mode + r := r + hardLinks = append(hardLinks, hardLinkToCreate{ + dest: dest, + dirfd: dirfd, + mode: mode, + metadata: &r, + }) + continue + + case tar.TypeSymlink: + if err := safeSymlink(dest, dirfd, mode, &r, options); err != nil { + return output, err + } + continue + + case tar.TypeChar: + case tar.TypeBlock: + case tar.TypeFifo: + /* Ignore. */ + default: + return output, fmt.Errorf("invalid type %q", t) + } + + totalChunksSize += r.Size + + dstFile, _, err := findFileInOtherLayers(r, dest, dirfd, otherLayersCache, d.layersTarget, missingDirsMode) + if err != nil { + return output, err + } + if dstFile != nil { + if err := setFileAttrs(dstFile, mode, &r, options); err != nil { + dstFile.Close() + return output, err + } + dstFile.Close() + continue + } + + if enableHostDedup { + dstFile, _, err = findFileOnTheHost(r, dest, dirfd, missingDirsMode) + if err != nil { + return output, err + } + if dstFile != nil { + if err := setFileAttrs(dstFile, mode, &r, options); err != nil { + dstFile.Close() + return output, err + } + dstFile.Close() + continue + } + } + + missingChunksSize += r.Size + if t == tar.TypeReg { + rawChunk := ImageSourceChunk{ + Offset: uint64(r.Offset), + Length: uint64(r.EndOffset - r.Offset), + } + file := missingFile{ + File: &toc.Entries[i], + } + missingChunks = append(missingChunks, missingChunk{ + RawChunk: rawChunk, + Files: []missingFile{ + file, + }, + }) + } + } + // There are some missing files. Prepare a multirange request for the missing chunks. + if len(missingChunks) > 0 { + missingChunks = mergeMissingChunks(missingChunks, maxNumberMissingChunks) + if err := retrieveMissingFiles(d, dest, dirfd, missingChunks, missingDirsMode, options); err != nil { + return output, err + } + } + + for _, m := range hardLinks { + if err := safeLink(m.dest, m.dirfd, m.mode, m.metadata, options); err != nil { + return output, err + } + } + + if totalChunksSize > 0 { + logrus.Debugf("Missing %d bytes out of %d (%.2f %%)", missingChunksSize, totalChunksSize, float32(missingChunksSize*100.0)/float32(totalChunksSize)) + } + return output, nil +} diff --git a/pkg/chunked/zstdchunked_test.go b/pkg/chunked/zstdchunked_test.go new file mode 100644 index 0000000000..59892a3f4f --- /dev/null +++ b/pkg/chunked/zstdchunked_test.go @@ -0,0 +1,172 @@ +package chunked + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "testing" +) + +func TestIsZstdChunkedFrameMagic(t *testing.T) { + b := append(zstdChunkedFrameMagic[:], make([]byte, 200)...) + if !isZstdChunkedFrameMagic(b) { + t.Fatal("Chunked frame magic not found") + } + // change a byte + b[0] = -b[0] + if isZstdChunkedFrameMagic(b) { + t.Fatal("Invalid chunked frame magic found") + } +} + +type seekable struct { + data []byte + offset uint64 + length uint64 + t *testing.T +} + +func (s seekable) GetBlobAt(req []ImageSourceChunk) (chan io.ReadCloser, chan error, error) { + if len(req) != 1 { + s.t.Fatal("Requested more than one chunk") + } + if req[0].Offset != s.offset { + s.t.Fatal("Invalid offset requested") + } + if req[0].Length != s.length { + s.t.Fatal("Invalid length requested") + } + + m := make(chan io.ReadCloser) + e := make(chan error) + + go func() { + m <- ioutil.NopCloser(bytes.NewReader(s.data)) + close(m) + close(e) + }() + + return m, e, nil +} + +var someFiles = []zstdFileMetadata{ + { + Type: "dir", + Name: "/foo", + Mode: 0755, + Size: 0, + }, + { + Type: "reg", + Name: "/foo/bar", + Mode: 0755, + Size: 10, + Digest: "sha256:5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03", + Offset: 100, + EndOffset: 110, + ChunkSize: 10, + ChunkDigest: "sha256:5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03", + ChunkOffset: 0, + }, + { + Type: "reg", + Name: "/foo/baz", + Mode: 0755, + Size: 12, + Digest: "sha256:6f0378f21a495f5c13247317d158e9d51da45a5bf68fc2f366e450deafdc8302", + Offset: 200, + EndOffset: 212, + ChunkSize: 12, + ChunkDigest: "sha256:6f0378f21a495f5c13247317d158e9d51da45a5bf68fc2f366e450deafdc8302", + ChunkOffset: 0, + }, +} + +func TestGenerateAndParseManifest(t *testing.T) { + annotations := make(map[string]string) + offsetManifest := uint64(100000) + + var b bytes.Buffer + writer := bufio.NewWriter(&b) + if err := writeZstdChunkedManifest(writer, annotations, offsetManifest, someFiles[:], 9); err != nil { + t.Error(err) + } + if err := writer.Flush(); err != nil { + t.Error(err) + } + + offsetMetadata := annotations[manifestInfoKey] + if offsetMetadata == "" { + t.Fatal("Annotation not found") + } + + var offset, length, lengthUncompressed, manifestType uint64 + if _, err := fmt.Sscanf(offsetMetadata, "%d:%d:%d:%d", &offset, &length, &lengthUncompressed, &manifestType); err != nil { + t.Error(err) + } + + if offset != offsetManifest+8 { + t.Fatalf("Invalid offset %d", offset) + } + if manifestType != manifestTypeCRFS { + t.Fatalf("Invalid manifest type %d", manifestType) + } + if b.Len() == 0 { + t.Fatal("no manifest written") + } + + data := b.Bytes()[offset-offsetManifest:] + s := seekable{ + data: data, + offset: offset, + length: length, + t: t, + } + + manifest, err := readZstdChunkedManifest(s, 8192, annotations) + if err != nil { + t.Error(err) + } + + var toc zstdTOC + if err := json.Unmarshal(manifest, &toc); err != nil { + t.Error(err) + } + + if toc.Version != 1 { + t.Fatal("Invalid manifest version generated") + } + if len(toc.Entries) != len(someFiles) { + t.Fatal("Manifest mismatch") + } +} + +func TestGetTarType(t *testing.T) { + for k, v := range typesToTar { + r, err := typeToTarType(k) + if err != nil { + t.Error(err) + } + if r != v { + t.Fatal("Invalid typeToTarType conversion") + } + } + if _, err := typeToTarType("FOO"); err == nil { + t.Fatal("Invalid typeToTarType conversion") + } + for k, v := range tarTypes { + r, err := getType(k) + if err != nil { + t.Error(err) + } + if r != v { + t.Fatal("Invalid getType conversion") + } + } + if _, err := getType(byte('Z')); err == nil { + t.Fatal("Invalid getType conversion") + } +}