From 116f67036f503405b80a4d8baa429a6166490b80 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 03:01:03 +0200 Subject: [PATCH 01/14] transfer: copy archives from another repo this is somehow similar to borg recreate, but with different focus and way simpler: not changing compression algo not changing chunking not excluding files inside an archive by path match only dealing with complete archives but: different src and dst repo only reading each chunk once keeping the compressed payload (no decompression/recompression effort) --dry-run can be used before and afterwards to check --- src/borg/archiver.py | 105 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index de9489ff0a..49516fff6b 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -338,6 +338,74 @@ def do_serve(self, args): ).serve() return EXIT_SUCCESS + @with_other_repository(manifest=True, key=True, compatibility=(Manifest.Operation.READ,)) + @with_repository(exclusive=True, manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,)) + def do_transfer(self, args, *, + repository, manifest, key, cache, + other_repository=None, other_manifest=None, other_key=None): + """archives transfer from other repository""" + dry_run = args.dry_run + + args.consider_checkpoints = True + archive_names = tuple(x.name for x in other_manifest.archives.list_considering(args)) + if not archive_names: + return EXIT_SUCCESS + + for name in archive_names: + transfer_size = 0 + present_size = 0 + if name in manifest.archives and not dry_run: + print(f"{name}: archive is already present in destination repo, skipping.") + else: + if not dry_run: + print(f"{name}: copying archive to destination repo...") + other_archive = Archive(other_repository, other_key, other_manifest, name) + archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None + for item in other_archive.iter_items(): + if 'chunks' in item: + chunks = [] + for chunk_id, size, _ in item.chunks: + refcount = cache.seen_chunk(chunk_id, size) + if refcount == 0: # target repo does not yet have this chunk + if not dry_run: + cdata = other_repository.get(chunk_id) + # keep compressed payload same, avoid decompression / recompression + data = other_key.decrypt(chunk_id, cdata, decompress=False) + chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False, + compress=False, size=size) + cache.repository.async_response(wait=False) + chunks.append(chunk_entry) + transfer_size += size + else: + if not dry_run: + chunk_entry = cache.chunk_incref(chunk_id, archive.stats) + chunks.append(chunk_entry) + present_size += size + if not dry_run: + item.chunks = chunks # overwrite! IDs and sizes are same, csizes are likely different + archive.stats.nfiles += 1 + # TODO: filter the item data, get rid of legacy crap + if not dry_run: + archive.add_item(item) + if not dry_run: + additional_metadata = {} + # keep all metadata except archive version and stats. also do not keep + # recreate_source_id, recreate_args, recreate_partial_chunks which were used only in 1.1.0b1 .. b2. + for attr in ('cmdline', 'hostname', 'username', 'time', 'time_end', 'comment', + 'chunker_params', 'recreate_cmdline'): + if hasattr(other_archive.metadata, attr): + additional_metadata[attr] = getattr(other_archive.metadata, attr) + archive.save(stats=archive.stats, additional_metadata=additional_metadata) + print(f"{name}: finished. " + f"transfer_size: {format_file_size(transfer_size)} " + f"present_size: {format_file_size(present_size)}") + else: + print(f"{name}: completed" if transfer_size == 0 else + f"{name}: incomplete, " + f"transfer_size: {format_file_size(transfer_size)} " + f"present_size: {format_file_size(present_size)}") + return EXIT_SUCCESS + @with_repository(create=True, exclusive=True, manifest=False) @with_other_repository(key=True, compatibility=(Manifest.Operation.READ, )) def do_init(self, args, repository, *, other_repository=None, other_key=None): @@ -4083,6 +4151,43 @@ def define_borg_mount(parser): help='archives to delete') define_archive_filters_group(subparser) + # borg transfer + transfer_epilog = process_epilog(""" + This command transfers archives from one repository to another repository. + + Suggested use: + + # initialize DST_REPO reusing key material from SRC_REPO, so that + # chunking and chunk id generation will work in the same way as before. + borg init --other-location=SRC_REPO --encryption=DST_ENC DST_REPO + + # transfer archives from SRC_REPO to DST_REPO + borg transfer --dry-run SRC_REPO DST_REPO # check what it would do + borg transfer SRC_REPO DST_REPO # do it! + borg transfer --dry-run SRC_REPO DST_REPO # check! anything left? + + The default is to transfer all archives, including checkpoint archives. + + You could use the misc. archive filter options to limit which archives it will + transfer, e.g. using the --prefix option. This is recommended for big + repositories with multiple data sets to keep the runtime per invocation lower. + """) + subparser = subparsers.add_parser('transfer', parents=[common_parser], add_help=False, + description=self.do_transfer.__doc__, + epilog=transfer_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help='transfer of archives from another repository') + subparser.set_defaults(func=self.do_transfer) + subparser.add_argument('-n', '--dry-run', dest='dry_run', action='store_true', + help='do not change repository, just check') + subparser.add_argument('other_location', metavar='SRC_REPOSITORY', + type=location_validator(archive=False, other=True), + help='source repository') + subparser.add_argument('location', metavar='DST_REPOSITORY', + type=location_validator(archive=False, other=False), + help='destination repository') + define_archive_filters_group(subparser) + # borg diff diff_epilog = process_epilog(""" This command finds differences (file contents, user/group/mode) between archives. From 98b7dc0bf5baecc8a371f78b8ea42fc5f1769c2c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 16:58:57 +0200 Subject: [PATCH 02/14] transfer: clean item of attic 0.13 'acl' bug remnants also: remove attic bug support code from borg check. borg transfer removes the acl key. we do not run borg check on old repos. --- src/borg/archive.py | 3 --- src/borg/archiver.py | 9 +++++++-- src/borg/testsuite/archiver.py | 29 ----------------------------- 3 files changed, 7 insertions(+), 34 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index a9c145d49e..ba95a08678 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1944,9 +1944,6 @@ def list_keys_safe(keys): def valid_item(obj): if not isinstance(obj, StableDict): return False, 'not a dictionary' - # A bug in Attic up to and including release 0.13 added a (meaningless) b'acl' key to every item. - # We ignore it here, should it exist. See test_attic013_acl_bug for details. - obj.pop(b'acl', None) keys = set(obj) if not required_item_keys.issubset(keys): return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 49516fff6b..a4cddd7b00 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -344,6 +344,12 @@ def do_transfer(self, args, *, repository, manifest, key, cache, other_repository=None, other_manifest=None, other_key=None): """archives transfer from other repository""" + + def upgrade_item(item): + """upgrade item as needed, get rid of legacy crap""" + item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 + return item + dry_run = args.dry_run args.consider_checkpoints = True @@ -384,9 +390,8 @@ def do_transfer(self, args, *, if not dry_run: item.chunks = chunks # overwrite! IDs and sizes are same, csizes are likely different archive.stats.nfiles += 1 - # TODO: filter the item data, get rid of legacy crap if not dry_run: - archive.add_item(item) + archive.add_item(upgrade_item(item)) if not dry_run: additional_metadata = {} # keep all metadata except archive version and stats. also do not keep diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 450ba1757b..5889b12ab4 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -3907,35 +3907,6 @@ def test_empty_repository(self): repository.commit(compact=False) self.cmd('check', self.repository_location, exit_code=1) - def test_attic013_acl_bug(self): - # Attic up to release 0.13 contained a bug where every item unintentionally received - # a b'acl'=None key-value pair. - # This bug can still live on in Borg repositories (through borg upgrade). - class Attic013Item: - def as_dict(self): - return { - # These are required - b'path': '1234', - b'mtime': 0, - b'mode': 0, - b'user': b'0', - b'group': b'0', - b'uid': 0, - b'gid': 0, - # acl is the offending key. - b'acl': None, - } - - archive, repository = self.open_archive('archive1') - with repository: - manifest, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - with Cache(repository, key, manifest) as cache: - archive = Archive(repository, key, manifest, '0.13', cache=cache, create=True) - archive.items_buffer.add(Attic013Item()) - archive.save() - self.cmd('check', self.repository_location, exit_code=0) - self.cmd('list', self.repository_location + '::0.13', exit_code=0) - class ManifestAuthenticationTest(ArchiverTestCaseBase): def spoof_manifest(self, repository): From ba1dbe6111d5b1f11b2a3d3c43a0fed83b2e7392 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 17:13:37 +0200 Subject: [PATCH 03/14] transfer: make sure items with chunks have precomputed size --- src/borg/archiver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index a4cddd7b00..0c5dee5f1c 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -348,6 +348,7 @@ def do_transfer(self, args, *, def upgrade_item(item): """upgrade item as needed, get rid of legacy crap""" item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 + item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks return item dry_run = args.dry_run From 01f72d15b4c4ebd5dd21e6787dacf0b44d433d5c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 20:51:43 +0200 Subject: [PATCH 04/14] transfer: remove the zlib type bytes hack hack: see the docstring of ZLIB_legacy class. New clean ZLIB class that works as every other compressor. ZLIB ID 0x0500, ZLIB_legacy ID 0x.8.. --- src/borg/archiver.py | 8 +++++- src/borg/compress.pyx | 51 ++++++++++++++++++++++++++++++---- src/borg/testsuite/archiver.py | 4 +-- src/borg/testsuite/compress.py | 4 +-- 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 0c5dee5f1c..0982081670 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -44,7 +44,7 @@ from .archive import has_link from .cache import Cache, assert_secure, SecurityManager from .constants import * # NOQA - from .compress import CompressionSpec + from .compress import CompressionSpec, ZLIB, ZLIB_legacy from .crypto.key import key_creator, key_argument_names, tam_required_file, tam_required from .crypto.key import RepoKey, KeyfileKey, Blake2RepoKey, Blake2KeyfileKey, FlexiKey from .crypto.keymanager import KeyManager @@ -351,6 +351,11 @@ def upgrade_item(item): item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks return item + def upgrade_compressed_chunk(chunk): + if ZLIB_legacy.detect(chunk): + chunk = ZLIB.ID + chunk # get rid of the attic legacy: prepend separate type bytes for zlib + return chunk + dry_run = args.dry_run args.consider_checkpoints = True @@ -378,6 +383,7 @@ def upgrade_item(item): cdata = other_repository.get(chunk_id) # keep compressed payload same, avoid decompression / recompression data = other_key.decrypt(chunk_id, cdata, decompress=False) + data = upgrade_compressed_chunk(data) chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False, compress=False, size=size) cache.repository.async_response(wait=False) diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 2e0eb4809b..7997456c6f 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -331,14 +331,52 @@ class ZSTD(DecidingCompressor): return dest[:osize] -class ZLIB(CompressorBase): +class ZLIB(DecidingCompressor): """ zlib compression / decompression (python stdlib) """ - ID = b'\x08\x00' # not used here, see detect() - # avoid all 0x.8.. IDs elsewhere! + ID = b'\x05\x00' name = 'zlib' + def __init__(self, level=6, **kwargs): + super().__init__(**kwargs) + self.level = level + + def _decide(self, data): + """ + Decides what to do with *data*. Returns (compressor, zlib_data). + + *zlib_data* is the ZLIB result if *compressor* is ZLIB as well, otherwise it is None. + """ + zlib_data = zlib.compress(data, self.level) + if len(zlib_data) < len(data): + return self, zlib_data + else: + return NONE_COMPRESSOR, None + + def decompress(self, data): + data = super().decompress(data) + try: + return zlib.decompress(data) + except zlib.error as e: + raise DecompressionError(str(e)) from None + + +class ZLIB_legacy(CompressorBase): + """ + zlib compression / decompression (python stdlib) + + Note: This is the legacy ZLIB support as used by borg < 1.3. + It still suffers from attic *only* supporting zlib and not having separate + ID bytes to differentiate between differently compressed chunks. + This just works because zlib compressed stuff always starts with 0x.8.. bytes. + Newer borg uses the ZLIB class that has separate ID bytes (as all the other + compressors) and does not need this hack. + """ + ID = b'\x08\x00' # not used here, see detect() + # avoid all 0x.8.. IDs elsewhere! + name = 'zlib_legacy' + @classmethod def detect(cls, data): # matches misc. patterns 0x.8.. used by zlib @@ -502,13 +540,14 @@ COMPRESSOR_TABLE = { CNONE.name: CNONE, LZ4.name: LZ4, ZLIB.name: ZLIB, + ZLIB_legacy.name: ZLIB_legacy, LZMA.name: LZMA, Auto.name: Auto, ZSTD.name: ZSTD, ObfuscateSize.name: ObfuscateSize, } # List of possible compression types. Does not include Auto, since it is a meta-Compressor. -COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ] # check fast stuff first +COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, ZLIB_legacy, LZMA, ObfuscateSize, ] # check fast stuff first def get_compressor(name, **kwargs): cls = COMPRESSOR_TABLE[name] @@ -554,7 +593,7 @@ class CompressionSpec: self.name = values[0] if self.name in ('none', 'lz4', ): return - elif self.name in ('zlib', 'lzma', ): + elif self.name in ('zlib', 'lzma', 'zlib_legacy'): # zlib_legacy just for testing if count < 2: level = 6 # default compression level in py stdlib elif count == 2: @@ -597,7 +636,7 @@ class CompressionSpec: def compressor(self): if self.name in ('none', 'lz4', ): return get_compressor(self.name) - elif self.name in ('zlib', 'lzma', 'zstd', ): + elif self.name in ('zlib', 'lzma', 'zstd', 'zlib_legacy'): return get_compressor(self.name, level=self.level) elif self.name == 'auto': return get_compressor(self.name, compressor=self.inner.compressor) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 5889b12ab4..b69fe819f2 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2442,7 +2442,7 @@ def test_compression_none_uncompressible(self): def test_compression_zlib_compressible(self): size, csize = self._get_sizes('zlib', compressible=True) assert csize < size * 0.1 - assert csize == 35 + assert csize == 37 def test_compression_zlib_uncompressible(self): size, csize = self._get_sizes('zlib', compressible=False) @@ -2451,7 +2451,7 @@ def test_compression_zlib_uncompressible(self): def test_compression_auto_compressible(self): size, csize = self._get_sizes('auto,zlib', compressible=True) assert csize < size * 0.1 - assert csize == 35 # same as compression 'zlib' + assert csize == 37 # same as compression 'zlib' def test_compression_auto_uncompressible(self): size, csize = self._get_sizes('auto,zlib', compressible=False) diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index 3942c3537f..c93dd3bb67 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -88,11 +88,11 @@ def test_autodetect_invalid(): Compressor(**params).decompress(b'\x08\x00notreallyzlib') -def test_zlib_compat(): +def test_zlib_legacy_compat(): # for compatibility reasons, we do not add an extra header for zlib, # nor do we expect one when decompressing / autodetecting for level in range(10): - c = get_compressor(name='zlib', level=level) + c = get_compressor(name='zlib_legacy', level=level) cdata1 = c.compress(data) cdata2 = zlib.compress(data, level) assert cdata1 == cdata2 From e4a97ea8cc475b5eb5f5f43d622a485874224727 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 4 May 2022 01:58:24 +0200 Subject: [PATCH 05/14] transfer: all hardlinks have chunks, maybe chunks_healty, hlid Item.hlid: same id, same hardlink (xxh64 digest) Item.hardlink_master: not used for new archives any more Item.source: not used for hardlink slaves any more --- src/borg/archiver.py | 15 +++++++++++++++ src/borg/constants.py | 2 +- src/borg/item.pyx | 3 ++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 0982081670..ea70f412a8 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -347,6 +347,20 @@ def do_transfer(self, args, *, def upgrade_item(item): """upgrade item as needed, get rid of legacy crap""" + if item.get('hardlink_master', True) and 'source' not in item and hardlinkable(item.mode): + item._dict['hlid'] = hlid = hashlib.sha256(item._dict['path']) + hardlink_masters[hlid] = (item._dict.get('chunks'), item._dict.get('chunks_healthy')) + elif 'source' in item and hardlinkable(item.mode): + item._dict['hlid'] = hlid = hashlib.sha256(item._dict['source']) + chunks, chunks_healthy = hardlink_masters.get(hlid, (None, None)) + if chunks is not None: + item._dict['chunks'] = chunks + for chunk_id, _, _ in chunks: + cache.chunk_incref(chunk_id, archive.stats) + if chunks_healthy is not None: + item._dict['chunks_healthy'] = chunks + item._dict.pop('source') # not used for hardlinks any more, replaced by hlid + item._dict.pop('hardlink_master', None) # not used for hardlinks any more, replaced by hlid item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks return item @@ -371,6 +385,7 @@ def upgrade_compressed_chunk(chunk): else: if not dry_run: print(f"{name}: copying archive to destination repo...") + hardlink_masters = {} other_archive = Archive(other_repository, other_key, other_manifest, name) archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None for item in other_archive.iter_items(): diff --git a/src/borg/constants.py b/src/borg/constants.py index 0b2ef16a16..13eb8bd232 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -1,5 +1,5 @@ # this set must be kept complete, otherwise the RobustUnpacker might malfunction: -ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master', +ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master', 'hlid', 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size', 'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', 'part']) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 48debf1839..0b2598ffe3 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -181,7 +181,8 @@ class Item(PropDict): # compatibility note: this is a new feature, in old archives size will be missing. size = PropDict._make_property('size', int) - hardlink_master = PropDict._make_property('hardlink_master', bool) + hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link. + hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') chunks_healthy = PropDict._make_property('chunks_healthy', (list, type(None)), 'list or None') From 7903dad1839c0c1e3a64dbbce8115e2d8aae5804 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 4 May 2022 10:34:33 +0200 Subject: [PATCH 06/14] transfer: convert timestamps int/bigint -> msgpack.Timestamp, see #2323 Timestamp scales to 64 or 96bit serialization formats, that should be enough for everybody. We use this in archived items and also in the files cache. --- src/borg/archiver.py | 4 ++++ src/borg/cache.py | 11 ++++++----- src/borg/helpers/msgpack.py | 17 ++++++++++++++--- src/borg/helpers/parseformat.py | 3 +++ src/borg/item.pyx | 11 +++++------ src/borg/testsuite/item.py | 7 ++++--- 6 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index ea70f412a8..eac059bb73 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -360,6 +360,10 @@ def upgrade_item(item): if chunks_healthy is not None: item._dict['chunks_healthy'] = chunks item._dict.pop('source') # not used for hardlinks any more, replaced by hlid + for attr in 'atime', 'ctime', 'mtime', 'birthtime': + if attr in item: + ns = getattr(item, attr) # decode (bigint or Timestamp) --> int ns + setattr(item, attr, ns) # encode int ns --> msgpack.Timestamp only, no bigint any more item._dict.pop('hardlink_master', None) # not used for hardlinks any more, replaced by hlid item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks diff --git a/src/borg/cache.py b/src/borg/cache.py index 6fa74e692d..6cd6123590 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -19,7 +19,7 @@ from .helpers import Error from .helpers import Manifest from .helpers import get_cache_dir, get_security_dir -from .helpers import int_to_bigint, bigint_to_int, bin_to_hex, parse_stringified_list +from .helpers import bin_to_hex, parse_stringified_list from .helpers import format_file_size from .helpers import safe_ns from .helpers import yes @@ -28,6 +28,7 @@ from .helpers import set_ec, EXIT_WARNING from .helpers import safe_unlink from .helpers import msgpack +from .helpers.msgpack import int_to_timestamp, timestamp_to_int from .item import ArchiveItem, ChunkListEntry from .crypto.key import PlaintextKey from .crypto.file_integrity import IntegrityCheckedFile, DetachedIntegrityCheckedFile, FileIntegrityError @@ -623,7 +624,7 @@ def commit(self): # this is to avoid issues with filesystem snapshots and cmtime granularity. # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet. entry = FileCacheEntry(*msgpack.unpackb(item)) - if entry.age == 0 and bigint_to_int(entry.cmtime) < self._newest_cmtime or \ + if entry.age == 0 and timestamp_to_int(entry.cmtime) < self._newest_cmtime or \ entry.age > 0 and entry.age < ttl: msgpack.pack((path_hash, entry), fd) entry_count += 1 @@ -1018,10 +1019,10 @@ def file_known_and_unchanged(self, hashed_path, path_hash, st): if 'i' in cache_mode and entry.inode != st.st_ino: files_cache_logger.debug('KNOWN-CHANGED: file inode number has changed: %r', hashed_path) return True, None - if 'c' in cache_mode and bigint_to_int(entry.cmtime) != st.st_ctime_ns: + if 'c' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_ctime_ns: files_cache_logger.debug('KNOWN-CHANGED: file ctime has changed: %r', hashed_path) return True, None - elif 'm' in cache_mode and bigint_to_int(entry.cmtime) != st.st_mtime_ns: + elif 'm' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_mtime_ns: files_cache_logger.debug('KNOWN-CHANGED: file mtime has changed: %r', hashed_path) return True, None # we ignored the inode number in the comparison above or it is still same. @@ -1049,7 +1050,7 @@ def memorize_file(self, hashed_path, path_hash, st, ids): elif 'm' in cache_mode: cmtime_type = 'mtime' cmtime_ns = safe_ns(st.st_mtime_ns) - entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_bigint(cmtime_ns), chunk_ids=ids) + entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids) self.files[path_hash] = msgpack.packb(entry) self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns) files_cache_logger.debug('FILES-CACHE-UPDATE: put %r [has %s] <- %r', diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 2ace88feef..411f00fec4 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -24,7 +24,7 @@ from msgpack import unpack as mp_unpack from msgpack import version as mp_version -from msgpack import ExtType +from msgpack import ExtType, Timestamp from msgpack import OutOfData @@ -164,7 +164,7 @@ def get_limited_unpacker(kind): return Unpacker(**args) -def bigint_to_int(mtime): +def bigint_to_int(mtime): # legacy """Convert bytearray to int """ if isinstance(mtime, bytes): @@ -172,7 +172,7 @@ def bigint_to_int(mtime): return mtime -def int_to_bigint(value): +def int_to_bigint(value): # legacy """Convert integers larger than 64 bits to bytearray Smaller integers are left alone @@ -180,3 +180,14 @@ def int_to_bigint(value): if value.bit_length() > 63: return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True) return value + + +def int_to_timestamp(ns): + return Timestamp.from_unix_nano(ns) + + +def timestamp_to_int(ts): + if isinstance(ts, Timestamp): + return ts.to_unix_nano() + # legacy support note: we need to keep the bigint conversion for compatibility with borg < 1.3 archives. + return bigint_to_int(ts) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 3e145ada2d..9bcbce22c8 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -19,6 +19,7 @@ from .errors import Error from .fs import get_keys_dir +from .msgpack import Timestamp from .time import OutputTimestamp, format_time, to_localtime, safe_timestamp, safe_s from .. import __version__ as borg_version from .. import __version_tuple__ as borg_version_tuple @@ -1043,6 +1044,8 @@ def decode(d): value = decode_tuple(value) elif isinstance(value, bytes): value = decode_bytes(value) + elif isinstance(value, Timestamp): + value = value.to_unix_nano() if isinstance(key, bytes): key = key.decode() res[key] = value diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 0b2598ffe3..764279db03 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -3,9 +3,9 @@ from collections import namedtuple from .constants import ITEM_KEYS, ARCHIVE_KEYS from .helpers import safe_encode, safe_decode -from .helpers import bigint_to_int, int_to_bigint from .helpers import StableDict from .helpers import format_file_size +from .helpers.msgpack import timestamp_to_int, int_to_timestamp cdef extern from "_item.c": @@ -171,11 +171,10 @@ class Item(PropDict): rdev = PropDict._make_property('rdev', int) bsdflags = PropDict._make_property('bsdflags', int) - # note: we need to keep the bigint conversion for compatibility with borg 1.0 archives. - atime = PropDict._make_property('atime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - ctime = PropDict._make_property('ctime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - mtime = PropDict._make_property('mtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - birthtime = PropDict._make_property('birthtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) + atime = PropDict._make_property('atime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + ctime = PropDict._make_property('ctime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + mtime = PropDict._make_property('mtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + birthtime = PropDict._make_property('birthtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) # size is only present for items with a chunk list and then it is sum(chunk_sizes) # compatibility note: this is a new feature, in old archives size will be missing. diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index aa40cc0660..80b38edce4 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -3,6 +3,7 @@ from ..cache import ChunkListEntry from ..item import Item from ..helpers import StableDict +from ..helpers.msgpack import Timestamp def test_item_empty(): @@ -77,15 +78,15 @@ def test_item_int_property(): item.mode = "invalid" -def test_item_bigint_property(): +def test_item_mptimestamp_property(): item = Item() small, big = 42, 2 ** 65 item.atime = small assert item.atime == small - assert item.as_dict() == {'atime': small} + assert item.as_dict() == {'atime': Timestamp.from_unix_nano(small)} item.atime = big assert item.atime == big - assert item.as_dict() == {'atime': b'\0' * 8 + b'\x02'} + assert item.as_dict() == {'atime': Timestamp.from_unix_nano(big)} def test_item_user_group_none(): From 6bfdb3f63016a73b320c89fce4a80ca08e4ad49d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 8 May 2022 14:14:47 +0200 Subject: [PATCH 07/14] refactor hardlink_master processing globally borg now has the chunks list in every item with content. due to the symmetric way how borg now deals with hardlinks using item.hlid, processing gets much simpler. but some places where borg deals with other "sources" of hardlinks still need to do some hardlink management: borg uses the HardLinkManager there now (which is not much more than a dict, but keeps documentation at one place and avoids some code duplication we had before). item.hlid is computed via hardlink_id function. support hardlinked symlinks, fixes #2379 as we use item.hlid now to group hardlinks together, there is no conflict with the item.source usage for symlink targets any more. 2nd+ hardlinks now add to the files count as did the 1st one. for borg, now all hardlinks are created equal. so any hardlink item with chunks now adds to the "file" count. ItemFormatter: support {hlid} instead of {source} for hardlinks --- docs/faq.rst | 3 - docs/internals/data-structures.rst | 2 +- docs/usage/general/file-metadata.rst.inc | 2 +- src/borg/archive.py | 286 +++++++++-------------- src/borg/archiver.py | 95 +++----- src/borg/fuse.py | 39 ++-- src/borg/helpers/fs.py | 68 +++++- src/borg/helpers/parseformat.py | 14 +- src/borg/item.pyx | 1 - src/borg/testsuite/archiver.py | 44 +++- 10 files changed, 263 insertions(+), 291 deletions(-) diff --git a/docs/faq.rst b/docs/faq.rst index 05f5176bc5..d16eff6c03 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -132,9 +132,6 @@ Which file types, attributes, etc. are *not* preserved? Archive extraction has optional support to extract all-zero chunks as holes in a sparse file. * Some filesystem specific attributes, like btrfs NOCOW, see :ref:`platforms`. - * For hardlinked symlinks, the hardlinking can not be archived (and thus, - the hardlinking will not be done at extraction time). The symlinks will - be archived and extracted as non-hardlinked symlinks, see :issue:`2379`. Are there other known limitations? ---------------------------------- diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index 2c5b7c1948..d1a5a4cd38 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -567,7 +567,7 @@ dictionary created by the ``Item`` class that contains: * uid * gid * mode (item type + permissions) -* source (for symlinks, and for hardlinks within one archive) +* source (for symlinks) * rdev (for device files) * mtime, atime, ctime in nanoseconds * xattrs diff --git a/docs/usage/general/file-metadata.rst.inc b/docs/usage/general/file-metadata.rst.inc index 8f4c67cbfb..c2694d1888 100644 --- a/docs/usage/general/file-metadata.rst.inc +++ b/docs/usage/general/file-metadata.rst.inc @@ -10,7 +10,7 @@ Besides regular file and directory structures, Borg can preserve * FIFOs ("named pipes") * special file *contents* can be backed up in ``--read-special`` mode. By default the metadata to create them with mknod(2), mkfifo(2) etc. is stored. -* hardlinked regular files, devices, FIFOs (considering all items in the same archive) +* hardlinked regular files, devices, symlinks, FIFOs (considering all items in the same archive) * timestamps in nanosecond precision: mtime, atime, ctime * other timestamps: birthtime (on platforms supporting it) * permissions: diff --git a/src/borg/archive.py b/src/borg/archive.py index ba95a08678..da083eac6b 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -28,7 +28,7 @@ from .crypto.low_level import IntegrityError as IntegrityErrorBase from .hashindex import ChunkIndex, ChunkIndexEntry, CacheSynchronizer from .helpers import Manifest -from .helpers import hardlinkable +from .helpers import HardLinkManager from .helpers import ChunkIteratorFileWrapper, open_item from .helpers import Error, IntegrityError, set_ec from .platform import uid2user, user2uid, gid2group, group2gid @@ -280,7 +280,7 @@ def __init__(self, repository, key): self.repository = repository self.key = key - def unpack_many(self, ids, filter=None, partial_extract=False, preload=False, hardlink_masters=None): + def unpack_many(self, ids, *, filter=None, preload=False): """ Return iterator of items. @@ -290,10 +290,7 @@ def unpack_many(self, ids, filter=None, partial_extract=False, preload=False, ha Warning: if *preload* is True then all data chunks of every yielded item have to be retrieved, otherwise preloaded chunks will accumulate in RemoteRepository and create a memory leak. """ - def _preload(chunks): - self.repository.preload([c.id for c in chunks]) - - masters_preloaded = set() + hlids_preloaded = set() unpacker = msgpack.Unpacker(use_list=False) for data in self.fetch_many(ids): unpacker.feed(data) @@ -306,33 +303,20 @@ def _preload(chunks): items = [item for item in items if filter(item)] if preload: - if filter and partial_extract: - # if we do only a partial extraction, it gets a bit - # complicated with computing the preload items: if a hardlink master item is not - # selected (== not extracted), we will still need to preload its chunks if a - # corresponding hardlink slave is selected (== is extracted). - # due to a side effect of the filter() call, we now have hardlink_masters dict populated. - for item in items: - if hardlinkable(item.mode): - source = item.get('source') - if source is None: # maybe a hardlink master - if 'chunks' in item: - _preload(item.chunks) - # if this is a hl master, remember that we already preloaded all chunks of it (if any): - if item.get('hardlink_master', True): - masters_preloaded.add(item.path) - else: # hardlink slave - if source not in masters_preloaded: - # we only need to preload *once* (for the 1st selected slave) - chunks, _ = hardlink_masters[source] - if chunks is not None: - _preload(chunks) - masters_preloaded.add(source) - else: - # easy: we do not have a filter, thus all items are selected, thus we need to preload all chunks. - for item in items: - if 'chunks' in item: - _preload(item.chunks) + for item in items: + if 'chunks' in item: + hlid = item.get('hlid', None) + if hlid is None: + preload_chunks = True + else: + if hlid in hlids_preloaded: + preload_chunks = False + else: + # not having the hardlink's chunks already preloaded for other hardlink to same inode + preload_chunks = True + hlids_preloaded.add(hlid) + if preload_chunks: + self.repository.preload([c.id for c in item.chunks]) for item in items: yield item @@ -443,7 +427,6 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False, self.repository = repository self.cache = cache self.manifest = manifest - self.hard_links = {} self.stats = Statistics(output_json=log_json, iec=iec) self.iec = iec self.show_progress = progress @@ -584,12 +567,10 @@ def item_filter(self, item, filter=None): return False return filter(item) if filter else True - def iter_items(self, filter=None, partial_extract=False, preload=False, hardlink_masters=None): + def iter_items(self, filter=None, preload=False): # note: when calling this with preload=True, later fetch_many() must be called with # is_preloaded=True or the RemoteRepository code will leak memory! - assert not (filter and partial_extract and preload) or hardlink_masters is not None - for item in self.pipeline.unpack_many(self.metadata.items, partial_extract=partial_extract, - preload=preload, hardlink_masters=hardlink_masters, + for item in self.pipeline.unpack_many(self.metadata.items, preload=preload, filter=lambda item: self.item_filter(item, filter)): yield item @@ -719,33 +700,30 @@ def add(id): return stats @contextmanager - def extract_helper(self, dest, item, path, stripped_components, original_path, hardlink_masters): + def extract_helper(self, item, path, hlm, *, dry_run=False): hardlink_set = False # Hard link? - if 'source' in item: - source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:]) - chunks, link_target = hardlink_masters.get(item.source, (None, source)) - if link_target and has_link: - # Hard link was extracted previously, just link - with backup_io('link'): - os.link(link_target, path) - hardlink_set = True - elif chunks is not None: - # assign chunks to this item, since the item which had the chunks was not extracted - item.chunks = chunks + if 'hlid' in item: + link_target = hlm.retrieve(id=item.hlid) + if link_target is not None and has_link: + if not dry_run: + # another hardlink to same inode (same hlid) was extracted previously, just link to it + with backup_io('link'): + os.link(link_target, path, follow_symlinks=False) + hardlink_set = True yield hardlink_set - if not hardlink_set and hardlink_masters: - if has_link: - # Update master entry with extracted item path, so that following hardlinks don't extract twice. + if not hardlink_set: + if 'hlid' in item and has_link: + # Update entry with extracted item path, so that following hardlinks don't extract twice. # We have hardlinking support, so we will hardlink not extract. - hardlink_masters[item.get('source') or original_path] = (None, path) + hlm.remember(id=item.hlid, info=path) else: # Broken platform with no hardlinking support. # In this case, we *want* to extract twice, because there is no other way. pass def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False, - hardlink_masters=None, stripped_components=0, original_path=None, pi=None): + hlm=None, stripped_components=0, original_path=None, pi=None): """ Extract archive item. @@ -754,29 +732,33 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp :param dry_run: do not write any data :param stdout: write extracted data to stdout :param sparse: write sparse files (chunk-granularity, independent of the original being sparse) - :param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly + :param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly :param stripped_components: stripped leading path components to correct hard link extraction :param original_path: 'path' key as stored in archive :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes) """ - hardlink_masters = hardlink_masters or {} has_damaged_chunks = 'chunks_healthy' in item if dry_run or stdout: - if 'chunks' in item: - item_chunks_size = 0 - for data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True): - if pi: - pi.show(increase=len(data), info=[remove_surrogates(item.path)]) - if stdout: - sys.stdout.buffer.write(data) - item_chunks_size += len(data) - if stdout: - sys.stdout.buffer.flush() - if 'size' in item: - item_size = item.size - if item_size != item_chunks_size: - raise BackupError('Size inconsistency detected: size {}, chunks size {}'.format( - item_size, item_chunks_size)) + with self.extract_helper(item, '', hlm, dry_run=dry_run or stdout) as hardlink_set: + if not hardlink_set: + # it does not really set hardlinks due to dry_run, but we need to behave same + # as non-dry_run concerning fetching preloaded chunks from the pipeline or + # it would get stuck. + if 'chunks' in item: + item_chunks_size = 0 + for data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True): + if pi: + pi.show(increase=len(data), info=[remove_surrogates(item.path)]) + if stdout: + sys.stdout.buffer.write(data) + item_chunks_size += len(data) + if stdout: + sys.stdout.buffer.flush() + if 'size' in item: + item_size = item.size + if item_size != item_chunks_size: + raise BackupError('Size inconsistency detected: size {}, chunks size {}'.format( + item_size, item_chunks_size)) if has_damaged_chunks: raise BackupError('File has damaged (all-zero) chunks. Try running borg check --repair.') return @@ -807,8 +789,7 @@ def make_parent(path): if stat.S_ISREG(mode): with backup_io('makedirs'): make_parent(path) - with self.extract_helper(dest, item, path, stripped_components, original_path, - hardlink_masters) as hardlink_set: + with self.extract_helper(item, path, hlm) as hardlink_set: if hardlink_set: return with backup_io('open'): @@ -847,24 +828,26 @@ def make_parent(path): self.restore_attrs(path, item) elif stat.S_ISLNK(mode): make_parent(path) - source = item.source - try: - os.symlink(source, path) - except UnicodeEncodeError: - raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None - self.restore_attrs(path, item, symlink=True) + with self.extract_helper(item, path, hlm) as hardlink_set: + if hardlink_set: + # unusual, but possible: this is a hardlinked symlink. + return + source = item.source + try: + os.symlink(source, path) + except UnicodeEncodeError: + raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None + self.restore_attrs(path, item, symlink=True) elif stat.S_ISFIFO(mode): make_parent(path) - with self.extract_helper(dest, item, path, stripped_components, original_path, - hardlink_masters) as hardlink_set: + with self.extract_helper(item, path, hlm) as hardlink_set: if hardlink_set: return os.mkfifo(path) self.restore_attrs(path, item) elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode): make_parent(path) - with self.extract_helper(dest, item, path, stripped_components, original_path, - hardlink_masters) as hardlink_set: + with self.extract_helper(item, path, hlm) as hardlink_set: if hardlink_set: return os.mknod(path, item.mode, item.rdev) @@ -1041,79 +1024,43 @@ def compare_archives_iter(archive1, archive2, matcher=None, can_compare_chunk_id :param can_compare_chunk_ids: Whether --chunker-params are the same for both archives. """ - def hardlink_master_seen(item): - return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters - - def is_hardlink_master(item): - return item.get('hardlink_master', True) and 'source' not in item and hardlinkable(item.mode) - - def update_hardlink_masters(item1, item2): - if is_hardlink_master(item1) or is_hardlink_master(item2): - hardlink_masters[item1.path] = (item1, item2) - - def has_hardlink_master(item, hardlink_masters): - return hardlinkable(item.mode) and item.get('source') in hardlink_masters - def compare_items(item1, item2): - if has_hardlink_master(item1, hardlink_masters): - item1 = hardlink_masters[item1.source][0] - if has_hardlink_master(item2, hardlink_masters): - item2 = hardlink_masters[item2.source][1] return ItemDiff(item1, item2, archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]), archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]), can_compare_chunk_ids=can_compare_chunk_ids) - def defer_if_necessary(item1, item2): - """Adds item tuple to deferred if necessary and returns True, if items were deferred""" - update_hardlink_masters(item1, item2) - defer = not hardlink_master_seen(item1) or not hardlink_master_seen(item2) - if defer: - deferred.append((item1, item2)) - return defer - orphans_archive1 = OrderedDict() orphans_archive2 = OrderedDict() - deferred = [] - hardlink_masters = {} for item1, item2 in zip_longest( archive1.iter_items(lambda item: matcher.match(item.path)), archive2.iter_items(lambda item: matcher.match(item.path)), ): if item1 and item2 and item1.path == item2.path: - if not defer_if_necessary(item1, item2): - yield (item1.path, compare_items(item1, item2)) + yield (item1.path, compare_items(item1, item2)) continue if item1: matching_orphan = orphans_archive2.pop(item1.path, None) if matching_orphan: - if not defer_if_necessary(item1, matching_orphan): - yield (item1.path, compare_items(item1, matching_orphan)) + yield (item1.path, compare_items(item1, matching_orphan)) else: orphans_archive1[item1.path] = item1 if item2: matching_orphan = orphans_archive1.pop(item2.path, None) if matching_orphan: - if not defer_if_necessary(matching_orphan, item2): - yield (matching_orphan.path, compare_items(matching_orphan, item2)) + yield (matching_orphan.path, compare_items(matching_orphan, item2)) else: orphans_archive2[item2.path] = item2 # At this point orphans_* contain items that had no matching partner in the other archive for added in orphans_archive2.values(): path = added.path deleted_item = Item.create_deleted(path) - update_hardlink_masters(deleted_item, added) yield (path, compare_items(deleted_item, added)) for deleted in orphans_archive1.values(): path = deleted.path deleted_item = Item.create_deleted(path) - update_hardlink_masters(deleted, deleted_item) yield (path, compare_items(deleted, deleted_item)) - for item1, item2 in deferred: - assert hardlink_master_seen(item1) - assert hardlink_master_seen(item2) - yield (path, compare_items(item1, item2)) class MetadataCollector: @@ -1289,7 +1236,7 @@ def __init__(self, *, metadata_collector, cache, key, self.show_progress = show_progress self.print_file_status = file_status_printer or (lambda *args: None) - self.hard_links = {} + self.hlm = HardLinkManager(id_type=tuple, info_type=tuple) # (dev, ino) -> (hlid, chunks) self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) self.cwd = os.getcwd() self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse) @@ -1298,29 +1245,32 @@ def __init__(self, *, metadata_collector, cache, key, def create_helper(self, path, st, status=None, hardlinkable=True): safe_path = make_path_safe(path) item = Item(path=safe_path) - hardlink_master = False hardlinked = hardlinkable and st.st_nlink > 1 + update_map = False if hardlinked: - source = self.hard_links.get((st.st_ino, st.st_dev)) - if source is not None: - item.source = source - status = 'h' # hardlink (to already seen inodes) - else: - hardlink_master = True - yield item, status, hardlinked, hardlink_master - # if we get here, "with"-block worked ok without error/exception, the item was processed ok... + status = 'h' # hardlink + hlid, chunks = self.hlm.retrieve(id=(st.st_ino, st.st_dev), default=(None, None)) + if hlid is None: + update_map = True + hlid = self.hlm.hardlink_id(item._dict['path']) + item.hlid = hlid + if chunks is not None: + item.chunks = chunks + yield item, status, hardlinked self.add_item(item, stats=self.stats) - # ... and added to the archive, so we can remember it to refer to it later in the archive: - if hardlink_master: - self.hard_links[(st.st_ino, st.st_dev)] = safe_path + if update_map: + # remember the hlid of this fs object and if the item has chunks, + # also remember them, so we do not have to re-chunk a hardlink. + chunks = item.chunks if 'chunks' in item else None + self.hlm.remember(id=(st.st_ino, st.st_dev), info=(hlid, chunks)) def process_dir_with_fd(self, *, path, fd, st): - with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master): + with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked): item.update(self.metadata_collector.stat_attrs(st, path, fd=fd)) return status def process_dir(self, *, path, parent_fd, name, st): - with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master): + with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked): with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_dir, noatime=True, op='dir_open') as fd: # fd is None for directories on windows, in that case a race condition check is not possible. @@ -1331,7 +1281,7 @@ def process_dir(self, *, path, parent_fd, name, st): return status def process_fifo(self, *, path, parent_fd, name, st): - with self.create_helper(path, st, 'f') as (item, status, hardlinked, hardlink_master): # fifo + with self.create_helper(path, st, 'f') as (item, status, hardlinked): # fifo with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_normal, noatime=True) as fd: with backup_io('fstat'): st = stat_update_check(st, os.fstat(fd)) @@ -1339,7 +1289,7 @@ def process_fifo(self, *, path, parent_fd, name, st): return status def process_dev(self, *, path, parent_fd, name, st, dev_type): - with self.create_helper(path, st, dev_type) as (item, status, hardlinked, hardlink_master): # char/block device + with self.create_helper(path, st, dev_type) as (item, status, hardlinked): # char/block device # looks like we can not work fd-based here without causing issues when trying to open/close the device with backup_io('stat'): st = stat_update_check(st, os_stat(path=path, parent_fd=parent_fd, name=name, follow_symlinks=False)) @@ -1348,10 +1298,7 @@ def process_dev(self, *, path, parent_fd, name, st, dev_type): return status def process_symlink(self, *, path, parent_fd, name, st): - # note: using hardlinkable=False because we can not support hardlinked symlinks, - # due to the dual-use of item.source, see issue #2343: - # hardlinked symlinks will be archived [and extracted] as non-hardlinked symlinks. - with self.create_helper(path, st, 's', hardlinkable=False) as (item, status, hardlinked, hardlink_master): + with self.create_helper(path, st, 's', hardlinkable=True) as (item, status, hardlinked): fname = name if name is not None and parent_fd is not None else path with backup_io('readlink'): source = os.readlink(fname, dir_fd=parent_fd) @@ -1384,7 +1331,7 @@ def process_pipe(self, *, path, cache, fd, mode, user, group): return status def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): - with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet + with self.create_helper(path, st, None) as (item, status, hardlinked): # no status yet with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags, noatime=True) as fd: with backup_io('fstat'): st = stat_update_check(st, os.fstat(fd)) @@ -1395,7 +1342,9 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): # so it can be extracted / accessed in FUSE mount like a regular file. # this needs to be done early, so that part files also get the patched mode. item.mode = stat.S_IFREG | stat.S_IMODE(item.mode) - if not hardlinked or hardlink_master: + if 'chunks' in item: # create_helper might have put chunks from a previous hardlink there + [cache.chunk_incref(id_, self.stats) for id_, _, _ in item.chunks] + else: # normal case, no "2nd+" hardlink if not is_special_file: hashed_path = safe_encode(os.path.join(self.cwd, path)) path_hash = self.key.id_hash(hashed_path) @@ -1420,7 +1369,6 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): status = 'M' if known else 'A' # regular file, modified or added self.print_file_status(status, path) status = None # we already printed the status - item.hardlink_master = hardlinked # Only chunkify the file if needed if chunks is not None: item.chunks = chunks @@ -1444,7 +1392,7 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): # also, we must not memorize a potentially inconsistent/corrupt file that # changed while we backed it up. cache.memorize_file(hashed_path, path_hash, st, [c.id for c in item.chunks]) - self.stats.nfiles += 1 + self.stats.nfiles += 1 item.update(self.metadata_collector.stat_ext_attrs(st, path, fd=fd)) item.get_size(memorize=True) return status @@ -1464,6 +1412,7 @@ def __init__(self, *, cache, key, self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False) + self.hlm = HardLinkManager(id_type=str, info_type=list) # path -> chunks @contextmanager def create_helper(self, tarinfo, status=None, type=None): @@ -1504,11 +1453,21 @@ def process_dev(self, *, tarinfo, status, type): item.rdev = os.makedev(tarinfo.devmajor, tarinfo.devminor) return status - def process_link(self, *, tarinfo, status, type): + def process_symlink(self, *, tarinfo, status, type): with self.create_helper(tarinfo, status, type) as (item, status): item.source = tarinfo.linkname return status + def process_hardlink(self, *, tarinfo, status, type): + with self.create_helper(tarinfo, status, type) as (item, status): + # create a not hardlinked borg item, reusing the chunks, see HardLinkManager.__doc__ + chunks = self.hlm.retrieve(tarinfo.linkname) + if chunks is not None: + item.chunks = chunks + item.get_size(memorize=True, from_chunks=True) + self.stats.nfiles += 1 + return status + def process_file(self, *, tarinfo, status, type, tar): with self.create_helper(tarinfo, status, type) as (item, status): self.print_file_status(status, tarinfo.name) @@ -1516,8 +1475,10 @@ def process_file(self, *, tarinfo, status, type, tar): fd = tar.extractfile(tarinfo) self.process_file_chunks(item, self.cache, self.stats, self.show_progress, backup_io_iter(self.chunker.chunkify(fd))) - item.get_size(memorize=True) + item.get_size(memorize=True, from_chunks=True) self.stats.nfiles += 1 + # we need to remember ALL files, see HardLinkManager.__doc__ + self.hlm.remember(id=tarinfo.name, info=item.chunks) return status @@ -2127,34 +2088,11 @@ def recreate(self, archive_name, comment=None, target_name=None): def process_items(self, archive, target): matcher = self.matcher - target_is_subset = not matcher.empty() - hardlink_masters = {} if target_is_subset else None - - def item_is_hardlink_master(item): - return (target_is_subset and - hardlinkable(item.mode) and - item.get('hardlink_master', True) and - 'source' not in item) for item in archive.iter_items(): if not matcher.match(item.path): self.print_file_status('x', item.path) - if item_is_hardlink_master(item): - hardlink_masters[item.path] = (item.get('chunks'), item.get('chunks_healthy'), None) continue - if target_is_subset and hardlinkable(item.mode) and item.get('source') in hardlink_masters: - # master of this hard link is outside the target subset - chunks, chunks_healthy, new_source = hardlink_masters[item.source] - if new_source is None: - # First item to use this master, move the chunks - item.chunks = chunks - if chunks_healthy is not None: - item.chunks_healthy = chunks_healthy - hardlink_masters[item.source] = (None, None, item.path) - del item.source - else: - # Master was already moved, only update this item's source - item.source = new_source if self.dry_run: self.print_file_status('-', item.path) else: @@ -2261,7 +2199,7 @@ def exclude(dir, tag_item): tag_files = [] tagged_dirs = [] - # to support reading hard-linked CACHEDIR.TAGs (aka CACHE_TAG_NAME), similar to hardlink_masters: + # to support reading hard-linked CACHEDIR.TAGs (aka CACHE_TAG_NAME): cachedir_masters = {} if self.exclude_caches: diff --git a/src/borg/archiver.py b/src/borg/archiver.py index eac059bb73..4b6fd1782c 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -59,7 +59,7 @@ from .helpers import timestamp from .helpers import get_cache_dir, os_stat from .helpers import Manifest, AI_HUMAN_SORT_KEYS - from .helpers import hardlinkable + from .helpers import HardLinkManager from .helpers import StableDict from .helpers import check_python, check_extension_modules from .helpers import dir_is_tagged, is_slow_msgpack, is_supported_msgpack, yes, sysinfo @@ -347,12 +347,12 @@ def do_transfer(self, args, *, def upgrade_item(item): """upgrade item as needed, get rid of legacy crap""" - if item.get('hardlink_master', True) and 'source' not in item and hardlinkable(item.mode): - item._dict['hlid'] = hlid = hashlib.sha256(item._dict['path']) - hardlink_masters[hlid] = (item._dict.get('chunks'), item._dict.get('chunks_healthy')) - elif 'source' in item and hardlinkable(item.mode): - item._dict['hlid'] = hlid = hashlib.sha256(item._dict['source']) - chunks, chunks_healthy = hardlink_masters.get(hlid, (None, None)) + if hlm.borg1_hardlink_master(item): + item._dict['hlid'] = hlid = hlm.hardlink_id(item._dict['path']) + hlm.remember(id=hlid, info=(item._dict.get('chunks'), item._dict.get('chunks_healthy'))) + elif hlm.borg1_hardlink_slave(item): + item._dict['hlid'] = hlid = hlm.hardlink_id(item._dict['source']) + chunks, chunks_healthy = hlm.retrieve(id=hlid, default=(None, None)) if chunks is not None: item._dict['chunks'] = chunks for chunk_id, _, _ in chunks: @@ -389,7 +389,7 @@ def upgrade_compressed_chunk(chunk): else: if not dry_run: print(f"{name}: copying archive to destination repo...") - hardlink_masters = {} + hlm = HardLinkManager(id_type=bytes, info_type=tuple) # hlid -> (chunks, chunks_healthy) other_archive = Archive(other_repository, other_key, other_manifest, name) archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None for item in other_archive.iter_items(): @@ -1154,16 +1154,14 @@ def _rec_walk(self, *, path, parent_fd, name, fso, cache, matcher, self.print_file_status(status, path) @staticmethod - def build_filter(matcher, peek_and_store_hardlink_masters, strip_components): + def build_filter(matcher, strip_components): if strip_components: def item_filter(item): matched = matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:]) - peek_and_store_hardlink_masters(item, matched) return matched else: def item_filter(item): matched = matcher.match(item.path) - peek_and_store_hardlink_masters(item, matched) return matched return item_filter @@ -1186,33 +1184,18 @@ def do_extract(self, args, repository, manifest, key, archive): sparse = args.sparse strip_components = args.strip_components dirs = [] - partial_extract = not matcher.empty() or strip_components - hardlink_masters = {} if partial_extract or not has_link else None + hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path - def peek_and_store_hardlink_masters(item, matched): - # not has_link: - # OS does not have hardlink capability thus we need to remember the chunks so that - # we can extract all hardlinks as separate normal (not-hardlinked) files instead. - # - # partial_extract and not matched and hardlinkable: - # we do not extract the very first hardlink, so we need to remember the chunks - # in hardlinks_master, so we can use them when we extract some 2nd+ hardlink item - # that has no chunks list. - if ((not has_link or (partial_extract and not matched and hardlinkable(item.mode))) and - (item.get('hardlink_master', True) and 'source' not in item)): - hardlink_masters[item.get('path')] = (item.get('chunks'), None) - - filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components) + filter = self.build_filter(matcher, strip_components) if progress: pi = ProgressIndicatorPercent(msg='%5.1f%% Extracting: %s', step=0.1, msgid='extract') pi.output('Calculating total archive size for the progress indicator (might take long for large archives)') - extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter)) + extracted_size = sum(item.get_size() for item in archive.iter_items(filter)) pi.total = extracted_size else: pi = None - for item in archive.iter_items(filter, partial_extract=partial_extract, - preload=True, hardlink_masters=hardlink_masters): + for item in archive.iter_items(filter, preload=True): orig_path = item.path if strip_components: item.path = os.sep.join(orig_path.split(os.sep)[strip_components:]) @@ -1227,13 +1210,13 @@ def peek_and_store_hardlink_masters(item, matched): logging.getLogger('borg.output.list').info(remove_surrogates(item.path)) try: if dry_run: - archive.extract_item(item, dry_run=True, pi=pi) + archive.extract_item(item, dry_run=True, hlm=hlm, pi=pi) else: if stat.S_ISDIR(item.mode): dirs.append(item) archive.extract_item(item, stdout=stdout, restore_attrs=False) else: - archive.extract_item(item, stdout=stdout, sparse=sparse, hardlink_masters=hardlink_masters, + archive.extract_item(item, stdout=stdout, sparse=sparse, hlm=hlm, stripped_components=strip_components, original_path=orig_path, pi=pi) except (BackupOSError, BackupError) as e: self.print_warning('%s: %s', remove_surrogates(orig_path), e) @@ -1298,15 +1281,9 @@ def _export_tar(self, args, archive, tarstream): progress = args.progress output_list = args.output_list strip_components = args.strip_components - partial_extract = not matcher.empty() or strip_components - hardlink_masters = {} if partial_extract else None - - def peek_and_store_hardlink_masters(item, matched): - if ((partial_extract and not matched and hardlinkable(item.mode)) and - (item.get('hardlink_master', True) and 'source' not in item)): - hardlink_masters[item.get('path')] = (item.get('chunks'), None) + hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path - filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components) + filter = self.build_filter(matcher, strip_components) # The | (pipe) symbol instructs tarfile to use a streaming mode of operation # where it never seeks on the passed fileobj. @@ -1316,7 +1293,7 @@ def peek_and_store_hardlink_masters(item, matched): if progress: pi = ProgressIndicatorPercent(msg='%5.1f%% Processing: %s', step=0.1, msgid='extract') pi.output('Calculating size') - extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter)) + extracted_size = sum(item.get_size() for item in archive.iter_items(filter)) pi.total = extracted_size else: pi = None @@ -1351,9 +1328,8 @@ def item_to_tarinfo(item, original_path): tarinfo.gid = item.gid tarinfo.uname = item.user or '' tarinfo.gname = item.group or '' - # The linkname in tar has the same dual use the 'source' attribute of Borg items, - # i.e. for symlinks it means the destination, while for hardlinks it refers to the - # file. + # The linkname in tar has 2 uses: + # for symlinks it means the destination, while for hardlinks it refers to the file. # Since hardlinks in tar have a different type code (LNKTYPE) the format might # support hardlinking arbitrary objects (including symlinks and directories), but # whether implementations actually support that is a whole different question... @@ -1362,23 +1338,16 @@ def item_to_tarinfo(item, original_path): modebits = stat.S_IFMT(item.mode) if modebits == stat.S_IFREG: tarinfo.type = tarfile.REGTYPE - if 'source' in item: - source = os.sep.join(item.source.split(os.sep)[strip_components:]) - if hardlink_masters is None: - linkname = source - else: - chunks, linkname = hardlink_masters.get(item.source, (None, source)) - if linkname: - # Master was already added to the archive, add a hardlink reference to it. + if 'hlid' in item: + linkname = hlm.retrieve(id=item.hlid) + if linkname is not None: + # the first hardlink was already added to the archive, add a tar-hardlink reference to it. tarinfo.type = tarfile.LNKTYPE tarinfo.linkname = linkname - elif chunks is not None: - # The item which has the chunks was not put into the tar, therefore - # we do that now and update hardlink_masters to reflect that. - item.chunks = chunks + else: tarinfo.size = item.get_size() stream = item_content_stream(item) - hardlink_masters[item.get('source') or original_path] = (None, item.path) + hlm.remember(id=item.hlid, info=item.path) else: tarinfo.size = item.get_size() stream = item_content_stream(item) @@ -1436,8 +1405,7 @@ def item_to_paxheaders(format, item): ph['BORG.item.meta'] = meta_text return ph - for item in archive.iter_items(filter, partial_extract=partial_extract, - preload=True, hardlink_masters=hardlink_masters): + for item in archive.iter_items(filter, preload=True): orig_path = item.path if strip_components: item.path = os.sep.join(orig_path.split(os.sep)[strip_components:]) @@ -2072,12 +2040,11 @@ def _import_tar(self, args, repository, manifest, key, cache, tarstream): elif tarinfo.isdir(): status = tfo.process_dir(tarinfo=tarinfo, status='d', type=stat.S_IFDIR) elif tarinfo.issym(): - status = tfo.process_link(tarinfo=tarinfo, status='s', type=stat.S_IFLNK) + status = tfo.process_symlink(tarinfo=tarinfo, status='s', type=stat.S_IFLNK) elif tarinfo.islnk(): - # tar uses the same hardlink model as borg (rather vice versa); the first instance of a hardlink - # is stored as a regular file, later instances are special entries referencing back to the - # first instance. - status = tfo.process_link(tarinfo=tarinfo, status='h', type=stat.S_IFREG) + # tar uses a hardlink model like: the first instance of a hardlink is stored as a regular file, + # later instances are special entries referencing back to the first instance. + status = tfo.process_hardlink(tarinfo=tarinfo, status='h', type=stat.S_IFREG) elif tarinfo.isblk(): status = tfo.process_dev(tarinfo=tarinfo, status='b', type=stat.S_IFBLK) elif tarinfo.ischr(): diff --git a/src/borg/fuse.py b/src/borg/fuse.py index e2ef8eaa9a..b81f37f1f9 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -35,7 +35,8 @@ def async_wrapper(fn): from .archiver import Archiver from .archive import Archive, get_item_uid_gid from .hashindex import FuseVersionsIndex -from .helpers import daemonize, daemonizing, hardlinkable, signal_handler, format_file_size, Error +from .helpers import daemonize, daemonizing, signal_handler, format_file_size, Error +from .helpers import HardLinkManager from .helpers import msgpack from .item import Item from .lrucache import LRUCache @@ -339,15 +340,9 @@ def _process_archive(self, archive_name, prefix=[]): consider_part_files=self._args.consider_part_files) strip_components = self._args.strip_components matcher = Archiver.build_matcher(self._args.patterns, self._args.paths) - partial_extract = not matcher.empty() or strip_components - hardlink_masters = {} if partial_extract else None + hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path - def peek_and_store_hardlink_masters(item, matched): - if (partial_extract and not matched and hardlinkable(item.mode) and - item.get('hardlink_master', True) and 'source' not in item): - hardlink_masters[item.get('path')] = (item.get('chunks'), None) - - filter = Archiver.build_filter(matcher, peek_and_store_hardlink_masters, strip_components) + filter = Archiver.build_filter(matcher, strip_components) for item_inode, item in self.cache.iter_archive_items(archive.metadata.items, filter=filter, consider_part_files=self._args.consider_part_files): if strip_components: @@ -369,15 +364,13 @@ def peek_and_store_hardlink_masters(item, matched): parent = 1 for segment in segments[:-1]: parent = self._process_inner(segment, parent) - self._process_leaf(segments[-1], item, parent, prefix, is_dir, item_inode, - hardlink_masters, strip_components) + self._process_leaf(segments[-1], item, parent, prefix, is_dir, item_inode, hlm) duration = time.perf_counter() - t0 logger.debug('fuse: _process_archive completed in %.1f s for archive %s', duration, archive.name) - def _process_leaf(self, name, item, parent, prefix, is_dir, item_inode, hardlink_masters, stripped_components): + def _process_leaf(self, name, item, parent, prefix, is_dir, item_inode, hlm): path = item.path del item.path # save some space - hardlink_masters = hardlink_masters or {} def file_version(item, path): if 'chunks' in item: @@ -402,10 +395,9 @@ def make_versioned_name(name, version, add_dir=False): version_enc = os.fsencode('.%05d' % version) return name + version_enc + ext - if 'source' in item and hardlinkable(item.mode): - source = os.sep.join(item.source.split(os.sep)[stripped_components:]) - chunks, link_target = hardlink_masters.get(item.source, (None, source)) - if link_target: + if 'hlid' in item: + link_target = hlm.retrieve(id=item.hlid, default=None) + if link_target is not None: # Hard link was extracted previously, just link link_target = os.fsencode(link_target) if self.versions: @@ -415,19 +407,16 @@ def make_versioned_name(name, version, add_dir=False): try: inode = self.find_inode(link_target, prefix) except KeyError: - logger.warning('Skipping broken hard link: %s -> %s', path, source) + logger.warning('Skipping broken hard link: %s -> %s', path, link_target) return item = self.get_item(inode) item.nlink = item.get('nlink', 1) + 1 self._items[inode] = item - elif chunks is not None: - # assign chunks to this item, since the item which had the chunks was not extracted - item.chunks = chunks + else: inode = item_inode self._items[inode] = item - if hardlink_masters: - # Update master entry with extracted item path, so that following hardlinks don't extract twice. - hardlink_masters[item.source] = (None, path) + # remember extracted item path, so that following hardlinks don't extract twice. + hlm.remember(id=item.hlid, info=path) else: inode = item_inode @@ -436,7 +425,7 @@ def make_versioned_name(name, version, add_dir=False): enc_path = os.fsencode(path) version = file_version(item, enc_path) if version is not None: - # regular file, with contents - maybe a hardlink master + # regular file, with contents name = make_versioned_name(name, version) self.file_versions[enc_path] = version diff --git a/src/borg/helpers/fs.py b/src/borg/helpers/fs.py index d1a412da08..89b54a09d3 100644 --- a/src/borg/helpers/fs.py +++ b/src/borg/helpers/fs.py @@ -1,4 +1,5 @@ import errno +import hashlib import os import os.path import re @@ -165,9 +166,70 @@ def make_path_safe(path): return _safe_re.sub('', path) or '.' -def hardlinkable(mode): - """return True if we support hardlinked items of this type""" - return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode) +class HardLinkManager: + """ + Manage hardlinks (and avoid code duplication doing so). + + A) When creating a borg2 archive from the filesystem, we have to maintain a mapping like: + (dev, ino) -> (hlid, chunks) # for fs_hl_targets + If we encounter the same (dev, ino) again later, we'll just re-use the hlid and chunks list. + + B) When extracting a borg2 archive to the filesystem, we have to maintain a mapping like: + hlid -> path + If we encounter the same hlid again later, we hardlink to the path of the already extracted content of same hlid. + + C) When transferring from a borg1 archive, we need: + path -> chunks, chunks_healthy # for borg1_hl_targets + If we encounter a regular file item with source == path later, we reuse chunks and chunks_healthy + and create the same hlid = hardlink_id(source). + + D) When importing a tar file (simplified 1-pass way for now, not creating borg hardlink items): + path -> chunks + If we encounter a LNK tar entry later with linkname==path, we re-use the chunks and create a regular file item. + For better hardlink support (including the very first hardlink item for each group of same-target hardlinks), + we would need a 2-pass processing, which is not yet implemented. + """ + def __init__(self, *, id_type, info_type): + self._map = {} + self.id_type = id_type + self.info_type = info_type + + def borg1_hardlinkable(self, mode): # legacy + return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode) + + def borg1_hardlink_master(self, item): # legacy + return item.get('hardlink_master', True) and 'source' not in item and self.borg1_hardlinkable(item.mode) + + def borg1_hardlink_slave(self, item): # legacy + return 'source' in item and self.borg1_hardlinkable(item.mode) + + def hardlink_id(self, path): + """compute a hardlink id from a path""" + assert isinstance(path, bytes) + return hashlib.sha256(path).digest() + + def remember(self, *, id, info): + """ + remember stuff from a (usually contentful) item. + + :param id: some id used to reference to the contentful item, could be: + a path (tar style, old borg style) [bytes] + a hlid (new borg style) [bytes] + a (dev, inode) tuple (filesystem) + :param info: information to remember, could be: + chunks / chunks_healthy list + hlid + """ + assert isinstance(id, self.id_type) + assert isinstance(info, self.info_type) + self._map[id] = info + + def retrieve(self, id, *, default=None): + """ + retrieve stuff to use it in a (usually contentless) item. + """ + assert isinstance(id, self.id_type) + return self._map.get(id, default) def scandir_keyfunc(dirent): diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 9bcbce22c8..414402de0f 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -695,7 +695,8 @@ class ItemFormatter(BaseFormatter): KEY_DESCRIPTIONS = { 'bpath': 'verbatim POSIX path, can contain any character except NUL', 'path': 'path interpreted as text (might be missing non-text characters, see bpath)', - 'source': 'link target for links (identical to linktarget)', + 'source': 'link target for symlinks (identical to linktarget)', + 'hlid': 'hard link identity (same if hardlinking same fs object)', 'extra': 'prepends {source} with " -> " for soft links and " link to " for hard links', 'csize': 'compressed size', 'dsize': 'deduplicated size', @@ -706,7 +707,7 @@ class ItemFormatter(BaseFormatter): 'health': 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)', } KEY_GROUPS = ( - ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget', 'flags'), + ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget', 'hlid', 'flags'), ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'), ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'), tuple(sorted(hash_algorithms)), @@ -802,11 +803,9 @@ def get_item_data(self, item): extra = '' if source: source = remove_surrogates(source) - if item_type == 'l': - extra = ' -> %s' % source - else: - mode = 'h' + mode[1:] - extra = ' link to %s' % source + extra = ' -> %s' % source + hlid = item.get('hlid') + hlid = bin_to_hex(hlid) if hlid else '' item_data['type'] = item_type item_data['mode'] = mode item_data['user'] = item.user or item.uid @@ -822,6 +821,7 @@ def get_item_data(self, item): item_data['health'] = 'broken' if 'chunks_healthy' in item else 'healthy' item_data['source'] = source item_data['linktarget'] = source + item_data['hlid'] = hlid item_data['flags'] = item.get('bsdflags') for key in self.used_call_keys: item_data[key] = self.call_keys[key](item) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 764279db03..9ea76f2de4 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -214,7 +214,6 @@ class Item(PropDict): except AttributeError: if stat.S_ISLNK(self.mode): # get out of here quickly. symlinks have no own chunks, their fs size is the length of the target name. - # also, there is the dual-use issue of .source (#2343), so don't confuse it with a hardlink slave. return len(self.source) # no precomputed (c)size value available, compute it: try: diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index b69fe819f2..402239455e 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -321,7 +321,7 @@ def create_regular_file(self, name, size=0, contents=None): contents = b'X' * size fd.write(contents) - def create_test_files(self): + def create_test_files(self, create_hardlinks=True): """Create a minimal test case including all supported file types """ # File @@ -332,7 +332,7 @@ def create_test_files(self): # File mode os.chmod('input/file1', 0o4755) # Hard link - if are_hardlinks_supported(): + if are_hardlinks_supported() and create_hardlinks: os.link(os.path.join(self.input_path, 'file1'), os.path.join(self.input_path, 'hardlink')) # Symlink @@ -432,7 +432,7 @@ def test_basic_functionality(self): self.assert_in(name, list_output) self.assert_dirs_equal('input', 'output/input') info_output = self.cmd('info', self.repository_location + '::test') - item_count = 4 if has_lchflags else 5 # one file is UF_NODUMP + item_count = 5 if has_lchflags else 6 # one file is UF_NODUMP self.assert_in('Number of files: %d' % item_count, info_output) shutil.rmtree(self.cache_path) info_output2 = self.cmd('info', self.repository_location + '::test') @@ -506,6 +506,29 @@ def test_symlink_extract(self): self.cmd('extract', self.repository_location + '::test') assert os.readlink('input/link1') == 'somewhere' + @pytest.mark.skipif(not are_symlinks_supported() or not are_hardlinks_supported(), + reason='symlinks or hardlinks not supported') + def test_hardlinked_symlinks_extract(self): + self.create_regular_file('target', size=1024) + with changedir('input'): + os.symlink('target', 'symlink1') + os.link('symlink1', 'symlink2', follow_symlinks=False) + self.cmd('init', '--encryption=repokey', self.repository_location) + self.cmd('create', self.repository_location + '::test', 'input') + with changedir('output'): + output = self.cmd('extract', self.repository_location + '::test') + print(output) + with changedir('input'): + assert os.path.exists('target') + assert os.readlink('symlink1') == 'target' + assert os.readlink('symlink2') == 'target' + st1 = os.stat('symlink1', follow_symlinks=False) + st2 = os.stat('symlink2', follow_symlinks=False) + assert st1.st_nlink == 2 + assert st2.st_nlink == 2 + assert st1.st_ino == st2.st_ino + assert st1.st_size == st2.st_size + @pytest.mark.skipif(not is_utime_fully_supported(), reason='cannot properly setup and execute test without utime') def test_atime(self): def has_noatime(some_file): @@ -2661,7 +2684,7 @@ def test_fuse_versions_view(self): hl3 = os.path.join(mountpoint, 'input', 'hardlink3', 'hardlink3.00001') assert os.stat(hl1).st_ino == os.stat(hl2).st_ino == os.stat(hl3).st_ino assert open(hl3, 'rb').read() == b'123456' - # similar again, but exclude the hardlink master: + # similar again, but exclude the 1st hardlink: with self.fuse_mount(self.repository_location, mountpoint, '-o', 'versions', '-e', 'input/hardlink1'): if are_hardlinks_supported(): hl2 = os.path.join(mountpoint, 'input', 'hardlink2', 'hardlink2.00001') @@ -3475,7 +3498,7 @@ def test_extract_hardlinks_tar(self): assert os.stat('input/dir1/source2').st_nlink == 2 def test_import_tar(self, tar_format='PAX'): - self.create_test_files() + self.create_test_files(create_hardlinks=False) # hardlinks become separate files os.unlink('input/flagfile') self.cmd('init', '--encryption=none', self.repository_location) self.cmd('create', self.repository_location + '::src', 'input') @@ -3489,7 +3512,7 @@ def test_import_tar(self, tar_format='PAX'): def test_import_tar_gz(self, tar_format='GNU'): if not shutil.which('gzip'): pytest.skip('gzip is not installed') - self.create_test_files() + self.create_test_files(create_hardlinks=False) # hardlinks become separate files os.unlink('input/flagfile') self.cmd('init', '--encryption=none', self.repository_location) self.cmd('create', self.repository_location + '::src', 'input') @@ -4444,26 +4467,23 @@ def ccc(a, b): class TestBuildFilter: - @staticmethod - def peek_and_store_hardlink_masters(item, matched): - pass def test_basic(self): matcher = PatternMatcher() matcher.add([parse_pattern('included')], IECommand.Include) - filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0) + filter = Archiver.build_filter(matcher, 0) assert filter(Item(path='included')) assert filter(Item(path='included/file')) assert not filter(Item(path='something else')) def test_empty(self): matcher = PatternMatcher(fallback=True) - filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0) + filter = Archiver.build_filter(matcher, 0) assert filter(Item(path='anything')) def test_strip_components(self): matcher = PatternMatcher(fallback=True) - filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, strip_components=1) + filter = Archiver.build_filter(matcher, strip_components=1) assert not filter(Item(path='shallow')) assert not filter(Item(path='shallow/')) # can this even happen? paths are normalized... assert filter(Item(path='deep enough/file')) From e5f1a4fb4d74cc5cdc1cd38f14200f4db782a35a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 11 May 2022 23:52:04 +0200 Subject: [PATCH 08/14] recreate: cachedir_masters not needed any more now all hardlinked regular file items have chunks. --- src/borg/archive.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index da083eac6b..ed17465ea3 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -2199,30 +2199,13 @@ def exclude(dir, tag_item): tag_files = [] tagged_dirs = [] - # to support reading hard-linked CACHEDIR.TAGs (aka CACHE_TAG_NAME): - cachedir_masters = {} - - if self.exclude_caches: - # sadly, due to how CACHEDIR.TAG works (filename AND file [header] contents) and - # how borg deals with hardlinks (slave hardlinks referring back to master hardlinks), - # we need to pass over the archive collecting hardlink master paths. - # as seen in issue #4911, the master paths can have an arbitrary filenames, - # not just CACHEDIR.TAG. - for item in archive.iter_items(filter=lambda item: os.path.basename(item.path) == CACHE_TAG_NAME): - if stat.S_ISREG(item.mode) and 'chunks' not in item and 'source' in item: - # this is a hardlink slave, referring back to its hardlink master (via item.source) - cachedir_masters[item.source] = None # we know the key (path), but not the value (item) yet - for item in archive.iter_items( filter=lambda item: os.path.basename(item.path) == CACHE_TAG_NAME or matcher.match(item.path)): - if self.exclude_caches and item.path in cachedir_masters: - cachedir_masters[item.path] = item dir, tag_file = os.path.split(item.path) if tag_file in self.exclude_if_present: exclude(dir, item) elif self.exclude_caches and tag_file == CACHE_TAG_NAME and stat.S_ISREG(item.mode): - content_item = item if 'chunks' in item else cachedir_masters[item.source] - file = open_item(archive, content_item) + file = open_item(archive, item) if file.read(len(CACHE_TAG_CONTENTS)) == CACHE_TAG_CONTENTS: exclude(dir, item) matcher.add(tag_files, IECommand.Include) From 27e06a1676773a469c08e83a909fea768cfb4f5c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 17 May 2022 02:00:00 +0200 Subject: [PATCH 09/14] use version 2 for new archives but still be able to read v1 archives for borg transfer. --- src/borg/archive.py | 8 ++++---- src/borg/cache.py | 2 +- src/borg/testsuite/archiver.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index ed17465ea3..ba19e72bbb 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -472,7 +472,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False, def _load_meta(self, id): data = self.key.decrypt(id, self.repository.get(id)) metadata = ArchiveItem(internal_dict=msgpack.unpackb(data)) - if metadata.version != 1: + if metadata.version not in (1, 2): # legacy: still need to read v1 archives raise Exception('Unknown archive metadata version') return metadata @@ -601,7 +601,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m self.start = start self.end = end metadata = { - 'version': 1, + 'version': 2, 'name': name, 'comment': comment or '', 'items': self.items_buffer.chunks, @@ -1748,7 +1748,7 @@ def valid_archive(obj): continue if not valid_msgpacked_dict(data, archive_keys_serialized): continue - if b'cmdline' not in data or b'\xa7version\x01' not in data: + if b'cmdline' not in data or b'\xa7version\x02' not in data: continue try: archive = msgpack.unpackb(data) @@ -1989,7 +1989,7 @@ def valid_item(obj): del self.manifest.archives[info.name] continue archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) - if archive.version != 1: + if archive.version != 2: raise Exception('Unknown archive metadata version') archive.cmdline = [safe_decode(arg) for arg in archive.cmdline] items_buffer = ChunkBuffer(self.key) diff --git a/src/borg/cache.py b/src/borg/cache.py index 6cd6123590..58ceb541b4 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -757,7 +757,7 @@ def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx): csize, data = decrypted_repository.get(archive_id) chunk_idx.add(archive_id, 1, len(data), csize) archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) - if archive.version != 1: + if archive.version not in (1, 2): # legacy raise Exception('Unknown archive metadata version') sync = CacheSynchronizer(chunk_idx) for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)): diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 402239455e..f82177fcdb 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -3873,7 +3873,7 @@ def test_manifest_rebuild_duplicate_archive(self): 'username': 'bar', 'name': 'archive1', 'time': '2016-12-15T18:49:51.849711', - 'version': 1, + 'version': 2, }) archive_id = key.id_hash(archive) repository.put(archive_id, key.encrypt(archive_id, archive)) @@ -3936,7 +3936,7 @@ def spoof_manifest(self, repository): with repository: _, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) repository.put(Manifest.MANIFEST_ID, key.encrypt(Manifest.MANIFEST_ID, msgpack.packb({ - 'version': 1, + 'version': 2, 'archives': {}, 'config': {}, 'timestamp': (datetime.utcnow() + timedelta(days=1)).strftime(ISO_FORMAT), From 4512707446bd92bb9b12b1e9b1bb4b365f856a4d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 17 May 2022 17:21:19 +0200 Subject: [PATCH 10/14] use whitelist approach to make sure item._dict is clean --- src/borg/archiver.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 4b6fd1782c..23ab03d930 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -345,6 +345,11 @@ def do_transfer(self, args, *, other_repository=None, other_manifest=None, other_key=None): """archives transfer from other repository""" + ITEM_KEY_WHITELIST = {'path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hlid', + 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size', + 'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', + 'part'} + def upgrade_item(item): """upgrade item as needed, get rid of legacy crap""" if hlm.borg1_hardlink_master(item): @@ -364,10 +369,14 @@ def upgrade_item(item): if attr in item: ns = getattr(item, attr) # decode (bigint or Timestamp) --> int ns setattr(item, attr, ns) # encode int ns --> msgpack.Timestamp only, no bigint any more - item._dict.pop('hardlink_master', None) # not used for hardlinks any more, replaced by hlid - item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 - item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks - return item + # make sure we only have desired stuff in the new item. specifically, make sure to get rid of: + # - 'acl' remnants of bug in attic <= 0.13 + # - 'hardlink_master' (superseded by hlid) + new_item_dict = {key: value for key, value in item.as_dict().items() if key in ITEM_KEY_WHITELIST} + new_item = Item(internal_dict=new_item_dict) + new_item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks + assert all(key in new_item for key in REQUIRED_ITEM_KEYS) + return new_item def upgrade_compressed_chunk(chunk): if ZLIB_legacy.detect(chunk): From 8229ce25f9792c4e60caacc5ad4d8b3239c6f778 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 17 May 2022 19:46:52 +0200 Subject: [PATCH 11/14] compute hlid from inode / device --- src/borg/archive.py | 14 +++++++------- src/borg/archiver.py | 4 ++-- src/borg/helpers/fs.py | 14 ++++++++++---- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index ba19e72bbb..15cfc5d55e 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1236,7 +1236,7 @@ def __init__(self, *, metadata_collector, cache, key, self.show_progress = show_progress self.print_file_status = file_status_printer or (lambda *args: None) - self.hlm = HardLinkManager(id_type=tuple, info_type=tuple) # (dev, ino) -> (hlid, chunks) + self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None))) # (dev, ino) -> chunks or None self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) self.cwd = os.getcwd() self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse) @@ -1249,20 +1249,20 @@ def create_helper(self, path, st, status=None, hardlinkable=True): update_map = False if hardlinked: status = 'h' # hardlink - hlid, chunks = self.hlm.retrieve(id=(st.st_ino, st.st_dev), default=(None, None)) - if hlid is None: + nothing = object() + chunks = self.hlm.retrieve(id=(st.st_ino, st.st_dev), default=nothing) + if chunks is nothing: update_map = True - hlid = self.hlm.hardlink_id(item._dict['path']) - item.hlid = hlid - if chunks is not None: + elif chunks is not None: item.chunks = chunks + item.hlid = self.hlm.hardlink_id_from_inode(ino=st.st_ino, dev=st.st_dev) yield item, status, hardlinked self.add_item(item, stats=self.stats) if update_map: # remember the hlid of this fs object and if the item has chunks, # also remember them, so we do not have to re-chunk a hardlink. chunks = item.chunks if 'chunks' in item else None - self.hlm.remember(id=(st.st_ino, st.st_dev), info=(hlid, chunks)) + self.hlm.remember(id=(st.st_ino, st.st_dev), info=chunks) def process_dir_with_fd(self, *, path, fd, st): with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 23ab03d930..7e0f67f486 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -353,10 +353,10 @@ def do_transfer(self, args, *, def upgrade_item(item): """upgrade item as needed, get rid of legacy crap""" if hlm.borg1_hardlink_master(item): - item._dict['hlid'] = hlid = hlm.hardlink_id(item._dict['path']) + item._dict['hlid'] = hlid = hlm.hardlink_id_from_path(item._dict['path']) hlm.remember(id=hlid, info=(item._dict.get('chunks'), item._dict.get('chunks_healthy'))) elif hlm.borg1_hardlink_slave(item): - item._dict['hlid'] = hlid = hlm.hardlink_id(item._dict['source']) + item._dict['hlid'] = hlid = hlm.hardlink_id_from_path(item._dict['source']) chunks, chunks_healthy = hlm.retrieve(id=hlid, default=(None, None)) if chunks is not None: item._dict['chunks'] = chunks diff --git a/src/borg/helpers/fs.py b/src/borg/helpers/fs.py index 89b54a09d3..fecda9c69e 100644 --- a/src/borg/helpers/fs.py +++ b/src/borg/helpers/fs.py @@ -181,7 +181,7 @@ class HardLinkManager: C) When transferring from a borg1 archive, we need: path -> chunks, chunks_healthy # for borg1_hl_targets If we encounter a regular file item with source == path later, we reuse chunks and chunks_healthy - and create the same hlid = hardlink_id(source). + and create the same hlid = hardlink_id_from_path(source). D) When importing a tar file (simplified 1-pass way for now, not creating borg hardlink items): path -> chunks @@ -203,11 +203,17 @@ def borg1_hardlink_master(self, item): # legacy def borg1_hardlink_slave(self, item): # legacy return 'source' in item and self.borg1_hardlinkable(item.mode) - def hardlink_id(self, path): + def hardlink_id_from_path(self, path): """compute a hardlink id from a path""" assert isinstance(path, bytes) return hashlib.sha256(path).digest() + def hardlink_id_from_inode(self, *, ino, dev): + """compute a hardlink id from an inode""" + assert isinstance(ino, int) + assert isinstance(dev, int) + return hashlib.sha256(f'{ino}/{dev}'.encode()).digest() + def remember(self, *, id, info): """ remember stuff from a (usually contentful) item. @@ -220,8 +226,8 @@ def remember(self, *, id, info): chunks / chunks_healthy list hlid """ - assert isinstance(id, self.id_type) - assert isinstance(info, self.info_type) + assert isinstance(id, self.id_type), f"key is {key!r}, not of type {self.key_type}" + assert isinstance(info, self.info_type), f"info is {info!r}, not of type {self.info_type}" self._map[id] = info def retrieve(self, id, *, default=None): From 1ed0ac22bc675c6075e71d1b3d1017c9108c3656 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 17 May 2022 22:54:12 +0200 Subject: [PATCH 12/14] compression: use the 2 bytes for type and level, fixes #6698 adapt borg transfer, transferred chunks are set to compression level "unknown". --- src/borg/archiver.py | 7 ++++- src/borg/compress.pyx | 56 ++++++++++++++++++++++----------------- src/borg/testsuite/key.py | 8 +++--- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 7e0f67f486..7408931caa 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -379,8 +379,13 @@ def upgrade_item(item): return new_item def upgrade_compressed_chunk(chunk): + level = b'\xFF' # FF means unknown compression level if ZLIB_legacy.detect(chunk): - chunk = ZLIB.ID + chunk # get rid of the attic legacy: prepend separate type bytes for zlib + ctype = ZLIB.ID + chunk = ctype + level + chunk # get rid of the attic legacy: prepend separate type/level bytes + else: + ctype = chunk[0:1] + chunk = ctype + level + chunk[2:] # keep type same, but set level return chunk dry_run = args.dry_run diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 7997456c6f..70c95df708 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -56,16 +56,21 @@ cdef class CompressorBase: also handles compression format auto detection and adding/stripping the ID header (which enable auto detection). """ - ID = b'\xFF\xFF' # reserved and not used - # overwrite with a unique 2-bytes bytestring in child classes + ID = b'\xFF' # reserved and not used + # overwrite with a unique 1-byte bytestring in child classes name = 'baseclass' @classmethod def detect(cls, data): return data.startswith(cls.ID) - def __init__(self, **kwargs): - pass + def __init__(self, level=255, **kwargs): + assert 0 <= level <= 255 + if self.ID is not None: + self.id_level = self.ID + bytes((level, )) # level 255 means "unknown level" + assert len(self.id_level) == 2 + else: + self.id_level = None def decide(self, data): """ @@ -85,8 +90,8 @@ cdef class CompressorBase: Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor, which is needed so that the correct decompressor can be used for decompression. """ - # add ID bytes - return self.ID + data + # add id_level bytes + return self.id_level + data def decompress(self, data): """ @@ -96,7 +101,7 @@ cdef class CompressorBase: Only handles input generated by _this_ Compressor - for a general purpose decompression method see *Compressor.decompress*. """ - # strip ID bytes + # strip id_level bytes return data[2:] cdef class DecidingCompressor(CompressorBase): @@ -106,8 +111,8 @@ cdef class DecidingCompressor(CompressorBase): """ name = 'decidebaseclass' - def __init__(self, **kwargs): - super().__init__(**kwargs) + def __init__(self, level=255, **kwargs): + super().__init__(level=level, **kwargs) def _decide(self, data): """ @@ -148,9 +153,12 @@ class CNONE(CompressorBase): """ none - no compression, just pass through data """ - ID = b'\x00\x00' + ID = b'\x00' name = 'none' + def __init__(self, level=255, **kwargs): + super().__init__(level=level, **kwargs) # no defined levels for CNONE, so just say "unknown" + def compress(self, data): return super().compress(data) @@ -170,11 +178,11 @@ class LZ4(DecidingCompressor): - wrapper releases CPython's GIL to support multithreaded code - uses safe lz4 methods that never go beyond the end of the output buffer """ - ID = b'\x01\x00' + ID = b'\x01' name = 'lz4' - def __init__(self, **kwargs): - pass + def __init__(self, level=255, **kwargs): + super().__init__(level=level, **kwargs) # no defined levels for LZ4, so just say "unknown" def _decide(self, idata): """ @@ -235,11 +243,11 @@ class LZMA(DecidingCompressor): """ lzma compression / decompression """ - ID = b'\x02\x00' + ID = b'\x02' name = 'lzma' def __init__(self, level=6, **kwargs): - super().__init__(**kwargs) + super().__init__(level=level, **kwargs) self.level = level if lzma is None: raise ValueError('No lzma support found.') @@ -270,11 +278,11 @@ class ZSTD(DecidingCompressor): # This is a NOT THREAD SAFE implementation. # Only ONE python context must be created at a time. # It should work flawlessly as long as borg will call ONLY ONE compression job at time. - ID = b'\x03\x00' + ID = b'\x03' name = 'zstd' def __init__(self, level=3, **kwargs): - super().__init__(**kwargs) + super().__init__(level=level, **kwargs) self.level = level def _decide(self, idata): @@ -335,11 +343,11 @@ class ZLIB(DecidingCompressor): """ zlib compression / decompression (python stdlib) """ - ID = b'\x05\x00' + ID = b'\x05' name = 'zlib' def __init__(self, level=6, **kwargs): - super().__init__(**kwargs) + super().__init__(level=level, **kwargs) self.level = level def _decide(self, data): @@ -373,8 +381,8 @@ class ZLIB_legacy(CompressorBase): Newer borg uses the ZLIB class that has separate ID bytes (as all the other compressors) and does not need this hack. """ - ID = b'\x08\x00' # not used here, see detect() - # avoid all 0x.8.. IDs elsewhere! + ID = b'\x08' # not used here, see detect() + # avoid all 0x.8 IDs elsewhere! name = 'zlib_legacy' @classmethod @@ -386,7 +394,7 @@ class ZLIB_legacy(CompressorBase): return check_ok and is_deflate def __init__(self, level=6, **kwargs): - super().__init__(**kwargs) + super().__init__(level=level, **kwargs) self.level = level def compress(self, data): @@ -478,14 +486,14 @@ class ObfuscateSize(CompressorBase): """ Meta-Compressor that obfuscates the compressed data size. """ - ID = b'\x04\x00' + ID = b'\x04' name = 'obfuscate' header_fmt = Struct('>I') header_len = len(header_fmt.pack(0)) def __init__(self, level=None, compressor=None): - super().__init__() + super().__init__(level=level) # data will be encrypted, so we can tell the level self.compressor = compressor if level is None: pass # decompression diff --git a/src/borg/testsuite/key.py b/src/borg/testsuite/key.py index e0be752fae..5073c5b23b 100644 --- a/src/borg/testsuite/key.py +++ b/src/borg/testsuite/key.py @@ -256,8 +256,8 @@ def test_authenticated_encrypt(self, monkeypatch): plaintext = b'123456789' id = key.id_hash(plaintext) authenticated = key.encrypt(id, plaintext) - # 0x07 is the key TYPE, \x0000 identifies no compression. - assert authenticated == b'\x07\x00\x00' + plaintext + # 0x07 is the key TYPE, \x00ff identifies no compression / unknown level. + assert authenticated == b'\x07\x00\xff' + plaintext def test_blake2_authenticated_encrypt(self, monkeypatch): monkeypatch.setenv('BORG_PASSPHRASE', 'test') @@ -267,8 +267,8 @@ def test_blake2_authenticated_encrypt(self, monkeypatch): plaintext = b'123456789' id = key.id_hash(plaintext) authenticated = key.encrypt(id, plaintext) - # 0x06 is the key TYPE, 0x0000 identifies no compression. - assert authenticated == b'\x06\x00\x00' + plaintext + # 0x06 is the key TYPE, 0x00ff identifies no compression / unknown level. + assert authenticated == b'\x06\x00\xff' + plaintext class TestTAM: From 154aab58811d07821757eb3e57e3e26659d08bb0 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 17 May 2022 23:29:58 +0200 Subject: [PATCH 13/14] obfuscation: fix byte order for size, fixes #6701 --- src/borg/archiver.py | 12 +++++++++++- src/borg/compress.pyx | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 7408931caa..faf6aa949e 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -29,6 +29,7 @@ from contextlib import contextmanager from datetime import datetime, timedelta from io import TextIOWrapper + from struct import Struct from .logger import create_logger, setup_logging @@ -44,7 +45,7 @@ from .archive import has_link from .cache import Cache, assert_secure, SecurityManager from .constants import * # NOQA - from .compress import CompressionSpec, ZLIB, ZLIB_legacy + from .compress import CompressionSpec, ZLIB, ZLIB_legacy, ObfuscateSize from .crypto.key import key_creator, key_argument_names, tam_required_file, tam_required from .crypto.key import RepoKey, KeyfileKey, Blake2RepoKey, Blake2KeyfileKey, FlexiKey from .crypto.keymanager import KeyManager @@ -386,6 +387,15 @@ def upgrade_compressed_chunk(chunk): else: ctype = chunk[0:1] chunk = ctype + level + chunk[2:] # keep type same, but set level + if ctype == ObfuscateSize.ID: + # in older borg, we used unusual byte order + old_header_fmt = Struct('>I') + new_header_fmt = ObfuscateSize.header_fmt + length = ObfuscateSize.header_len + size_bytes = chunk[2:2+length] + size = old_header_fmt.unpack(size_bytes) + size_bytes = new_header_fmt.pack(size) + chunk = chunk[0:2] + size_bytes + chunk[2+length:] return chunk dry_run = args.dry_run diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 70c95df708..c63a04df3f 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -489,7 +489,7 @@ class ObfuscateSize(CompressorBase): ID = b'\x04' name = 'obfuscate' - header_fmt = Struct('>I') + header_fmt = Struct(' Date: Wed, 18 May 2022 14:47:47 +0200 Subject: [PATCH 14/14] upgrade compressed chunk: fix treatment of ObfuscateSize chunks the inner payload of ObfuscateSize chunks are compressed chunks and need the same zlib fix and level patching as non-obfuscated compressed chunks. --- src/borg/archiver.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index faf6aa949e..dc0db82f23 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -380,13 +380,18 @@ def upgrade_item(item): return new_item def upgrade_compressed_chunk(chunk): + def upgrade_zlib_and_level(chunk): + if ZLIB_legacy.detect(chunk): + ctype = ZLIB.ID + chunk = ctype + level + chunk # get rid of the attic legacy: prepend separate type/level bytes + else: + ctype = chunk[0:1] + chunk = ctype + level + chunk[2:] # keep type same, but set level + return chunk + + ctype = chunk[0:1] level = b'\xFF' # FF means unknown compression level - if ZLIB_legacy.detect(chunk): - ctype = ZLIB.ID - chunk = ctype + level + chunk # get rid of the attic legacy: prepend separate type/level bytes - else: - ctype = chunk[0:1] - chunk = ctype + level + chunk[2:] # keep type same, but set level + if ctype == ObfuscateSize.ID: # in older borg, we used unusual byte order old_header_fmt = Struct('>I') @@ -395,7 +400,11 @@ def upgrade_compressed_chunk(chunk): size_bytes = chunk[2:2+length] size = old_header_fmt.unpack(size_bytes) size_bytes = new_header_fmt.pack(size) - chunk = chunk[0:2] + size_bytes + chunk[2+length:] + compressed = chunk[2+length:] + compressed = upgrade_zlib_and_level(compressed) + chunk = ctype + level + size_bytes + compressed + else: + chunk = upgrade_zlib_and_level(chunk) return chunk dry_run = args.dry_run