diff --git a/docs/faq.rst b/docs/faq.rst index 05f5176bc5..d16eff6c03 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -132,9 +132,6 @@ Which file types, attributes, etc. are *not* preserved? Archive extraction has optional support to extract all-zero chunks as holes in a sparse file. * Some filesystem specific attributes, like btrfs NOCOW, see :ref:`platforms`. - * For hardlinked symlinks, the hardlinking can not be archived (and thus, - the hardlinking will not be done at extraction time). The symlinks will - be archived and extracted as non-hardlinked symlinks, see :issue:`2379`. Are there other known limitations? ---------------------------------- diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index 2c5b7c1948..d1a5a4cd38 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -567,7 +567,7 @@ dictionary created by the ``Item`` class that contains: * uid * gid * mode (item type + permissions) -* source (for symlinks, and for hardlinks within one archive) +* source (for symlinks) * rdev (for device files) * mtime, atime, ctime in nanoseconds * xattrs diff --git a/docs/usage/general/file-metadata.rst.inc b/docs/usage/general/file-metadata.rst.inc index 8f4c67cbfb..c2694d1888 100644 --- a/docs/usage/general/file-metadata.rst.inc +++ b/docs/usage/general/file-metadata.rst.inc @@ -10,7 +10,7 @@ Besides regular file and directory structures, Borg can preserve * FIFOs ("named pipes") * special file *contents* can be backed up in ``--read-special`` mode. By default the metadata to create them with mknod(2), mkfifo(2) etc. is stored. -* hardlinked regular files, devices, FIFOs (considering all items in the same archive) +* hardlinked regular files, devices, symlinks, FIFOs (considering all items in the same archive) * timestamps in nanosecond precision: mtime, atime, ctime * other timestamps: birthtime (on platforms supporting it) * permissions: diff --git a/src/borg/archive.py b/src/borg/archive.py index a9c145d49e..15cfc5d55e 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -28,7 +28,7 @@ from .crypto.low_level import IntegrityError as IntegrityErrorBase from .hashindex import ChunkIndex, ChunkIndexEntry, CacheSynchronizer from .helpers import Manifest -from .helpers import hardlinkable +from .helpers import HardLinkManager from .helpers import ChunkIteratorFileWrapper, open_item from .helpers import Error, IntegrityError, set_ec from .platform import uid2user, user2uid, gid2group, group2gid @@ -280,7 +280,7 @@ def __init__(self, repository, key): self.repository = repository self.key = key - def unpack_many(self, ids, filter=None, partial_extract=False, preload=False, hardlink_masters=None): + def unpack_many(self, ids, *, filter=None, preload=False): """ Return iterator of items. @@ -290,10 +290,7 @@ def unpack_many(self, ids, filter=None, partial_extract=False, preload=False, ha Warning: if *preload* is True then all data chunks of every yielded item have to be retrieved, otherwise preloaded chunks will accumulate in RemoteRepository and create a memory leak. """ - def _preload(chunks): - self.repository.preload([c.id for c in chunks]) - - masters_preloaded = set() + hlids_preloaded = set() unpacker = msgpack.Unpacker(use_list=False) for data in self.fetch_many(ids): unpacker.feed(data) @@ -306,33 +303,20 @@ def _preload(chunks): items = [item for item in items if filter(item)] if preload: - if filter and partial_extract: - # if we do only a partial extraction, it gets a bit - # complicated with computing the preload items: if a hardlink master item is not - # selected (== not extracted), we will still need to preload its chunks if a - # corresponding hardlink slave is selected (== is extracted). - # due to a side effect of the filter() call, we now have hardlink_masters dict populated. - for item in items: - if hardlinkable(item.mode): - source = item.get('source') - if source is None: # maybe a hardlink master - if 'chunks' in item: - _preload(item.chunks) - # if this is a hl master, remember that we already preloaded all chunks of it (if any): - if item.get('hardlink_master', True): - masters_preloaded.add(item.path) - else: # hardlink slave - if source not in masters_preloaded: - # we only need to preload *once* (for the 1st selected slave) - chunks, _ = hardlink_masters[source] - if chunks is not None: - _preload(chunks) - masters_preloaded.add(source) - else: - # easy: we do not have a filter, thus all items are selected, thus we need to preload all chunks. - for item in items: - if 'chunks' in item: - _preload(item.chunks) + for item in items: + if 'chunks' in item: + hlid = item.get('hlid', None) + if hlid is None: + preload_chunks = True + else: + if hlid in hlids_preloaded: + preload_chunks = False + else: + # not having the hardlink's chunks already preloaded for other hardlink to same inode + preload_chunks = True + hlids_preloaded.add(hlid) + if preload_chunks: + self.repository.preload([c.id for c in item.chunks]) for item in items: yield item @@ -443,7 +427,6 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False, self.repository = repository self.cache = cache self.manifest = manifest - self.hard_links = {} self.stats = Statistics(output_json=log_json, iec=iec) self.iec = iec self.show_progress = progress @@ -489,7 +472,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False, def _load_meta(self, id): data = self.key.decrypt(id, self.repository.get(id)) metadata = ArchiveItem(internal_dict=msgpack.unpackb(data)) - if metadata.version != 1: + if metadata.version not in (1, 2): # legacy: still need to read v1 archives raise Exception('Unknown archive metadata version') return metadata @@ -584,12 +567,10 @@ def item_filter(self, item, filter=None): return False return filter(item) if filter else True - def iter_items(self, filter=None, partial_extract=False, preload=False, hardlink_masters=None): + def iter_items(self, filter=None, preload=False): # note: when calling this with preload=True, later fetch_many() must be called with # is_preloaded=True or the RemoteRepository code will leak memory! - assert not (filter and partial_extract and preload) or hardlink_masters is not None - for item in self.pipeline.unpack_many(self.metadata.items, partial_extract=partial_extract, - preload=preload, hardlink_masters=hardlink_masters, + for item in self.pipeline.unpack_many(self.metadata.items, preload=preload, filter=lambda item: self.item_filter(item, filter)): yield item @@ -620,7 +601,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m self.start = start self.end = end metadata = { - 'version': 1, + 'version': 2, 'name': name, 'comment': comment or '', 'items': self.items_buffer.chunks, @@ -719,33 +700,30 @@ def add(id): return stats @contextmanager - def extract_helper(self, dest, item, path, stripped_components, original_path, hardlink_masters): + def extract_helper(self, item, path, hlm, *, dry_run=False): hardlink_set = False # Hard link? - if 'source' in item: - source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:]) - chunks, link_target = hardlink_masters.get(item.source, (None, source)) - if link_target and has_link: - # Hard link was extracted previously, just link - with backup_io('link'): - os.link(link_target, path) - hardlink_set = True - elif chunks is not None: - # assign chunks to this item, since the item which had the chunks was not extracted - item.chunks = chunks + if 'hlid' in item: + link_target = hlm.retrieve(id=item.hlid) + if link_target is not None and has_link: + if not dry_run: + # another hardlink to same inode (same hlid) was extracted previously, just link to it + with backup_io('link'): + os.link(link_target, path, follow_symlinks=False) + hardlink_set = True yield hardlink_set - if not hardlink_set and hardlink_masters: - if has_link: - # Update master entry with extracted item path, so that following hardlinks don't extract twice. + if not hardlink_set: + if 'hlid' in item and has_link: + # Update entry with extracted item path, so that following hardlinks don't extract twice. # We have hardlinking support, so we will hardlink not extract. - hardlink_masters[item.get('source') or original_path] = (None, path) + hlm.remember(id=item.hlid, info=path) else: # Broken platform with no hardlinking support. # In this case, we *want* to extract twice, because there is no other way. pass def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False, - hardlink_masters=None, stripped_components=0, original_path=None, pi=None): + hlm=None, stripped_components=0, original_path=None, pi=None): """ Extract archive item. @@ -754,29 +732,33 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp :param dry_run: do not write any data :param stdout: write extracted data to stdout :param sparse: write sparse files (chunk-granularity, independent of the original being sparse) - :param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly + :param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly :param stripped_components: stripped leading path components to correct hard link extraction :param original_path: 'path' key as stored in archive :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes) """ - hardlink_masters = hardlink_masters or {} has_damaged_chunks = 'chunks_healthy' in item if dry_run or stdout: - if 'chunks' in item: - item_chunks_size = 0 - for data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True): - if pi: - pi.show(increase=len(data), info=[remove_surrogates(item.path)]) - if stdout: - sys.stdout.buffer.write(data) - item_chunks_size += len(data) - if stdout: - sys.stdout.buffer.flush() - if 'size' in item: - item_size = item.size - if item_size != item_chunks_size: - raise BackupError('Size inconsistency detected: size {}, chunks size {}'.format( - item_size, item_chunks_size)) + with self.extract_helper(item, '', hlm, dry_run=dry_run or stdout) as hardlink_set: + if not hardlink_set: + # it does not really set hardlinks due to dry_run, but we need to behave same + # as non-dry_run concerning fetching preloaded chunks from the pipeline or + # it would get stuck. + if 'chunks' in item: + item_chunks_size = 0 + for data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True): + if pi: + pi.show(increase=len(data), info=[remove_surrogates(item.path)]) + if stdout: + sys.stdout.buffer.write(data) + item_chunks_size += len(data) + if stdout: + sys.stdout.buffer.flush() + if 'size' in item: + item_size = item.size + if item_size != item_chunks_size: + raise BackupError('Size inconsistency detected: size {}, chunks size {}'.format( + item_size, item_chunks_size)) if has_damaged_chunks: raise BackupError('File has damaged (all-zero) chunks. Try running borg check --repair.') return @@ -807,8 +789,7 @@ def make_parent(path): if stat.S_ISREG(mode): with backup_io('makedirs'): make_parent(path) - with self.extract_helper(dest, item, path, stripped_components, original_path, - hardlink_masters) as hardlink_set: + with self.extract_helper(item, path, hlm) as hardlink_set: if hardlink_set: return with backup_io('open'): @@ -847,24 +828,26 @@ def make_parent(path): self.restore_attrs(path, item) elif stat.S_ISLNK(mode): make_parent(path) - source = item.source - try: - os.symlink(source, path) - except UnicodeEncodeError: - raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None - self.restore_attrs(path, item, symlink=True) + with self.extract_helper(item, path, hlm) as hardlink_set: + if hardlink_set: + # unusual, but possible: this is a hardlinked symlink. + return + source = item.source + try: + os.symlink(source, path) + except UnicodeEncodeError: + raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None + self.restore_attrs(path, item, symlink=True) elif stat.S_ISFIFO(mode): make_parent(path) - with self.extract_helper(dest, item, path, stripped_components, original_path, - hardlink_masters) as hardlink_set: + with self.extract_helper(item, path, hlm) as hardlink_set: if hardlink_set: return os.mkfifo(path) self.restore_attrs(path, item) elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode): make_parent(path) - with self.extract_helper(dest, item, path, stripped_components, original_path, - hardlink_masters) as hardlink_set: + with self.extract_helper(item, path, hlm) as hardlink_set: if hardlink_set: return os.mknod(path, item.mode, item.rdev) @@ -1041,79 +1024,43 @@ def compare_archives_iter(archive1, archive2, matcher=None, can_compare_chunk_id :param can_compare_chunk_ids: Whether --chunker-params are the same for both archives. """ - def hardlink_master_seen(item): - return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters - - def is_hardlink_master(item): - return item.get('hardlink_master', True) and 'source' not in item and hardlinkable(item.mode) - - def update_hardlink_masters(item1, item2): - if is_hardlink_master(item1) or is_hardlink_master(item2): - hardlink_masters[item1.path] = (item1, item2) - - def has_hardlink_master(item, hardlink_masters): - return hardlinkable(item.mode) and item.get('source') in hardlink_masters - def compare_items(item1, item2): - if has_hardlink_master(item1, hardlink_masters): - item1 = hardlink_masters[item1.source][0] - if has_hardlink_master(item2, hardlink_masters): - item2 = hardlink_masters[item2.source][1] return ItemDiff(item1, item2, archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]), archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]), can_compare_chunk_ids=can_compare_chunk_ids) - def defer_if_necessary(item1, item2): - """Adds item tuple to deferred if necessary and returns True, if items were deferred""" - update_hardlink_masters(item1, item2) - defer = not hardlink_master_seen(item1) or not hardlink_master_seen(item2) - if defer: - deferred.append((item1, item2)) - return defer - orphans_archive1 = OrderedDict() orphans_archive2 = OrderedDict() - deferred = [] - hardlink_masters = {} for item1, item2 in zip_longest( archive1.iter_items(lambda item: matcher.match(item.path)), archive2.iter_items(lambda item: matcher.match(item.path)), ): if item1 and item2 and item1.path == item2.path: - if not defer_if_necessary(item1, item2): - yield (item1.path, compare_items(item1, item2)) + yield (item1.path, compare_items(item1, item2)) continue if item1: matching_orphan = orphans_archive2.pop(item1.path, None) if matching_orphan: - if not defer_if_necessary(item1, matching_orphan): - yield (item1.path, compare_items(item1, matching_orphan)) + yield (item1.path, compare_items(item1, matching_orphan)) else: orphans_archive1[item1.path] = item1 if item2: matching_orphan = orphans_archive1.pop(item2.path, None) if matching_orphan: - if not defer_if_necessary(matching_orphan, item2): - yield (matching_orphan.path, compare_items(matching_orphan, item2)) + yield (matching_orphan.path, compare_items(matching_orphan, item2)) else: orphans_archive2[item2.path] = item2 # At this point orphans_* contain items that had no matching partner in the other archive for added in orphans_archive2.values(): path = added.path deleted_item = Item.create_deleted(path) - update_hardlink_masters(deleted_item, added) yield (path, compare_items(deleted_item, added)) for deleted in orphans_archive1.values(): path = deleted.path deleted_item = Item.create_deleted(path) - update_hardlink_masters(deleted, deleted_item) yield (path, compare_items(deleted, deleted_item)) - for item1, item2 in deferred: - assert hardlink_master_seen(item1) - assert hardlink_master_seen(item2) - yield (path, compare_items(item1, item2)) class MetadataCollector: @@ -1289,7 +1236,7 @@ def __init__(self, *, metadata_collector, cache, key, self.show_progress = show_progress self.print_file_status = file_status_printer or (lambda *args: None) - self.hard_links = {} + self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None))) # (dev, ino) -> chunks or None self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) self.cwd = os.getcwd() self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse) @@ -1298,29 +1245,32 @@ def __init__(self, *, metadata_collector, cache, key, def create_helper(self, path, st, status=None, hardlinkable=True): safe_path = make_path_safe(path) item = Item(path=safe_path) - hardlink_master = False hardlinked = hardlinkable and st.st_nlink > 1 + update_map = False if hardlinked: - source = self.hard_links.get((st.st_ino, st.st_dev)) - if source is not None: - item.source = source - status = 'h' # hardlink (to already seen inodes) - else: - hardlink_master = True - yield item, status, hardlinked, hardlink_master - # if we get here, "with"-block worked ok without error/exception, the item was processed ok... + status = 'h' # hardlink + nothing = object() + chunks = self.hlm.retrieve(id=(st.st_ino, st.st_dev), default=nothing) + if chunks is nothing: + update_map = True + elif chunks is not None: + item.chunks = chunks + item.hlid = self.hlm.hardlink_id_from_inode(ino=st.st_ino, dev=st.st_dev) + yield item, status, hardlinked self.add_item(item, stats=self.stats) - # ... and added to the archive, so we can remember it to refer to it later in the archive: - if hardlink_master: - self.hard_links[(st.st_ino, st.st_dev)] = safe_path + if update_map: + # remember the hlid of this fs object and if the item has chunks, + # also remember them, so we do not have to re-chunk a hardlink. + chunks = item.chunks if 'chunks' in item else None + self.hlm.remember(id=(st.st_ino, st.st_dev), info=chunks) def process_dir_with_fd(self, *, path, fd, st): - with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master): + with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked): item.update(self.metadata_collector.stat_attrs(st, path, fd=fd)) return status def process_dir(self, *, path, parent_fd, name, st): - with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master): + with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked): with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_dir, noatime=True, op='dir_open') as fd: # fd is None for directories on windows, in that case a race condition check is not possible. @@ -1331,7 +1281,7 @@ def process_dir(self, *, path, parent_fd, name, st): return status def process_fifo(self, *, path, parent_fd, name, st): - with self.create_helper(path, st, 'f') as (item, status, hardlinked, hardlink_master): # fifo + with self.create_helper(path, st, 'f') as (item, status, hardlinked): # fifo with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_normal, noatime=True) as fd: with backup_io('fstat'): st = stat_update_check(st, os.fstat(fd)) @@ -1339,7 +1289,7 @@ def process_fifo(self, *, path, parent_fd, name, st): return status def process_dev(self, *, path, parent_fd, name, st, dev_type): - with self.create_helper(path, st, dev_type) as (item, status, hardlinked, hardlink_master): # char/block device + with self.create_helper(path, st, dev_type) as (item, status, hardlinked): # char/block device # looks like we can not work fd-based here without causing issues when trying to open/close the device with backup_io('stat'): st = stat_update_check(st, os_stat(path=path, parent_fd=parent_fd, name=name, follow_symlinks=False)) @@ -1348,10 +1298,7 @@ def process_dev(self, *, path, parent_fd, name, st, dev_type): return status def process_symlink(self, *, path, parent_fd, name, st): - # note: using hardlinkable=False because we can not support hardlinked symlinks, - # due to the dual-use of item.source, see issue #2343: - # hardlinked symlinks will be archived [and extracted] as non-hardlinked symlinks. - with self.create_helper(path, st, 's', hardlinkable=False) as (item, status, hardlinked, hardlink_master): + with self.create_helper(path, st, 's', hardlinkable=True) as (item, status, hardlinked): fname = name if name is not None and parent_fd is not None else path with backup_io('readlink'): source = os.readlink(fname, dir_fd=parent_fd) @@ -1384,7 +1331,7 @@ def process_pipe(self, *, path, cache, fd, mode, user, group): return status def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): - with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet + with self.create_helper(path, st, None) as (item, status, hardlinked): # no status yet with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags, noatime=True) as fd: with backup_io('fstat'): st = stat_update_check(st, os.fstat(fd)) @@ -1395,7 +1342,9 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): # so it can be extracted / accessed in FUSE mount like a regular file. # this needs to be done early, so that part files also get the patched mode. item.mode = stat.S_IFREG | stat.S_IMODE(item.mode) - if not hardlinked or hardlink_master: + if 'chunks' in item: # create_helper might have put chunks from a previous hardlink there + [cache.chunk_incref(id_, self.stats) for id_, _, _ in item.chunks] + else: # normal case, no "2nd+" hardlink if not is_special_file: hashed_path = safe_encode(os.path.join(self.cwd, path)) path_hash = self.key.id_hash(hashed_path) @@ -1420,7 +1369,6 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): status = 'M' if known else 'A' # regular file, modified or added self.print_file_status(status, path) status = None # we already printed the status - item.hardlink_master = hardlinked # Only chunkify the file if needed if chunks is not None: item.chunks = chunks @@ -1444,7 +1392,7 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal): # also, we must not memorize a potentially inconsistent/corrupt file that # changed while we backed it up. cache.memorize_file(hashed_path, path_hash, st, [c.id for c in item.chunks]) - self.stats.nfiles += 1 + self.stats.nfiles += 1 item.update(self.metadata_collector.stat_ext_attrs(st, path, fd=fd)) item.get_size(memorize=True) return status @@ -1464,6 +1412,7 @@ def __init__(self, *, cache, key, self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False) + self.hlm = HardLinkManager(id_type=str, info_type=list) # path -> chunks @contextmanager def create_helper(self, tarinfo, status=None, type=None): @@ -1504,11 +1453,21 @@ def process_dev(self, *, tarinfo, status, type): item.rdev = os.makedev(tarinfo.devmajor, tarinfo.devminor) return status - def process_link(self, *, tarinfo, status, type): + def process_symlink(self, *, tarinfo, status, type): with self.create_helper(tarinfo, status, type) as (item, status): item.source = tarinfo.linkname return status + def process_hardlink(self, *, tarinfo, status, type): + with self.create_helper(tarinfo, status, type) as (item, status): + # create a not hardlinked borg item, reusing the chunks, see HardLinkManager.__doc__ + chunks = self.hlm.retrieve(tarinfo.linkname) + if chunks is not None: + item.chunks = chunks + item.get_size(memorize=True, from_chunks=True) + self.stats.nfiles += 1 + return status + def process_file(self, *, tarinfo, status, type, tar): with self.create_helper(tarinfo, status, type) as (item, status): self.print_file_status(status, tarinfo.name) @@ -1516,8 +1475,10 @@ def process_file(self, *, tarinfo, status, type, tar): fd = tar.extractfile(tarinfo) self.process_file_chunks(item, self.cache, self.stats, self.show_progress, backup_io_iter(self.chunker.chunkify(fd))) - item.get_size(memorize=True) + item.get_size(memorize=True, from_chunks=True) self.stats.nfiles += 1 + # we need to remember ALL files, see HardLinkManager.__doc__ + self.hlm.remember(id=tarinfo.name, info=item.chunks) return status @@ -1787,7 +1748,7 @@ def valid_archive(obj): continue if not valid_msgpacked_dict(data, archive_keys_serialized): continue - if b'cmdline' not in data or b'\xa7version\x01' not in data: + if b'cmdline' not in data or b'\xa7version\x02' not in data: continue try: archive = msgpack.unpackb(data) @@ -1944,9 +1905,6 @@ def list_keys_safe(keys): def valid_item(obj): if not isinstance(obj, StableDict): return False, 'not a dictionary' - # A bug in Attic up to and including release 0.13 added a (meaningless) b'acl' key to every item. - # We ignore it here, should it exist. See test_attic013_acl_bug for details. - obj.pop(b'acl', None) keys = set(obj) if not required_item_keys.issubset(keys): return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys) @@ -2031,7 +1989,7 @@ def valid_item(obj): del self.manifest.archives[info.name] continue archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) - if archive.version != 1: + if archive.version != 2: raise Exception('Unknown archive metadata version') archive.cmdline = [safe_decode(arg) for arg in archive.cmdline] items_buffer = ChunkBuffer(self.key) @@ -2130,34 +2088,11 @@ def recreate(self, archive_name, comment=None, target_name=None): def process_items(self, archive, target): matcher = self.matcher - target_is_subset = not matcher.empty() - hardlink_masters = {} if target_is_subset else None - - def item_is_hardlink_master(item): - return (target_is_subset and - hardlinkable(item.mode) and - item.get('hardlink_master', True) and - 'source' not in item) for item in archive.iter_items(): if not matcher.match(item.path): self.print_file_status('x', item.path) - if item_is_hardlink_master(item): - hardlink_masters[item.path] = (item.get('chunks'), item.get('chunks_healthy'), None) continue - if target_is_subset and hardlinkable(item.mode) and item.get('source') in hardlink_masters: - # master of this hard link is outside the target subset - chunks, chunks_healthy, new_source = hardlink_masters[item.source] - if new_source is None: - # First item to use this master, move the chunks - item.chunks = chunks - if chunks_healthy is not None: - item.chunks_healthy = chunks_healthy - hardlink_masters[item.source] = (None, None, item.path) - del item.source - else: - # Master was already moved, only update this item's source - item.source = new_source if self.dry_run: self.print_file_status('-', item.path) else: @@ -2264,30 +2199,13 @@ def exclude(dir, tag_item): tag_files = [] tagged_dirs = [] - # to support reading hard-linked CACHEDIR.TAGs (aka CACHE_TAG_NAME), similar to hardlink_masters: - cachedir_masters = {} - - if self.exclude_caches: - # sadly, due to how CACHEDIR.TAG works (filename AND file [header] contents) and - # how borg deals with hardlinks (slave hardlinks referring back to master hardlinks), - # we need to pass over the archive collecting hardlink master paths. - # as seen in issue #4911, the master paths can have an arbitrary filenames, - # not just CACHEDIR.TAG. - for item in archive.iter_items(filter=lambda item: os.path.basename(item.path) == CACHE_TAG_NAME): - if stat.S_ISREG(item.mode) and 'chunks' not in item and 'source' in item: - # this is a hardlink slave, referring back to its hardlink master (via item.source) - cachedir_masters[item.source] = None # we know the key (path), but not the value (item) yet - for item in archive.iter_items( filter=lambda item: os.path.basename(item.path) == CACHE_TAG_NAME or matcher.match(item.path)): - if self.exclude_caches and item.path in cachedir_masters: - cachedir_masters[item.path] = item dir, tag_file = os.path.split(item.path) if tag_file in self.exclude_if_present: exclude(dir, item) elif self.exclude_caches and tag_file == CACHE_TAG_NAME and stat.S_ISREG(item.mode): - content_item = item if 'chunks' in item else cachedir_masters[item.source] - file = open_item(archive, content_item) + file = open_item(archive, item) if file.read(len(CACHE_TAG_CONTENTS)) == CACHE_TAG_CONTENTS: exclude(dir, item) matcher.add(tag_files, IECommand.Include) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index de9489ff0a..dc0db82f23 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -29,6 +29,7 @@ from contextlib import contextmanager from datetime import datetime, timedelta from io import TextIOWrapper + from struct import Struct from .logger import create_logger, setup_logging @@ -44,7 +45,7 @@ from .archive import has_link from .cache import Cache, assert_secure, SecurityManager from .constants import * # NOQA - from .compress import CompressionSpec + from .compress import CompressionSpec, ZLIB, ZLIB_legacy, ObfuscateSize from .crypto.key import key_creator, key_argument_names, tam_required_file, tam_required from .crypto.key import RepoKey, KeyfileKey, Blake2RepoKey, Blake2KeyfileKey, FlexiKey from .crypto.keymanager import KeyManager @@ -59,7 +60,7 @@ from .helpers import timestamp from .helpers import get_cache_dir, os_stat from .helpers import Manifest, AI_HUMAN_SORT_KEYS - from .helpers import hardlinkable + from .helpers import HardLinkManager from .helpers import StableDict from .helpers import check_python, check_extension_modules from .helpers import dir_is_tagged, is_slow_msgpack, is_supported_msgpack, yes, sysinfo @@ -338,6 +339,137 @@ def do_serve(self, args): ).serve() return EXIT_SUCCESS + @with_other_repository(manifest=True, key=True, compatibility=(Manifest.Operation.READ,)) + @with_repository(exclusive=True, manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,)) + def do_transfer(self, args, *, + repository, manifest, key, cache, + other_repository=None, other_manifest=None, other_key=None): + """archives transfer from other repository""" + + ITEM_KEY_WHITELIST = {'path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hlid', + 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size', + 'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', + 'part'} + + def upgrade_item(item): + """upgrade item as needed, get rid of legacy crap""" + if hlm.borg1_hardlink_master(item): + item._dict['hlid'] = hlid = hlm.hardlink_id_from_path(item._dict['path']) + hlm.remember(id=hlid, info=(item._dict.get('chunks'), item._dict.get('chunks_healthy'))) + elif hlm.borg1_hardlink_slave(item): + item._dict['hlid'] = hlid = hlm.hardlink_id_from_path(item._dict['source']) + chunks, chunks_healthy = hlm.retrieve(id=hlid, default=(None, None)) + if chunks is not None: + item._dict['chunks'] = chunks + for chunk_id, _, _ in chunks: + cache.chunk_incref(chunk_id, archive.stats) + if chunks_healthy is not None: + item._dict['chunks_healthy'] = chunks + item._dict.pop('source') # not used for hardlinks any more, replaced by hlid + for attr in 'atime', 'ctime', 'mtime', 'birthtime': + if attr in item: + ns = getattr(item, attr) # decode (bigint or Timestamp) --> int ns + setattr(item, attr, ns) # encode int ns --> msgpack.Timestamp only, no bigint any more + # make sure we only have desired stuff in the new item. specifically, make sure to get rid of: + # - 'acl' remnants of bug in attic <= 0.13 + # - 'hardlink_master' (superseded by hlid) + new_item_dict = {key: value for key, value in item.as_dict().items() if key in ITEM_KEY_WHITELIST} + new_item = Item(internal_dict=new_item_dict) + new_item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks + assert all(key in new_item for key in REQUIRED_ITEM_KEYS) + return new_item + + def upgrade_compressed_chunk(chunk): + def upgrade_zlib_and_level(chunk): + if ZLIB_legacy.detect(chunk): + ctype = ZLIB.ID + chunk = ctype + level + chunk # get rid of the attic legacy: prepend separate type/level bytes + else: + ctype = chunk[0:1] + chunk = ctype + level + chunk[2:] # keep type same, but set level + return chunk + + ctype = chunk[0:1] + level = b'\xFF' # FF means unknown compression level + + if ctype == ObfuscateSize.ID: + # in older borg, we used unusual byte order + old_header_fmt = Struct('>I') + new_header_fmt = ObfuscateSize.header_fmt + length = ObfuscateSize.header_len + size_bytes = chunk[2:2+length] + size = old_header_fmt.unpack(size_bytes) + size_bytes = new_header_fmt.pack(size) + compressed = chunk[2+length:] + compressed = upgrade_zlib_and_level(compressed) + chunk = ctype + level + size_bytes + compressed + else: + chunk = upgrade_zlib_and_level(chunk) + return chunk + + dry_run = args.dry_run + + args.consider_checkpoints = True + archive_names = tuple(x.name for x in other_manifest.archives.list_considering(args)) + if not archive_names: + return EXIT_SUCCESS + + for name in archive_names: + transfer_size = 0 + present_size = 0 + if name in manifest.archives and not dry_run: + print(f"{name}: archive is already present in destination repo, skipping.") + else: + if not dry_run: + print(f"{name}: copying archive to destination repo...") + hlm = HardLinkManager(id_type=bytes, info_type=tuple) # hlid -> (chunks, chunks_healthy) + other_archive = Archive(other_repository, other_key, other_manifest, name) + archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None + for item in other_archive.iter_items(): + if 'chunks' in item: + chunks = [] + for chunk_id, size, _ in item.chunks: + refcount = cache.seen_chunk(chunk_id, size) + if refcount == 0: # target repo does not yet have this chunk + if not dry_run: + cdata = other_repository.get(chunk_id) + # keep compressed payload same, avoid decompression / recompression + data = other_key.decrypt(chunk_id, cdata, decompress=False) + data = upgrade_compressed_chunk(data) + chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False, + compress=False, size=size) + cache.repository.async_response(wait=False) + chunks.append(chunk_entry) + transfer_size += size + else: + if not dry_run: + chunk_entry = cache.chunk_incref(chunk_id, archive.stats) + chunks.append(chunk_entry) + present_size += size + if not dry_run: + item.chunks = chunks # overwrite! IDs and sizes are same, csizes are likely different + archive.stats.nfiles += 1 + if not dry_run: + archive.add_item(upgrade_item(item)) + if not dry_run: + additional_metadata = {} + # keep all metadata except archive version and stats. also do not keep + # recreate_source_id, recreate_args, recreate_partial_chunks which were used only in 1.1.0b1 .. b2. + for attr in ('cmdline', 'hostname', 'username', 'time', 'time_end', 'comment', + 'chunker_params', 'recreate_cmdline'): + if hasattr(other_archive.metadata, attr): + additional_metadata[attr] = getattr(other_archive.metadata, attr) + archive.save(stats=archive.stats, additional_metadata=additional_metadata) + print(f"{name}: finished. " + f"transfer_size: {format_file_size(transfer_size)} " + f"present_size: {format_file_size(present_size)}") + else: + print(f"{name}: completed" if transfer_size == 0 else + f"{name}: incomplete, " + f"transfer_size: {format_file_size(transfer_size)} " + f"present_size: {format_file_size(present_size)}") + return EXIT_SUCCESS + @with_repository(create=True, exclusive=True, manifest=False) @with_other_repository(key=True, compatibility=(Manifest.Operation.READ, )) def do_init(self, args, repository, *, other_repository=None, other_key=None): @@ -1055,16 +1187,14 @@ def _rec_walk(self, *, path, parent_fd, name, fso, cache, matcher, self.print_file_status(status, path) @staticmethod - def build_filter(matcher, peek_and_store_hardlink_masters, strip_components): + def build_filter(matcher, strip_components): if strip_components: def item_filter(item): matched = matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:]) - peek_and_store_hardlink_masters(item, matched) return matched else: def item_filter(item): matched = matcher.match(item.path) - peek_and_store_hardlink_masters(item, matched) return matched return item_filter @@ -1087,33 +1217,18 @@ def do_extract(self, args, repository, manifest, key, archive): sparse = args.sparse strip_components = args.strip_components dirs = [] - partial_extract = not matcher.empty() or strip_components - hardlink_masters = {} if partial_extract or not has_link else None + hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path - def peek_and_store_hardlink_masters(item, matched): - # not has_link: - # OS does not have hardlink capability thus we need to remember the chunks so that - # we can extract all hardlinks as separate normal (not-hardlinked) files instead. - # - # partial_extract and not matched and hardlinkable: - # we do not extract the very first hardlink, so we need to remember the chunks - # in hardlinks_master, so we can use them when we extract some 2nd+ hardlink item - # that has no chunks list. - if ((not has_link or (partial_extract and not matched and hardlinkable(item.mode))) and - (item.get('hardlink_master', True) and 'source' not in item)): - hardlink_masters[item.get('path')] = (item.get('chunks'), None) - - filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components) + filter = self.build_filter(matcher, strip_components) if progress: pi = ProgressIndicatorPercent(msg='%5.1f%% Extracting: %s', step=0.1, msgid='extract') pi.output('Calculating total archive size for the progress indicator (might take long for large archives)') - extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter)) + extracted_size = sum(item.get_size() for item in archive.iter_items(filter)) pi.total = extracted_size else: pi = None - for item in archive.iter_items(filter, partial_extract=partial_extract, - preload=True, hardlink_masters=hardlink_masters): + for item in archive.iter_items(filter, preload=True): orig_path = item.path if strip_components: item.path = os.sep.join(orig_path.split(os.sep)[strip_components:]) @@ -1128,13 +1243,13 @@ def peek_and_store_hardlink_masters(item, matched): logging.getLogger('borg.output.list').info(remove_surrogates(item.path)) try: if dry_run: - archive.extract_item(item, dry_run=True, pi=pi) + archive.extract_item(item, dry_run=True, hlm=hlm, pi=pi) else: if stat.S_ISDIR(item.mode): dirs.append(item) archive.extract_item(item, stdout=stdout, restore_attrs=False) else: - archive.extract_item(item, stdout=stdout, sparse=sparse, hardlink_masters=hardlink_masters, + archive.extract_item(item, stdout=stdout, sparse=sparse, hlm=hlm, stripped_components=strip_components, original_path=orig_path, pi=pi) except (BackupOSError, BackupError) as e: self.print_warning('%s: %s', remove_surrogates(orig_path), e) @@ -1199,15 +1314,9 @@ def _export_tar(self, args, archive, tarstream): progress = args.progress output_list = args.output_list strip_components = args.strip_components - partial_extract = not matcher.empty() or strip_components - hardlink_masters = {} if partial_extract else None - - def peek_and_store_hardlink_masters(item, matched): - if ((partial_extract and not matched and hardlinkable(item.mode)) and - (item.get('hardlink_master', True) and 'source' not in item)): - hardlink_masters[item.get('path')] = (item.get('chunks'), None) + hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path - filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components) + filter = self.build_filter(matcher, strip_components) # The | (pipe) symbol instructs tarfile to use a streaming mode of operation # where it never seeks on the passed fileobj. @@ -1217,7 +1326,7 @@ def peek_and_store_hardlink_masters(item, matched): if progress: pi = ProgressIndicatorPercent(msg='%5.1f%% Processing: %s', step=0.1, msgid='extract') pi.output('Calculating size') - extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter)) + extracted_size = sum(item.get_size() for item in archive.iter_items(filter)) pi.total = extracted_size else: pi = None @@ -1252,9 +1361,8 @@ def item_to_tarinfo(item, original_path): tarinfo.gid = item.gid tarinfo.uname = item.user or '' tarinfo.gname = item.group or '' - # The linkname in tar has the same dual use the 'source' attribute of Borg items, - # i.e. for symlinks it means the destination, while for hardlinks it refers to the - # file. + # The linkname in tar has 2 uses: + # for symlinks it means the destination, while for hardlinks it refers to the file. # Since hardlinks in tar have a different type code (LNKTYPE) the format might # support hardlinking arbitrary objects (including symlinks and directories), but # whether implementations actually support that is a whole different question... @@ -1263,23 +1371,16 @@ def item_to_tarinfo(item, original_path): modebits = stat.S_IFMT(item.mode) if modebits == stat.S_IFREG: tarinfo.type = tarfile.REGTYPE - if 'source' in item: - source = os.sep.join(item.source.split(os.sep)[strip_components:]) - if hardlink_masters is None: - linkname = source - else: - chunks, linkname = hardlink_masters.get(item.source, (None, source)) - if linkname: - # Master was already added to the archive, add a hardlink reference to it. + if 'hlid' in item: + linkname = hlm.retrieve(id=item.hlid) + if linkname is not None: + # the first hardlink was already added to the archive, add a tar-hardlink reference to it. tarinfo.type = tarfile.LNKTYPE tarinfo.linkname = linkname - elif chunks is not None: - # The item which has the chunks was not put into the tar, therefore - # we do that now and update hardlink_masters to reflect that. - item.chunks = chunks + else: tarinfo.size = item.get_size() stream = item_content_stream(item) - hardlink_masters[item.get('source') or original_path] = (None, item.path) + hlm.remember(id=item.hlid, info=item.path) else: tarinfo.size = item.get_size() stream = item_content_stream(item) @@ -1337,8 +1438,7 @@ def item_to_paxheaders(format, item): ph['BORG.item.meta'] = meta_text return ph - for item in archive.iter_items(filter, partial_extract=partial_extract, - preload=True, hardlink_masters=hardlink_masters): + for item in archive.iter_items(filter, preload=True): orig_path = item.path if strip_components: item.path = os.sep.join(orig_path.split(os.sep)[strip_components:]) @@ -1973,12 +2073,11 @@ def _import_tar(self, args, repository, manifest, key, cache, tarstream): elif tarinfo.isdir(): status = tfo.process_dir(tarinfo=tarinfo, status='d', type=stat.S_IFDIR) elif tarinfo.issym(): - status = tfo.process_link(tarinfo=tarinfo, status='s', type=stat.S_IFLNK) + status = tfo.process_symlink(tarinfo=tarinfo, status='s', type=stat.S_IFLNK) elif tarinfo.islnk(): - # tar uses the same hardlink model as borg (rather vice versa); the first instance of a hardlink - # is stored as a regular file, later instances are special entries referencing back to the - # first instance. - status = tfo.process_link(tarinfo=tarinfo, status='h', type=stat.S_IFREG) + # tar uses a hardlink model like: the first instance of a hardlink is stored as a regular file, + # later instances are special entries referencing back to the first instance. + status = tfo.process_hardlink(tarinfo=tarinfo, status='h', type=stat.S_IFREG) elif tarinfo.isblk(): status = tfo.process_dev(tarinfo=tarinfo, status='b', type=stat.S_IFBLK) elif tarinfo.ischr(): @@ -4083,6 +4182,43 @@ def define_borg_mount(parser): help='archives to delete') define_archive_filters_group(subparser) + # borg transfer + transfer_epilog = process_epilog(""" + This command transfers archives from one repository to another repository. + + Suggested use: + + # initialize DST_REPO reusing key material from SRC_REPO, so that + # chunking and chunk id generation will work in the same way as before. + borg init --other-location=SRC_REPO --encryption=DST_ENC DST_REPO + + # transfer archives from SRC_REPO to DST_REPO + borg transfer --dry-run SRC_REPO DST_REPO # check what it would do + borg transfer SRC_REPO DST_REPO # do it! + borg transfer --dry-run SRC_REPO DST_REPO # check! anything left? + + The default is to transfer all archives, including checkpoint archives. + + You could use the misc. archive filter options to limit which archives it will + transfer, e.g. using the --prefix option. This is recommended for big + repositories with multiple data sets to keep the runtime per invocation lower. + """) + subparser = subparsers.add_parser('transfer', parents=[common_parser], add_help=False, + description=self.do_transfer.__doc__, + epilog=transfer_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help='transfer of archives from another repository') + subparser.set_defaults(func=self.do_transfer) + subparser.add_argument('-n', '--dry-run', dest='dry_run', action='store_true', + help='do not change repository, just check') + subparser.add_argument('other_location', metavar='SRC_REPOSITORY', + type=location_validator(archive=False, other=True), + help='source repository') + subparser.add_argument('location', metavar='DST_REPOSITORY', + type=location_validator(archive=False, other=False), + help='destination repository') + define_archive_filters_group(subparser) + # borg diff diff_epilog = process_epilog(""" This command finds differences (file contents, user/group/mode) between archives. diff --git a/src/borg/cache.py b/src/borg/cache.py index 6fa74e692d..58ceb541b4 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -19,7 +19,7 @@ from .helpers import Error from .helpers import Manifest from .helpers import get_cache_dir, get_security_dir -from .helpers import int_to_bigint, bigint_to_int, bin_to_hex, parse_stringified_list +from .helpers import bin_to_hex, parse_stringified_list from .helpers import format_file_size from .helpers import safe_ns from .helpers import yes @@ -28,6 +28,7 @@ from .helpers import set_ec, EXIT_WARNING from .helpers import safe_unlink from .helpers import msgpack +from .helpers.msgpack import int_to_timestamp, timestamp_to_int from .item import ArchiveItem, ChunkListEntry from .crypto.key import PlaintextKey from .crypto.file_integrity import IntegrityCheckedFile, DetachedIntegrityCheckedFile, FileIntegrityError @@ -623,7 +624,7 @@ def commit(self): # this is to avoid issues with filesystem snapshots and cmtime granularity. # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet. entry = FileCacheEntry(*msgpack.unpackb(item)) - if entry.age == 0 and bigint_to_int(entry.cmtime) < self._newest_cmtime or \ + if entry.age == 0 and timestamp_to_int(entry.cmtime) < self._newest_cmtime or \ entry.age > 0 and entry.age < ttl: msgpack.pack((path_hash, entry), fd) entry_count += 1 @@ -756,7 +757,7 @@ def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx): csize, data = decrypted_repository.get(archive_id) chunk_idx.add(archive_id, 1, len(data), csize) archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) - if archive.version != 1: + if archive.version not in (1, 2): # legacy raise Exception('Unknown archive metadata version') sync = CacheSynchronizer(chunk_idx) for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)): @@ -1018,10 +1019,10 @@ def file_known_and_unchanged(self, hashed_path, path_hash, st): if 'i' in cache_mode and entry.inode != st.st_ino: files_cache_logger.debug('KNOWN-CHANGED: file inode number has changed: %r', hashed_path) return True, None - if 'c' in cache_mode and bigint_to_int(entry.cmtime) != st.st_ctime_ns: + if 'c' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_ctime_ns: files_cache_logger.debug('KNOWN-CHANGED: file ctime has changed: %r', hashed_path) return True, None - elif 'm' in cache_mode and bigint_to_int(entry.cmtime) != st.st_mtime_ns: + elif 'm' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_mtime_ns: files_cache_logger.debug('KNOWN-CHANGED: file mtime has changed: %r', hashed_path) return True, None # we ignored the inode number in the comparison above or it is still same. @@ -1049,7 +1050,7 @@ def memorize_file(self, hashed_path, path_hash, st, ids): elif 'm' in cache_mode: cmtime_type = 'mtime' cmtime_ns = safe_ns(st.st_mtime_ns) - entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_bigint(cmtime_ns), chunk_ids=ids) + entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids) self.files[path_hash] = msgpack.packb(entry) self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns) files_cache_logger.debug('FILES-CACHE-UPDATE: put %r [has %s] <- %r', diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 2e0eb4809b..c63a04df3f 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -56,16 +56,21 @@ cdef class CompressorBase: also handles compression format auto detection and adding/stripping the ID header (which enable auto detection). """ - ID = b'\xFF\xFF' # reserved and not used - # overwrite with a unique 2-bytes bytestring in child classes + ID = b'\xFF' # reserved and not used + # overwrite with a unique 1-byte bytestring in child classes name = 'baseclass' @classmethod def detect(cls, data): return data.startswith(cls.ID) - def __init__(self, **kwargs): - pass + def __init__(self, level=255, **kwargs): + assert 0 <= level <= 255 + if self.ID is not None: + self.id_level = self.ID + bytes((level, )) # level 255 means "unknown level" + assert len(self.id_level) == 2 + else: + self.id_level = None def decide(self, data): """ @@ -85,8 +90,8 @@ cdef class CompressorBase: Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor, which is needed so that the correct decompressor can be used for decompression. """ - # add ID bytes - return self.ID + data + # add id_level bytes + return self.id_level + data def decompress(self, data): """ @@ -96,7 +101,7 @@ cdef class CompressorBase: Only handles input generated by _this_ Compressor - for a general purpose decompression method see *Compressor.decompress*. """ - # strip ID bytes + # strip id_level bytes return data[2:] cdef class DecidingCompressor(CompressorBase): @@ -106,8 +111,8 @@ cdef class DecidingCompressor(CompressorBase): """ name = 'decidebaseclass' - def __init__(self, **kwargs): - super().__init__(**kwargs) + def __init__(self, level=255, **kwargs): + super().__init__(level=level, **kwargs) def _decide(self, data): """ @@ -148,9 +153,12 @@ class CNONE(CompressorBase): """ none - no compression, just pass through data """ - ID = b'\x00\x00' + ID = b'\x00' name = 'none' + def __init__(self, level=255, **kwargs): + super().__init__(level=level, **kwargs) # no defined levels for CNONE, so just say "unknown" + def compress(self, data): return super().compress(data) @@ -170,11 +178,11 @@ class LZ4(DecidingCompressor): - wrapper releases CPython's GIL to support multithreaded code - uses safe lz4 methods that never go beyond the end of the output buffer """ - ID = b'\x01\x00' + ID = b'\x01' name = 'lz4' - def __init__(self, **kwargs): - pass + def __init__(self, level=255, **kwargs): + super().__init__(level=level, **kwargs) # no defined levels for LZ4, so just say "unknown" def _decide(self, idata): """ @@ -235,11 +243,11 @@ class LZMA(DecidingCompressor): """ lzma compression / decompression """ - ID = b'\x02\x00' + ID = b'\x02' name = 'lzma' def __init__(self, level=6, **kwargs): - super().__init__(**kwargs) + super().__init__(level=level, **kwargs) self.level = level if lzma is None: raise ValueError('No lzma support found.') @@ -270,11 +278,11 @@ class ZSTD(DecidingCompressor): # This is a NOT THREAD SAFE implementation. # Only ONE python context must be created at a time. # It should work flawlessly as long as borg will call ONLY ONE compression job at time. - ID = b'\x03\x00' + ID = b'\x03' name = 'zstd' def __init__(self, level=3, **kwargs): - super().__init__(**kwargs) + super().__init__(level=level, **kwargs) self.level = level def _decide(self, idata): @@ -331,14 +339,52 @@ class ZSTD(DecidingCompressor): return dest[:osize] -class ZLIB(CompressorBase): +class ZLIB(DecidingCompressor): """ zlib compression / decompression (python stdlib) """ - ID = b'\x08\x00' # not used here, see detect() - # avoid all 0x.8.. IDs elsewhere! + ID = b'\x05' name = 'zlib' + def __init__(self, level=6, **kwargs): + super().__init__(level=level, **kwargs) + self.level = level + + def _decide(self, data): + """ + Decides what to do with *data*. Returns (compressor, zlib_data). + + *zlib_data* is the ZLIB result if *compressor* is ZLIB as well, otherwise it is None. + """ + zlib_data = zlib.compress(data, self.level) + if len(zlib_data) < len(data): + return self, zlib_data + else: + return NONE_COMPRESSOR, None + + def decompress(self, data): + data = super().decompress(data) + try: + return zlib.decompress(data) + except zlib.error as e: + raise DecompressionError(str(e)) from None + + +class ZLIB_legacy(CompressorBase): + """ + zlib compression / decompression (python stdlib) + + Note: This is the legacy ZLIB support as used by borg < 1.3. + It still suffers from attic *only* supporting zlib and not having separate + ID bytes to differentiate between differently compressed chunks. + This just works because zlib compressed stuff always starts with 0x.8.. bytes. + Newer borg uses the ZLIB class that has separate ID bytes (as all the other + compressors) and does not need this hack. + """ + ID = b'\x08' # not used here, see detect() + # avoid all 0x.8 IDs elsewhere! + name = 'zlib_legacy' + @classmethod def detect(cls, data): # matches misc. patterns 0x.8.. used by zlib @@ -348,7 +394,7 @@ class ZLIB(CompressorBase): return check_ok and is_deflate def __init__(self, level=6, **kwargs): - super().__init__(**kwargs) + super().__init__(level=level, **kwargs) self.level = level def compress(self, data): @@ -440,14 +486,14 @@ class ObfuscateSize(CompressorBase): """ Meta-Compressor that obfuscates the compressed data size. """ - ID = b'\x04\x00' + ID = b'\x04' name = 'obfuscate' - header_fmt = Struct('>I') + header_fmt = Struct(' path - def peek_and_store_hardlink_masters(item, matched): - if (partial_extract and not matched and hardlinkable(item.mode) and - item.get('hardlink_master', True) and 'source' not in item): - hardlink_masters[item.get('path')] = (item.get('chunks'), None) - - filter = Archiver.build_filter(matcher, peek_and_store_hardlink_masters, strip_components) + filter = Archiver.build_filter(matcher, strip_components) for item_inode, item in self.cache.iter_archive_items(archive.metadata.items, filter=filter, consider_part_files=self._args.consider_part_files): if strip_components: @@ -369,15 +364,13 @@ def peek_and_store_hardlink_masters(item, matched): parent = 1 for segment in segments[:-1]: parent = self._process_inner(segment, parent) - self._process_leaf(segments[-1], item, parent, prefix, is_dir, item_inode, - hardlink_masters, strip_components) + self._process_leaf(segments[-1], item, parent, prefix, is_dir, item_inode, hlm) duration = time.perf_counter() - t0 logger.debug('fuse: _process_archive completed in %.1f s for archive %s', duration, archive.name) - def _process_leaf(self, name, item, parent, prefix, is_dir, item_inode, hardlink_masters, stripped_components): + def _process_leaf(self, name, item, parent, prefix, is_dir, item_inode, hlm): path = item.path del item.path # save some space - hardlink_masters = hardlink_masters or {} def file_version(item, path): if 'chunks' in item: @@ -402,10 +395,9 @@ def make_versioned_name(name, version, add_dir=False): version_enc = os.fsencode('.%05d' % version) return name + version_enc + ext - if 'source' in item and hardlinkable(item.mode): - source = os.sep.join(item.source.split(os.sep)[stripped_components:]) - chunks, link_target = hardlink_masters.get(item.source, (None, source)) - if link_target: + if 'hlid' in item: + link_target = hlm.retrieve(id=item.hlid, default=None) + if link_target is not None: # Hard link was extracted previously, just link link_target = os.fsencode(link_target) if self.versions: @@ -415,19 +407,16 @@ def make_versioned_name(name, version, add_dir=False): try: inode = self.find_inode(link_target, prefix) except KeyError: - logger.warning('Skipping broken hard link: %s -> %s', path, source) + logger.warning('Skipping broken hard link: %s -> %s', path, link_target) return item = self.get_item(inode) item.nlink = item.get('nlink', 1) + 1 self._items[inode] = item - elif chunks is not None: - # assign chunks to this item, since the item which had the chunks was not extracted - item.chunks = chunks + else: inode = item_inode self._items[inode] = item - if hardlink_masters: - # Update master entry with extracted item path, so that following hardlinks don't extract twice. - hardlink_masters[item.source] = (None, path) + # remember extracted item path, so that following hardlinks don't extract twice. + hlm.remember(id=item.hlid, info=path) else: inode = item_inode @@ -436,7 +425,7 @@ def make_versioned_name(name, version, add_dir=False): enc_path = os.fsencode(path) version = file_version(item, enc_path) if version is not None: - # regular file, with contents - maybe a hardlink master + # regular file, with contents name = make_versioned_name(name, version) self.file_versions[enc_path] = version diff --git a/src/borg/helpers/fs.py b/src/borg/helpers/fs.py index d1a412da08..fecda9c69e 100644 --- a/src/borg/helpers/fs.py +++ b/src/borg/helpers/fs.py @@ -1,4 +1,5 @@ import errno +import hashlib import os import os.path import re @@ -165,9 +166,76 @@ def make_path_safe(path): return _safe_re.sub('', path) or '.' -def hardlinkable(mode): - """return True if we support hardlinked items of this type""" - return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode) +class HardLinkManager: + """ + Manage hardlinks (and avoid code duplication doing so). + + A) When creating a borg2 archive from the filesystem, we have to maintain a mapping like: + (dev, ino) -> (hlid, chunks) # for fs_hl_targets + If we encounter the same (dev, ino) again later, we'll just re-use the hlid and chunks list. + + B) When extracting a borg2 archive to the filesystem, we have to maintain a mapping like: + hlid -> path + If we encounter the same hlid again later, we hardlink to the path of the already extracted content of same hlid. + + C) When transferring from a borg1 archive, we need: + path -> chunks, chunks_healthy # for borg1_hl_targets + If we encounter a regular file item with source == path later, we reuse chunks and chunks_healthy + and create the same hlid = hardlink_id_from_path(source). + + D) When importing a tar file (simplified 1-pass way for now, not creating borg hardlink items): + path -> chunks + If we encounter a LNK tar entry later with linkname==path, we re-use the chunks and create a regular file item. + For better hardlink support (including the very first hardlink item for each group of same-target hardlinks), + we would need a 2-pass processing, which is not yet implemented. + """ + def __init__(self, *, id_type, info_type): + self._map = {} + self.id_type = id_type + self.info_type = info_type + + def borg1_hardlinkable(self, mode): # legacy + return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode) + + def borg1_hardlink_master(self, item): # legacy + return item.get('hardlink_master', True) and 'source' not in item and self.borg1_hardlinkable(item.mode) + + def borg1_hardlink_slave(self, item): # legacy + return 'source' in item and self.borg1_hardlinkable(item.mode) + + def hardlink_id_from_path(self, path): + """compute a hardlink id from a path""" + assert isinstance(path, bytes) + return hashlib.sha256(path).digest() + + def hardlink_id_from_inode(self, *, ino, dev): + """compute a hardlink id from an inode""" + assert isinstance(ino, int) + assert isinstance(dev, int) + return hashlib.sha256(f'{ino}/{dev}'.encode()).digest() + + def remember(self, *, id, info): + """ + remember stuff from a (usually contentful) item. + + :param id: some id used to reference to the contentful item, could be: + a path (tar style, old borg style) [bytes] + a hlid (new borg style) [bytes] + a (dev, inode) tuple (filesystem) + :param info: information to remember, could be: + chunks / chunks_healthy list + hlid + """ + assert isinstance(id, self.id_type), f"key is {key!r}, not of type {self.key_type}" + assert isinstance(info, self.info_type), f"info is {info!r}, not of type {self.info_type}" + self._map[id] = info + + def retrieve(self, id, *, default=None): + """ + retrieve stuff to use it in a (usually contentless) item. + """ + assert isinstance(id, self.id_type) + return self._map.get(id, default) def scandir_keyfunc(dirent): diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 2ace88feef..411f00fec4 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -24,7 +24,7 @@ from msgpack import unpack as mp_unpack from msgpack import version as mp_version -from msgpack import ExtType +from msgpack import ExtType, Timestamp from msgpack import OutOfData @@ -164,7 +164,7 @@ def get_limited_unpacker(kind): return Unpacker(**args) -def bigint_to_int(mtime): +def bigint_to_int(mtime): # legacy """Convert bytearray to int """ if isinstance(mtime, bytes): @@ -172,7 +172,7 @@ def bigint_to_int(mtime): return mtime -def int_to_bigint(value): +def int_to_bigint(value): # legacy """Convert integers larger than 64 bits to bytearray Smaller integers are left alone @@ -180,3 +180,14 @@ def int_to_bigint(value): if value.bit_length() > 63: return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True) return value + + +def int_to_timestamp(ns): + return Timestamp.from_unix_nano(ns) + + +def timestamp_to_int(ts): + if isinstance(ts, Timestamp): + return ts.to_unix_nano() + # legacy support note: we need to keep the bigint conversion for compatibility with borg < 1.3 archives. + return bigint_to_int(ts) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 3e145ada2d..414402de0f 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -19,6 +19,7 @@ from .errors import Error from .fs import get_keys_dir +from .msgpack import Timestamp from .time import OutputTimestamp, format_time, to_localtime, safe_timestamp, safe_s from .. import __version__ as borg_version from .. import __version_tuple__ as borg_version_tuple @@ -694,7 +695,8 @@ class ItemFormatter(BaseFormatter): KEY_DESCRIPTIONS = { 'bpath': 'verbatim POSIX path, can contain any character except NUL', 'path': 'path interpreted as text (might be missing non-text characters, see bpath)', - 'source': 'link target for links (identical to linktarget)', + 'source': 'link target for symlinks (identical to linktarget)', + 'hlid': 'hard link identity (same if hardlinking same fs object)', 'extra': 'prepends {source} with " -> " for soft links and " link to " for hard links', 'csize': 'compressed size', 'dsize': 'deduplicated size', @@ -705,7 +707,7 @@ class ItemFormatter(BaseFormatter): 'health': 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)', } KEY_GROUPS = ( - ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget', 'flags'), + ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget', 'hlid', 'flags'), ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'), ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'), tuple(sorted(hash_algorithms)), @@ -801,11 +803,9 @@ def get_item_data(self, item): extra = '' if source: source = remove_surrogates(source) - if item_type == 'l': - extra = ' -> %s' % source - else: - mode = 'h' + mode[1:] - extra = ' link to %s' % source + extra = ' -> %s' % source + hlid = item.get('hlid') + hlid = bin_to_hex(hlid) if hlid else '' item_data['type'] = item_type item_data['mode'] = mode item_data['user'] = item.user or item.uid @@ -821,6 +821,7 @@ def get_item_data(self, item): item_data['health'] = 'broken' if 'chunks_healthy' in item else 'healthy' item_data['source'] = source item_data['linktarget'] = source + item_data['hlid'] = hlid item_data['flags'] = item.get('bsdflags') for key in self.used_call_keys: item_data[key] = self.call_keys[key](item) @@ -1043,6 +1044,8 @@ def decode(d): value = decode_tuple(value) elif isinstance(value, bytes): value = decode_bytes(value) + elif isinstance(value, Timestamp): + value = value.to_unix_nano() if isinstance(key, bytes): key = key.decode() res[key] = value diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 48debf1839..9ea76f2de4 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -3,9 +3,9 @@ from collections import namedtuple from .constants import ITEM_KEYS, ARCHIVE_KEYS from .helpers import safe_encode, safe_decode -from .helpers import bigint_to_int, int_to_bigint from .helpers import StableDict from .helpers import format_file_size +from .helpers.msgpack import timestamp_to_int, int_to_timestamp cdef extern from "_item.c": @@ -171,17 +171,17 @@ class Item(PropDict): rdev = PropDict._make_property('rdev', int) bsdflags = PropDict._make_property('bsdflags', int) - # note: we need to keep the bigint conversion for compatibility with borg 1.0 archives. - atime = PropDict._make_property('atime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - ctime = PropDict._make_property('ctime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - mtime = PropDict._make_property('mtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - birthtime = PropDict._make_property('birthtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) + atime = PropDict._make_property('atime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + ctime = PropDict._make_property('ctime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + mtime = PropDict._make_property('mtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + birthtime = PropDict._make_property('birthtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) # size is only present for items with a chunk list and then it is sum(chunk_sizes) # compatibility note: this is a new feature, in old archives size will be missing. size = PropDict._make_property('size', int) - hardlink_master = PropDict._make_property('hardlink_master', bool) + hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link. + hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') chunks_healthy = PropDict._make_property('chunks_healthy', (list, type(None)), 'list or None') @@ -214,7 +214,6 @@ class Item(PropDict): except AttributeError: if stat.S_ISLNK(self.mode): # get out of here quickly. symlinks have no own chunks, their fs size is the length of the target name. - # also, there is the dual-use issue of .source (#2343), so don't confuse it with a hardlink slave. return len(self.source) # no precomputed (c)size value available, compute it: try: diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 450ba1757b..f82177fcdb 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -321,7 +321,7 @@ def create_regular_file(self, name, size=0, contents=None): contents = b'X' * size fd.write(contents) - def create_test_files(self): + def create_test_files(self, create_hardlinks=True): """Create a minimal test case including all supported file types """ # File @@ -332,7 +332,7 @@ def create_test_files(self): # File mode os.chmod('input/file1', 0o4755) # Hard link - if are_hardlinks_supported(): + if are_hardlinks_supported() and create_hardlinks: os.link(os.path.join(self.input_path, 'file1'), os.path.join(self.input_path, 'hardlink')) # Symlink @@ -432,7 +432,7 @@ def test_basic_functionality(self): self.assert_in(name, list_output) self.assert_dirs_equal('input', 'output/input') info_output = self.cmd('info', self.repository_location + '::test') - item_count = 4 if has_lchflags else 5 # one file is UF_NODUMP + item_count = 5 if has_lchflags else 6 # one file is UF_NODUMP self.assert_in('Number of files: %d' % item_count, info_output) shutil.rmtree(self.cache_path) info_output2 = self.cmd('info', self.repository_location + '::test') @@ -506,6 +506,29 @@ def test_symlink_extract(self): self.cmd('extract', self.repository_location + '::test') assert os.readlink('input/link1') == 'somewhere' + @pytest.mark.skipif(not are_symlinks_supported() or not are_hardlinks_supported(), + reason='symlinks or hardlinks not supported') + def test_hardlinked_symlinks_extract(self): + self.create_regular_file('target', size=1024) + with changedir('input'): + os.symlink('target', 'symlink1') + os.link('symlink1', 'symlink2', follow_symlinks=False) + self.cmd('init', '--encryption=repokey', self.repository_location) + self.cmd('create', self.repository_location + '::test', 'input') + with changedir('output'): + output = self.cmd('extract', self.repository_location + '::test') + print(output) + with changedir('input'): + assert os.path.exists('target') + assert os.readlink('symlink1') == 'target' + assert os.readlink('symlink2') == 'target' + st1 = os.stat('symlink1', follow_symlinks=False) + st2 = os.stat('symlink2', follow_symlinks=False) + assert st1.st_nlink == 2 + assert st2.st_nlink == 2 + assert st1.st_ino == st2.st_ino + assert st1.st_size == st2.st_size + @pytest.mark.skipif(not is_utime_fully_supported(), reason='cannot properly setup and execute test without utime') def test_atime(self): def has_noatime(some_file): @@ -2442,7 +2465,7 @@ def test_compression_none_uncompressible(self): def test_compression_zlib_compressible(self): size, csize = self._get_sizes('zlib', compressible=True) assert csize < size * 0.1 - assert csize == 35 + assert csize == 37 def test_compression_zlib_uncompressible(self): size, csize = self._get_sizes('zlib', compressible=False) @@ -2451,7 +2474,7 @@ def test_compression_zlib_uncompressible(self): def test_compression_auto_compressible(self): size, csize = self._get_sizes('auto,zlib', compressible=True) assert csize < size * 0.1 - assert csize == 35 # same as compression 'zlib' + assert csize == 37 # same as compression 'zlib' def test_compression_auto_uncompressible(self): size, csize = self._get_sizes('auto,zlib', compressible=False) @@ -2661,7 +2684,7 @@ def test_fuse_versions_view(self): hl3 = os.path.join(mountpoint, 'input', 'hardlink3', 'hardlink3.00001') assert os.stat(hl1).st_ino == os.stat(hl2).st_ino == os.stat(hl3).st_ino assert open(hl3, 'rb').read() == b'123456' - # similar again, but exclude the hardlink master: + # similar again, but exclude the 1st hardlink: with self.fuse_mount(self.repository_location, mountpoint, '-o', 'versions', '-e', 'input/hardlink1'): if are_hardlinks_supported(): hl2 = os.path.join(mountpoint, 'input', 'hardlink2', 'hardlink2.00001') @@ -3475,7 +3498,7 @@ def test_extract_hardlinks_tar(self): assert os.stat('input/dir1/source2').st_nlink == 2 def test_import_tar(self, tar_format='PAX'): - self.create_test_files() + self.create_test_files(create_hardlinks=False) # hardlinks become separate files os.unlink('input/flagfile') self.cmd('init', '--encryption=none', self.repository_location) self.cmd('create', self.repository_location + '::src', 'input') @@ -3489,7 +3512,7 @@ def test_import_tar(self, tar_format='PAX'): def test_import_tar_gz(self, tar_format='GNU'): if not shutil.which('gzip'): pytest.skip('gzip is not installed') - self.create_test_files() + self.create_test_files(create_hardlinks=False) # hardlinks become separate files os.unlink('input/flagfile') self.cmd('init', '--encryption=none', self.repository_location) self.cmd('create', self.repository_location + '::src', 'input') @@ -3850,7 +3873,7 @@ def test_manifest_rebuild_duplicate_archive(self): 'username': 'bar', 'name': 'archive1', 'time': '2016-12-15T18:49:51.849711', - 'version': 1, + 'version': 2, }) archive_id = key.id_hash(archive) repository.put(archive_id, key.encrypt(archive_id, archive)) @@ -3907,42 +3930,13 @@ def test_empty_repository(self): repository.commit(compact=False) self.cmd('check', self.repository_location, exit_code=1) - def test_attic013_acl_bug(self): - # Attic up to release 0.13 contained a bug where every item unintentionally received - # a b'acl'=None key-value pair. - # This bug can still live on in Borg repositories (through borg upgrade). - class Attic013Item: - def as_dict(self): - return { - # These are required - b'path': '1234', - b'mtime': 0, - b'mode': 0, - b'user': b'0', - b'group': b'0', - b'uid': 0, - b'gid': 0, - # acl is the offending key. - b'acl': None, - } - - archive, repository = self.open_archive('archive1') - with repository: - manifest, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - with Cache(repository, key, manifest) as cache: - archive = Archive(repository, key, manifest, '0.13', cache=cache, create=True) - archive.items_buffer.add(Attic013Item()) - archive.save() - self.cmd('check', self.repository_location, exit_code=0) - self.cmd('list', self.repository_location + '::0.13', exit_code=0) - class ManifestAuthenticationTest(ArchiverTestCaseBase): def spoof_manifest(self, repository): with repository: _, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) repository.put(Manifest.MANIFEST_ID, key.encrypt(Manifest.MANIFEST_ID, msgpack.packb({ - 'version': 1, + 'version': 2, 'archives': {}, 'config': {}, 'timestamp': (datetime.utcnow() + timedelta(days=1)).strftime(ISO_FORMAT), @@ -4473,26 +4467,23 @@ def ccc(a, b): class TestBuildFilter: - @staticmethod - def peek_and_store_hardlink_masters(item, matched): - pass def test_basic(self): matcher = PatternMatcher() matcher.add([parse_pattern('included')], IECommand.Include) - filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0) + filter = Archiver.build_filter(matcher, 0) assert filter(Item(path='included')) assert filter(Item(path='included/file')) assert not filter(Item(path='something else')) def test_empty(self): matcher = PatternMatcher(fallback=True) - filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0) + filter = Archiver.build_filter(matcher, 0) assert filter(Item(path='anything')) def test_strip_components(self): matcher = PatternMatcher(fallback=True) - filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, strip_components=1) + filter = Archiver.build_filter(matcher, strip_components=1) assert not filter(Item(path='shallow')) assert not filter(Item(path='shallow/')) # can this even happen? paths are normalized... assert filter(Item(path='deep enough/file')) diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index 3942c3537f..c93dd3bb67 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -88,11 +88,11 @@ def test_autodetect_invalid(): Compressor(**params).decompress(b'\x08\x00notreallyzlib') -def test_zlib_compat(): +def test_zlib_legacy_compat(): # for compatibility reasons, we do not add an extra header for zlib, # nor do we expect one when decompressing / autodetecting for level in range(10): - c = get_compressor(name='zlib', level=level) + c = get_compressor(name='zlib_legacy', level=level) cdata1 = c.compress(data) cdata2 = zlib.compress(data, level) assert cdata1 == cdata2 diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index aa40cc0660..80b38edce4 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -3,6 +3,7 @@ from ..cache import ChunkListEntry from ..item import Item from ..helpers import StableDict +from ..helpers.msgpack import Timestamp def test_item_empty(): @@ -77,15 +78,15 @@ def test_item_int_property(): item.mode = "invalid" -def test_item_bigint_property(): +def test_item_mptimestamp_property(): item = Item() small, big = 42, 2 ** 65 item.atime = small assert item.atime == small - assert item.as_dict() == {'atime': small} + assert item.as_dict() == {'atime': Timestamp.from_unix_nano(small)} item.atime = big assert item.atime == big - assert item.as_dict() == {'atime': b'\0' * 8 + b'\x02'} + assert item.as_dict() == {'atime': Timestamp.from_unix_nano(big)} def test_item_user_group_none(): diff --git a/src/borg/testsuite/key.py b/src/borg/testsuite/key.py index e0be752fae..5073c5b23b 100644 --- a/src/borg/testsuite/key.py +++ b/src/borg/testsuite/key.py @@ -256,8 +256,8 @@ def test_authenticated_encrypt(self, monkeypatch): plaintext = b'123456789' id = key.id_hash(plaintext) authenticated = key.encrypt(id, plaintext) - # 0x07 is the key TYPE, \x0000 identifies no compression. - assert authenticated == b'\x07\x00\x00' + plaintext + # 0x07 is the key TYPE, \x00ff identifies no compression / unknown level. + assert authenticated == b'\x07\x00\xff' + plaintext def test_blake2_authenticated_encrypt(self, monkeypatch): monkeypatch.setenv('BORG_PASSPHRASE', 'test') @@ -267,8 +267,8 @@ def test_blake2_authenticated_encrypt(self, monkeypatch): plaintext = b'123456789' id = key.id_hash(plaintext) authenticated = key.encrypt(id, plaintext) - # 0x06 is the key TYPE, 0x0000 identifies no compression. - assert authenticated == b'\x06\x00\x00' + plaintext + # 0x06 is the key TYPE, 0x00ff identifies no compression / unknown level. + assert authenticated == b'\x06\x00\xff' + plaintext class TestTAM: