borgbackup · ThomasWaldmann · May 3, 2022 · May 3, 2022 · May 3, 2022 · May 3, 2022
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -132,9 +132,6 @@ Which file types, attributes, etc. are *not* preserved?
       Archive extraction has optional support to extract all-zero chunks as
       holes in a sparse file.
     * Some filesystem specific attributes, like btrfs NOCOW, see :ref:`platforms`.
-    * For hardlinked symlinks, the hardlinking can not be archived (and thus,
-      the hardlinking will not be done at extraction time). The symlinks will
-      be archived and extracted as non-hardlinked symlinks, see :issue:`2379`.
 
 Are there other known limitations?
 ----------------------------------

diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst
@@ -567,7 +567,7 @@ dictionary created by the ``Item`` class that contains:
 * uid
 * gid
 * mode (item type + permissions)
-* source (for symlinks, and for hardlinks within one archive)
+* source (for symlinks)
 * rdev (for device files)
 * mtime, atime, ctime in nanoseconds
 * xattrs

diff --git a/docs/usage/general/file-metadata.rst.inc b/docs/usage/general/file-metadata.rst.inc
@@ -10,7 +10,7 @@ Besides regular file and directory structures, Borg can preserve
   * FIFOs ("named pipes")
   * special file *contents* can be backed up in ``--read-special`` mode.
     By default the metadata to create them with mknod(2), mkfifo(2) etc. is stored.
-* hardlinked regular files, devices, FIFOs (considering all items in the same archive)
+* hardlinked regular files, devices, symlinks, FIFOs (considering all items in the same archive)
 * timestamps in nanosecond precision: mtime, atime, ctime
 * other timestamps: birthtime (on platforms supporting it)
 * permissions:

diff --git a/src/borg/archive.py b/src/borg/archive.py
diff --git a/src/borg/archiver.py b/src/borg/archiver.py
diff --git a/src/borg/cache.py b/src/borg/cache.py
@@ -19,7 +19,7 @@
 from .helpers import Error
 from .helpers import Manifest
 from .helpers import get_cache_dir, get_security_dir
-from .helpers import int_to_bigint, bigint_to_int, bin_to_hex, parse_stringified_list
+from .helpers import bin_to_hex, parse_stringified_list
 from .helpers import format_file_size
 from .helpers import safe_ns
 from .helpers import yes
@@ -28,6 +28,7 @@
 from .helpers import set_ec, EXIT_WARNING
 from .helpers import safe_unlink
 from .helpers import msgpack
+from .helpers.msgpack import int_to_timestamp, timestamp_to_int
 from .item import ArchiveItem, ChunkListEntry
 from .crypto.key import PlaintextKey
 from .crypto.file_integrity import IntegrityCheckedFile, DetachedIntegrityCheckedFile, FileIntegrityError
@@ -623,7 +624,7 @@ def commit(self):
                     # this is to avoid issues with filesystem snapshots and cmtime granularity.
                     # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet.
                     entry = FileCacheEntry(*msgpack.unpackb(item))
-                    if entry.age == 0 and bigint_to_int(entry.cmtime) < self._newest_cmtime or \
+                    if entry.age == 0 and timestamp_to_int(entry.cmtime) < self._newest_cmtime or \
                        entry.age > 0 and entry.age < ttl:
                         msgpack.pack((path_hash, entry), fd)
                         entry_count += 1
@@ -756,7 +757,7 @@ def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx):
             csize, data = decrypted_repository.get(archive_id)
             chunk_idx.add(archive_id, 1, len(data), csize)
             archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
-            if archive.version != 1:
+            if archive.version not in (1, 2):  # legacy
                 raise Exception('Unknown archive metadata version')
             sync = CacheSynchronizer(chunk_idx)
             for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
@@ -1018,10 +1019,10 @@ def file_known_and_unchanged(self, hashed_path, path_hash, st):
         if 'i' in cache_mode and entry.inode != st.st_ino:
             files_cache_logger.debug('KNOWN-CHANGED: file inode number has changed: %r', hashed_path)
             return True, None
-        if 'c' in cache_mode and bigint_to_int(entry.cmtime) != st.st_ctime_ns:
+        if 'c' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_ctime_ns:
             files_cache_logger.debug('KNOWN-CHANGED: file ctime has changed: %r', hashed_path)
             return True, None
-        elif 'm' in cache_mode and bigint_to_int(entry.cmtime) != st.st_mtime_ns:
+        elif 'm' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_mtime_ns:
             files_cache_logger.debug('KNOWN-CHANGED: file mtime has changed: %r', hashed_path)
             return True, None
         # we ignored the inode number in the comparison above or it is still same.
@@ -1049,7 +1050,7 @@ def memorize_file(self, hashed_path, path_hash, st, ids):
         elif 'm' in cache_mode:
             cmtime_type = 'mtime'
             cmtime_ns = safe_ns(st.st_mtime_ns)
-        entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_bigint(cmtime_ns), chunk_ids=ids)
+        entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids)
         self.files[path_hash] = msgpack.packb(entry)
         self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns)
         files_cache_logger.debug('FILES-CACHE-UPDATE: put %r [has %s] <- %r',

diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx
@@ -56,16 +56,21 @@ cdef class CompressorBase:
     also handles compression format auto detection and
     adding/stripping the ID header (which enable auto detection).
     """
-    ID = b'\xFF\xFF'  # reserved and not used
-                      # overwrite with a unique 2-bytes bytestring in child classes
+    ID = b'\xFF'  # reserved and not used
+                  # overwrite with a unique 1-byte bytestring in child classes
     name = 'baseclass'
 
     @classmethod
     def detect(cls, data):
         return data.startswith(cls.ID)
 
-    def __init__(self, **kwargs):
-        pass
+    def __init__(self, level=255, **kwargs):
+        assert 0 <= level <= 255
+        if self.ID is not None:
+            self.id_level = self.ID + bytes((level, ))  # level 255 means "unknown level"
+            assert len(self.id_level) == 2
+        else:
+            self.id_level = None
 
     def decide(self, data):
         """
@@ -85,8 +90,8 @@ cdef class CompressorBase:
         Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor,
         which is needed so that the correct decompressor can be used for decompression.
         """
-        # add ID bytes
-        return self.ID + data
+        # add id_level bytes
+        return self.id_level + data
 
     def decompress(self, data):
         """
@@ -96,7 +101,7 @@ cdef class CompressorBase:
         Only handles input generated by _this_ Compressor - for a general purpose
         decompression method see *Compressor.decompress*.
         """
-        # strip ID bytes
+        # strip id_level bytes
         return data[2:]
 
 cdef class DecidingCompressor(CompressorBase):
@@ -106,8 +111,8 @@ cdef class DecidingCompressor(CompressorBase):
     """
     name = 'decidebaseclass'
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)
 
     def _decide(self, data):
         """
@@ -148,9 +153,12 @@ class CNONE(CompressorBase):
     """
     none - no compression, just pass through data
     """
-    ID = b'\x00\x00'
+    ID = b'\x00'
     name = 'none'
 
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)  # no defined levels for CNONE, so just say "unknown"
+
     def compress(self, data):
         return super().compress(data)
 
@@ -170,11 +178,11 @@ class LZ4(DecidingCompressor):
         - wrapper releases CPython's GIL to support multithreaded code
         - uses safe lz4 methods that never go beyond the end of the output buffer
     """
-    ID = b'\x01\x00'
+    ID = b'\x01'
     name = 'lz4'
 
-    def __init__(self, **kwargs):
-        pass
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)  # no defined levels for LZ4, so just say "unknown"
 
     def _decide(self, idata):
         """
@@ -235,11 +243,11 @@ class LZMA(DecidingCompressor):
     """
     lzma compression / decompression
     """
-    ID = b'\x02\x00'
+    ID = b'\x02'
     name = 'lzma'
 
     def __init__(self, level=6, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
         self.level = level
         if lzma is None:
             raise ValueError('No lzma support found.')
@@ -270,11 +278,11 @@ class ZSTD(DecidingCompressor):
     # This is a NOT THREAD SAFE implementation.
     # Only ONE python context must be created at a time.
     # It should work flawlessly as long as borg will call ONLY ONE compression job at time.
-    ID = b'\x03\x00'
+    ID = b'\x03'
     name = 'zstd'
 
     def __init__(self, level=3, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
         self.level = level
 
     def _decide(self, idata):
@@ -331,14 +339,52 @@ class ZSTD(DecidingCompressor):
         return dest[:osize]
 
 
-class ZLIB(CompressorBase):
+class ZLIB(DecidingCompressor):
     """
     zlib compression / decompression (python stdlib)
     """
-    ID = b'\x08\x00'  # not used here, see detect()
-                      # avoid all 0x.8.. IDs elsewhere!
+    ID = b'\x05'
     name = 'zlib'
 
+    def __init__(self, level=6, **kwargs):
+        super().__init__(level=level, **kwargs)
+        self.level = level
+
+    def _decide(self, data):
+        """
+        Decides what to do with *data*. Returns (compressor, zlib_data).
+
+        *zlib_data* is the ZLIB result if *compressor* is ZLIB as well, otherwise it is None.
+        """
+        zlib_data = zlib.compress(data, self.level)
+        if len(zlib_data) < len(data):
+            return self, zlib_data
+        else:
+            return NONE_COMPRESSOR, None
+
+    def decompress(self, data):
+        data = super().decompress(data)
+        try:
+            return zlib.decompress(data)
+        except zlib.error as e:
+            raise DecompressionError(str(e)) from None
+
+
+class ZLIB_legacy(CompressorBase):
+    """
+    zlib compression / decompression (python stdlib)
+
+    Note: This is the legacy ZLIB support as used by borg < 1.3.
+          It still suffers from attic *only* supporting zlib and not having separate
+          ID bytes to differentiate between differently compressed chunks.
+          This just works because zlib compressed stuff always starts with 0x.8.. bytes.
+          Newer borg uses the ZLIB class that has separate ID bytes (as all the other
+          compressors) and does not need this hack.
+    """
+    ID = b'\x08'  # not used here, see detect()
+    # avoid all 0x.8 IDs elsewhere!
+    name = 'zlib_legacy'
+
     @classmethod
     def detect(cls, data):
         # matches misc. patterns 0x.8.. used by zlib
@@ -348,7 +394,7 @@ class ZLIB(CompressorBase):
         return check_ok and is_deflate
 
     def __init__(self, level=6, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
         self.level = level
 
     def compress(self, data):
@@ -440,14 +486,14 @@ class ObfuscateSize(CompressorBase):
     """
     Meta-Compressor that obfuscates the compressed data size.
     """
-    ID = b'\x04\x00'
+    ID = b'\x04'
     name = 'obfuscate'
 
-    header_fmt = Struct('>I')
+    header_fmt = Struct('<I')
     header_len = len(header_fmt.pack(0))
 
     def __init__(self, level=None, compressor=None):
-        super().__init__()
+        super().__init__(level=level)  # data will be encrypted, so we can tell the level
         self.compressor = compressor
         if level is None:
             pass  # decompression
@@ -502,13 +548,14 @@ COMPRESSOR_TABLE = {
     CNONE.name: CNONE,
     LZ4.name: LZ4,
     ZLIB.name: ZLIB,
+    ZLIB_legacy.name: ZLIB_legacy,
     LZMA.name: LZMA,
     Auto.name: Auto,
     ZSTD.name: ZSTD,
     ObfuscateSize.name: ObfuscateSize,
 }
 # List of possible compression types. Does not include Auto, since it is a meta-Compressor.
-COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ]  # check fast stuff first
+COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, ZLIB_legacy, LZMA, ObfuscateSize, ]  # check fast stuff first
 
 def get_compressor(name, **kwargs):
     cls = COMPRESSOR_TABLE[name]
@@ -554,7 +601,7 @@ class CompressionSpec:
         self.name = values[0]
         if self.name in ('none', 'lz4', ):
             return
-        elif self.name in ('zlib', 'lzma', ):
+        elif self.name in ('zlib', 'lzma', 'zlib_legacy'):  # zlib_legacy just for testing
             if count < 2:
                 level = 6  # default compression level in py stdlib
             elif count == 2:
@@ -597,7 +644,7 @@ class CompressionSpec:
     def compressor(self):
         if self.name in ('none', 'lz4', ):
             return get_compressor(self.name)
-        elif self.name in ('zlib', 'lzma', 'zstd', ):
+        elif self.name in ('zlib', 'lzma', 'zstd', 'zlib_legacy'):
             return get_compressor(self.name, level=self.level)
         elif self.name == 'auto':
             return get_compressor(self.name, compressor=self.inner.compressor)

diff --git a/src/borg/constants.py b/src/borg/constants.py
@@ -1,5 +1,5 @@
 # this set must be kept complete, otherwise the RobustUnpacker might malfunction:
-ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master',
+ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master', 'hlid',
                        'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size',
                        'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
                        'part'])