From f8dbe5b54293c18d3e6668a563c1cc861c0982bd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 5 May 2022 19:36:02 +0200 Subject: [PATCH 01/14] cleanup msgpack related str/bytes mess, see #968 see ticket and borg.helpers.msgpack docstring. --- src/borg/archive.py | 16 ++- src/borg/archiver.py | 12 +-- src/borg/cache_sync/unpack.h | 36 ++++--- src/borg/helpers/manifest.py | 46 ++++----- src/borg/helpers/msgpack.py | 101 +++++++++++++------ src/borg/item.pyx | 172 +++++++++++++++++++++++++++------ src/borg/remote.py | 49 +++++----- src/borg/testsuite/archive.py | 14 +-- src/borg/testsuite/archiver.py | 14 +-- src/borg/testsuite/item.py | 6 +- 10 files changed, 311 insertions(+), 155 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 15cfc5d55e..dc244a9f15 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -34,7 +34,7 @@ from .platform import uid2user, user2uid, gid2group, group2gid from .helpers import parse_timestamp, to_localtime from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize -from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates +from .helpers import safe_encode, make_path_safe, remove_surrogates from .helpers import StableDict from .helpers import bin_to_hex from .helpers import safe_ns @@ -479,7 +479,6 @@ def _load_meta(self, id): def load(self, id): self.id = id self.metadata = self._load_meta(self.id) - self.metadata.cmdline = [safe_decode(arg) for arg in self.metadata.cmdline] self.name = self.metadata.name self.comment = self.metadata.get('comment', '') @@ -1515,7 +1514,7 @@ class RobustUnpacker: """ def __init__(self, validator, item_keys): super().__init__() - self.item_keys = [msgpack.packb(name.encode()) for name in item_keys] + self.item_keys = [msgpack.packb(name) for name in item_keys] self.validator = validator self._buffered_data = [] self._resync = False @@ -1734,7 +1733,7 @@ def valid_archive(obj): # lost manifest on a older borg version than the most recent one that was ever used # within this repository (assuming that newer borg versions support more item keys). manifest = Manifest(self.key, self.repository) - archive_keys_serialized = [msgpack.packb(name.encode()) for name in ARCHIVE_KEYS] + archive_keys_serialized = [msgpack.packb(name) for name in ARCHIVE_KEYS] pi = ProgressIndicatorPercent(total=len(self.chunks), msg="Rebuilding manifest %6.2f%%", step=0.01, msgid='check.rebuild_manifest') for chunk_id, _ in self.chunks.iteritems(): @@ -1881,9 +1880,9 @@ def robust_iterator(archive): Missing item chunks will be skipped and the msgpack stream will be restarted """ - item_keys = frozenset(key.encode() for key in self.manifest.item_keys) - required_item_keys = frozenset(key.encode() for key in REQUIRED_ITEM_KEYS) - unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and b'path' in item, + item_keys = self.manifest.item_keys + required_item_keys = REQUIRED_ITEM_KEYS + unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and 'path' in item, self.manifest.item_keys) _state = 0 @@ -1905,7 +1904,7 @@ def list_keys_safe(keys): def valid_item(obj): if not isinstance(obj, StableDict): return False, 'not a dictionary' - keys = set(obj) + keys = set(k.decode('utf-8', errors='replace') for k in obj) if not required_item_keys.issubset(keys): return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys) if not keys.issubset(item_keys): @@ -1991,7 +1990,6 @@ def valid_item(obj): archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) if archive.version != 2: raise Exception('Unknown archive metadata version') - archive.cmdline = [safe_decode(arg) for arg in archive.cmdline] items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index dc0db82f23..acef0db5e0 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -55,7 +55,7 @@ from .helpers import PrefixSpec, GlobSpec, CommentSpec, SortBySpec, FilesCacheMode from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter from .helpers import format_timedelta, format_file_size, parse_file_size, format_archive - from .helpers import safe_encode, remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes + from .helpers import remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes from .helpers import interval, prune_within, prune_split, PRUNING_PATTERNS from .helpers import timestamp from .helpers import get_cache_dir, os_stat @@ -1944,12 +1944,12 @@ def do_upgrade(self, args, repository, manifest=None, key=None): print('This repository is not encrypted, cannot enable TAM.') return EXIT_ERROR - if not manifest.tam_verified or not manifest.config.get(b'tam_required', False): + if not manifest.tam_verified or not manifest.config.get('tam_required', False): # The standard archive listing doesn't include the archive ID like in borg 1.1.x print('Manifest contents:') for archive_info in manifest.archives.list(sort_by=['ts']): print(format_archive(archive_info), '[%s]' % bin_to_hex(archive_info.id)) - manifest.config[b'tam_required'] = True + manifest.config['tam_required'] = True manifest.write() repository.commit(compact=False) if not key.tam_required: @@ -1972,7 +1972,7 @@ def do_upgrade(self, args, repository, manifest=None, key=None): print('Key updated') if hasattr(key, 'find_key'): print('Key location:', key.find_key()) - manifest.config[b'tam_required'] = False + manifest.config['tam_required'] = False manifest.write() repository.commit(compact=False) else: @@ -2304,7 +2304,7 @@ def do_debug_dump_archive(self, args, repository, manifest, key): """dump decoded archive metadata (not: data)""" try: - archive_meta_orig = manifest.archives.get_raw_dict()[safe_encode(args.location.archive)] + archive_meta_orig = manifest.archives.get_raw_dict()[args.location.archive] except KeyError: raise Archive.DoesNotExist(args.location.archive) @@ -2321,7 +2321,7 @@ def output(fd): fd.write(do_indent(prepare_dump_dict(archive_meta_orig))) fd.write(',\n') - data = key.decrypt(archive_meta_orig[b'id'], repository.get(archive_meta_orig[b'id'])) + data = key.decrypt(archive_meta_orig['id'], repository.get(archive_meta_orig['id'])) archive_org_dict = msgpack.unpackb(data, object_hook=StableDict) fd.write(' "_meta":\n') diff --git a/src/borg/cache_sync/unpack.h b/src/borg/cache_sync/unpack.h index f6dd9ca8e8..bba566419f 100644 --- a/src/borg/cache_sync/unpack.h +++ b/src/borg/cache_sync/unpack.h @@ -384,19 +384,11 @@ static inline int unpack_callback_map_end(unpack_user* u) static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int length) { - /* raw = what Borg uses for binary stuff and strings as well */ + /* raw = what Borg uses for text stuff */ /* Note: p points to an internal buffer which contains l bytes. */ (void)b; switch(u->expect) { - case expect_key: - if(length != 32) { - SET_LAST_ERROR("Incorrect key length"); - return -1; - } - memcpy(u->current.key, p, 32); - u->expect = expect_size; - break; case expect_map_key: if(length == 6 && !memcmp("chunks", p, 6)) { u->expect = expect_chunks_begin; @@ -409,19 +401,31 @@ static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* u->expect = expect_map_item_end; } break; - default: - if(u->inside_chunks) { - SET_LAST_ERROR("Unexpected bytes in chunks structure"); - return -1; - } } return 0; } static inline int unpack_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int length) { - (void)u; (void)b; (void)p; (void)length; - UNEXPECTED("bin"); + /* bin = what Borg uses for binary stuff */ + /* Note: p points to an internal buffer which contains l bytes. */ + (void)b; + + switch(u->expect) { + case expect_key: + if(length != 32) { + SET_LAST_ERROR("Incorrect key length"); + return -1; + } + memcpy(u->current.key, p, 32); + u->expect = expect_size; + break; + default: + if(u->inside_chunks) { + SET_LAST_ERROR("Unexpected bytes in chunks structure"); + return -1; + } + } return 0; } diff --git a/src/borg/helpers/manifest.py b/src/borg/helpers/manifest.py index 425feb4e68..1b9d91fb49 100644 --- a/src/borg/helpers/manifest.py +++ b/src/borg/helpers/manifest.py @@ -12,7 +12,7 @@ logger = create_logger() from .datastruct import StableDict -from .parseformat import bin_to_hex, safe_encode, safe_decode +from .parseformat import bin_to_hex from .time import parse_timestamp from .. import shellpattern from ..constants import * # NOQA @@ -39,39 +39,35 @@ class Archives(abc.MutableMapping): str timestamps or datetime timestamps. """ def __init__(self): - # key: encoded archive name, value: dict(b'id': bytes_id, b'time': bytes_iso_ts) + # key: str archive name, value: dict('id': bytes_id, 'time': str_iso_ts) self._archives = {} def __len__(self): return len(self._archives) def __iter__(self): - return iter(safe_decode(name) for name in self._archives) + return iter(self._archives) def __getitem__(self, name): assert isinstance(name, str) - _name = safe_encode(name) - values = self._archives.get(_name) + values = self._archives.get(name) if values is None: raise KeyError - ts = parse_timestamp(values[b'time'].decode()) - return ArchiveInfo(name=name, id=values[b'id'], ts=ts) + ts = parse_timestamp(values['time']) + return ArchiveInfo(name=name, id=values['id'], ts=ts) def __setitem__(self, name, info): assert isinstance(name, str) - name = safe_encode(name) assert isinstance(info, tuple) id, ts = info assert isinstance(id, bytes) if isinstance(ts, datetime): ts = ts.replace(tzinfo=None).strftime(ISO_FORMAT) assert isinstance(ts, str) - ts = ts.encode() - self._archives[name] = {b'id': id, b'time': ts} + self._archives[name] = {'id': id, 'time': ts} def __delitem__(self, name): assert isinstance(name, str) - name = safe_encode(name) del self._archives[name] def list(self, *, glob=None, match_end=r'\Z', sort_by=(), consider_checkpoints=True, first=None, last=None, reverse=False): @@ -116,8 +112,8 @@ def list_considering(self, args): def set_raw_dict(self, d): """set the dict we get from the msgpack unpacker""" for k, v in d.items(): - assert isinstance(k, bytes) - assert isinstance(v, dict) and b'id' in v and b'time' in v + assert isinstance(k, str) + assert isinstance(v, dict) and 'id' in v and 'time' in v self._archives[k] = v def get_raw_dict(self): @@ -196,10 +192,10 @@ def load(cls, repository, operations, key=None, force_tam_not_required=False): manifest.timestamp = m.get('timestamp') manifest.config = m.config # valid item keys are whatever is known in the repo or every key we know - manifest.item_keys = ITEM_KEYS | frozenset(key.decode() for key in m.get('item_keys', [])) + manifest.item_keys = ITEM_KEYS | frozenset(m.get('item_keys', [])) if manifest.tam_verified: - manifest_required = manifest.config.get(b'tam_required', False) + manifest_required = manifest.config.get('tam_required', False) security_required = tam_required(repository) if manifest_required and not security_required: logger.debug('Manifest is TAM verified and says TAM is required, updating security database...') @@ -214,32 +210,32 @@ def load(cls, repository, operations, key=None, force_tam_not_required=False): def check_repository_compatibility(self, operations): for operation in operations: assert isinstance(operation, self.Operation) - feature_flags = self.config.get(b'feature_flags', None) + feature_flags = self.config.get('feature_flags', None) if feature_flags is None: return - if operation.value.encode() not in feature_flags: + if operation.value not in feature_flags: continue - requirements = feature_flags[operation.value.encode()] - if b'mandatory' in requirements: - unsupported = set(requirements[b'mandatory']) - self.SUPPORTED_REPO_FEATURES + requirements = feature_flags[operation.value] + if 'mandatory' in requirements: + unsupported = set(requirements['mandatory']) - self.SUPPORTED_REPO_FEATURES if unsupported: - raise MandatoryFeatureUnsupported([f.decode() for f in unsupported]) + raise MandatoryFeatureUnsupported(list(unsupported)) def get_all_mandatory_features(self): result = {} - feature_flags = self.config.get(b'feature_flags', None) + feature_flags = self.config.get('feature_flags', None) if feature_flags is None: return result for operation, requirements in feature_flags.items(): - if b'mandatory' in requirements: - result[operation.decode()] = {feature.decode() for feature in requirements[b'mandatory']} + if 'mandatory' in requirements: + result[operation] = set(requirements['mandatory']) return result def write(self): from ..item import ManifestItem if self.key.tam_required: - self.config[b'tam_required'] = True + self.config['tam_required'] = True # self.timestamp needs to be strictly monotonically increasing. Clocks often are not set correctly if self.timestamp is None: self.timestamp = datetime.utcnow().strftime(ISO_FORMAT) diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 411f00fec4..5a2edecd67 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -1,21 +1,56 @@ +""" +wrapping msgpack +================ + +Due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it - +to avoid having lots of clutter in the calling code. see tickets #968 and #3632. + +Packing +------- +- use_bin_type = True (used by borg since borg 1.3) + This is used to generate output according to new msgpack 2.0 spec. + This cleanly keeps bytes and str types apart. + +- use_bin_type = False (used by borg < 1.3) + This creates output according to the older msgpack spec. + BAD: str and bytes were packed into same "raw" representation. + +- unicode_errors = 'surrogateescape' + Guess backup applications are one of the rare cases when this needs to be used. + It is needed because borg also needs to deal with data that does not cleanly encode/decode using utf-8. + There's a lot of crap out there, e.g. in filenames and as a backup tool, we must keep them as good as possible. + +Unpacking +--------- +- raw = True (the old way, used by borg <= 1.3) + This is currently still needed to not try to decode "raw" msgpack objects. + These could come either from str (new or old msgpack) or bytes (old msgpack). + Thus, we basically must know what we want and either keep the bytes we get + or decode them to str, if we want str. + +- raw = False (the new way) + This can be used in future, when we do not have to deal with data any more that was packed the old way. + It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str. + +- unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False). + +As of borg 1.3, we have the first part on the way to fix the msgpack str/bytes mess, #968. +borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely. +But from now on, borg only **writes** new data according to the new msgpack spec, +thus we can complete the fix for #968 in a later borg release. + +current way in msgpack terms +---------------------------- + +- pack with use_bin_type=True (according to msgpack 2.0 spec) +- packs str -> raw and bytes -> bin +- unpack with raw=True (aka "the old way") +- unpacks raw to bytes (thus we always need to decode manually if we want str) +""" + from .datastruct import StableDict from ..constants import * # NOQA -# wrapping msgpack --------------------------------------------------------------------------------------------------- -# -# due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it - -# to avoid having lots of clutter in the calling code. see tickets #968 and #3632. -# -# Packing -# ------- -# use_bin_type = False is needed to generate the old msgpack format (not msgpack 2.0 spec) as borg always did. -# unicode_errors = None is needed because usage of it is deprecated -# -# Unpacking -# --------- -# raw = True is needed to unpack the old msgpack format to bytes (not str, about the decoding see item.pyx). -# unicode_errors = None is needed because usage of it is deprecated - from msgpack import Packer as mp_Packer from msgpack import packb as mp_packb from msgpack import pack as mp_pack @@ -30,6 +65,10 @@ version = mp_version +USE_BIN_TYPE = True +RAW = True # should become False later when we do not need to read old stuff any more +UNICODE_ERRORS = 'surrogateescape' # previously done by safe_encode, safe_decode + class PackException(Exception): """Exception while msgpack packing""" @@ -40,10 +79,10 @@ class UnpackException(Exception): class Packer(mp_Packer): - def __init__(self, *, default=None, unicode_errors=None, - use_single_float=False, autoreset=True, use_bin_type=False, + def __init__(self, *, default=None, unicode_errors=UNICODE_ERRORS, + use_single_float=False, autoreset=True, use_bin_type=USE_BIN_TYPE, strict_types=False): - assert unicode_errors is None + assert unicode_errors == UNICODE_ERRORS super().__init__(default=default, unicode_errors=unicode_errors, use_single_float=use_single_float, autoreset=autoreset, use_bin_type=use_bin_type, strict_types=strict_types) @@ -55,16 +94,16 @@ def pack(self, obj): raise PackException(e) -def packb(o, *, use_bin_type=False, unicode_errors=None, **kwargs): - assert unicode_errors is None +def packb(o, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs): + assert unicode_errors == UNICODE_ERRORS try: return mp_packb(o, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs) except Exception as e: raise PackException(e) -def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs): - assert unicode_errors is None +def pack(o, stream, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs): + assert unicode_errors == UNICODE_ERRORS try: return mp_pack(o, stream, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs) except Exception as e: @@ -72,13 +111,13 @@ def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs): class Unpacker(mp_Unpacker): - def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=True, + def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=RAW, object_hook=None, object_pairs_hook=None, list_hook=None, - unicode_errors=None, max_buffer_size=0, + unicode_errors=UNICODE_ERRORS, max_buffer_size=0, ext_hook=ExtType, strict_map_key=False): - assert raw is True - assert unicode_errors is None + assert raw == RAW + assert unicode_errors == UNICODE_ERRORS kw = dict(file_like=file_like, read_size=read_size, use_list=use_list, raw=raw, object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook, unicode_errors=unicode_errors, max_buffer_size=max_buffer_size, @@ -105,10 +144,11 @@ def __next__(self): next = __next__ -def unpackb(packed, *, raw=True, unicode_errors=None, +def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS, strict_map_key=False, **kwargs): - assert unicode_errors is None + assert raw == RAW + assert unicode_errors == UNICODE_ERRORS try: kw = dict(raw=raw, unicode_errors=unicode_errors, strict_map_key=strict_map_key) @@ -118,10 +158,11 @@ def unpackb(packed, *, raw=True, unicode_errors=None, raise UnpackException(e) -def unpack(stream, *, raw=True, unicode_errors=None, +def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS, strict_map_key=False, **kwargs): - assert unicode_errors is None + # assert raw == RAW + assert unicode_errors == UNICODE_ERRORS try: kw = dict(raw=raw, unicode_errors=unicode_errors, strict_map_key=strict_map_key) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 9ea76f2de4..89f476c1a4 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -2,7 +2,6 @@ import stat from collections import namedtuple from .constants import ITEM_KEYS, ARCHIVE_KEYS -from .helpers import safe_encode, safe_decode from .helpers import StableDict from .helpers import format_file_size from .helpers.msgpack import timestamp_to_int, int_to_timestamp @@ -16,6 +15,51 @@ cdef extern from "_item.c": API_VERSION = '1.2_01' +def fix_key(data, key): + """if k is a bytes-typed key, migrate key/value to a str-typed key in dict data""" + if isinstance(key, bytes): + value = data.pop(key) + key = key.decode() + data[key] = value + assert isinstance(key, str) + return key + + +def fix_str_value(data, key, errors='surrogateescape'): + """makes sure that data[key] is a str (decode if it is bytes)""" + assert isinstance(key, str) # fix_key must be called first + value = data[key] + if isinstance(value, bytes): + value = value.decode('utf-8', errors=errors) + data[key] = value + assert isinstance(value, str) + return value + + +def fix_list_of_str(t): + """make sure we have a list of str""" + assert isinstance(t, (tuple, list)) + l = [e.decode() if isinstance(e, bytes) else e for e in t] + assert all(isinstance(e, str) for e in l), repr(l) + return l + + +def fix_tuple_of_str(t): + """make sure we have a tuple of str""" + assert isinstance(t, (tuple, list)) + t = tuple(e.decode() if isinstance(e, bytes) else e for e in t) + assert all(isinstance(e, str) for e in t), repr(t) + return t + + +def fix_tuple_of_str_and_int(t): + """make sure we have a tuple of str""" + assert isinstance(t, (tuple, list)) + t = tuple(e.decode() if isinstance(e, bytes) else e for e in t) + assert all(isinstance(e, (str, int)) for e in t), repr(t) + return t + + class PropDict: """ Manage a dictionary via properties. @@ -155,10 +199,10 @@ class Item(PropDict): # properties statically defined, so that IDEs can know their names: - path = PropDict._make_property('path', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - source = PropDict._make_property('source', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode) - group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode) + path = PropDict._make_property('path', str, 'surrogate-escaped str') + source = PropDict._make_property('source', str, 'surrogate-escaped str') + user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None') + group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None') acl_access = PropDict._make_property('acl_access', bytes) acl_default = PropDict._make_property('acl_default', bytes) @@ -290,6 +334,14 @@ class Item(PropDict): except AttributeError: return False + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k in ('path', 'source', 'user', 'group'): + v = fix_str_value(d, k) + self._dict[k] = v + class EncryptedKey(PropDict): """ @@ -309,7 +361,7 @@ class EncryptedKey(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - algorithm = PropDict._make_property('algorithm', str, encode=str.encode, decode=bytes.decode) + algorithm = PropDict._make_property('algorithm', str) iterations = PropDict._make_property('iterations', int) salt = PropDict._make_property('salt', bytes) hash = PropDict._make_property('hash', bytes) @@ -317,7 +369,17 @@ class EncryptedKey(PropDict): argon2_time_cost = PropDict._make_property('argon2_time_cost', int) argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int) argon2_parallelism = PropDict._make_property('argon2_parallelism', int) - argon2_type = PropDict._make_property('argon2_type', str, encode=str.encode, decode=bytes.decode) + argon2_type = PropDict._make_property('argon2_type', str) + + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + if k in ('algorithm', 'argon2_type'): + v = fix_str_value(d, k) + self._dict[k] = v class Key(PropDict): @@ -344,17 +406,13 @@ class Key(PropDict): chunk_seed = PropDict._make_property('chunk_seed', int) tam_required = PropDict._make_property('tam_required', bool) - -def tuple_encode(t): - """encode a tuple that might contain str items""" - # we have str, but want to give bytes to msgpack.pack - return tuple(safe_encode(e) if isinstance(e, str) else e for e in t) - - -def tuple_decode(t): - """decode a tuple that might contain bytes items""" - # we get bytes objects from msgpack.unpack, but want str - return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t) + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + self._dict[k] = v class ArchiveItem(PropDict): @@ -374,15 +432,15 @@ class ArchiveItem(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - name = PropDict._make_property('name', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) + name = PropDict._make_property('name', str, 'surrogate-escaped str') items = PropDict._make_property('items', list) cmdline = PropDict._make_property('cmdline', list) # list of s-e-str - hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - username = PropDict._make_property('username', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode) + hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str') + username = PropDict._make_property('username', str, 'surrogate-escaped str') + time = PropDict._make_property('time', str) + time_end = PropDict._make_property('time_end', str) + comment = PropDict._make_property('comment', str, 'surrogate-escaped str') + chunker_params = PropDict._make_property('chunker_params', tuple) recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2 recreate_source_id = PropDict._make_property('recreate_source_id', bytes) @@ -395,6 +453,22 @@ class ArchiveItem(PropDict): csize_parts = PropDict._make_property('csize_parts', int) nfiles_parts = PropDict._make_property('nfiles_parts', int) + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + if k in ('name', 'hostname', 'username', 'comment'): + v = fix_str_value(d, k) + if k in ('time', 'time_end'): + v = fix_str_value(d, k, 'replace') + if k == 'chunker_params': + v = fix_tuple_of_str_and_int(v) + if k in ('cmdline', 'recreate_cmdline'): + v = fix_list_of_str(v) + self._dict[k] = v + class ManifestItem(PropDict): """ @@ -413,10 +487,52 @@ class ManifestItem(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - archives = PropDict._make_property('archives', dict) # name -> dict - timestamp = PropDict._make_property('timestamp', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) + archives = PropDict._make_property('archives', dict, 'dict of str -> dict') # name -> dict + timestamp = PropDict._make_property('timestamp', str) config = PropDict._make_property('config', dict) - item_keys = PropDict._make_property('item_keys', tuple) + item_keys = PropDict._make_property('item_keys', tuple, 'tuple of str') + + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + if k == 'archives': + ad = v + assert isinstance(ad, dict) + for ak, av in list(ad.items()): + ak = fix_key(ad, ak) + assert isinstance(av, dict) + for ik, iv in list(av.items()): + ik = fix_key(av, ik) + assert set(av) == {'id', 'time'} + assert isinstance(av['id'], bytes) + fix_str_value(av, 'time') + if k == 'timestamp': + v = fix_str_value(d, k, 'replace') + if k == 'config': + cd = v + assert isinstance(cd, dict) + for ck, cv in list(cd.items()): + ck = fix_key(cd, ck) + if ck == 'tam_required': + assert isinstance(cv, bool) + if ck == 'feature_flags': + assert isinstance(cv, dict) + ops = {'read', 'check', 'write', 'delete'} + for op, specs in list(cv.items()): + op = fix_key(cv, op) + assert op in ops + for speck, specv in list(specs.items()): + speck = fix_key(specs, speck) + if speck == 'mandatory': + specs[speck] = fix_tuple_of_str(specv) + assert set(cv).issubset(ops) + if k == 'item_keys': + v = fix_tuple_of_str(v) + self._dict[k] = v + class ItemDiff: """ diff --git a/src/borg/remote.py b/src/borg/remote.py index 2870d71d98..8de302871f 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -38,7 +38,8 @@ RPC_PROTOCOL_VERSION = 2 BORG_VERSION = parse_version(__version__) -MSGID, MSG, ARGS, RESULT = b'i', b'm', b'a', b'r' +MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r' # pack +MSGIDB, MSGB, ARGSB, RESULTB = b'i', b'm', b'a', b'r' # unpack MAX_INFLIGHT = 100 @@ -216,9 +217,9 @@ def serve(self): for unpacked in unpacker: if isinstance(unpacked, dict): dictFormat = True - msgid = unpacked[MSGID] - method = unpacked[MSG].decode() - args = decode_keys(unpacked[ARGS]) + msgid = unpacked[MSGIDB] + method = unpacked[MSGB].decode() + args = decode_keys(unpacked[ARGSB]) elif isinstance(unpacked, tuple) and len(unpacked) == 4: dictFormat = False # The first field 'type' was always 1 and has always been ignored @@ -256,21 +257,21 @@ def serve(self): try: msg = msgpack.packb({MSGID: msgid, - b'exception_class': e.__class__.__name__, - b'exception_args': e.args, - b'exception_full': ex_full, - b'exception_short': ex_short, - b'exception_trace': ex_trace, - b'sysinfo': sysinfo()}) + 'exception_class': e.__class__.__name__, + 'exception_args': e.args, + 'exception_full': ex_full, + 'exception_short': ex_short, + 'exception_trace': ex_trace, + 'sysinfo': sysinfo()}) except TypeError: msg = msgpack.packb({MSGID: msgid, - b'exception_class': e.__class__.__name__, - b'exception_args': [x if isinstance(x, (str, bytes, int)) else None - for x in e.args], - b'exception_full': ex_full, - b'exception_short': ex_short, - b'exception_trace': ex_trace, - b'sysinfo': sysinfo()}) + 'exception_class': e.__class__.__name__, + 'exception_args': [x if isinstance(x, (str, bytes, int)) else None + for x in e.args], + 'exception_full': ex_full, + 'exception_short': ex_short, + 'exception_trace': ex_trace, + 'sysinfo': sysinfo()}) os_write(stdout_fd, msg) else: @@ -570,7 +571,7 @@ def __init__(self, location, create=False, exclusive=False, lock_wait=None, lock try: try: version = self.call('negotiate', {'client_data': { - b'client_version': BORG_VERSION, + 'client_version': BORG_VERSION, }}) except ConnectionClosed: raise ConnectionClosedWithHint('Is borg working on the server?') from None @@ -791,7 +792,7 @@ def handle_error(unpacked): if b'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULT] + yield unpacked[RESULTB] if not waiting_for and not calls: return except KeyError: @@ -811,7 +812,7 @@ def handle_error(unpacked): if b'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULT] + yield unpacked[RESULTB] if self.to_send or ((calls or self.preload_ids) and len(waiting_for) < MAX_INFLIGHT): w_fds = [self.stdin_fd] else: @@ -828,15 +829,15 @@ def handle_error(unpacked): self.unpacker.feed(data) for unpacked in self.unpacker: if isinstance(unpacked, dict): - msgid = unpacked[MSGID] + msgid = unpacked[MSGIDB] elif isinstance(unpacked, tuple) and len(unpacked) == 4: # The first field 'type' was always 1 and has always been ignored _, msgid, error, res = unpacked if error: # ignore res, because it is only a fixed string anyway. - unpacked = {MSGID: msgid, b'exception_class': error} + unpacked = {MSGIDB: msgid, b'exception_class': error} else: - unpacked = {MSGID: msgid, RESULT: res} + unpacked = {MSGIDB: msgid, RESULTB: res} else: raise UnexpectedRPCDataFormatFromServer(data) if msgid in self.ignore_responses: @@ -847,7 +848,7 @@ def handle_error(unpacked): else: # we currently do not have async result values except "None", # so we do not add them into async_responses. - if unpacked[RESULT] is not None: + if unpacked[RESULTB] is not None: self.async_responses[msgid] = unpacked else: self.responses[msgid] = unpacked diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index 3f8535ff5a..0eed9f7e8e 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -186,8 +186,8 @@ def process(self, input): return result def test_extra_garbage_no_sync(self): - chunks = [(False, [self.make_chunks([b'foo', b'bar'])]), - (False, [b'garbage'] + [self.make_chunks([b'boo', b'baz'])])] + chunks = [(False, [self.make_chunks(['foo', 'bar'])]), + (False, [b'garbage'] + [self.make_chunks(['boo', 'baz'])])] result = self.process(chunks) self.assert_equal(result, [ {b'path': b'foo'}, {b'path': b'bar'}, @@ -203,19 +203,19 @@ def split(self, left, length): return parts def test_correct_stream(self): - chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 2) + chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 2) input = [(False, chunks)] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'bar'}, {b'path': b'boo'}, {b'path': b'baz'}]) def test_missing_chunk(self): - chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4) + chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, chunks[4:])] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) def test_corrupt_chunk(self): - chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4) + chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) @@ -242,7 +242,7 @@ def test_invalid_msgpacked_item(packed, item_keys_serialized): @pytest.mark.parametrize('packed', [msgpack.packb(o) for o in [ - {b'path': b'/a/b/c'}, # small (different msgpack mapping type!) + {'path': b'/a/b/c'}, # small (different msgpack mapping type!) OrderedDict((k, b'') for k in IK), # as big (key count) as it gets OrderedDict((k, b'x' * 1000) for k in IK), # as big (key count and volume) as it gets ]]) @@ -251,7 +251,7 @@ def test_valid_msgpacked_items(packed, item_keys_serialized): def test_key_length_msgpacked_items(): - key = b'x' * 32 # 31 bytes is the limit for fixstr msgpack type + key = 'x' * 32 # 31 bytes is the limit for fixstr msgpack type data = {key: b''} item_keys_serialized = [msgpack.packb(key), ] assert valid_msgpacked_dict(msgpack.packb(data), item_keys_serialized) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 32feeb44ee..f3315b676b 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -1810,7 +1810,7 @@ def test_create_dry_run(self): def add_unknown_feature(self, operation): with Repository(self.repository_path, exclusive=True) as repository: manifest, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - manifest.config[b'feature_flags'] = {operation.value.encode(): {b'mandatory': [b'unknown-feature']}} + manifest.config['feature_flags'] = {operation.value: {'mandatory': ['unknown-feature']}} manifest.write() repository.commit(compact=False) @@ -3640,13 +3640,13 @@ def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, ex with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm + assert key[b'algorithm'] == expected_algorithm.encode() def test_change_passphrase_does_not_change_algorithm_argon2(self): - self.verify_change_passphrase_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305') + self.verify_change_passphrase_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') def test_change_passphrase_does_not_change_algorithm_pbkdf2(self): - self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', b'sha256') + self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', 'sha256') def verify_change_location_does_not_change_algorithm(self, given_algorithm, expected_algorithm): self.cmd('init', '--encryption=keyfile', '--key-algorithm', given_algorithm, self.repository_location) @@ -3655,13 +3655,13 @@ def verify_change_location_does_not_change_algorithm(self, given_algorithm, expe with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm + assert key[b'algorithm'] == expected_algorithm.encode() def test_change_location_does_not_change_algorithm_argon2(self): - self.verify_change_location_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305') + self.verify_change_location_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') def test_change_location_does_not_change_algorithm_pbkdf2(self): - self.verify_change_location_does_not_change_algorithm('pbkdf2', b'sha256') + self.verify_change_location_does_not_change_algorithm('pbkdf2', 'sha256') def test_key_change_algorithm(self): self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location) diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index 80b38edce4..94167e7ea9 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -102,7 +102,7 @@ def test_item_se_str_property(): item = Item() item.path = '/a/b/c' assert item.path == '/a/b/c' - assert item.as_dict() == {'path': b'/a/b/c'} + assert item.as_dict() == {'path': '/a/b/c'} del item.path assert item.as_dict() == {} with pytest.raises(TypeError): @@ -111,11 +111,11 @@ def test_item_se_str_property(): # non-utf-8 path, needing surrogate-escaping for latin-1 u-umlaut item = Item(internal_dict={'path': b'/a/\xfc/c'}) assert item.path == '/a/\udcfc/c' # getting a surrogate-escaped representation - assert item.as_dict() == {'path': b'/a/\xfc/c'} + assert item.as_dict() == {'path': '/a/\udcfc/c'} del item.path assert 'path' not in item item.path = '/a/\udcfc/c' # setting using a surrogate-escaped representation - assert item.as_dict() == {'path': b'/a/\xfc/c'} + assert item.as_dict() == {'path': '/a/\udcfc/c'} def test_item_list_property(): From 8e87f1111b02f35c2ac14c0605b60ccd8db486b8 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 6 May 2022 03:59:10 +0200 Subject: [PATCH 02/14] cleanup msgpack related str/bytes mess, fixes #968 see ticket and borg.helpers.msgpack docstring. this changeset implements the full migration to msgpack 2.0 spec (use_bin_type=True, raw=False). still needed compat to the past is done via want_bytes decoder in borg.item. --- src/borg/archive.py | 7 +-- src/borg/archiver.py | 2 +- src/borg/crypto/key.py | 10 ++--- src/borg/helpers/msgpack.py | 34 +++++++------- src/borg/item.pyx | 35 +++++++++------ src/borg/remote.py | 77 +++++++++++++++----------------- src/borg/repository.py | 44 +++++++++--------- src/borg/testsuite/archive.py | 14 +++--- src/borg/testsuite/archiver.py | 10 ++--- src/borg/testsuite/key.py | 16 +++---- src/borg/testsuite/repository.py | 4 +- 11 files changed, 124 insertions(+), 129 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index dc244a9f15..ff6ca7ce68 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1718,13 +1718,10 @@ def rebuild_manifest(self): Iterates through all objects in the repository looking for archive metadata blocks. """ - required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS) - def valid_archive(obj): if not isinstance(obj, dict): return False - keys = set(obj) - return required_archive_keys.issubset(keys) + return REQUIRED_ARCHIVE_KEYS.issubset(obj) logger.info('Rebuilding missing manifest, this might take some time...') # as we have lost the manifest, we do not know any more what valid item keys we had. @@ -1904,7 +1901,7 @@ def list_keys_safe(keys): def valid_item(obj): if not isinstance(obj, StableDict): return False, 'not a dictionary' - keys = set(k.decode('utf-8', errors='replace') for k in obj) + keys = set(obj) if not required_item_keys.issubset(keys): return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys) if not keys.issubset(item_keys): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index acef0db5e0..a91e43c548 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -2331,7 +2331,7 @@ def output(fd): unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict) first = True - for item_id in archive_org_dict[b'items']: + for item_id in archive_org_dict['items']: data = key.decrypt(item_id, repository.get(item_id)) unpacker.feed(data) for item in unpacker: diff --git a/src/borg/crypto/key.py b/src/borg/crypto/key.py index 6ca6fbac02..15df53d001 100644 --- a/src/borg/crypto/key.py +++ b/src/borg/crypto/key.py @@ -232,24 +232,24 @@ def unpack_and_verify_manifest(self, data, force_tam_not_required=False): unpacker = get_limited_unpacker('manifest') unpacker.feed(data) unpacked = unpacker.unpack() - if b'tam' not in unpacked: + if 'tam' not in unpacked: if tam_required: raise TAMRequiredError(self.repository._location.canonical_path()) else: logger.debug('TAM not found and not required') return unpacked, False - tam = unpacked.pop(b'tam', None) + tam = unpacked.pop('tam', None) if not isinstance(tam, dict): raise TAMInvalid() - tam_type = tam.get(b'type', b'').decode('ascii', 'replace') + tam_type = tam.get('type', '') if tam_type != 'HKDF_HMAC_SHA512': if tam_required: raise TAMUnsupportedSuiteError(repr(tam_type)) else: logger.debug('Ignoring TAM made with unsupported suite, since TAM is not required: %r', tam_type) return unpacked, False - tam_hmac = tam.get(b'hmac') - tam_salt = tam.get(b'salt') + tam_hmac = tam.get('hmac') + tam_salt = tam.get('salt') if not isinstance(tam_salt, bytes) or not isinstance(tam_hmac, bytes): raise TAMInvalid() offset = data.index(tam_hmac) diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 5a2edecd67..268ee30e75 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -2,8 +2,7 @@ wrapping msgpack ================ -Due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it - -to avoid having lots of clutter in the calling code. see tickets #968 and #3632. +We wrap msgpack here the way we need it - to avoid having lots of clutter in the calling code. Packing ------- @@ -22,30 +21,27 @@ Unpacking --------- -- raw = True (the old way, used by borg <= 1.3) - This is currently still needed to not try to decode "raw" msgpack objects. - These could come either from str (new or old msgpack) or bytes (old msgpack). - Thus, we basically must know what we want and either keep the bytes we get - or decode them to str, if we want str. - -- raw = False (the new way) - This can be used in future, when we do not have to deal with data any more that was packed the old way. +- raw = False (used by borg since borg 1.3) + We already can use this with borg 1.3 due to the want_bytes decoder. + This decoder can be removed in future, when we do not have to deal with data any more that was packed the old way. It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str. +- raw = True (the old way, used by borg < 1.3) + - unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False). -As of borg 1.3, we have the first part on the way to fix the msgpack str/bytes mess, #968. -borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely. -But from now on, borg only **writes** new data according to the new msgpack spec, -thus we can complete the fix for #968 in a later borg release. +As of borg 1.3, we have fixed most of the msgpack str/bytes mess, #968. +Borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely. +But from now on, borg only **writes** new data according to the new msgpack 2.0 spec, +thus we can remove some legacy support in a later borg release (some places are marked with "legacy"). current way in msgpack terms ---------------------------- - pack with use_bin_type=True (according to msgpack 2.0 spec) - packs str -> raw and bytes -> bin -- unpack with raw=True (aka "the old way") -- unpacks raw to bytes (thus we always need to decode manually if we want str) +- unpack with raw=False (according to msgpack 2.0 spec, using unicode_errors='surrogateescape') +- unpacks bin to bytes and raw to str (thus we need to re-encode manually if we want bytes from "raw") """ from .datastruct import StableDict @@ -66,8 +62,8 @@ version = mp_version USE_BIN_TYPE = True -RAW = True # should become False later when we do not need to read old stuff any more -UNICODE_ERRORS = 'surrogateescape' # previously done by safe_encode, safe_decode +RAW = False +UNICODE_ERRORS = 'surrogateescape' class PackException(Exception): @@ -161,7 +157,7 @@ def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS, def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS, strict_map_key=False, **kwargs): - # assert raw == RAW + assert raw == RAW assert unicode_errors == UNICODE_ERRORS try: kw = dict(raw=raw, unicode_errors=unicode_errors, diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 89f476c1a4..4a6c811630 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -60,6 +60,15 @@ def fix_tuple_of_str_and_int(t): return t +def want_bytes(v): + """we know that we want bytes and the value should be bytes""" + # legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False + if isinstance(v, str): + v = v.encode('utf-8', errors='surrogateescape') + assert isinstance(v, bytes) + return v + + class PropDict: """ Manage a dictionary via properties. @@ -204,10 +213,10 @@ class Item(PropDict): user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None') group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None') - acl_access = PropDict._make_property('acl_access', bytes) - acl_default = PropDict._make_property('acl_default', bytes) - acl_extended = PropDict._make_property('acl_extended', bytes) - acl_nfs4 = PropDict._make_property('acl_nfs4', bytes) + acl_access = PropDict._make_property('acl_access', bytes, decode=want_bytes) + acl_default = PropDict._make_property('acl_default', bytes, decode=want_bytes) + acl_extended = PropDict._make_property('acl_extended', bytes, decode=want_bytes) + acl_nfs4 = PropDict._make_property('acl_nfs4', bytes, decode=want_bytes) mode = PropDict._make_property('mode', int) uid = PropDict._make_property('uid', int) @@ -224,7 +233,7 @@ class Item(PropDict): # compatibility note: this is a new feature, in old archives size will be missing. size = PropDict._make_property('size', int) - hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link. + hlid = PropDict._make_property('hlid', bytes, decode=want_bytes) # hard link id: same value means same hard link. hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') @@ -363,9 +372,9 @@ class EncryptedKey(PropDict): version = PropDict._make_property('version', int) algorithm = PropDict._make_property('algorithm', str) iterations = PropDict._make_property('iterations', int) - salt = PropDict._make_property('salt', bytes) - hash = PropDict._make_property('hash', bytes) - data = PropDict._make_property('data', bytes) + salt = PropDict._make_property('salt', bytes, decode=want_bytes) + hash = PropDict._make_property('hash', bytes, decode=want_bytes) + data = PropDict._make_property('data', bytes, decode=want_bytes) argon2_time_cost = PropDict._make_property('argon2_time_cost', int) argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int) argon2_parallelism = PropDict._make_property('argon2_parallelism', int) @@ -399,10 +408,10 @@ class Key(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - repository_id = PropDict._make_property('repository_id', bytes) - enc_key = PropDict._make_property('enc_key', bytes) - enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes) - id_key = PropDict._make_property('id_key', bytes) + repository_id = PropDict._make_property('repository_id', bytes, decode=want_bytes) + enc_key = PropDict._make_property('enc_key', bytes, decode=want_bytes) + enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes, decode=want_bytes) + id_key = PropDict._make_property('id_key', bytes, decode=want_bytes) chunk_seed = PropDict._make_property('chunk_seed', int) tam_required = PropDict._make_property('tam_required', bool) @@ -443,7 +452,7 @@ class ArchiveItem(PropDict): chunker_params = PropDict._make_property('chunker_params', tuple) recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2 - recreate_source_id = PropDict._make_property('recreate_source_id', bytes) + recreate_source_id = PropDict._make_property('recreate_source_id', bytes, decode=want_bytes) recreate_args = PropDict._make_property('recreate_args', list) # list of s-e-str recreate_partial_chunks = PropDict._make_property('recreate_partial_chunks', list) # list of tuples size = PropDict._make_property('size', int) diff --git a/src/borg/remote.py b/src/borg/remote.py index 8de302871f..6ea51d3c3d 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -38,8 +38,7 @@ RPC_PROTOCOL_VERSION = 2 BORG_VERSION = parse_version(__version__) -MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r' # pack -MSGIDB, MSGB, ARGSB, RESULTB = b'i', b'm', b'a', b'r' # unpack +MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r' MAX_INFLIGHT = 100 @@ -139,10 +138,6 @@ def __init__(self, data): } -def decode_keys(d): - return {k.decode(): d[k] for k in d} - - class RepositoryServer: # pragma: no cover rpc_methods = ( '__len__', @@ -217,14 +212,13 @@ def serve(self): for unpacked in unpacker: if isinstance(unpacked, dict): dictFormat = True - msgid = unpacked[MSGIDB] - method = unpacked[MSGB].decode() - args = decode_keys(unpacked[ARGSB]) + msgid = unpacked[MSGID] + method = unpacked[MSG] + args = unpacked[ARGS] elif isinstance(unpacked, tuple) and len(unpacked) == 4: dictFormat = False # The first field 'type' was always 1 and has always been ignored _, msgid, method, args = unpacked - method = method.decode() args = self.positional_to_named(method, args) else: if self.repository is not None: @@ -308,7 +302,7 @@ def negotiate(self, client_data): # clients since 1.1.0b3 use a dict as client_data # clients since 1.1.0b6 support json log format from server if isinstance(client_data, dict): - self.client_version = client_data[b'client_version'] + self.client_version = client_data['client_version'] level = logging.getLevelName(logging.getLogger('').level) setup_logging(is_serve=True, json=True, level=level) logger.debug('Initialized logging system for JSON-based protocol') @@ -370,7 +364,6 @@ def open(self, path, create=False, lock_wait=None, lock=True, exclusive=None, ap return self.repository.id def inject_exception(self, kind): - kind = kind.decode() s1 = 'test string' s2 = 'test string2' if kind == 'DoesNotExist': @@ -484,35 +477,35 @@ class RemoteRepository: class RPCError(Exception): def __init__(self, unpacked): - # for borg < 1.1: unpacked only has b'exception_class' as key - # for borg 1.1+: unpacked has keys: b'exception_args', b'exception_full', b'exception_short', b'sysinfo' + # for borg < 1.1: unpacked only has 'exception_class' as key + # for borg 1.1+: unpacked has keys: 'exception_args', 'exception_full', 'exception_short', 'sysinfo' self.unpacked = unpacked def get_message(self): - if b'exception_short' in self.unpacked: - return b'\n'.join(self.unpacked[b'exception_short']).decode() + if 'exception_short' in self.unpacked: + return '\n'.join(self.unpacked['exception_short']) else: return self.exception_class @property def traceback(self): - return self.unpacked.get(b'exception_trace', True) + return self.unpacked.get('exception_trace', True) @property def exception_class(self): - return self.unpacked[b'exception_class'].decode() + return self.unpacked['exception_class'] @property def exception_full(self): - if b'exception_full' in self.unpacked: - return b'\n'.join(self.unpacked[b'exception_full']).decode() + if 'exception_full' in self.unpacked: + return '\n'.join(self.unpacked['exception_full']) else: return self.get_message() + '\nRemote Exception (see remote log for the traceback)' @property def sysinfo(self): - if b'sysinfo' in self.unpacked: - return self.unpacked[b'sysinfo'].decode() + if 'sysinfo' in self.unpacked: + return self.unpacked['sysinfo'] else: return '' @@ -577,9 +570,9 @@ def __init__(self, location, create=False, exclusive=False, lock_wait=None, lock raise ConnectionClosedWithHint('Is borg working on the server?') from None if version == RPC_PROTOCOL_VERSION: self.dictFormat = False - elif isinstance(version, dict) and b'server_version' in version: + elif isinstance(version, dict) and 'server_version' in version: self.dictFormat = True - self.server_version = version[b'server_version'] + self.server_version = version['server_version'] else: raise Exception('Server insisted on using unsupported protocol version %s' % version) @@ -734,9 +727,9 @@ def pop_preload_msgid(chunkid): return msgid def handle_error(unpacked): - error = unpacked[b'exception_class'].decode() - old_server = b'exception_args' not in unpacked - args = unpacked.get(b'exception_args') + error = unpacked['exception_class'] + old_server = 'exception_args' not in unpacked + args = unpacked.get('exception_args') if error == 'DoesNotExist': raise Repository.DoesNotExist(self.location.processed) @@ -748,29 +741,29 @@ def handle_error(unpacked): if old_server: raise IntegrityError('(not available)') else: - raise IntegrityError(args[0].decode()) + raise IntegrityError(args[0]) elif error == 'AtticRepository': if old_server: raise Repository.AtticRepository('(not available)') else: - raise Repository.AtticRepository(args[0].decode()) + raise Repository.AtticRepository(args[0]) elif error == 'PathNotAllowed': if old_server: raise PathNotAllowed('(unknown)') else: - raise PathNotAllowed(args[0].decode()) + raise PathNotAllowed(args[0]) elif error == 'ParentPathDoesNotExist': - raise Repository.ParentPathDoesNotExist(args[0].decode()) + raise Repository.ParentPathDoesNotExist(args[0]) elif error == 'ObjectNotFound': if old_server: raise Repository.ObjectNotFound('(not available)', self.location.processed) else: - raise Repository.ObjectNotFound(args[0].decode(), self.location.processed) + raise Repository.ObjectNotFound(args[0], self.location.processed) elif error == 'InvalidRPCMethod': if old_server: raise InvalidRPCMethod('(not available)') else: - raise InvalidRPCMethod(args[0].decode()) + raise InvalidRPCMethod(args[0]) else: raise self.RPCError(unpacked) @@ -789,10 +782,10 @@ def handle_error(unpacked): try: unpacked = self.responses.pop(waiting_for[0]) waiting_for.pop(0) - if b'exception_class' in unpacked: + if 'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULTB] + yield unpacked[RESULT] if not waiting_for and not calls: return except KeyError: @@ -809,10 +802,10 @@ def handle_error(unpacked): else: return else: - if b'exception_class' in unpacked: + if 'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULTB] + yield unpacked[RESULT] if self.to_send or ((calls or self.preload_ids) and len(waiting_for) < MAX_INFLIGHT): w_fds = [self.stdin_fd] else: @@ -829,26 +822,26 @@ def handle_error(unpacked): self.unpacker.feed(data) for unpacked in self.unpacker: if isinstance(unpacked, dict): - msgid = unpacked[MSGIDB] + msgid = unpacked[MSGID] elif isinstance(unpacked, tuple) and len(unpacked) == 4: # The first field 'type' was always 1 and has always been ignored _, msgid, error, res = unpacked if error: # ignore res, because it is only a fixed string anyway. - unpacked = {MSGIDB: msgid, b'exception_class': error} + unpacked = {MSGID: msgid, 'exception_class': error} else: - unpacked = {MSGIDB: msgid, RESULTB: res} + unpacked = {MSGID: msgid, RESULT: res} else: raise UnexpectedRPCDataFormatFromServer(data) if msgid in self.ignore_responses: self.ignore_responses.remove(msgid) # async methods never return values, but may raise exceptions. - if b'exception_class' in unpacked: + if 'exception_class' in unpacked: self.async_responses[msgid] = unpacked else: # we currently do not have async result values except "None", # so we do not add them into async_responses. - if unpacked[RESULTB] is not None: + if unpacked[RESULT] is not None: self.async_responses[msgid] = unpacked else: self.responses[msgid] = unpacked diff --git a/src/borg/repository.py b/src/borg/repository.py index 9267fe0e6c..3fcc72aadd 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -516,16 +516,16 @@ def _read_integrity(self, transaction_id, key): integrity = msgpack.unpack(fd) except FileNotFoundError: return - if integrity.get(b'version') != 2: - logger.warning('Unknown integrity data version %r in %s', integrity.get(b'version'), integrity_file) + if integrity.get('version') != 2: + logger.warning('Unknown integrity data version %r in %s', integrity.get('version'), integrity_file) return - return integrity[key].decode() + return integrity[key] def open_index(self, transaction_id, auto_recover=True): if transaction_id is None: return NSIndex() index_path = os.path.join(self.path, 'index.%d' % transaction_id) - integrity_data = self._read_integrity(transaction_id, b'index') + integrity_data = self._read_integrity(transaction_id, 'index') try: with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd: return NSIndex.read(fd) @@ -575,7 +575,7 @@ def prepare_txn(self, transaction_id, do_cleanup=True): self.io.cleanup(transaction_id) hints_path = os.path.join(self.path, 'hints.%d' % transaction_id) index_path = os.path.join(self.path, 'index.%d' % transaction_id) - integrity_data = self._read_integrity(transaction_id, b'hints') + integrity_data = self._read_integrity(transaction_id, 'hints') try: with IntegrityCheckedFile(hints_path, write=False, integrity_data=integrity_data) as fd: hints = msgpack.unpack(fd) @@ -588,23 +588,23 @@ def prepare_txn(self, transaction_id, do_cleanup=True): self.check_transaction() self.prepare_txn(transaction_id) return - if hints[b'version'] == 1: + if hints['version'] == 1: logger.debug('Upgrading from v1 hints.%d', transaction_id) - self.segments = hints[b'segments'] + self.segments = hints['segments'] self.compact = FreeSpace() self.storage_quota_use = 0 self.shadow_index = {} - for segment in sorted(hints[b'compact']): + for segment in sorted(hints['compact']): logger.debug('Rebuilding sparse info for segment %d', segment) self._rebuild_sparse(segment) logger.debug('Upgrade to v2 hints complete') - elif hints[b'version'] != 2: - raise ValueError('Unknown hints file version: %d' % hints[b'version']) + elif hints['version'] != 2: + raise ValueError('Unknown hints file version: %d' % hints['version']) else: - self.segments = hints[b'segments'] - self.compact = FreeSpace(hints[b'compact']) - self.storage_quota_use = hints.get(b'storage_quota_use', 0) - self.shadow_index = hints.get(b'shadow_index', {}) + self.segments = hints['segments'] + self.compact = FreeSpace(hints['compact']) + self.storage_quota_use = hints.get('storage_quota_use', 0) + self.shadow_index = hints.get('shadow_index', {}) self.log_storage_quota() # Drop uncommitted segments in the shadow index for key, shadowed_segments in self.shadow_index.items(): @@ -621,16 +621,16 @@ def rename_tmp(file): os.rename(file + '.tmp', file) hints = { - b'version': 2, - b'segments': self.segments, - b'compact': self.compact, - b'storage_quota_use': self.storage_quota_use, - b'shadow_index': self.shadow_index, + 'version': 2, + 'segments': self.segments, + 'compact': self.compact, + 'storage_quota_use': self.storage_quota_use, + 'shadow_index': self.shadow_index, } integrity = { # Integrity version started at 2, the current hints version. # Thus, integrity version == hints version, for now. - b'version': 2, + 'version': 2, } transaction_id = self.io.get_segments_transaction_id() assert transaction_id is not None @@ -647,7 +647,7 @@ def rename_tmp(file): with IntegrityCheckedFile(hints_file + '.tmp', filename=hints_name, write=True) as fd: msgpack.pack(hints, fd) flush_and_sync(fd) - integrity[b'hints'] = fd.integrity_data + integrity['hints'] = fd.integrity_data # Write repository index index_name = 'index.%d' % transaction_id @@ -656,7 +656,7 @@ def rename_tmp(file): # XXX: Consider using SyncFile for index write-outs. self.index.write(fd) flush_and_sync(fd) - integrity[b'index'] = fd.integrity_data + integrity['index'] = fd.integrity_data # Write integrity file, containing checksums of the hints and index files integrity_name = 'integrity.%d' % transaction_id diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index 0eed9f7e8e..9cdcf50466 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -171,7 +171,7 @@ def make_chunks(self, items): return b''.join(msgpack.packb({'path': item}) for item in items) def _validator(self, value): - return isinstance(value, dict) and value.get(b'path') in (b'foo', b'bar', b'boo', b'baz') + return isinstance(value, dict) and value.get('path') in ('foo', 'bar', 'boo', 'baz') def process(self, input): unpacker = RobustUnpacker(validator=self._validator, item_keys=ITEM_KEYS) @@ -190,10 +190,10 @@ def test_extra_garbage_no_sync(self): (False, [b'garbage'] + [self.make_chunks(['boo', 'baz'])])] result = self.process(chunks) self.assert_equal(result, [ - {b'path': b'foo'}, {b'path': b'bar'}, + {'path': 'foo'}, {'path': 'bar'}, 103, 97, 114, 98, 97, 103, 101, - {b'path': b'boo'}, - {b'path': b'baz'}]) + {'path': 'boo'}, + {'path': 'baz'}]) def split(self, left, length): parts = [] @@ -206,19 +206,19 @@ def test_correct_stream(self): chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 2) input = [(False, chunks)] result = self.process(input) - self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'bar'}, {b'path': b'boo'}, {b'path': b'baz'}]) + self.assert_equal(result, [{'path': 'foo'}, {'path': 'bar'}, {'path': 'boo'}, {'path': 'baz'}]) def test_missing_chunk(self): chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, chunks[4:])] result = self.process(input) - self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) + self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}]) def test_corrupt_chunk(self): chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])] result = self.process(input) - self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) + self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}]) @pytest.fixture diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index f3315b676b..a4205ea760 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -3623,14 +3623,14 @@ def test_init_defaults_to_argon2(self): self.cmd('init', '--encryption=repokey', self.repository_location) with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == b'argon2 chacha20-poly1305' + assert key['algorithm'] == 'argon2 chacha20-poly1305' def test_init_with_explicit_key_algorithm(self): """https://github.com/borgbackup/borg/issues/747#issuecomment-1076160401""" self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location) with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == b'sha256' + assert key['algorithm'] == 'sha256' def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, expected_algorithm): self.cmd('init', '--encryption=repokey', '--key-algorithm', given_algorithm, self.repository_location) @@ -3640,7 +3640,7 @@ def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, ex with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm.encode() + assert key['algorithm'] == expected_algorithm def test_change_passphrase_does_not_change_algorithm_argon2(self): self.verify_change_passphrase_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') @@ -3655,7 +3655,7 @@ def verify_change_location_does_not_change_algorithm(self, given_algorithm, expe with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm.encode() + assert key['algorithm'] == expected_algorithm def test_change_location_does_not_change_algorithm_argon2(self): self.verify_change_location_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') @@ -3969,7 +3969,7 @@ def test_not_required(self): key.change_passphrase(key._passphrase) manifest = msgpack.unpackb(key.decrypt(Manifest.MANIFEST_ID, repository.get(Manifest.MANIFEST_ID))) - del manifest[b'tam'] + del manifest['tam'] repository.put(Manifest.MANIFEST_ID, key.encrypt(Manifest.MANIFEST_ID, msgpack.packb(manifest))) repository.commit(compact=False) output = self.cmd('list', '--debug', self.repository_location) diff --git a/src/borg/testsuite/key.py b/src/borg/testsuite/key.py index 5073c5b23b..02eaa86e5e 100644 --- a/src/borg/testsuite/key.py +++ b/src/borg/testsuite/key.py @@ -360,23 +360,23 @@ def test_round_trip(self, key): assert blob.startswith(b'\x82') unpacked = msgpack.unpackb(blob) - assert unpacked[b'tam'][b'type'] == b'HKDF_HMAC_SHA512' + assert unpacked['tam']['type'] == 'HKDF_HMAC_SHA512' unpacked, verified = key.unpack_and_verify_manifest(blob) assert verified - assert unpacked[b'foo'] == b'bar' - assert b'tam' not in unpacked + assert unpacked['foo'] == 'bar' + assert 'tam' not in unpacked - @pytest.mark.parametrize('which', (b'hmac', b'salt')) + @pytest.mark.parametrize('which', ('hmac', 'salt')) def test_tampered(self, key, which): data = {'foo': 'bar'} blob = key.pack_and_authenticate_metadata(data) assert blob.startswith(b'\x82') unpacked = msgpack.unpackb(blob, object_hook=StableDict) - assert len(unpacked[b'tam'][which]) == 64 - unpacked[b'tam'][which] = unpacked[b'tam'][which][0:32] + bytes(32) - assert len(unpacked[b'tam'][which]) == 64 + assert len(unpacked['tam'][which]) == 64 + unpacked['tam'][which] = unpacked['tam'][which][0:32] + bytes(32) + assert len(unpacked['tam'][which]) == 64 blob = msgpack.packb(unpacked) with pytest.raises(TAMInvalid): @@ -421,4 +421,4 @@ def to_dict(key): load_me = RepoKey.detect(repository, manifest_data=None) assert to_dict(load_me) == to_dict(save_me) - assert msgpack.unpackb(a2b_base64(saved))[b'algorithm'] == expected_algorithm.encode() + assert msgpack.unpackb(a2b_base64(saved))['algorithm'] == expected_algorithm diff --git a/src/borg/testsuite/repository.py b/src/borg/testsuite/repository.py index b4944e58a1..52f03e668d 100644 --- a/src/borg/testsuite/repository.py +++ b/src/borg/testsuite/repository.py @@ -655,8 +655,8 @@ def _subtly_corrupted_hints_setup(self): hints = msgpack.unpack(fd) fd.seek(0) # Corrupt segment refcount - assert hints[b'segments'][2] == 1 - hints[b'segments'][2] = 0 + assert hints['segments'][2] == 1 + hints['segments'][2] = 0 msgpack.pack(hints, fd) fd.truncate() From 33444be9268d70f163b16f1bf8c7a5bf0f1fb353 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 19 May 2022 23:12:21 +0200 Subject: [PATCH 03/14] more str vs bytes fixing --- src/borg/crypto/key.py | 6 ++- src/borg/helpers/fs.py | 4 +- src/borg/item.pyx | 88 ++++++++++++++++++++++++++++++------------ 3 files changed, 70 insertions(+), 28 deletions(-) diff --git a/src/borg/crypto/key.py b/src/borg/crypto/key.py index 15df53d001..2b7b50da86 100644 --- a/src/borg/crypto/key.py +++ b/src/borg/crypto/key.py @@ -22,7 +22,7 @@ from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded, PassphraseWrong from ..helpers import msgpack from ..helpers.manifest import Manifest -from ..item import Key, EncryptedKey +from ..item import Key, EncryptedKey, want_bytes from ..platform import SaveFile from .nonces import NonceManager @@ -250,8 +250,10 @@ def unpack_and_verify_manifest(self, data, force_tam_not_required=False): return unpacked, False tam_hmac = tam.get('hmac') tam_salt = tam.get('salt') - if not isinstance(tam_salt, bytes) or not isinstance(tam_hmac, bytes): + if not isinstance(tam_salt, (bytes, str)) or not isinstance(tam_hmac, (bytes, str)): raise TAMInvalid() + tam_hmac = want_bytes(tam_hmac) # legacy + tam_salt = want_bytes(tam_salt) # legacy offset = data.index(tam_hmac) data[offset:offset + 64] = bytes(64) tam_key = self._tam_key(tam_salt, context=b'manifest') diff --git a/src/borg/helpers/fs.py b/src/borg/helpers/fs.py index fecda9c69e..5509e0d0f0 100644 --- a/src/borg/helpers/fs.py +++ b/src/borg/helpers/fs.py @@ -205,8 +205,8 @@ def borg1_hardlink_slave(self, item): # legacy def hardlink_id_from_path(self, path): """compute a hardlink id from a path""" - assert isinstance(path, bytes) - return hashlib.sha256(path).digest() + assert isinstance(path, str) + return hashlib.sha256(path.encode('utf-8', errors='surrogateescape')).digest() def hardlink_id_from_inode(self, *, ino, dev): """compute a hardlink id from an inode""" diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 4a6c811630..9fddfa457b 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -15,11 +15,11 @@ cdef extern from "_item.c": API_VERSION = '1.2_01' -def fix_key(data, key): +def fix_key(data, key, *, errors='strict'): """if k is a bytes-typed key, migrate key/value to a str-typed key in dict data""" if isinstance(key, bytes): value = data.pop(key) - key = key.decode() + key = key.decode('utf-8', errors=errors) data[key] = value assert isinstance(key, str) return key @@ -29,46 +29,77 @@ def fix_str_value(data, key, errors='surrogateescape'): """makes sure that data[key] is a str (decode if it is bytes)""" assert isinstance(key, str) # fix_key must be called first value = data[key] - if isinstance(value, bytes): - value = value.decode('utf-8', errors=errors) - data[key] = value - assert isinstance(value, str) + value = want_str(value, errors=errors) + data[key] = value return value -def fix_list_of_str(t): +def fix_bytes_value(data, key): + """makes sure that data[key] is bytes (encode if it is str)""" + assert isinstance(key, str) # fix_key must be called first + value = data[key] + value = want_bytes(value) + data[key] = value + return value + + +def fix_list_of_str(v): """make sure we have a list of str""" - assert isinstance(t, (tuple, list)) - l = [e.decode() if isinstance(e, bytes) else e for e in t] - assert all(isinstance(e, str) for e in l), repr(l) - return l + assert isinstance(v, (tuple, list)) + return [want_str(e) for e in v] + +def fix_list_of_bytes(v): + """make sure we have a list of bytes""" + assert isinstance(v, (tuple, list)) + return [want_bytes(e) for e in v] -def fix_tuple_of_str(t): + +def fix_list_of_chunkentries(v): + """make sure we have a list of correct chunkentries""" + assert isinstance(v, (tuple, list)) + chunks = [] + for ce in v: + assert isinstance(ce, (tuple, list)) + assert len(ce) == 3 # id, size, csize + assert isinstance(ce[1], int) + assert isinstance(ce[2], int) + ce_fixed = [want_bytes(ce[0]), ce[1], ce[2]] # list! + chunks.append(ce_fixed) # create a list of lists + return chunks + + +def fix_tuple_of_str(v): """make sure we have a tuple of str""" - assert isinstance(t, (tuple, list)) - t = tuple(e.decode() if isinstance(e, bytes) else e for e in t) - assert all(isinstance(e, str) for e in t), repr(t) - return t + assert isinstance(v, (tuple, list)) + return tuple(want_str(e) for e in v) -def fix_tuple_of_str_and_int(t): +def fix_tuple_of_str_and_int(v): """make sure we have a tuple of str""" - assert isinstance(t, (tuple, list)) - t = tuple(e.decode() if isinstance(e, bytes) else e for e in t) + assert isinstance(v, (tuple, list)) + t = tuple(e.decode() if isinstance(e, bytes) else e for e in v) assert all(isinstance(e, (str, int)) for e in t), repr(t) return t -def want_bytes(v): +def want_bytes(v, *, errors='surrogateescape'): """we know that we want bytes and the value should be bytes""" # legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False if isinstance(v, str): - v = v.encode('utf-8', errors='surrogateescape') + v = v.encode('utf-8', errors=errors) assert isinstance(v, bytes) return v +def want_str(v, *, errors='surrogateescape'): + """we know that we want str and the value should be str""" + if isinstance(v, bytes): + v = v.decode('utf-8', errors=errors) + assert isinstance(v, str) + return v + + class PropDict: """ Manage a dictionary via properties. @@ -349,6 +380,11 @@ class Item(PropDict): k = fix_key(d, k) if k in ('path', 'source', 'user', 'group'): v = fix_str_value(d, k) + if k in ('chunks', 'chunks_healthy'): + v = fix_list_of_chunkentries(v) + if k in ('acl_access', 'acl_default', 'acl_extended', 'acl_nfs4'): + v = fix_bytes_value(d, k) + # TODO: xattrs self._dict[k] = v @@ -476,6 +512,8 @@ class ArchiveItem(PropDict): v = fix_tuple_of_str_and_int(v) if k in ('cmdline', 'recreate_cmdline'): v = fix_list_of_str(v) + if k == 'items': + v = fix_list_of_bytes(v) self._dict[k] = v @@ -511,13 +549,15 @@ class ManifestItem(PropDict): ad = v assert isinstance(ad, dict) for ak, av in list(ad.items()): - ak = fix_key(ad, ak) + ak = fix_key(ad, ak, errors='surrogateescape') assert isinstance(av, dict) for ik, iv in list(av.items()): ik = fix_key(av, ik) + if ik == 'id': + fix_bytes_value(av, 'id') + if ik == 'time': + fix_str_value(av, 'time') assert set(av) == {'id', 'time'} - assert isinstance(av['id'], bytes) - fix_str_value(av, 'time') if k == 'timestamp': v = fix_str_value(d, k, 'replace') if k == 'config': From 655c1b9cc2ed9927d52bc6338d2debeb24c69786 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 28 May 2022 21:57:22 +0200 Subject: [PATCH 04/14] update docstrings / comments --- src/borg/item.pyx | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 9fddfa457b..c44ce54288 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -223,14 +223,9 @@ class Item(PropDict): Items are created either from msgpack unpacker output, from another dict, from kwargs or built step-by-step by setting attributes. - msgpack gives us a dict with bytes-typed keys, just give it to Item(internal_dict=d) and use item.key_name later. - msgpack gives us byte-typed values for stuff that should be str, we automatically decode when getting - such a property and encode when setting it. + msgpack unpacker gives us a dict, just give it to Item(internal_dict=d) and use item.key_name later. If an Item shall be serialized, give as_dict() method output to msgpack packer. - - A bug in Attic up to and including release 0.13 added a (meaningless) 'acl' key to every item. - We must never re-use this key. See test_attic013_acl_bug for details. """ VALID_KEYS = ITEM_KEYS | {'deleted', 'nlink', } # str-typed keys @@ -261,7 +256,6 @@ class Item(PropDict): birthtime = PropDict._make_property('birthtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) # size is only present for items with a chunk list and then it is sum(chunk_sizes) - # compatibility note: this is a new feature, in old archives size will be missing. size = PropDict._make_property('size', int) hlid = PropDict._make_property('hlid', bytes, decode=want_bytes) # hard link id: same value means same hard link. @@ -395,13 +389,13 @@ class EncryptedKey(PropDict): A EncryptedKey is created either from msgpack unpacker output, from another dict, from kwargs or built step-by-step by setting attributes. - msgpack gives us a dict with bytes-typed keys, just give it to EncryptedKey(d) and use enc_key.xxx later. + msgpack unpacker gives us a dict, just give it to EncryptedKey(d) and use enc_key.xxx later. If a EncryptedKey shall be serialized, give as_dict() method output to msgpack packer. """ - VALID_KEYS = { 'version', 'algorithm', 'iterations', 'salt', 'hash', 'data', - 'argon2_time_cost', 'argon2_memory_cost', 'argon2_parallelism', 'argon2_type' } + VALID_KEYS = {'version', 'algorithm', 'iterations', 'salt', 'hash', 'data', + 'argon2_time_cost', 'argon2_memory_cost', 'argon2_parallelism', 'argon2_type'} __slots__ = ("_dict", ) # avoid setting attributes not supported by properties @@ -434,7 +428,7 @@ class Key(PropDict): A Key is created either from msgpack unpacker output, from another dict, from kwargs or built step-by-step by setting attributes. - msgpack gives us a dict with bytes-typed keys, just give it to Key(d) and use key.xxx later. + msgpack unpacker gives us a dict, just give it to Key(d) and use key.xxx later. If a Key shall be serialized, give as_dict() method output to msgpack packer. """ @@ -467,7 +461,7 @@ class ArchiveItem(PropDict): An ArchiveItem is created either from msgpack unpacker output, from another dict, from kwargs or built step-by-step by setting attributes. - msgpack gives us a dict with bytes-typed keys, just give it to ArchiveItem(d) and use arch.xxx later. + msgpack unpacker gives us a dict, just give it to ArchiveItem(d) and use arch.xxx later. If a ArchiveItem shall be serialized, give as_dict() method output to msgpack packer. """ @@ -524,7 +518,7 @@ class ManifestItem(PropDict): A ManifestItem is created either from msgpack unpacker output, from another dict, from kwargs or built step-by-step by setting attributes. - msgpack gives us a dict with bytes-typed keys, just give it to ManifestItem(d) and use manifest.xxx later. + msgpack unpacker gives us a dict, just give it to ManifestItem(d) and use manifest.xxx later. If a ManifestItem shall be serialized, give as_dict() method output to msgpack packer. """ From 8e58525fc6d3eceae4d846a7be9e2eed20084580 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 29 May 2022 15:38:43 +0200 Subject: [PATCH 05/14] Item: remove some decode= params update_internal() makes sure they have the desired type already. --- src/borg/item.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index c44ce54288..0fabc0241f 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -239,10 +239,10 @@ class Item(PropDict): user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None') group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None') - acl_access = PropDict._make_property('acl_access', bytes, decode=want_bytes) - acl_default = PropDict._make_property('acl_default', bytes, decode=want_bytes) - acl_extended = PropDict._make_property('acl_extended', bytes, decode=want_bytes) - acl_nfs4 = PropDict._make_property('acl_nfs4', bytes, decode=want_bytes) + acl_access = PropDict._make_property('acl_access', bytes) + acl_default = PropDict._make_property('acl_default', bytes) + acl_extended = PropDict._make_property('acl_extended', bytes) + acl_nfs4 = PropDict._make_property('acl_nfs4', bytes) mode = PropDict._make_property('mode', int) uid = PropDict._make_property('uid', int) @@ -258,7 +258,7 @@ class Item(PropDict): # size is only present for items with a chunk list and then it is sum(chunk_sizes) size = PropDict._make_property('size', int) - hlid = PropDict._make_property('hlid', bytes, decode=want_bytes) # hard link id: same value means same hard link. + hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link. hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') @@ -482,7 +482,7 @@ class ArchiveItem(PropDict): chunker_params = PropDict._make_property('chunker_params', tuple) recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2 - recreate_source_id = PropDict._make_property('recreate_source_id', bytes, decode=want_bytes) + recreate_source_id = PropDict._make_property('recreate_source_id', bytes) recreate_args = PropDict._make_property('recreate_args', list) # list of s-e-str recreate_partial_chunks = PropDict._make_property('recreate_partial_chunks', list) # list of tuples size = PropDict._make_property('size', int) From 7b138cc7107ebc00f5b9ed422e2794da8eedba61 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 29 May 2022 16:43:51 +0200 Subject: [PATCH 06/14] Item: convert timestamps once, get rid of bigint code --- src/borg/archiver.py | 4 ---- src/borg/helpers/__init__.py | 2 +- src/borg/helpers/msgpack.py | 25 +++---------------------- src/borg/item.pyx | 18 ++++++++++++++++-- src/borg/testsuite/helpers.py | 14 +------------- 5 files changed, 21 insertions(+), 42 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index a91e43c548..7ac3c7e6d6 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -366,10 +366,6 @@ def upgrade_item(item): if chunks_healthy is not None: item._dict['chunks_healthy'] = chunks item._dict.pop('source') # not used for hardlinks any more, replaced by hlid - for attr in 'atime', 'ctime', 'mtime', 'birthtime': - if attr in item: - ns = getattr(item, attr) # decode (bigint or Timestamp) --> int ns - setattr(item, attr, ns) # encode int ns --> msgpack.Timestamp only, no bigint any more # make sure we only have desired stuff in the new item. specifically, make sure to get rid of: # - 'acl' remnants of bug in attic <= 0.13 # - 'hardlink_master' (superseded by hlid) diff --git a/src/borg/helpers/__init__.py b/src/borg/helpers/__init__.py index 34a59b903d..d8a4cda620 100644 --- a/src/borg/helpers/__init__.py +++ b/src/borg/helpers/__init__.py @@ -18,7 +18,7 @@ from .time import * # NOQA from .yes import * # NOQA -from .msgpack import is_slow_msgpack, is_supported_msgpack, int_to_bigint, bigint_to_int, get_limited_unpacker +from .msgpack import is_slow_msgpack, is_supported_msgpack, get_limited_unpacker from . import msgpack # generic mechanism to enable users to invoke workarounds by setting the diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 268ee30e75..4e79c02744 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -201,30 +201,11 @@ def get_limited_unpacker(kind): return Unpacker(**args) -def bigint_to_int(mtime): # legacy - """Convert bytearray to int - """ - if isinstance(mtime, bytes): - return int.from_bytes(mtime, 'little', signed=True) - return mtime - - -def int_to_bigint(value): # legacy - """Convert integers larger than 64 bits to bytearray - - Smaller integers are left alone - """ - if value.bit_length() > 63: - return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True) - return value - - def int_to_timestamp(ns): + assert isinstance(ns, int) return Timestamp.from_unix_nano(ns) def timestamp_to_int(ts): - if isinstance(ts, Timestamp): - return ts.to_unix_nano() - # legacy support note: we need to keep the bigint conversion for compatibility with borg < 1.3 archives. - return bigint_to_int(ts) + assert isinstance(ts, Timestamp) + return ts.to_unix_nano() diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 0fabc0241f..1520527a3a 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -4,7 +4,7 @@ from collections import namedtuple from .constants import ITEM_KEYS, ARCHIVE_KEYS from .helpers import StableDict from .helpers import format_file_size -from .helpers.msgpack import timestamp_to_int, int_to_timestamp +from .helpers.msgpack import timestamp_to_int, int_to_timestamp, Timestamp cdef extern from "_item.c": @@ -83,6 +83,17 @@ def fix_tuple_of_str_and_int(v): return t +def fix_timestamp(v): + """make sure v is a Timestamp""" + if isinstance(v, Timestamp): + return v + # legacy support + if isinstance(v, bytes): # was: bigint_to_int() + v = int.from_bytes(v, 'little', signed=True) + assert isinstance(v, int) + return int_to_timestamp(v) + + def want_bytes(v, *, errors='surrogateescape'): """we know that we want bytes and the value should be bytes""" # legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False @@ -369,13 +380,16 @@ class Item(PropDict): return False def update_internal(self, d): - # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str), + # also need to fix old timestamp data types. for k, v in list(d.items()): k = fix_key(d, k) if k in ('path', 'source', 'user', 'group'): v = fix_str_value(d, k) if k in ('chunks', 'chunks_healthy'): v = fix_list_of_chunkentries(v) + if k in ('atime', 'ctime', 'mtime', 'birthtime'): + v = fix_timestamp(v) if k in ('acl_access', 'acl_default', 'acl_extended', 'acl_nfs4'): v = fix_bytes_value(d, k) # TODO: xattrs diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index b181da564e..fd04141167 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -22,7 +22,7 @@ from ..helpers import is_slow_msgpack from ..helpers import msgpack from ..helpers import yes, TRUISH, FALSISH, DEFAULTISH -from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex +from ..helpers import StableDict, bin_to_hex from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless from ..helpers import swidth_slice @@ -38,18 +38,6 @@ from . import BaseTestCase, FakeInputs -class BigIntTestCase(BaseTestCase): - - def test_bigint(self): - self.assert_equal(int_to_bigint(0), 0) - self.assert_equal(int_to_bigint(2**63-1), 2**63-1) - self.assert_equal(int_to_bigint(-2**63+1), -2**63+1) - self.assert_equal(int_to_bigint(2**63), b'\x00\x00\x00\x00\x00\x00\x00\x80\x00') - self.assert_equal(int_to_bigint(-2**63), b'\x00\x00\x00\x00\x00\x00\x00\x80\xff') - self.assert_equal(bigint_to_int(int_to_bigint(-2**70)), -2**70) - self.assert_equal(bigint_to_int(int_to_bigint(2**70)), 2**70) - - def test_bin_to_hex(): assert bin_to_hex(b'') == '' assert bin_to_hex(b'\x00\x01\xff') == '0001ff' From 9d684120a2ae91fcd75003cda1ce36e7c1f22056 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 29 May 2022 17:32:42 +0200 Subject: [PATCH 07/14] Item: assert type also in property getter also: fixed Item.xattrs to be StableDict (not just a dict, as the msgpack unpacker gives us) --- src/borg/item.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 1520527a3a..0e8ff179eb 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -207,6 +207,8 @@ class PropDict: raise AttributeError(attr_error_msg) from None if decode is not None: value = decode(value) + if not isinstance(value, value_type): + raise TypeError(type_error_msg) return value def _set(self, value): @@ -392,7 +394,10 @@ class Item(PropDict): v = fix_timestamp(v) if k in ('acl_access', 'acl_default', 'acl_extended', 'acl_nfs4'): v = fix_bytes_value(d, k) - # TODO: xattrs + if k == 'xattrs': + if not isinstance(v, StableDict): + v = StableDict(v) + # TODO: xattrs key/value types self._dict[k] = v From 64cc16a9f470925dc0c9da152ec3ff2864b3ae95 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 29 May 2022 21:22:50 +0200 Subject: [PATCH 08/14] Item: fix xattr processing Item.xattrs is now always a StableDict mapping bytes keys -> bytes values. The special casing of empty values (b'') getting replaced by None was removed. --- src/borg/item.pyx | 12 +++++++++--- src/borg/xattr.py | 8 ++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 0e8ff179eb..c3f59ed271 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -99,7 +99,7 @@ def want_bytes(v, *, errors='surrogateescape'): # legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False if isinstance(v, str): v = v.encode('utf-8', errors=errors) - assert isinstance(v, bytes) + assert isinstance(v, bytes), f'not a bytes object, but {v!r}' return v @@ -107,7 +107,7 @@ def want_str(v, *, errors='surrogateescape'): """we know that we want str and the value should be str""" if isinstance(v, bytes): v = v.decode('utf-8', errors=errors) - assert isinstance(v, str) + assert isinstance(v, str), f'not a str object, but {v!r}' return v @@ -397,7 +397,13 @@ class Item(PropDict): if k == 'xattrs': if not isinstance(v, StableDict): v = StableDict(v) - # TODO: xattrs key/value types + v_new = StableDict() + for xk, xv in list(v.items()): + xk = want_bytes(xk) + # old borg used to store None instead of a b'' value + xv = b'' if xv is None else want_bytes(xv) + v_new[xk] = xv + v = v_new # xattrs is a StableDict(bytes keys -> bytes values) self._dict[k] = v diff --git a/src/borg/xattr.py b/src/borg/xattr.py index 8a731340c8..175f6372cb 100644 --- a/src/borg/xattr.py +++ b/src/borg/xattr.py @@ -76,9 +76,7 @@ def get_all(path, follow_symlinks=False): for name in names: try: # xattr name is a bytes object, we directly use it. - # if we get an empty xattr value (b''), we store None into the result dict - - # borg always did it like that... - result[name] = getxattr(path, name, follow_symlinks=follow_symlinks) or None + result[name] = getxattr(path, name, follow_symlinks=follow_symlinks) except OSError as e: name_str = name.decode() if isinstance(path, int): @@ -122,9 +120,7 @@ def set_all(path, xattrs, follow_symlinks=False): warning = False for k, v in xattrs.items(): try: - # the key k is a bytes object due to msgpack unpacking it as such. - # if we have a None value, it means "empty", so give b'' to setxattr in that case: - setxattr(path, k, v or b'', follow_symlinks=follow_symlinks) + setxattr(path, k, v, follow_symlinks=follow_symlinks) except OSError as e: warning = True k_str = k.decode() From f2b085787b0fb13634073b1df3979a30ac66fa5a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 30 May 2022 00:05:07 +0200 Subject: [PATCH 09/14] Item: disallow None value for .user/group/chunks/chunks_healthy If we do not know the value, just do not have that key/value pair in the item. --- src/borg/archive.py | 23 ++++++++++++++--------- src/borg/archiver.py | 4 ++-- src/borg/helpers/parseformat.py | 4 ++-- src/borg/item.pyx | 8 ++++---- src/borg/testsuite/item.py | 8 -------- 5 files changed, 22 insertions(+), 25 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index ff6ca7ce68..e068413f57 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -392,14 +392,14 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def if uid_forced is not None: uid = uid_forced else: - uid = None if numeric else user2uid(item.user) + uid = None if numeric else user2uid(item.get('user')) uid = item.uid if uid is None else uid if uid < 0: uid = uid_default if gid_forced is not None: gid = gid_forced else: - gid = None if numeric else group2gid(item.group) + gid = None if numeric else group2gid(item.get('group')) gid = item.gid if gid is None else gid if gid < 0: gid = gid_default @@ -1089,11 +1089,13 @@ def stat_simple_attrs(self, st): if not self.nobirthtime and hasattr(st, 'st_birthtime'): # sadly, there's no stat_result.st_birthtime_ns attrs['birthtime'] = safe_ns(int(st.st_birthtime * 10**9)) - if self.numeric_ids: - attrs['user'] = attrs['group'] = None - else: - attrs['user'] = uid2user(st.st_uid) - attrs['group'] = gid2group(st.st_gid) + if not self.numeric_ids: + user = uid2user(st.st_uid) + if user is not None: + attrs['user'] = user + group = gid2group(st.st_gid) + if group is not None: + attrs['group'] = group return attrs def stat_ext_attrs(self, st, path, fd=None): @@ -1426,8 +1428,11 @@ def s_to_ns(s): return safe_ns(int(float(s) * 1e9)) item = Item(path=make_path_safe(tarinfo.name), mode=tarinfo.mode | type, - uid=tarinfo.uid, gid=tarinfo.gid, user=tarinfo.uname or None, group=tarinfo.gname or None, - mtime=s_to_ns(tarinfo.mtime)) + uid=tarinfo.uid, gid=tarinfo.gid, mtime=s_to_ns(tarinfo.mtime)) + if tarinfo.uname: + item.user = tarinfo.uname + if tarinfo.gname: + item.group = tarinfo.gname if ph: # note: for mtime this is a bit redundant as it is already done by tarfile module, # but we just do it in our way to be consistent for sure. diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 7ac3c7e6d6..8d5513973c 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -1355,8 +1355,8 @@ def item_to_tarinfo(item, original_path): tarinfo.mode = stat.S_IMODE(item.mode) tarinfo.uid = item.uid tarinfo.gid = item.gid - tarinfo.uname = item.user or '' - tarinfo.gname = item.group or '' + tarinfo.uname = item.get('user', '') + tarinfo.gname = item.get('group', '') # The linkname in tar has 2 uses: # for symlinks it means the destination, while for hardlinks it refers to the file. # Since hardlinks in tar have a different type code (LNKTYPE) the format might diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 414402de0f..e772f38c47 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -808,8 +808,8 @@ def get_item_data(self, item): hlid = bin_to_hex(hlid) if hlid else '' item_data['type'] = item_type item_data['mode'] = mode - item_data['user'] = item.user or item.uid - item_data['group'] = item.group or item.gid + item_data['user'] = item.get('user', str(item.uid)) + item_data['group'] = item.get('group', str(item.gid)) item_data['uid'] = item.uid item_data['gid'] = item.gid item_data['path'] = remove_surrogates(item.path) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index c3f59ed271..1822cf9de5 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -249,8 +249,8 @@ class Item(PropDict): path = PropDict._make_property('path', str, 'surrogate-escaped str') source = PropDict._make_property('source', str, 'surrogate-escaped str') - user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None') - group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None') + user = PropDict._make_property('user', str, 'surrogate-escaped str') + group = PropDict._make_property('group', str, 'surrogate-escaped str') acl_access = PropDict._make_property('acl_access', bytes) acl_default = PropDict._make_property('acl_default', bytes) @@ -274,8 +274,8 @@ class Item(PropDict): hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link. hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy - chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') - chunks_healthy = PropDict._make_property('chunks_healthy', (list, type(None)), 'list or None') + chunks = PropDict._make_property('chunks', list, 'list') + chunks_healthy = PropDict._make_property('chunks_healthy', list, 'list') xattrs = PropDict._make_property('xattrs', StableDict) diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index 94167e7ea9..a0c1d9d134 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -89,14 +89,6 @@ def test_item_mptimestamp_property(): assert item.as_dict() == {'atime': Timestamp.from_unix_nano(big)} -def test_item_user_group_none(): - item = Item() - item.user = None - assert item.user is None - item.group = None - assert item.group is None - - def test_item_se_str_property(): # start simple item = Item() From ed22f721f3e43e7800173d87b7a1d4cefedbff8e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 30 May 2022 13:32:11 +0200 Subject: [PATCH 10/14] EncryptedKey: fix once, remove decode=... --- src/borg/item.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 1822cf9de5..5e6e401f56 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -427,9 +427,9 @@ class EncryptedKey(PropDict): version = PropDict._make_property('version', int) algorithm = PropDict._make_property('algorithm', str) iterations = PropDict._make_property('iterations', int) - salt = PropDict._make_property('salt', bytes, decode=want_bytes) - hash = PropDict._make_property('hash', bytes, decode=want_bytes) - data = PropDict._make_property('data', bytes, decode=want_bytes) + salt = PropDict._make_property('salt', bytes) + hash = PropDict._make_property('hash', bytes) + data = PropDict._make_property('data', bytes) argon2_time_cost = PropDict._make_property('argon2_time_cost', int) argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int) argon2_parallelism = PropDict._make_property('argon2_parallelism', int) @@ -443,6 +443,8 @@ class EncryptedKey(PropDict): assert isinstance(v, int) if k in ('algorithm', 'argon2_type'): v = fix_str_value(d, k) + if k in ('salt', 'hash', 'data'): + v = fix_bytes_value(d, k) self._dict[k] = v From 58009f6773c5658a3b0dc7b8fd8c956f7ed158c8 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 30 May 2022 13:38:05 +0200 Subject: [PATCH 11/14] Key: fix once, remove decode=... --- src/borg/item.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 5e6e401f56..169aec0484 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -465,10 +465,10 @@ class Key(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - repository_id = PropDict._make_property('repository_id', bytes, decode=want_bytes) - enc_key = PropDict._make_property('enc_key', bytes, decode=want_bytes) - enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes, decode=want_bytes) - id_key = PropDict._make_property('id_key', bytes, decode=want_bytes) + repository_id = PropDict._make_property('repository_id', bytes) + enc_key = PropDict._make_property('enc_key', bytes) + enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes) + id_key = PropDict._make_property('id_key', bytes) chunk_seed = PropDict._make_property('chunk_seed', int) tam_required = PropDict._make_property('tam_required', bool) @@ -478,6 +478,8 @@ class Key(PropDict): k = fix_key(d, k) if k == 'version': assert isinstance(v, int) + if k in ('repository_id', 'enc_key', 'enc_hmac_key', 'id_key'): + v = fix_bytes_value(d, k) self._dict[k] = v From 421d4bdfb05f0917f1337ad0e00f8cfaa6952d61 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 30 May 2022 23:22:48 +0200 Subject: [PATCH 12/14] docs: fix bytes -> str in data-structures docs --- docs/internals/data-structures.rst | 38 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index d1a5a4cd38..8de20761b7 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -329,17 +329,17 @@ or modified. It looks like this: .. code-block:: python { - b'version': 1, - b'timestamp': b'2017-05-05T12:42:23.042864', - b'item_keys': [b'acl_access', b'acl_default', ...], - b'config': {}, - b'archives': { - b'2017-05-05-system-backup': { - b'id': b'<32 byte binary object ID>', - b'time': b'2017-05-05T12:42:22.942864', + 'version': 1, + 'timestamp': '2017-05-05T12:42:23.042864', + 'item_keys': ['acl_access', 'acl_default', ...], + 'config': {}, + 'archives': { + '2017-05-05-system-backup': { + 'id': b'<32 byte binary object ID>', + 'time': '2017-05-05T12:42:22.942864', }, }, - b'tam': ..., + 'tam': ..., } The *version* field can be either 1 or 2. The versions differ in the @@ -393,15 +393,15 @@ The *config* key stores the feature flags enabled on a repository: .. code-block:: python config = { - b'feature_flags': { - b'read': { - b'mandatory': [b'some_feature'], + 'feature_flags': { + 'read': { + 'mandatory': ['some_feature'], }, - b'check': { - b'mandatory': [b'other_feature'], + 'check': { + 'mandatory': ['other_feature'], } - b'write': ..., - b'delete': ... + 'write': ..., + 'delete': ... }, } @@ -1220,9 +1220,9 @@ transaction ID in the file names. Integrity data is stored in a third file .. code-block:: python { - b'version': 2, - b'hints': b'{"algorithm": "XXH64", "digests": {"final": "411208db2aa13f1a"}}', - b'index': b'{"algorithm": "XXH64", "digests": {"HashHeader": "846b7315f91b8e48", "final": "cb3e26cadc173e40"}}' + 'version': 2, + 'hints': '{"algorithm": "XXH64", "digests": {"final": "411208db2aa13f1a"}}', + 'index': '{"algorithm": "XXH64", "digests": {"HashHeader": "846b7315f91b8e48", "final": "cb3e26cadc173e40"}}' } The *version* key started at 2, the same version used for the hints. Since Borg has From 08228fbd32cc29822085be9fd43926f0d4fb79fa Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 31 May 2022 00:07:01 +0200 Subject: [PATCH 13/14] Item: remove unused hardlink_masters param --- src/borg/item.pyx | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 169aec0484..cd1a3544c3 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -284,12 +284,10 @@ class Item(PropDict): part = PropDict._make_property('part', int) - def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False, consider_ids=None): + def get_size(self, memorize=False, compressed=False, from_chunks=False, consider_ids=None): """ Determine the (uncompressed or compressed) size of this item. - :param hardlink_masters: If given, the size of hardlink slaves is computed via the hardlink master's chunk list, - otherwise size will be returned as 0. :param memorize: Whether the computed size value will be stored into the item. :param compressed: Whether the compressed or uncompressed size will be returned. :param from_chunks: If true, size is computed from chunks even if a precomputed value is available. @@ -309,31 +307,14 @@ class Item(PropDict): # no precomputed (c)size value available, compute it: try: chunks = getattr(self, 'chunks') - having_chunks = True except AttributeError: - having_chunks = False - # this item has no (own) chunks list, but if this is a hardlink slave - # and we know the master, we can still compute the size. - if hardlink_masters is None: - chunks = None - else: - try: - master = getattr(self, 'source') - except AttributeError: - # not a hardlink slave, likely a directory or special file w/o chunks - chunks = None - else: - # hardlink slave, try to fetch hardlink master's chunks list - # todo: put precomputed size into hardlink_masters' values and use it, if present - chunks, _ = hardlink_masters.get(master, (None, None)) - if chunks is None: - return 0 + return 0 if consider_ids is not None: size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks if chunk.id in consider_ids) else: size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks) # if requested, memorize the precomputed (c)size for items that have an own chunks list: - if memorize and having_chunks: + if memorize: setattr(self, attr, size) return size From d4ee968b07b8cf4971049d13af8ed56d6ef7b5d1 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 9 Jun 2022 18:13:40 +0200 Subject: [PATCH 14/14] use borg 2.0 to refer to this, not 1.3 also, some type conversions are now done in update_internal once, not in the decode methods of the classes in item.pyx. --- src/borg/constants.py | 2 +- src/borg/helpers/msgpack.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/borg/constants.py b/src/borg/constants.py index 13eb8bd232..d2fa119196 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -128,7 +128,7 @@ class KeyType: # upper 4 bits are ciphersuite, 0 == legacy AES-CTR KEYFILE = 0x00 # repos with PASSPHRASE mode could not be created any more since borg 1.0, see #97. - # in borg 1.3 all of its code and also the "borg key migrate-to-repokey" command was removed. + # in borg 2. all of its code and also the "borg key migrate-to-repokey" command was removed. # if you still need to, you can use "borg key migrate-to-repokey" with borg 1.0, 1.1 and 1.2. # Nowadays, we just dispatch this to RepoKey and assume the passphrase was migrated to a repokey. PASSPHRASE = 0x01 # legacy, attic and borg < 1.0 diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 4e79c02744..ffb87be032 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -6,7 +6,7 @@ Packing ------- -- use_bin_type = True (used by borg since borg 1.3) +- use_bin_type = True (used by borg since borg 2.0) This is used to generate output according to new msgpack 2.0 spec. This cleanly keeps bytes and str types apart. @@ -21,16 +21,17 @@ Unpacking --------- -- raw = False (used by borg since borg 1.3) - We already can use this with borg 1.3 due to the want_bytes decoder. - This decoder can be removed in future, when we do not have to deal with data any more that was packed the old way. +- raw = False (used by borg since borg 2.0) + We already can use this with borg 2.0 due to the type conversion to the desired type in item.py update_internal + methods. This type conversion code can be removed in future, when we do not have to deal with data any more + that was packed the old way. It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str. - raw = True (the old way, used by borg < 1.3) - unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False). -As of borg 1.3, we have fixed most of the msgpack str/bytes mess, #968. +As of borg 2.0, we have fixed most of the msgpack str/bytes mess, #968. Borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely. But from now on, borg only **writes** new data according to the new msgpack 2.0 spec, thus we can remove some legacy support in a later borg release (some places are marked with "legacy"). @@ -41,7 +42,7 @@ - pack with use_bin_type=True (according to msgpack 2.0 spec) - packs str -> raw and bytes -> bin - unpack with raw=False (according to msgpack 2.0 spec, using unicode_errors='surrogateescape') -- unpacks bin to bytes and raw to str (thus we need to re-encode manually if we want bytes from "raw") +- unpacks bin to bytes and raw to str (thus we need to convert to desired type if we want bytes from "raw") """ from .datastruct import StableDict