diff --git a/src/borg/archive.py b/src/borg/archive.py index 720596ee33..4dedc0f3f8 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -36,7 +36,7 @@ from .helpers import bin_to_hex from .helpers import safe_ns from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi -from .helpers import PathPrefixPattern, FnmatchPattern +from .helpers import PathPrefixPattern, FnmatchPattern, IECommand from .item import Item, ArchiveItem from .key import key_factory from .platform import acl_get, acl_set, set_flags, get_flags, swidth @@ -1721,10 +1721,10 @@ def matcher_add_tagged_dirs(self, archive): """Add excludes to the matcher created by exclude_cache and exclude_if_present.""" def exclude(dir, tag_item): if self.keep_exclude_tags: - tag_files.append(PathPrefixPattern(tag_item.path)) - tagged_dirs.append(FnmatchPattern(dir + '/')) + tag_files.append(PathPrefixPattern(tag_item.path, recurse_dir=False)) + tagged_dirs.append(FnmatchPattern(dir + '/', recurse_dir=False)) else: - tagged_dirs.append(PathPrefixPattern(dir)) + tagged_dirs.append(PathPrefixPattern(dir, recurse_dir=False)) matcher = self.matcher tag_files = [] @@ -1747,8 +1747,8 @@ def exclude(dir, tag_item): file = open_item(archive, cachedir_masters[item.source]) if file.read(len(CACHE_TAG_CONTENTS)).startswith(CACHE_TAG_CONTENTS): exclude(dir, item) - matcher.add(tag_files, True) - matcher.add(tagged_dirs, False) + matcher.add(tag_files, IECommand.Include) + matcher.add(tagged_dirs, IECommand.ExcludeNoRecurse) def create_target(self, archive, target_name=None): """Create target archive.""" diff --git a/src/borg/archiver.py b/src/borg/archiver.py index f999cb9760..f2c26cf93f 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -54,7 +54,7 @@ from .helpers import ArgparsePatternAction, ArgparseExcludeFileAction, ArgparsePatternFileAction, parse_exclude_pattern from .helpers import dir_is_tagged, is_slow_msgpack, yes, sysinfo from .helpers import log_multi -from .helpers import parse_pattern, PatternMatcher, PathPrefixPattern +from .helpers import PatternMatcher from .helpers import signal_handler, raising_signal_handler, SigHup, SigTerm from .helpers import ErrorIgnoringTextIOWrapper from .helpers import ProgressIndicatorPercent @@ -190,16 +190,11 @@ def compare_chunk_contents(chunks1, chunks2): bi += slicelen @staticmethod - def build_matcher(inclexcl_patterns, paths): + def build_matcher(inclexcl_patterns, include_paths): matcher = PatternMatcher() - if inclexcl_patterns: - matcher.add_inclexcl(inclexcl_patterns) - include_patterns = [] - if paths: - include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in paths) - matcher.add(include_patterns, True) - matcher.fallback = not include_patterns - return matcher, include_patterns + matcher.add_inclexcl(inclexcl_patterns) + matcher.add_includepaths(include_paths) + return matcher def do_serve(self, args): """Start in server mode. This command is usually not used manually.""" @@ -493,13 +488,20 @@ def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present, This should only raise on critical errors. Per-item errors must be handled within this method. """ + if st is None: + with backup_io('stat'): + st = os.lstat(path) + + recurse_excluded_dir = False if not matcher.match(path): self.print_file_status('x', path) - return + + if stat.S_ISDIR(st.st_mode) and matcher.recurse_dir: + recurse_excluded_dir = True + else: + return + try: - if st is None: - with backup_io('stat'): - st = os.lstat(path) if (st.st_ino, st.st_dev) in skip_inodes: return # if restrict_dev is given, we do not want to recurse into a new filesystem, @@ -527,7 +529,8 @@ def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present, read_special=read_special, dry_run=dry_run) return if not dry_run: - status = archive.process_dir(path, st) + if not recurse_excluded_dir: + status = archive.process_dir(path, st) if recurse: with backup_io('scandir'): entries = helpers.scandir_inorder(path) @@ -590,7 +593,9 @@ def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present, status = '?' # need to add a status code somewhere else: status = '-' # dry run, item was not backed up - self.print_file_status(status, path) + + if not recurse_excluded_dir: + self.print_file_status(status, path) @staticmethod def build_filter(matcher, peek_and_store_hardlink_masters, strip_components): @@ -616,7 +621,7 @@ def do_extract(self, args, repository, manifest, key, archive): if sys.platform.startswith(('linux', 'freebsd', 'netbsd', 'openbsd', 'darwin', )): logger.warning('Hint: You likely need to fix your locale setup. E.g. install locales and use: LANG=en_US.UTF-8') - matcher, include_patterns = self.build_matcher(args.patterns, args.paths) + matcher = self.build_matcher(args.patterns, args.paths) progress = args.progress output_list = args.output_list @@ -681,9 +686,8 @@ def peek_and_store_hardlink_masters(item, matched): archive.extract_item(dir_item) except BackupOSError as e: self.print_warning('%s: %s', remove_surrogates(dir_item.path), e) - for pattern in include_patterns: - if pattern.match_count == 0: - self.print_warning("Include pattern '%s' never matched.", pattern) + for pattern in matcher.get_unmatched_include_patterns(): + self.print_warning("Include pattern '%s' never matched.", pattern) if pi: # clear progress output pi.finish() @@ -893,13 +897,13 @@ def compare_or_defer(item1, item2): 'If you know for certain that they are the same, pass --same-chunker-params ' 'to override this check.') - matcher, include_patterns = self.build_matcher(args.patterns, args.paths) + matcher = self.build_matcher(args.patterns, args.paths) compare_archives(archive1, archive2, matcher) - for pattern in include_patterns: - if pattern.match_count == 0: - self.print_warning("Include pattern '%s' never matched.", pattern) + for pattern in matcher.get_unmatched_include_patterns(): + self.print_warning("Include pattern '%s' never matched.", pattern) + return self.exit_code @with_repository(exclusive=True, cache=True) @@ -1048,7 +1052,7 @@ def write(bytestring): return self._list_repository(args, manifest, write) def _list_archive(self, args, repository, manifest, key, write): - matcher, _ = self.build_matcher(args.patterns, args.paths) + matcher = self.build_matcher(args.patterns, args.paths) if args.format is not None: format = args.format elif args.short: @@ -1330,7 +1334,7 @@ def do_recreate(self, args, repository, manifest, key, cache): env_var_override='BORG_RECREATE_I_KNOW_WHAT_I_AM_DOING'): return EXIT_ERROR - matcher, include_patterns = self.build_matcher(args.patterns, args.paths) + matcher = self.build_matcher(args.patterns, args.paths) self.output_list = args.output_list self.output_filter = args.output_filter recompress = args.recompress != 'never' diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 2766c04133..a52ff26bf1 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -23,6 +23,7 @@ from binascii import hexlify from collections import namedtuple, deque, abc, Counter from datetime import datetime, timezone, timedelta +from enum import Enum from fnmatch import translate from functools import wraps, partial, lru_cache from itertools import islice @@ -388,23 +389,24 @@ def parse_timestamp(timestamp): return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S').replace(tzinfo=timezone.utc) -def parse_add_pattern(patternstr, roots, patterns, fallback): - """Parse a pattern string and add it to roots or patterns depending on the pattern type.""" - pattern = parse_inclexcl_pattern(patternstr, fallback=fallback) - if pattern.ptype is RootPath: - roots.append(pattern.pattern) - elif pattern.ptype is PatternStyle: - fallback = pattern.pattern +def parse_patternfile_line(line, roots, ie_commands, fallback): + """Parse a pattern-file line and act depending on which command it represents.""" + ie_command = parse_inclexcl_command(line, fallback=fallback) + if ie_command.cmd is IECommand.RootPath: + roots.append(ie_command.val) + elif ie_command.cmd is IECommand.PatternStyle: + fallback = ie_command.val else: - patterns.append(pattern) + # it is some kind of include/exclude command + ie_commands.append(ie_command) return fallback -def load_pattern_file(fileobj, roots, patterns, fallback=None): +def load_pattern_file(fileobj, roots, ie_commands, fallback=None): if fallback is None: fallback = ShellPattern # ShellPattern is defined later in this module - for patternstr in clean_lines(fileobj): - fallback = parse_add_pattern(patternstr, roots, patterns, fallback) + for line in clean_lines(fileobj): + fallback = parse_patternfile_line(line, roots, ie_commands, fallback) def load_exclude_file(fileobj, patterns): @@ -417,7 +419,7 @@ def __init__(self, nargs=1, **kw): super().__init__(nargs=nargs, **kw) def __call__(self, parser, args, values, option_string=None): - parse_add_pattern(values[0], args.paths, args.patterns, ShellPattern) + parse_patternfile_line(values[0], args.paths, args.patterns, ShellPattern) class ArgparsePatternFileAction(argparse.Action): @@ -442,6 +444,11 @@ def parse(self, fobj, args): class PatternMatcher: + """Represents a collection of pattern objects to match paths against. + + *fallback* is a boolean value that *match()* returns if no matching patterns are found. + + """ def __init__(self, fallback=None): self._items = [] @@ -451,42 +458,88 @@ def __init__(self, fallback=None): # optimizations self._path_full_patterns = {} # full path -> return value + # indicates whether the last match() call ended on a pattern for which + # we should recurse into any matching folder. Will be set to True or + # False when calling match(). + self.recurse_dir = None + + # whether to recurse into directories when no match is found + # TODO: allow modification as a config option? + self.recurse_dir_default = True + + self.include_patterns = [] + + # TODO: move this info to parse_inclexcl_command and store in PatternBase subclass? + self.is_include_cmd = { + IECommand.Exclude: False, + IECommand.ExcludeNoRecurse: False, + IECommand.Include: True + } + def empty(self): return not len(self._items) and not len(self._path_full_patterns) - def _add(self, pattern, value): + def _add(self, pattern, cmd): + """*cmd* is an IECommand value. + """ if isinstance(pattern, PathFullPattern): key = pattern.pattern # full, normalized path - self._path_full_patterns[key] = value + self._path_full_patterns[key] = cmd else: - self._items.append((pattern, value)) + self._items.append((pattern, cmd)) - def add(self, patterns, value): - """Add list of patterns to internal list. The given value is returned from the match function when one of the - given patterns matches. + def add(self, patterns, cmd): + """Add list of patterns to internal list. *cmd* indicates whether the + pattern is an include/exclude pattern, and whether recursion should be + done on excluded folders. """ for pattern in patterns: - self._add(pattern, value) + self._add(pattern, cmd) + + def add_includepaths(self, include_paths): + """Used to add inclusion-paths from args.paths (from commandline). + """ + include_patterns = [parse_pattern(p, PathPrefixPattern) for p in include_paths] + self.add(include_patterns, IECommand.Include) + self.fallback = not include_patterns + self.include_patterns = include_patterns + + def get_unmatched_include_patterns(self): + "Note that this only returns patterns added via *add_includepaths*." + return [p for p in self.include_patterns if p.match_count == 0] def add_inclexcl(self, patterns): - """Add list of patterns (of type InclExclPattern) to internal list. The patterns ptype member is returned from - the match function when one of the given patterns matches. + """Add list of patterns (of type CmdTuple) to internal list. """ - for pattern, pattern_type in patterns: - self._add(pattern, pattern_type) + for pattern, cmd in patterns: + self._add(pattern, cmd) def match(self, path): + """Return True or False depending on whether *path* is matched. + + If no match is found among the patterns in this matcher, then the value + in self.fallback is returned (defaults to None). + + """ path = normalize_path(path) # do a fast lookup for full path matches (note: we do not count such matches): non_existent = object() value = self._path_full_patterns.get(path, non_existent) + if value is not non_existent: # we have a full path match! + # TODO: get from pattern; don't hard-code + self.recurse_dir = True return value + # this is the slow way, if we have many patterns in self._items: - for (pattern, value) in self._items: + for (pattern, cmd) in self._items: if pattern.match(path, normalize=False): - return value + self.recurse_dir = pattern.recurse_dir + return self.is_include_cmd[cmd] + + # by default we will recurse if there is no match + self.recurse_dir = self.recurse_dir_default return self.fallback @@ -502,14 +555,15 @@ class PatternBase: """ PREFIX = NotImplemented - def __init__(self, pattern): + def __init__(self, pattern, recurse_dir=False): self.pattern_orig = pattern self.match_count = 0 pattern = normalize_path(pattern) self._prepare(pattern) + self.recurse_dir = recurse_dir def match(self, path, normalize=True): - """match the given path against this pattern. + """Return a boolean indicating whether *path* is matched by this pattern. If normalize is True (default), the path will get normalized using normalize_path(), otherwise it is assumed that it already is normalized using that function. @@ -528,6 +582,7 @@ def __str__(self): return self.pattern_orig def _prepare(self, pattern): + "Should set the value of self.pattern" raise NotImplementedError def _match(self, path): @@ -625,7 +680,7 @@ def _match(self, path): return (self.regex.search(path) is not None) -_PATTERN_STYLES = set([ +_PATTERN_CLASSES = set([ FnmatchPattern, PathFullPattern, PathPrefixPattern, @@ -633,65 +688,86 @@ def _match(self, path): ShellPattern, ]) -_PATTERN_STYLE_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_STYLES) +_PATTERN_CLASS_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_CLASSES) -InclExclPattern = namedtuple('InclExclPattern', 'pattern ptype') -RootPath = object() -PatternStyle = object() +CmdTuple = namedtuple('CmdTuple', 'val cmd') -def get_pattern_style(prefix): +class IECommand(Enum): + """A command that an InclExcl file line can represent. + """ + RootPath = 1 + PatternStyle = 2 + Include = 3 + Exclude = 4 + ExcludeNoRecurse = 5 + + +def get_pattern_class(prefix): try: - return _PATTERN_STYLE_BY_PREFIX[prefix] + return _PATTERN_CLASS_BY_PREFIX[prefix] except KeyError: raise ValueError("Unknown pattern style: {}".format(prefix)) from None -def parse_pattern(pattern, fallback=FnmatchPattern): +def parse_pattern(pattern, fallback=FnmatchPattern, recurse_dir=True): """Read pattern from string and return an instance of the appropriate implementation class. + """ if len(pattern) > 2 and pattern[2] == ":" and pattern[:2].isalnum(): (style, pattern) = (pattern[:2], pattern[3:]) - cls = get_pattern_style(style) + cls = get_pattern_class(style) else: cls = fallback - return cls(pattern) + return cls(pattern, recurse_dir) -def parse_exclude_pattern(pattern, fallback=FnmatchPattern): +def parse_exclude_pattern(pattern_str, fallback=FnmatchPattern): """Read pattern from string and return an instance of the appropriate implementation class. """ - epattern = parse_pattern(pattern, fallback) - return InclExclPattern(epattern, False) - - -def parse_inclexcl_pattern(pattern, fallback=ShellPattern): - """Read pattern from string and return a InclExclPattern object.""" - type_prefix_map = { - '-': False, - '+': True, - 'R': RootPath, - 'r': RootPath, - 'P': PatternStyle, - 'p': PatternStyle, + epattern_obj = parse_pattern(pattern_str, fallback) + return CmdTuple(epattern_obj, IECommand.Exclude) + + +def parse_inclexcl_command(cmd_line_str, fallback=ShellPattern): + """Read a --patterns-from command from string and return a CmdTuple object.""" + + cmd_prefix_map = { + '-': IECommand.Exclude, + '!': IECommand.ExcludeNoRecurse, + '+': IECommand.Include, + 'R': IECommand.RootPath, + 'r': IECommand.RootPath, + 'P': IECommand.PatternStyle, + 'p': IECommand.PatternStyle, } + try: - ptype = type_prefix_map[pattern[0]] - pattern = pattern[1:].lstrip() - if not pattern: - raise ValueError("Missing pattern!") + cmd = cmd_prefix_map[cmd_line_str[0]] + + # remaining text on command-line following the command character + remainder_str = cmd_line_str[1:].lstrip() + + if not remainder_str: + raise ValueError("Missing pattern/information!") except (IndexError, KeyError, ValueError): - raise argparse.ArgumentTypeError("Unable to parse pattern: {}".format(pattern)) - if ptype is RootPath: - pobj = pattern - elif ptype is PatternStyle: + raise argparse.ArgumentTypeError("Unable to parse pattern/command: {}".format(cmd_line_str)) + + if cmd is IECommand.RootPath: + # TODO: validate string? + val = remainder_str + elif cmd is IECommand.PatternStyle: + # then remainder_str is something like 're' or 'sh' try: - pobj = get_pattern_style(pattern) + val = get_pattern_class(remainder_str) except ValueError: - raise argparse.ArgumentTypeError("Unable to parse pattern: {}".format(pattern)) + raise argparse.ArgumentTypeError("Invalid pattern style: {}".format(remainder_str)) else: - pobj = parse_pattern(pattern, fallback) - return InclExclPattern(pobj, ptype) + # determine recurse_dir based on command type + recurse_dir = cmd not in [IECommand.ExcludeNoRecurse] + val = parse_pattern(remainder_str, fallback, recurse_dir) + + return CmdTuple(val, cmd) def timestamp(s): diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index dc29194050..366900d3e2 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -37,6 +37,7 @@ from ..helpers import Manifest from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR from ..helpers import bin_to_hex +from ..helpers import IECommand from ..item import Item from ..key import KeyfileKeyBase, RepoKey, KeyfileKey, Passphrase, TAMRequiredError from ..keymanager import RepoIdMismatch, NotABorgKeyFile @@ -929,6 +930,40 @@ def test_create_pattern_file(self): self.assert_in('x input/file2', output) self.assert_in('x input/otherfile', output) + def test_create_pattern_exclude_folder_but_recurse(self): + """test when patterns exclude a parent folder, but include a child""" + self.patterns_file_path2 = os.path.join(self.tmpdir, 'patterns2') + with open(self.patterns_file_path2, 'wb') as fd: + fd.write(b'+ input/x/b\n- input/x*\n') + + self.cmd('init', '--encryption=repokey', self.repository_location) + self.create_regular_file('x/a/foo_a', size=1024 * 80) + self.create_regular_file('x/b/foo_b', size=1024 * 80) + self.create_regular_file('y/foo_y', size=1024 * 80) + output = self.cmd('create', '-v', '--list', + '--patterns-from=' + self.patterns_file_path2, + self.repository_location + '::test', 'input') + self.assert_in('x input/x/a/foo_a', output) + self.assert_in("A input/x/b/foo_b", output) + self.assert_in('A input/y/foo_y', output) + + def test_create_pattern_exclude_folder_no_recurse(self): + """test when patterns exclude a parent folder and, but include a child""" + self.patterns_file_path2 = os.path.join(self.tmpdir, 'patterns2') + with open(self.patterns_file_path2, 'wb') as fd: + fd.write(b'+ input/x/b\n! input/x*\n') + + self.cmd('init', '--encryption=repokey', self.repository_location) + self.create_regular_file('x/a/foo_a', size=1024 * 80) + self.create_regular_file('x/b/foo_b', size=1024 * 80) + self.create_regular_file('y/foo_y', size=1024 * 80) + output = self.cmd('create', '-v', '--list', + '--patterns-from=' + self.patterns_file_path2, + self.repository_location + '::test', 'input') + self.assert_not_in('input/x/a/foo_a', output) + self.assert_not_in('input/x/a', output) + self.assert_in('A input/y/foo_y', output) + def test_extract_pattern_opt(self): self.cmd('init', '--encryption=repokey', self.repository_location) self.create_regular_file('file1', size=1024 * 80) @@ -2889,7 +2924,7 @@ def peek_and_store_hardlink_masters(item, matched): def test_basic(self): matcher = PatternMatcher() - matcher.add([parse_pattern('included')], True) + matcher.add([parse_pattern('included')], IECommand.Include) filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0) assert filter(Item(path='included')) assert filter(Item(path='included/file')) diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 3938b722b1..f5db0992d4 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -557,12 +557,12 @@ def test_switch_patterns_style(): roots, patterns = [], [] load_pattern_file(pattern_file, roots, patterns) assert len(patterns) == 6 - assert isinstance(patterns[0].pattern, ShellPattern) - assert isinstance(patterns[1].pattern, FnmatchPattern) - assert isinstance(patterns[2].pattern, RegexPattern) - assert isinstance(patterns[3].pattern, RegexPattern) - assert isinstance(patterns[4].pattern, PathPrefixPattern) - assert isinstance(patterns[5].pattern, ShellPattern) + assert isinstance(patterns[0].val, ShellPattern) + assert isinstance(patterns[1].val, FnmatchPattern) + assert isinstance(patterns[2].val, RegexPattern) + assert isinstance(patterns[3].val, RegexPattern) + assert isinstance(patterns[4].val, PathPrefixPattern) + assert isinstance(patterns[5].val, ShellPattern) @pytest.mark.parametrize("lines", [ @@ -682,6 +682,10 @@ def test_pattern_matcher(): for i in ["", "foo", "bar"]: assert pm.match(i) is None + # add extra entries to aid in testing + for target in ["A", "B", "Empty", "FileNotFound"]: + pm.is_include_cmd[target] = target + pm.add([RegexPattern("^a")], "A") pm.add([RegexPattern("^b"), RegexPattern("^z")], "B") pm.add([RegexPattern("^$")], "Empty")