From bc01116f87fd54a70f1eb89740cdc13b38c32890 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Mon, 3 Aug 2020 18:49:34 +0545 Subject: [PATCH 1/2] Allow ignoring subrepos in dvcignore I implemented this inside dvcignore, and it should not require "internal" APIs hack as we discussed before. Right now, `ignore_subrepos` will only work for the paths of subrepo, not anything inside it. And the API user is more or less only RepoTree, so should be safe (we switch DvcTree and repo.tree based on path prefixes, so this is only required during `walk`, so as it shows the repo that were previously dvcignored and so that we could switch the trees later during walk. So, if dir is `dir -> repo` So, `tree.walk("dir")` will return `[]`, whereas `tree.walk("dir", ignore_subrepos=False)` will return `["repo"]` But, successive walk will just return `("repo", [], [])`, and could just be ignored as a side-effect. --- dvc/ignore.py | 29 ++++++++++++++++++++--------- dvc/tree/git.py | 20 ++++++++++++++++---- dvc/tree/local.py | 21 ++++++++++++++++----- dvc/utils/collections.py | 11 +++++++++++ tests/dir_helpers.py | 18 ++++++++++++------ tests/func/test_ignore.py | 22 ++++++++++++++++++++++ tests/func/test_tree.py | 27 +++++++++++++++++++++++++++ 7 files changed, 124 insertions(+), 24 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 0e80a02103..71569b38b3 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -6,12 +6,12 @@ from pathspec.patterns import GitWildMatchPattern from pathspec.util import normalize_file -from pygtrie import StringTrie from dvc.path_info import PathInfo from dvc.pathspec_math import PatternInfo, merge_patterns from dvc.system import System from dvc.utils import relpath +from dvc.utils.collections import PathStringTrie logger = logging.getLogger(__name__) @@ -156,7 +156,7 @@ class DvcIgnoreFilterNoop: def __init__(self, tree, root_dir): pass - def __call__(self, root, dirs, files): + def __call__(self, root, dirs, files, **kwargs): return dirs, files def is_ignored_dir(self, _): @@ -183,10 +183,11 @@ def __init__(self, tree, root_dir): self.tree = tree self.root_dir = root_dir - self.ignores_trie_tree = StringTrie(separator=os.sep) + self.ignores_trie_tree = PathStringTrie() self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) + self._ignored_subrepos = PathStringTrie() self._update(self.root_dir) def _update(self, dirname): @@ -222,7 +223,10 @@ def _update(self, dirname): def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): - new_pattern = DvcIgnorePatterns(["/{}/".format(d)], root) + self._ignored_subrepos[root] = self._ignored_subrepos.get( + root, set() + ) | {d} + new_pattern = DvcIgnorePatterns([f"/{d}/"], root) old_pattern = self.ignores_trie_tree.longest_prefix(root).value if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( @@ -236,12 +240,13 @@ def _update_sub_repo(self, root, dirs): else: self.ignores_trie_tree[root] = new_pattern - def __call__(self, root, dirs, files): + def __call__(self, root, dirs, files, ignore_subrepos=True): ignore_pattern = self._get_trie_pattern(root) if ignore_pattern: - return ignore_pattern(root, dirs, files) - else: - return dirs, files + dirs, files = ignore_pattern(root, dirs, files) + if not ignore_subrepos: + dirs.extend(self._ignored_subrepos.get(root, [])) + return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.get(dirname) @@ -277,8 +282,14 @@ def _is_ignored(self, path, is_dir=False): else: return False - def is_ignored_dir(self, path): + def _is_subrepo(self, path): + dirname, basename = os.path.split(os.path.normpath(path)) + return basename in self._ignored_subrepos.get(dirname, set()) + + def is_ignored_dir(self, path, ignore_subrepos=True): path = os.path.abspath(path) + if not ignore_subrepos: + return not self._is_subrepo(path) if path == self.root_dir: return False diff --git a/dvc/tree/git.py b/dvc/tree/git.py index edf49964a7..e4f2548982 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -84,13 +84,15 @@ def exists( path ) and not self.dvcignore.is_ignored_dir(path) - def isdir(self, path): # pylint: disable=arguments-differ + def isdir( + self, path, use_dvcignore=True + ): # pylint: disable=arguments-differ obj = self._git_object_by_path(path) if obj is None: return False if obj.mode != GIT_MODE_DIR: return False - return not self.dvcignore.is_ignored_dir(path) + return not (use_dvcignore and self.dvcignore.is_ignored_dir(path)) def isfile(self, path): # pylint: disable=arguments-differ obj = self._git_object_by_path(path) @@ -156,7 +158,14 @@ def _walk(self, tree, topdown=True): if not topdown: yield os.path.normpath(tree.abspath), dirs, nondirs - def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): + def walk( + self, + top, + topdown=True, + onerror=None, + use_dvcignore=True, + ignore_subrepos=True, + ): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -176,7 +185,10 @@ def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): for root, dirs, files in self._walk(tree, topdown=topdown): if use_dvcignore: dirs[:], files[:] = self.dvcignore( - os.path.abspath(root), dirs, files + os.path.abspath(root), + dirs, + files, + ignore_subrepos=ignore_subrepos, ) yield root, dirs, files diff --git a/dvc/tree/local.py b/dvc/tree/local.py index bfce3345f8..ca3b8cfbfe 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -86,18 +86,26 @@ def isfile(self, path_info): return not self.dvcignore.is_ignored_file(path_info) - def isdir(self, path_info): + def isdir( + self, path_info, use_dvcignore=True + ): # pylint: disable=arguments-differ if not os.path.isdir(path_info): return False - - return not self.dvcignore.is_ignored_dir(path_info) + return not (use_dvcignore and self.dvcignore.is_ignored_dir(path_info)) def iscopy(self, path_info): return not ( System.is_symlink(path_info) or System.is_hardlink(path_info) ) - def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): + def walk( + self, + top, + topdown=True, + onerror=None, + use_dvcignore=True, + ignore_subrepos=True, + ): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -108,7 +116,10 @@ def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): ): if use_dvcignore: dirs[:], files[:] = self.dvcignore( - os.path.abspath(root), dirs, files + os.path.abspath(root), + dirs, + files, + ignore_subrepos=ignore_subrepos, ) yield os.path.normpath(root), dirs, files diff --git a/dvc/utils/collections.py b/dvc/utils/collections.py index 1ff72a3472..d59869b616 100644 --- a/dvc/utils/collections.py +++ b/dvc/utils/collections.py @@ -1,5 +1,16 @@ +import os from collections.abc import Mapping +from pygtrie import StringTrie as _StringTrie + + +class PathStringTrie(_StringTrie): + """Trie based on platform-dependent separator for pathname components.""" + + def __init__(self, *args, **kwargs): + kwargs["separator"] = os.sep + super().__init__(*args, **kwargs) + def apply_diff(src, dest): """Recursively apply changes from src to dest. diff --git a/tests/dir_helpers.py b/tests/dir_helpers.py index 02ae154fd6..6d1abf8137 100644 --- a/tests/dir_helpers.py +++ b/tests/dir_helpers.py @@ -74,6 +74,8 @@ class TmpDir(pathlib.Path): + scheme = "local" + def __new__(cls, *args, **kwargs): if cls is TmpDir: cls = ( # pylint: disable=self-cls-assignment @@ -87,7 +89,7 @@ def __new__(cls, *args, **kwargs): self._init() return self - def init(self, *, scm=False, dvc=False): + def init(self, *, scm=False, dvc=False, subdir=False): from dvc.repo import Repo from dvc.scm.git import Git @@ -100,7 +102,9 @@ def init(self, *, scm=False, dvc=False): git_init(str_path) if dvc: self.dvc = Repo.init( - str_path, no_scm=not scm and not hasattr(self, "scm") + str_path, + no_scm=not scm and not hasattr(self, "scm"), + subdir=subdir, ) if scm: self.scm = self.dvc.scm if hasattr(self, "dvc") else Git(str_path) @@ -123,10 +127,10 @@ def gen(self, struct, text=""): if isinstance(struct, (str, bytes, pathlib.PurePath)): struct = {struct: text} - self._gen(struct) - return struct.keys() + return self._gen(struct) def _gen(self, struct, prefix=None): + paths = [] for name, contents in struct.items(): path = (prefix or self) / name @@ -141,6 +145,8 @@ def _gen(self, struct, prefix=None): path.write_bytes(contents) else: path.write_text(contents, encoding="utf-8") + paths.append(path) + return paths def dvc_gen(self, struct, text="", commit=None): paths = self.gen(struct, text) @@ -249,10 +255,10 @@ class PosixTmpDir(TmpDir, pathlib.PurePosixPath): @pytest.fixture(scope="session") def make_tmp_dir(tmp_path_factory, request): - def make(name, *, scm=False, dvc=False): + def make(name, *, scm=False, dvc=False, **kwargs): path = tmp_path_factory.mktemp(name) if isinstance(name, str) else name new_dir = TmpDir(path) - new_dir.init(scm=scm, dvc=dvc) + new_dir.init(scm=scm, dvc=dvc, **kwargs) request.addfinalizer(new_dir.close) return new_dir diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 67e8f71a5a..042329f79c 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -182,6 +182,28 @@ def test_ignore_subrepo(tmp_dir, scm, dvc): assert subrepo.tree.exists(PathInfo(subrepo_dir / "foo")) +def test_ignore_resurface_subrepo(tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"foo": "foo"}, commit="add foo") + subrepo_dir = tmp_dir / "subdir" + subrepo_dir.mkdir() + with subrepo_dir.chdir(): + Repo.init(subdir=True) + + dvc.tree.__dict__.pop("dvcignore", None) + + dirs = ["subdir"] + files = ["foo"] + assert dvc.tree.dvcignore(os.fspath(tmp_dir), dirs, files) == ([], files) + assert dvc.tree.dvcignore( + os.fspath(tmp_dir), dirs, files, ignore_subrepos=False + ) == (dirs, files) + + assert dvc.tree.dvcignore.is_ignored_dir(os.fspath(subrepo_dir)) + assert not dvc.tree.dvcignore.is_ignored_dir( + os.fspath(subrepo_dir), ignore_subrepos=False + ) + + def test_ignore_blank_line(tmp_dir, dvc): tmp_dir.gen({"dir": {"ignored": "text", "other": "text2"}}) tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "foo\n\ndir/ignored") diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 3ab93efc9b..e5c8fb6d30 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -245,3 +245,30 @@ def test_cleantree_subrepo(tmp_dir, dvc, scm, monkeypatch): assert subrepo.tree.isfile(path / "foo") assert subrepo.tree.exists(path / "dir") assert subrepo.tree.isdir(path / "dir") + + +def test_walk_dont_ignore_subrepos(tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"foo": "foo"}, commit="add foo") + subrepo_dir = tmp_dir / "subdir" + subrepo_dir.mkdir() + with subrepo_dir.chdir(): + Repo.init(subdir=True) + scm.add(["subdir"]) + scm.commit("Add subrepo") + + assert list(dvc.tree.walk(os.fspath(tmp_dir))) == [ + (os.fspath(tmp_dir), [], ["foo.dvc", "foo", ".gitignore"]) + ] + assert list(dvc.tree.walk(os.fspath(tmp_dir), ignore_subrepos=False)) == [ + (os.fspath(tmp_dir), ["subdir"], ["foo.dvc", "foo", ".gitignore"]), + (os.fspath(subrepo_dir), [], []), + ] + + scm_tree = scm.get_tree("HEAD", use_dvcignore=True) + assert list(scm_tree.walk(os.fspath(tmp_dir))) == [ + (os.fspath(tmp_dir), [], [".gitignore", "foo.dvc"]) + ] + assert list(scm_tree.walk(os.fspath(tmp_dir), ignore_subrepos=False)) == [ + (os.fspath(tmp_dir), ["subdir"], [".gitignore", "foo.dvc"]), + (os.fspath(subrepo_dir), [], []), + ] From 294a76aec7cf0dd0c4977731cba81a2f2e6efd1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Thu, 6 Aug 2020 22:37:40 +0545 Subject: [PATCH 2/2] Fix tests --- tests/func/test_tree.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index e5c8fb6d30..ad09e5312f 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -1,4 +1,5 @@ import os +from operator import itemgetter from os.path import join from dvc.path_info import PathInfo @@ -256,19 +257,14 @@ def test_walk_dont_ignore_subrepos(tmp_dir, scm, dvc): scm.add(["subdir"]) scm.commit("Add subrepo") - assert list(dvc.tree.walk(os.fspath(tmp_dir))) == [ - (os.fspath(tmp_dir), [], ["foo.dvc", "foo", ".gitignore"]) - ] - assert list(dvc.tree.walk(os.fspath(tmp_dir), ignore_subrepos=False)) == [ - (os.fspath(tmp_dir), ["subdir"], ["foo.dvc", "foo", ".gitignore"]), - (os.fspath(subrepo_dir), [], []), - ] - + dvc_tree = dvc.tree scm_tree = scm.get_tree("HEAD", use_dvcignore=True) - assert list(scm_tree.walk(os.fspath(tmp_dir))) == [ - (os.fspath(tmp_dir), [], [".gitignore", "foo.dvc"]) - ] - assert list(scm_tree.walk(os.fspath(tmp_dir), ignore_subrepos=False)) == [ - (os.fspath(tmp_dir), ["subdir"], [".gitignore", "foo.dvc"]), - (os.fspath(subrepo_dir), [], []), - ] + path = os.fspath(tmp_dir) + get_dirs = itemgetter(1) + + assert get_dirs(next(dvc_tree.walk(path))) == [] + assert get_dirs(next(scm_tree.walk(path))) == [] + + kw = dict(ignore_subrepos=False) + assert get_dirs(next(dvc_tree.walk(path, **kw))) == ["subdir"] + assert get_dirs(next(scm_tree.walk(path, **kw))) == ["subdir"]