From 7c722b230c963ff0726f75f13f67f39362f846e9 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 26 May 2020 15:35:23 +0900 Subject: [PATCH 1/5] tests: test that CleanTree works outside of walk() --- tests/func/test_tree.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 64bb733aa8..1df0000d5e 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -3,6 +3,7 @@ from dvc.ignore import CleanTree from dvc.path_info import PathInfo +from dvc.repo import Repo from dvc.repo.tree import RepoTree from dvc.scm import SCM from dvc.scm.git import GitTree @@ -220,3 +221,23 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, setup_remote): cache.save(PathInfo(erepo_dir / "dir"), None, tree=tree) for checksum in expected: assert os.path.exists(cache.checksum_to_path_info(checksum)) + + +def test_cleantree_subrepo(tmp_dir, dvc, scm, monkeypatch): + tmp_dir.gen({"subdir": {}}) + subrepo_dir = tmp_dir / "subdir" + with subrepo_dir.chdir(): + subrepo = Repo.init(subdir=True) + subrepo_dir.gen({"foo": "foo", "dir": {"bar": "bar"}}) + + assert isinstance(dvc.tree, CleanTree) + assert not dvc.tree.exists(subrepo_dir / "foo") + assert not dvc.tree.isfile(subrepo_dir / "foo") + assert not dvc.tree.exists(subrepo_dir / "dir") + assert not dvc.tree.isdir(subrepo_dir / "dir") + + assert isinstance(subrepo.tree, CleanTree) + assert subrepo.tree.exists(subrepo_dir / "foo") + assert subrepo.tree.isfile(subrepo_dir / "foo") + assert subrepo.tree.exists(subrepo_dir / "dir") + assert subrepo.tree.isdir(subrepo_dir / "dir") From a0f946353a2e01ee48327a33d9897ee7769232ed Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 26 May 2020 15:37:08 +0900 Subject: [PATCH 2/5] ignore: CleanTree should always check ignores, not just for walk() --- dvc/ignore.py | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index eefa3bfef8..585049c4e0 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -5,7 +5,9 @@ from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern +from dvc.path_info import PathInfo from dvc.scm.tree import BaseTree +from dvc.utils import relpath logger = logging.getLogger(__name__) @@ -125,19 +127,63 @@ def tree_root(self): return self.tree.tree_root def open(self, path, mode="r", encoding="utf-8"): - return self.tree.open(path, mode, encoding) + if self.isfile(path): + return self.tree.open(path, mode, encoding) + raise FileNotFoundError def exists(self, path): - return self.tree.exists(path) + return self._parents_exist(path) and ( + self._isdir(path) or self._isfile(path) + ) def isdir(self, path): - return self.tree.isdir(path) + return self._parents_exist(path) and self._isdir(path) + + def _isdir(self, path: PathInfo): + if self.tree.isdir(path): + dirname, basename = os.path.split(os.path.normpath(path)) + dirs, _ = self.dvcignore(os.path.abspath(dirname), [basename], []) + if dirs: + return True + return False def isfile(self, path): - return self.tree.isfile(path) + return self._parents_exist(path) and self._isfile(path) + + def _isfile(self, path: PathInfo): + if self.tree.isfile(path): + dirname, basename = os.path.split(os.path.normpath(path)) + _, files = self.dvcignore(os.path.abspath(dirname), [], [basename]) + if files: + return True + return False def isexec(self, path): - return self.tree.isexec(path) + return self.exists(path) and self.tree.isexec(path) + + def _parents_exist(self, path): + from dvc.repo import Repo + + path = PathInfo(path) + + # if parent is tree_root or in tree_root/.dvc we can skip this check + if path.parent == self.tree_root or path.parent.overlaps( + os.path.join(self.tree_root, Repo.DVC_DIR) + ): + return True + + # check if parent directories are in our ignores, starting from + # tree_root + path = relpath(path, self.tree_root) + for parent_dir in reversed(PathInfo(path).parents): + dirname, basename = os.path.split(parent_dir) + if basename == ".": + # parent_dir == tree_root + continue + dirs, _ = self.dvcignore(os.path.abspath(dirname), [basename], []) + if not dirs: + return False + return True def walk(self, top, topdown=True): for root, dirs, files in self.tree.walk(top, topdown): @@ -148,4 +194,6 @@ def walk(self, top, topdown=True): yield root, dirs, files def stat(self, path): - return self.tree.stat(path) + if self.exists(path): + return self.tree.stat(path) + raise FileNotFoundError From a277dd218397c4ece2e41f8d5ab66830c4fc1273 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 26 May 2020 16:36:09 +0900 Subject: [PATCH 3/5] CleanTree: handle broken symlinks --- dvc/ignore.py | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 585049c4e0..fc44522700 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -132,30 +132,38 @@ def open(self, path, mode="r", encoding="utf-8"): raise FileNotFoundError def exists(self, path): - return self._parents_exist(path) and ( - self._isdir(path) or self._isfile(path) - ) + if self.tree.exists(path) and self._parents_exist(path): + if self.tree.isdir(path): + return self._valid_dirname(path) + return self._valid_filename(path) + return False def isdir(self, path): - return self._parents_exist(path) and self._isdir(path) + return ( + self.tree.isdir(path) + and self._parents_exist(path) + and self._valid_dirname(path) + ) - def _isdir(self, path: PathInfo): - if self.tree.isdir(path): - dirname, basename = os.path.split(os.path.normpath(path)) - dirs, _ = self.dvcignore(os.path.abspath(dirname), [basename], []) - if dirs: - return True + def _valid_dirname(self, path): + dirname, basename = os.path.split(os.path.normpath(path)) + dirs, _ = self.dvcignore(os.path.abspath(dirname), [basename], []) + if dirs: + return True return False def isfile(self, path): - return self._parents_exist(path) and self._isfile(path) - - def _isfile(self, path: PathInfo): - if self.tree.isfile(path): - dirname, basename = os.path.split(os.path.normpath(path)) - _, files = self.dvcignore(os.path.abspath(dirname), [], [basename]) - if files: - return True + return ( + self.tree.isfile(path) + and self._parents_exist(path) + and self._valid_filename(path) + ) + + def _valid_filename(self, path): + dirname, basename = os.path.split(os.path.normpath(path)) + _, files = self.dvcignore(os.path.abspath(dirname), [], [basename]) + if files: + return True return False def isexec(self, path): From e6c0920df72faf520d660483ee12f1417d18738e Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 26 May 2020 17:22:59 +0900 Subject: [PATCH 4/5] handle case for linking from shared local cache dirs --- dvc/ignore.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index fc44522700..0fccf0b6a7 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -174,10 +174,8 @@ def _parents_exist(self, path): path = PathInfo(path) - # if parent is tree_root or in tree_root/.dvc we can skip this check - if path.parent == self.tree_root or path.parent.overlaps( - os.path.join(self.tree_root, Repo.DVC_DIR) - ): + # if parent is tree_root or inside a .dvc dir we can skip this check + if path.parent == self.tree_root or Repo.DVC_DIR in path.parts: return True # check if parent directories are in our ignores, starting from From 0ff1ab723d85017e17dc02ab84c219b9039a7e71 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Wed, 27 May 2020 14:55:38 +0900 Subject: [PATCH 5/5] Fix windows relpath/different drive issue --- dvc/ignore.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 0fccf0b6a7..384ba38bd0 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -178,9 +178,19 @@ def _parents_exist(self, path): if path.parent == self.tree_root or Repo.DVC_DIR in path.parts: return True + # if path is outside of tree, assume this is a local remote/local cache + # link/move operation where we do not need to filter ignores + path = relpath(path, self.tree_root) + if path.startswith("..") or ( + os.name == "nt" + and not os.path.commonprefix( + [os.path.abspath(path), self.tree_root] + ) + ): + return True + # check if parent directories are in our ignores, starting from # tree_root - path = relpath(path, self.tree_root) for parent_dir in reversed(PathInfo(path).parents): dirname, basename = os.path.split(parent_dir) if basename == ".":