diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index e02d52116a..22e5e450b1 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -76,9 +76,10 @@ def onerror(exc): if not recursive: for dname in dirs: info = PathInfo(root) / dname - if not dvc_only or ( - tree.dvctree and tree.dvctree.exists(info) - ): + t = tree._get_tree( # pylint: disable=protected-access + info + ) + if not dvc_only or (t.dvctree and t.dvctree.exists(info)): dvc = tree.isdvc(info) path = str(info.relative_to(path_info)) ret[path] = { diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 8011d0318c..f50c7eae9b 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -1,5 +1,10 @@ import logging import os +import threading +from itertools import takewhile + +from funcy import wrap_with +from pygtrie import StringTrie from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError @@ -236,7 +241,7 @@ def get_file_hash(self, path_info): return out.checksum -class RepoTree(BaseTree): # pylint:disable=abstract-method +class CombinedTree(BaseTree): # pylint:disable=abstract-method """DVC + git-tracked files tree. Args: @@ -355,7 +360,13 @@ def _walk(self, dvc_walk, repo_walk, dvcfiles=False): yield from self._walk_one(repo_walk) def walk( - self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs + self, + top, + topdown=True, + onerror=None, + dvcfiles=False, + ignore_subrepos=True, + **kwargs ): # pylint: disable=arguments-differ """Walk and merge both DVC and repo trees. @@ -391,19 +402,19 @@ def walk( return if repo_exists and not dvc_exists: yield from self.repo.tree.walk( - top, topdown=topdown, onerror=onerror + top, + topdown=topdown, + onerror=onerror, + ignore_subrepos=ignore_subrepos, ) return dvc_walk = self.dvctree.walk(top, topdown=topdown, **kwargs) - repo_walk = self.repo.tree.walk(top, topdown=topdown) + repo_walk = self.repo.tree.walk( + top, topdown=topdown, ignore_subrepos=ignore_subrepos + ) yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) - def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ - for root, _, files in self.walk(top, **kwargs): - for fname in files: - yield PathInfo(root) / fname - def get_file_hash(self, path_info): """Return file checksum for specified path. @@ -420,6 +431,118 @@ def get_file_hash(self, path_info): pass return file_md5(path_info, self)[0] + @property + def hash_jobs(self): # pylint: disable=invalid-overridden-method + return self.repo.tree.hash_jobs + + +class RepoTree(BaseTree): + scheme = "local" + PARAM_CHECKSUM = "md5" + + def __init__(self, repo, subrepos=False, repo_factory=None, **kwargs): + super().__init__(repo, {"url": repo.root_dir}) + + if not repo_factory: + from dvc.repo import Repo + + self.repo_factory = Repo + else: + self.repo_factory = repo_factory + + self._main_repo = repo + self.root_dir = repo.root_dir + self._traverse_subrepos = subrepos + + self.subtrees = StringTrie(separator=os.sep) + self.subtrees[self.root_dir] = CombinedTree(repo, **kwargs) + self._tree_configs = kwargs + + def _get_tree(self, path) -> CombinedTree: + path = os.path.abspath(path) + tree = self.subtrees.get(path) + if tree: + return tree + + prefix, tree = self.subtrees.longest_prefix(path) + if not prefix: + return self.subtrees.get(self.root_dir) + + parents = (parent.fspath for parent in PathInfo(path).parents) + dirs = [path] + list(takewhile(lambda p: p != prefix, parents)) + dirs.reverse() + self._update(dirs, start=tree) + return self.subtrees.get(path) + + @wrap_with(threading.Lock()) + def _update(self, dirs, start): + tree = start + for d in dirs: + if self._is_dvc_repo(d): + repo = self.repo_factory(d) + tree = CombinedTree(repo, **self._tree_configs) + self.subtrees[d] = tree + + def _is_dvc_repo(self, dir_path): + if not self._traverse_subrepos: + return False + + from dvc.repo import Repo + + repo_path = os.path.join(dir_path, Repo.DVC_DIR) + return self._main_repo.tree.isdir(repo_path, use_dvcignore=False) + + @property + def fetch(self): + return "fetch" in self._tree_configs + + @property + def stream(self): + return "stream" in self._tree_configs + + def open( + self, path_info, *args, **kwargs + ): # pylint: disable=signature-differs + return self._get_tree(path_info).open(path_info, *args, **kwargs) + + def exists(self, path_info, **kwargs): # pylint: disable=arguments-differ + return self._get_tree(path_info).exists(path_info, **kwargs) + + def isdir(self, path_info): + return self._get_tree(path_info).isdir(path_info) + + def isfile(self, path_info): + return self._get_tree(path_info).isfile(path_info) + + def isdvc(self, path, **kwargs): + return self._get_tree(path).isdvc(path, **kwargs) + + def isexec(self, path): + return self._get_tree(path).isexec(path) + + def stat(self, path): + return self._get_tree(path).stat(path) + + def walk(self, top, *args, **kwargs): + tree = self._get_tree(top) + for root, dirs, files in tree.walk( + top, *args, ignore_subrepos=not self._traverse_subrepos, **kwargs + ): + yield root, dirs, files + + for dirname in dirs: + dir_path = os.path.join(root, dirname) + if self._is_dvc_repo(dir_path): + yield from self.walk(dir_path, *args, **kwargs) + + def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ + for root, _, files in self.walk(top, **kwargs): + for fname in files: + yield PathInfo(root) / fname + + def get_file_hash(self, path_info): + return self._get_tree(path_info).get_file_hash(path_info) + def copytree(self, top, dest): top = PathInfo(top) dest = PathInfo(dest) @@ -444,4 +567,4 @@ def copytree(self, top, dest): @property def hash_jobs(self): # pylint: disable=invalid-overridden-method - return self.repo.tree.hash_jobs + return self._get_tree(self.root_dir).hash_jobs diff --git a/dvc/tree/git.py b/dvc/tree/git.py index e4f2548982..066f77be6d 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -240,3 +240,6 @@ def walk_files(self, top): # pylint: disable=arguments-differ for file in files: # NOTE: os.path.join is ~5.5 times slower yield f"{root}{os.sep}{file}" + + def _reset(self): + return self.__dict__.pop("dvcignore", None) diff --git a/dvc/tree/local.py b/dvc/tree/local.py index ca3b8cfbfe..994277aea3 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -357,3 +357,6 @@ def _remove_unpacked_dir(self, hash_): info = self.hash_to_path_info(hash_) path_info = info.with_name(info.name + self.UNPACKED_DIR_SUFFIX) self.remove(path_info) + + def _reset(self): + return self.__dict__.pop("dvcignore", None) diff --git a/tests/unit/repo/test_repo_tree.py b/tests/unit/repo/test_repo_tree.py index 417b2c0b92..73520bd374 100644 --- a/tests/unit/repo/test_repo_tree.py +++ b/tests/unit/repo/test_repo_tree.py @@ -1,5 +1,6 @@ import os import shutil +from unittest import mock import pytest @@ -177,3 +178,184 @@ def test_isdvc(tmp_dir, dvc): assert tree.isdvc("dir") assert not tree.isdvc("dir/baz") assert tree.isdvc("dir/baz", recursive=True, strict=False) + + +def make_subrepo(dir_, scm, config=None): + dir_.mkdir(parents=True) + with dir_.chdir(): + dir_.scm = scm + dir_.init(dvc=True, subdir=True) + if config: + dir_.add_remote(config=config) + + +def test_subrepos(tmp_dir, scm, dvc): + tmp_dir.scm_gen( + {"dir": {"repo.txt": "file to confuse RepoTree"}}, + commit="dir/repo.txt", + ) + + subrepo1 = tmp_dir / "dir" / "repo" + subrepo2 = tmp_dir / "dir" / "repo2" + + for repo in [subrepo1, subrepo2]: + make_subrepo(repo, scm) + + subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") + subrepo2.dvc_gen( + {"lorem": "lorem", "dir2": {"ipsum": "ipsum"}}, commit="BAR" + ) + + dvc.tree._reset() + tree = RepoTree(dvc, subrepos=True, fetch=True) + + def assert_tree_belongs_to_repo(ret_val): + method = tree._get_tree + + def f(*args, **kwargs): + r = method(*args, **kwargs) + assert r.repo.root_dir == ret_val.root_dir + return r + + return f + + with mock.patch.object( + tree, + "_get_tree", + side_effect=assert_tree_belongs_to_repo(subrepo1.dvc), + ): + assert tree.exists(subrepo1 / "foo") is True + assert tree.exists(subrepo1 / "bar") is False + + assert tree.isfile(subrepo1 / "foo") is True + assert tree.isfile(subrepo1 / "foo") is True + assert tree.isfile(subrepo1 / "dir1" / "bar") is True + assert tree.isfile(subrepo1 / "dir1") is False + + assert tree.isdir(subrepo1 / "dir1") is True + assert tree.isdir(subrepo1 / "dir1" / "bar") is False + assert tree.isdvc(subrepo1 / "foo") is True + + with mock.patch.object( + tree, + "_get_tree", + side_effect=assert_tree_belongs_to_repo(subrepo2.dvc), + ): + assert tree.exists(subrepo2 / "lorem") is True + assert tree.exists(subrepo2 / "ipsum") is False + + assert tree.isfile(subrepo2 / "lorem") is True + assert tree.isfile(subrepo2 / "lorem") is True + assert tree.isfile(subrepo2 / "dir2" / "ipsum") is True + assert tree.isfile(subrepo2 / "dir2") is False + + assert tree.isdir(subrepo2 / "dir2") is True + assert tree.isdir(subrepo2 / "dir2" / "ipsum") is False + assert tree.isdvc(subrepo2 / "lorem") is True + + +@pytest.mark.parametrize( + "dvcfiles,extra_expected", + [ + (False, []), + ( + True, + [ + PathInfo("dir") / "repo" / "foo.dvc", + PathInfo("dir") / "repo" / "dir1.dvc", + PathInfo("dir") / "repo2" / "lorem.dvc", + PathInfo("dir") / "repo2" / "dir2.dvc", + ], + ), + ], +) +def test_subrepo_walk(tmp_dir, scm, dvc, dvcfiles, extra_expected): + tmp_dir.scm_gen( + {"dir": {"repo.txt": "file to confuse RepoTree"}}, + commit="dir/repo.txt", + ) + + subrepo1 = tmp_dir / "dir" / "repo" + subrepo2 = tmp_dir / "dir" / "repo2" + + subdirs = [subrepo1, subrepo2] + for dir_ in subdirs: + make_subrepo(dir_, scm) + + subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") + subrepo2.dvc_gen( + {"lorem": "lorem", "dir2": {"ipsum": "ipsum"}}, commit="BAR" + ) + + # using tree that does not have dvcignore + dvc.tree._reset() + tree = RepoTree(dvc, subrepos=True, fetch=True) + expected = [ + PathInfo("dir") / "repo", + PathInfo("dir") / "repo.txt", + PathInfo("dir") / "repo2", + PathInfo("dir") / "repo" / ".dvcignore", + PathInfo("dir") / "repo" / ".gitignore", + PathInfo("dir") / "repo" / "foo", + PathInfo("dir") / "repo" / "dir1", + PathInfo("dir") / "repo" / "dir1" / "bar", + PathInfo("dir") / "repo2" / ".dvcignore", + PathInfo("dir") / "repo2" / ".gitignore", + PathInfo("dir") / "repo2" / "lorem", + PathInfo("dir") / "repo2" / "dir2", + PathInfo("dir") / "repo2" / "dir2" / "ipsum", + ] + + actual = [] + for root, dirs, files in tree.walk("dir", dvcfiles=dvcfiles): + for entry in dirs + files: + actual.append(os.path.join(root, entry)) + + expected = [str(path) for path in expected + extra_expected] + assert set(actual) == set(expected) + assert len(actual) == len(expected) + + +def test_repo_tree_no_subrepos(tmp_dir, dvc, scm): + tmp_dir.scm_gen( + {"dir": {"repo.txt": "file to confuse RepoTree"}}, + commit="dir/repo.txt", + ) + tmp_dir.dvc_gen({"lorem": "lorem"}, commit="add foo") + + subrepo = tmp_dir / "dir" / "repo" + make_subrepo(subrepo, scm) + subrepo.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") + subrepo.scm_gen({"ipsum": "ipsum"}, commit="BAR") + + # using tree that does not have dvcignore + dvc.tree._reset() + tree = RepoTree(dvc, subrepos=False, fetch=True) + expected = [ + tmp_dir / ".dvcignore", + tmp_dir / ".gitignore", + tmp_dir / "lorem", + tmp_dir / "lorem.dvc", + tmp_dir / "dir", + tmp_dir / "dir" / "repo.txt", + ] + + actual = [] + for root, dirs, files in tree.walk(tmp_dir, dvcfiles=True): + for entry in dirs + files: + actual.append(os.path.normpath(os.path.join(root, entry))) + + expected = [str(path) for path in expected] + assert set(actual) == set(expected) + assert len(actual) == len(expected) + + assert tree.isfile(tmp_dir / "lorem") is True + assert tree.isfile(tmp_dir / "dir" / "repo" / "foo") is False + assert tree.isdir(tmp_dir / "dir" / "repo") is False + assert tree.isdir(tmp_dir / "dir") is True + + assert tree.isdvc(tmp_dir / "lorem") is True + assert tree.isdvc(tmp_dir / "dir" / "repo" / "dir1") is False + + assert tree.exists(tmp_dir / "dir" / "repo.txt") is True + assert tree.exists(tmp_dir / "repo" / "ipsum") is False