From 3a20c65a709b6d27e8cc12455c767f645351631f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 14 Jul 2020 17:34:52 +0545 Subject: [PATCH 01/11] get/list/import/api: subrepo support --- dvc/api.py | 36 ++-- dvc/dependency/repo.py | 27 +-- dvc/external_repo.py | 287 +++++++++++++++--------------- dvc/ignore.py | 9 +- dvc/repo/__init__.py | 46 ++--- dvc/repo/fetch.py | 7 +- dvc/repo/get.py | 26 ++- dvc/repo/ls.py | 14 +- dvc/repo/metrics/show.py | 2 +- dvc/repo/plots/__init__.py | 2 +- dvc/repo/tree.py | 182 ++++++++++++------- dvc/subrepos.py | 11 ++ dvc/tree/git.py | 12 +- dvc/tree/local.py | 12 +- tests/dir_helpers.py | 16 +- tests/func/test_external_repo.py | 23 ++- tests/func/test_get.py | 47 +++++ tests/func/test_import.py | 47 ++++- tests/func/test_tree.py | 4 +- tests/unit/repo/test_repo_tree.py | 20 +-- 20 files changed, 493 insertions(+), 337 deletions(-) create mode 100644 dvc/subrepos.py diff --git a/dvc/api.py b/dvc/api.py index a2d648aa17..84f857b68b 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -2,7 +2,12 @@ from contextlib import _GeneratorContextManager as GCM from contextlib import contextmanager -from dvc.exceptions import DvcException, NotDvcRepoError +from dvc.exceptions import ( + DvcException, + FileMissingError, + NotDvcRepoError, + PathMissingError, +) from dvc.external_repo import external_repo from dvc.repo import Repo @@ -26,10 +31,14 @@ def get_url(path, repo=None, rev=None, remote=None): directory in the remote storage. """ with _make_repo(repo, rev=rev) as _repo: - if not isinstance(_repo, Repo): - raise UrlNotDvcRepoError(_repo.url) # pylint: disable=no-member - out = _repo.find_out_by_relpath(path) - remote_obj = _repo.cloud.get_remote(remote) + # pylint: disable=no-member + path = os.path.join(_repo.root_dir, path) + is_erepo = not isinstance(_repo, Repo) + r = _repo.in_repo(path) if is_erepo else _repo + if is_erepo and not r: + raise UrlNotDvcRepoError(_repo.url) + out = r.find_out_by_relpath(path) + remote_obj = r.cloud.get_remote(remote) return str(remote_obj.tree.hash_to_path_info(out.checksum)) @@ -74,10 +83,17 @@ def __getattr__(self, name): def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): with _make_repo(repo, rev=rev) as _repo: - with _repo.open_by_relpath( - path, remote=remote, mode=mode, encoding=encoding - ) as fd: - yield fd + is_erepo = not isinstance(_repo, Repo) + try: + with _repo.repo_tree.open_by_relpath( + path, remote=remote, mode=mode, encoding=encoding + ) as fd: + yield fd + except FileNotFoundError as exc: + if is_erepo: + # pylint: disable=no-member + raise PathMissingError(path, _repo.url) from exc + raise FileMissingError(path) from exc def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None): @@ -101,5 +117,5 @@ def _make_repo(repo_url=None, rev=None): return except NotDvcRepoError: pass # fallthrough to external_repo - with external_repo(url=repo_url, rev=rev) as repo: + with external_repo(url=repo_url, rev=rev, stream=True) as repo: yield repo diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index eb3c58b5ce..fcc205b822 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -2,7 +2,6 @@ from voluptuous import Required -from dvc.exceptions import OutputNotFoundError from dvc.path_info import PathInfo from .local import LocalDependency @@ -42,30 +41,17 @@ def repo_pair(self): def __str__(self): return "{} ({})".format(self.def_path, self.def_repo[self.PARAM_URL]) - def _make_repo(self, *, locked=True): + def _make_repo(self, *, locked=True, **kwargs): from dvc.external_repo import external_repo d = self.def_repo rev = (d.get("rev_lock") if locked else None) or d.get("rev") - return external_repo(d["url"], rev=rev) + return external_repo(d["url"], rev=rev, **kwargs) def _get_checksum(self, locked=True): - from dvc.repo.tree import RepoTree - - with self._make_repo(locked=locked) as repo: - try: - return repo.find_out_by_relpath(self.def_path).info["md5"] - except OutputNotFoundError: - path = PathInfo(os.path.join(repo.root_dir, self.def_path)) - - # we want stream but not fetch, so DVC out directories are - # walked, but dir contents is not fetched - tree = RepoTree(repo, stream=True) - - # We are polluting our repo cache with some dir listing here - if tree.isdir(path): - return self.repo.cache.local.tree.get_hash(path, tree=tree) - return tree.get_file_hash(path) + with self._make_repo(locked=locked, stream=True) as repo: + path = PathInfo(os.path.join(repo.root_dir, self.def_path)) + return repo.get_checksum(path, self.repo.cache.local) def status(self): current_checksum = self._get_checksum(locked=True) @@ -88,8 +74,7 @@ def download(self, to): self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() cache = self.repo.cache.local - with repo.use_cache(cache): - _, _, cache_infos = repo.fetch_external([self.def_path]) + _, _, cache_infos = repo.fetch_external([self.def_path], cache) cache.checkout(to.path_info, cache_infos[0]) def update(self, rev=None): diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 42df57c691..8fc2ca91d2 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -8,6 +8,7 @@ from funcy import cached_property, retry, wrap_with +from dvc import subrepos from dvc.config import NoRemoteError, NotDvcRepoError from dvc.exceptions import ( FileMissingError, @@ -20,23 +21,22 @@ from dvc.repo import Repo from dvc.repo.tree import RepoTree from dvc.scm.git import Git -from dvc.tree.local import LocalTree +from dvc.tree import LocalTree from dvc.utils.fs import remove logger = logging.getLogger(__name__) @contextmanager -def external_repo(url, rev=None, for_write=False): +def external_repo(url, rev=None, for_write=False, **kwargs): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) if not rev: rev = "HEAD" - try: - repo = ExternalRepo(path, url, rev, for_write=for_write) - except NotDvcRepoError: - repo = ExternalGitRepo(path, url, rev) - + # TODO: What to do with for the `dvcx`? + # something like following, perhaps? + # repo = ExternalDVCRepo if _is_dvc_main_repo(path, rev) else ExternalRepo + repo = ExternalRepo(path, url, rev, for_write=for_write, **kwargs) try: yield repo except NoRemoteError: @@ -67,53 +67,124 @@ def clean_repos(): _remove(path) -class BaseExternalRepo: - # pylint: disable=no-member +def _fix_local_remote(orig_repo, src_repo, remote_name): + # If a remote URL is relative to the source repo, + # it will have changed upon config load and made + # relative to this new repo. Restore the old one here. + new_remote = orig_repo.config["remote"][remote_name] + old_remote = src_repo.config["remote"][remote_name] + if new_remote["url"] != old_remote["url"]: + new_remote["url"] = old_remote["url"] - _local_cache = None - def __str__(self): - return self.url +def _add_upstream(orig_repo, src_repo): + # Fill the empty upstream entry with a new remote pointing to the + # original repo's cache location. + orig_repo.config["remote"]["auto-generated-upstream"] = { + "url": src_repo.cache.local.cache_dir + } + orig_repo.config["core"]["remote"] = "auto-generated-upstream" - @property - def local_cache(self): - if hasattr(self, "cache"): - return self.cache.local - return self._local_cache - @contextmanager - def use_cache(self, cache): - """Use the specified cache in place of default tmpdir cache for - download operations. - """ - if hasattr(self, "cache"): - save_cache = self.cache.local - self.cache.local = cache - self._local_cache = cache +class ExternalRepo: + def __init__( + self, root_dir, url, rev, for_write=False, fetch=True, **kwargs + ): + self.root_dir = os.path.abspath(root_dir) + self.scm = Git(root_dir) + self.url = url + self.config = {"fetch": fetch, **kwargs} + self.for_write = for_write - yield + repo_kw = {} + if for_write: + tree = LocalTree(None, {"url": self.root_dir}) + else: + # .dvc folders are ignored by dvcignore which is required for + # `subrepos.find()`, hence using a separate tree + tree = self.scm.get_tree(rev) + repo_kw = dict(scm=self.scm, rev=rev) + + paths = subrepos.find(tree) + self.repos = [Repo(path, **repo_kw) for path in paths] + self._setup_cache_dir() + self.rev = rev + + for repo in self.repos: + repo.cache.local.cache_dir = self.cache_dir + if os.path.isdir(self.url): + self._fix_upstream(repo) - if hasattr(self, "cache"): - self.cache.local = save_cache - self._local_cache = None + @wrap_with(threading.Lock()) + def _setup_cache_dir(self): + # share same cache_dir among all subrepos + try: + self.cache_dir = CACHE_DIRS[self.url] + except KeyError: + self.cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp( + "dvc-cache" + ) @cached_property - def repo_tree(self): - return RepoTree(self, fetch=True) + def tree(self): + kwargs = dict( + use_dvcignore=True, + dvcignore_root=self.root_dir, + ignore_subrepo=False, + ) + if self.for_write: + return LocalTree(None, {"url": self.root_dir}, **kwargs) + return self.scm.get_tree(rev=self.rev, **kwargs) - def get_rev(self): - if isinstance(self.tree, LocalTree): - return self.scm.get_rev() - return self.tree.rev + @cached_property + def repo_tree(self) -> "RepoTree": + return RepoTree(self.tree, subrepos=self.repos, **self.config) + + @wrap_with(threading.Lock()) + @contextmanager + def with_cache(self, cache_dir, link_types=None): + for repo in self.repos: + repo.cache.local.cache_dir = cache_dir + if link_types: + # FIXME: not rolling back, should be fine for now + repo.cache.local.cache_types = link_types + yield + for repo in self.repos: + repo.cache.local.cache_dir = self.cache_dir + + def _fix_upstream(self, orig_repo): + try: + rel_path = os.path.relpath(orig_repo.root_dir, self.root_dir) + src_repo = Repo(PathInfo(self.url) / rel_path) + except NotDvcRepoError: + # If ExternalRepo does not throw NotDvcRepoError and Repo does, + # the self.url might be a bare git repo. + # NOTE: This will fail to resolve remote with relative path, + # same as if it was a remote DVC repo. + return + + try: + remote_name = orig_repo.config["core"].get("remote") + if remote_name: + _fix_local_remote(orig_repo, src_repo, remote_name) + else: + _add_upstream(orig_repo, src_repo) + finally: + src_repo.close() + + def close(self): + for repo in self.repos: + repo.close() + self.scm.close() - def fetch_external(self, paths: Iterable, **kwargs): + def fetch_external(self, paths: Iterable, cache, **kwargs): """Fetch specified external repo paths into cache. - Returns 3-tuple in the form - (downloaded, failed, list(cache_infos)) - where cache_infos can be used as checkout targets for the - fetched paths. - """ + Returns 3-tuple in the form + (downloaded, failed, list(cache_infos)) + where cache_infos can be used as checkout targets for the + fetched paths. + """ download_results = [] failed = 0 @@ -126,120 +197,48 @@ def download_update(result): for path in paths: if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) - save_info = self.local_cache.save( - path, - self.repo_tree, - None, - save_link=False, - download_callback=download_update, - ) + with self.with_cache(cache.cache_dir): + save_info = cache.save( + path, + self.repo_tree, + None, + save_link=False, + download_callback=download_update, + ) save_infos.append(save_info) return sum(download_results), failed, save_infos - def get_external(self, path, dest): + def get_external(self, src, dest): """Convenience wrapper for fetch_external and checkout.""" - if self.local_cache: - # fetch DVC and git files to tmpdir cache, then checkout - _, _, save_infos = self.fetch_external([path]) - self.local_cache.checkout(PathInfo(dest), save_infos[0]) + repo = self.in_repo(src) + if repo: + cache = repo.cache.local + _, _, save_infos = self.fetch_external([src], cache) + cache.checkout(PathInfo(dest), save_infos[0]) else: - # git-only erepo with no cache, just copy files directly - # to dest - path = PathInfo(self.root_dir) / path + path = PathInfo(self.root_dir) / src if not self.repo_tree.exists(path): - raise PathMissingError(path, self.url) + raise PathMissingError(src, self.url) self.repo_tree.copytree(path, dest) + def get_rev(self): + if isinstance(self.tree, LocalTree): + return self.scm.get_rev() + return self.tree.rev -class ExternalRepo(Repo, BaseExternalRepo): - def __init__(self, root_dir, url, rev, for_write=False): - if for_write: - super().__init__(root_dir) - else: - root_dir = os.path.realpath(root_dir) - super().__init__(root_dir, scm=Git(root_dir), rev=rev) - self.url = url - self._set_cache_dir() - self._fix_upstream() - - @wrap_with(threading.Lock()) - def _set_cache_dir(self): - try: - cache_dir = CACHE_DIRS[self.url] - except KeyError: - cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache") - - self.cache.local.cache_dir = cache_dir - self._local_cache = self.cache.local - - def _fix_upstream(self): - if not os.path.isdir(self.url): - return - - try: - src_repo = Repo(self.url) - except NotDvcRepoError: - # If ExternalRepo does not throw NotDvcRepoError and Repo does, - # the self.url might be a bare git repo. - # NOTE: This will fail to resolve remote with relative path, - # same as if it was a remote DVC repo. - return - - try: - remote_name = self.config["core"].get("remote") - if remote_name: - self._fix_local_remote(src_repo, remote_name) - else: - self._add_upstream(src_repo) - finally: - src_repo.close() - - def _fix_local_remote(self, src_repo, remote_name): - # If a remote URL is relative to the source repo, - # it will have changed upon config load and made - # relative to this new repo. Restore the old one here. - new_remote = self.config["remote"][remote_name] - old_remote = src_repo.config["remote"][remote_name] - if new_remote["url"] != old_remote["url"]: - new_remote["url"] = old_remote["url"] - - def _add_upstream(self, src_repo): - # Fill the empty upstream entry with a new remote pointing to the - # original repo's cache location. - cache_dir = src_repo.cache.local.cache_dir - self.config["remote"]["auto-generated-upstream"] = {"url": cache_dir} - self.config["core"]["remote"] = "auto-generated-upstream" - - -class ExternalGitRepo(BaseExternalRepo): - def __init__(self, root_dir, url, rev): - self.root_dir = os.path.realpath(root_dir) - self.url = url - self.tree = self.scm.get_tree(rev) - - @cached_property - def scm(self): - return Git(self.root_dir) - - def close(self): - if "scm" in self.__dict__: - self.scm.close() + def get_checksum(self, path_info, cache): + if self.repo_tree.isdir(path_info): + return cache.tree.get_hash(path_info, tree=self.repo_tree) + return self.repo_tree.get_file_hash(path_info) - def find_out_by_relpath(self, path): - raise OutputNotFoundError(path, self) + def in_repo(self, path): + tree = self.repo_tree.in_subtree(path) + return tree.repo if tree else None - @contextmanager - def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): - """Opens a specified resource as a file object.""" - tree = RepoTree(self) - try: - with tree.open( - path, mode=mode, encoding=encoding, **kwargs - ) as fobj: - yield fobj - except FileNotFoundError: - raise PathMissingError(path, self.url) + @property + def main_repo(self): + return self.in_repo(os.curdir) def _cached_clone(url, rev, for_write=False): diff --git a/dvc/ignore.py b/dvc/ignore.py index f72623295e..90cf50ff5b 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -175,13 +175,12 @@ def is_ignored_file(self, _): class DvcIgnoreFilter: - def __init__(self, tree, root_dir): + def __init__(self, tree, root_dir, ignore_subrepo=True): self.tree = tree self.root_dir = root_dir - self.ignores = { - DvcIgnoreDirs([".git", ".hg", ".dvc"]), - DvcIgnoreRepo(), - } + self.ignores = {DvcIgnoreDirs([".git", ".hg", ".dvc"])} + if ignore_subrepo: + self.ignores.add(DvcIgnoreRepo()) ignore_pattern_trie = DvcIgnorePatternsTrie() for root, dirs, _ in self.tree.walk(self.root_dir): ignore_pattern = self._get_ignore_pattern(root) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 6d98682727..f6aeb5f5b9 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -1,14 +1,11 @@ import logging import os -from contextlib import contextmanager from functools import wraps from funcy import cached_property, cat, first from dvc.config import Config from dvc.dvcfile import PIPELINE_FILE, Dvcfile, is_valid_filename -from dvc.exceptions import FileMissingError -from dvc.exceptions import IsADirectoryError as DvcIsADirectoryError from dvc.exceptions import ( NoOutputOrStageError, NotDvcRepoError, @@ -153,22 +150,29 @@ def tree(self, tree): def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" + @cached_property + def repo_tree(self): + return RepoTree(self.tree, [self], stream=True) + @classmethod def find_root(cls, root=None, tree=None): + # TODO (@skshetry): Verify and refactor this root_dir = os.path.realpath(root or os.curdir) - if tree: - if tree.isdir(os.path.join(root_dir, cls.DVC_DIR)): - return root_dir - raise NotDvcRepoError(f"'{root}' does not contain DVC directory") - - if not os.path.isdir(root_dir): - raise NotDvcRepoError(f"directory '{root}' does not exist") + is_dir = tree.isdir if tree else os.path.isdir while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) - if os.path.isdir(dvc_dir): + if is_dir(dvc_dir): return root_dir + if ( + tree + and os.path.dirname(os.path.abspath(tree.tree_root)) + == root_dir + ): + raise NotDvcRepoError( + f"'{root}' does not contain DVC directory" + ) if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) @@ -580,26 +584,6 @@ def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts - @contextmanager - def open_by_relpath(self, path, remote=None, mode="r", encoding=None): - """Opens a specified resource as a file descriptor""" - - tree = RepoTree(self, stream=True) - path = os.path.join(self.root_dir, path) - try: - with self.state: - with tree.open( - os.path.join(self.root_dir, path), - mode=mode, - encoding=encoding, - remote=remote, - ) as fobj: - yield fobj - except FileNotFoundError as exc: - raise FileMissingError(path) from exc - except IsADirectoryError as exc: - raise DvcIsADirectoryError(f"'{path}' is a directory") from exc - def close(self): self.scm.close() diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index fe3cd9c6f6..3814589445 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -81,10 +81,9 @@ def _fetch_external(self, repo_url, repo_rev, files, jobs): failed, downloaded = 0, 0 try: with external_repo(repo_url, repo_rev) as repo: - with repo.use_cache(self.cache.local): - d, f, _ = repo.fetch_external(files, jobs=jobs) - downloaded += d - failed += f + d, f, _ = repo.fetch_external(files, self.cache.local, jobs=jobs) + downloaded += d + failed += f except CloneError: failed += 1 logger.exception( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 1141b00772..4ae5c23f6c 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -37,19 +37,17 @@ def get(url, path, out=None, rev=None): tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(url=url, rev=rev) as repo: - if hasattr(repo, "cache"): - repo.cache.local.cache_dir = tmp_dir - - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we - # are done, and to make that work we would have to copy data - # over anyway before removing the cache, so we might just copy - # it right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - repo.get_external(path, out) + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we + # are done, and to make that work we would have to copy data + # over anyway before removing the cache, so we might just copy + # it right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + link_types = ["reflink", "hardlink", "copy"] + with repo.with_cache(tmp_dir, link_types=link_types): + repo.get_external(path, out) finally: remove(tmp_dir) diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index e02d52116a..1f3ae17d60 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -29,7 +29,7 @@ def ls( """ from dvc.external_repo import external_repo - with external_repo(url, rev) as repo: + with external_repo(url, rev, stream=True) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path @@ -48,15 +48,10 @@ def ls( def _ls(repo, path_info, recursive=None, dvc_only=False): - from dvc.repo.tree import RepoTree - def onerror(exc): raise exc - # use our own RepoTree instance instead of repo.repo_tree since we want to - # fetch directory listings, but don't want to fetch file contents. - tree = RepoTree(repo, stream=True) - + tree = repo.repo_tree ret = {} try: for root, dirs, files in tree.walk( @@ -76,9 +71,8 @@ def onerror(exc): if not recursive: for dname in dirs: info = PathInfo(root) / dname - if not dvc_only or ( - tree.dvctree and tree.dvctree.exists(info) - ): + subtree = tree.in_subtree(info) + if not dvc_only or (subtree and subtree.exists(info)): dvc = tree.isdvc(info) path = str(info.relative_to(path_info)) ret[path] = { diff --git a/dvc/repo/metrics/show.py b/dvc/repo/metrics/show.py index cdd787f005..e57c71c56d 100644 --- a/dvc/repo/metrics/show.py +++ b/dvc/repo/metrics/show.py @@ -53,7 +53,7 @@ def _extract_metrics(metrics): def _read_metrics(repo, metrics, rev): - tree = RepoTree(repo) + tree = RepoTree(repo.tree, [repo]) res = {} for metric in metrics: diff --git a/dvc/repo/plots/__init__.py b/dvc/repo/plots/__init__.py index 4386ca5613..8ce3585493 100644 --- a/dvc/repo/plots/__init__.py +++ b/dvc/repo/plots/__init__.py @@ -44,7 +44,7 @@ def collect(self, targets=None, revs=None): continue rev = rev or "workspace" - tree = RepoTree(self.repo) + tree = RepoTree(self.repo.tree, [self.repo]) plots = _collect_plots(self.repo, targets, rev) for path_info, props in plots.items(): datafile = relpath(path_info, self.repo.root_dir) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 51ceacfbb9..df4a7c2b41 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -1,5 +1,8 @@ import logging import os +from typing import Optional + +from funcy import cached_property, post_processing from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError @@ -245,25 +248,36 @@ class RepoTree(BaseTree): # pylint:disable=abstract-method Any kwargs will be passed to `DvcTree()`. """ - def __init__(self, repo, **kwargs): - super().__init__(repo, {"url": repo.root_dir}) - if hasattr(repo, "dvc_dir"): - self.dvctree = DvcTree(repo, **kwargs) - else: - # git-only erepo's do not need dvctree - self.dvctree = None + def __init__( + self, tree, subrepos=None, **kwargs + ): # pylint: disable=super-init-not-called + subrepos = subrepos or [] + subrepos.sort(key=lambda r: len(r.root_dir), reverse=True) + dvctrees = [DvcTree(repo, **kwargs) for repo in subrepos] + self._kwargs = kwargs + self._dvctrees = { + os.path.abspath(tree.repo.root_dir): tree for tree in dvctrees + } + self.tree = tree @property def fetch(self): - if self.dvctree: - return self.dvctree.fetch - return False + return self._kwargs.get("fetch", False) @property def stream(self): - if self.dvctree: - return self.dvctree.stream - return False + return self._kwargs.get("stream", False) + + def _find_subtree(self, path_prefix) -> Optional[DvcTree]: + return self._find_subtree_with_prefix(path_prefix)[1] + + def _find_subtree_with_prefix(self, path): + # dvctrees is already ordered from low to high + path_prefix = os.path.abspath(path) + for pref, tree in self._dvctrees.items(): + if os.path.abspath(path_prefix).startswith(pref): + return pref, tree + return "", None def open( self, path, mode="r", encoding="utf-8", **kwargs @@ -271,50 +285,71 @@ def open( if "b" in mode: encoding = None - if self.dvctree and self.dvctree.exists(path): - return self.dvctree.open( - path, mode=mode, encoding=encoding, **kwargs - ) - return self.repo.tree.open(path, mode=mode, encoding=encoding) + subtree = self._find_subtree(path) + if subtree and subtree.exists(path): + return subtree.open(path, mode=mode, encoding=encoding, **kwargs) + return self.tree.open(path, mode=mode, encoding=encoding) + + def open_by_relpath(self, path, *args, **kwargs): + return self.open(PathInfo(self.tree.tree_root, path), *args, **kwargs) def exists(self, path): # pylint: disable=arguments-differ - return self.repo.tree.exists(path) or ( - self.dvctree and self.dvctree.exists(path) - ) + subtree = self._find_subtree(path) + return self.tree.exists(path) or (subtree and subtree.exists(path)) def isdir(self, path): # pylint: disable=arguments-differ - return self.repo.tree.isdir(path) or ( - self.dvctree and self.dvctree.isdir(path) - ) + subtree = self._find_subtree(path) + return self.tree.isdir(path) or (subtree and subtree.isdir(path)) def isdvc(self, path, **kwargs): - return self.dvctree is not None and self.dvctree.isdvc(path, **kwargs) + subtree = self._find_subtree(path) + return subtree is not None and subtree.isdvc(path, **kwargs) def isfile(self, path): # pylint: disable=arguments-differ - return self.repo.tree.isfile(path) or ( - self.dvctree and self.dvctree.isfile(path) - ) + subtree = self._find_subtree(path) + return self.tree.isfile(path) or (subtree and subtree.isfile(path)) def isexec(self, path): - if self.dvctree and self.dvctree.exists(path): - return self.dvctree.isexec(path) - return self.repo.tree.isexec(path) + subtree = self._find_subtree(path) + if subtree and subtree.exists(path): + return subtree.isexec(path) + return self.tree.isexec(path) def stat(self, path): - return self.repo.tree.stat(path) + return self.tree.stat(path) - def _walk_one(self, walk): + def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: - yield from self._walk_one(walk) - - def _walk(self, dvc_walk, repo_walk, dvcfiles=False): + yield from self._dvc_walk(walk) + + def _repo_walk(self, dir_path, walk, **kwargs): + assert os.path.isabs(dir_path) + subtree = self._dvctrees.get(dir_path) + if subtree: + dvc_walk = subtree.walk(dir_path, topdown=True) + yield from self._walk(walk, dvc_walk, **kwargs) + else: + try: + root, dirs, files = next(walk) + except StopIteration: + return + yield root, dirs, files + for dirname in dirs: + yield from self._repo_walk( + os.path.join(root, dirname), walk, **kwargs + ) + + def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): + assert repo_walk try: - _, dvc_dirs, dvc_fnames = next(dvc_walk) + _, dvc_dirs, dvc_fnames = ( + next(dvc_walk) if dvc_walk else (None, [], []) + ) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return @@ -343,11 +378,15 @@ def _walk(self, dvc_walk, repo_walk, dvcfiles=False): for dirname in dirs: if dirname in shared: - yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) + yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: - yield from self._walk_one(dvc_walk) + yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: - yield from self._walk_one(repo_walk) + yield from self._repo_walk( + os.path.join(repo_root, dirname), + repo_walk, + dvcfiles=dvcfiles, + ) def walk( self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs @@ -377,22 +416,25 @@ def walk( onerror(NotADirectoryError(top)) return - dvc_exists = self.dvctree and self.dvctree.exists(top) - repo_exists = self.repo.tree.exists(top) + subtree = self._find_subtree(top) + dvc_exists = subtree and subtree.exists(top) + repo_exists = self.tree.exists(top) if dvc_exists and not repo_exists: - yield from self.dvctree.walk( + yield from subtree.walk( top, topdown=topdown, onerror=onerror, **kwargs ) - return - if repo_exists and not dvc_exists: - yield from self.repo.tree.walk( - top, topdown=topdown, onerror=onerror + if not dvc_exists and not repo_exists: + repo_walk = self.tree.walk(top, topdown=topdown, onerror=onerror) + yield from self._repo_walk( + os.path.abspath(top), repo_walk, dvcfiles=dvcfiles ) - return - - dvc_walk = self.dvctree.walk(top, topdown=topdown, **kwargs) - repo_walk = self.repo.tree.walk(top, topdown=topdown) - yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) + dvc_walk = ( + subtree.walk(top, topdown=topdown, **kwargs) + if dvc_exists + else None + ) + repo_walk = self.tree.walk(top, topdown=topdown) + yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ for root, _, files in self.walk(top, **kwargs): @@ -408,11 +450,10 @@ def get_file_hash(self, path_info): """ if not self.exists(path_info): raise FileNotFoundError - if self.dvctree and self.dvctree.exists(path_info): - try: - return self.dvctree.get_file_hash(path_info) - except OutputNotFoundError: - pass + subtree = self._find_subtree(path_info) + subtree_exists = subtree is not None and subtree.exists(path_info) + if subtree_exists: + return subtree.get_file_hash(path_info) return file_md5(path_info, self)[0] def copytree(self, top, dest): @@ -437,6 +478,27 @@ def copytree(self, top, dest): with self.open(src, mode="rb") as fobj: copy_fobj_to_file(fobj, dest_dir / fname) - @property - def hash_jobs(self): # pylint: disable=invalid-overridden-method - return self.repo.tree.hash_jobs + @cached_property + def hash_jobs(self): + return self.tree.hash_jobs + + def in_subtree(self, path): + return self._find_subtree(path) + + @post_processing("\r\n".join) + def _visualize(self, top, **kwargs): + """`tree`-like output, useful for debugging/visualizing""" + indent = 4 + spacing = " " * indent + tee = "├── " + last = "└── " + for root, _, files in self.walk(top, **kwargs): + level = root.replace(top, "").count(os.sep) + indent = spacing * level + yield "{}{}/".format(indent, os.path.basename(root)) + sub_indent = spacing * (level + 1) + length = len(files) + for i, f in enumerate(files): + yield "{}{}{}".format( + sub_indent, tee if i + 1 != length else last, f + ) diff --git a/dvc/subrepos.py b/dvc/subrepos.py new file mode 100644 index 0000000000..0460a054d7 --- /dev/null +++ b/dvc/subrepos.py @@ -0,0 +1,11 @@ +import os + +from funcy import collecting + + +@collecting +def find(tree, top=None): + top = top or tree.tree_root + for root, _, _ in tree.walk(top): + if tree.isdir(os.path.join(root, ".dvc")): + yield root diff --git a/dvc/tree/git.py b/dvc/tree/git.py index a500fa2fb4..bf4b14e404 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -24,7 +24,14 @@ def _item_basename(item): class GitTree(BaseTree): # pylint:disable=abstract-method """Proxies the repo file access methods to Git objects""" - def __init__(self, git, rev, use_dvcignore=False, dvcignore_root=None): + def __init__( + self, + git, + rev, + use_dvcignore=False, + dvcignore_root=None, + ignore_subrepo=True, + ): """Create GitTree instance Args: @@ -36,6 +43,7 @@ def __init__(self, git, rev, use_dvcignore=False, dvcignore_root=None): self.rev = rev self.use_dvcignore = use_dvcignore self.dvcignore_root = dvcignore_root + self.ignore_subrepo = ignore_subrepo @property def tree_root(self): @@ -49,7 +57,7 @@ def dvcignore(self): if not self.use_dvcignore: return DvcIgnoreFilterNoop(self, root) self.use_dvcignore = False - ret = DvcIgnoreFilter(self, root) + ret = DvcIgnoreFilter(self, root, ignore_subrepo=self.ignore_subrepo) self.use_dvcignore = True return ret diff --git a/dvc/tree/local.py b/dvc/tree/local.py index 61af21e01e..e67371397f 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -36,12 +36,20 @@ class LocalTree(BaseTree): CACHE_MODE = 0o444 SHARED_MODE_MAP = {None: (0o644, 0o755), "group": (0o664, 0o775)} - def __init__(self, repo, config, use_dvcignore=False, dvcignore_root=None): + def __init__( + self, + repo, + config, + use_dvcignore=False, + dvcignore_root=None, + ignore_subrepo=False, + ): super().__init__(repo, config) url = config.get("url") self.path_info = self.PATH_CLS(url) if url else None self.use_dvcignore = use_dvcignore self.dvcignore_root = dvcignore_root + self.ignore_subrepo = ignore_subrepo @property def tree_root(self): @@ -61,7 +69,7 @@ def dvcignore(self): if not self.use_dvcignore: return DvcIgnoreFilterNoop(self, root) self.use_dvcignore = False - ret = DvcIgnoreFilter(self, root) + ret = DvcIgnoreFilter(self, root, ignore_subrepo=self.ignore_subrepo) self.use_dvcignore = True return ret diff --git a/tests/dir_helpers.py b/tests/dir_helpers.py index 02ae154fd6..ec192a77ba 100644 --- a/tests/dir_helpers.py +++ b/tests/dir_helpers.py @@ -87,7 +87,7 @@ def __new__(cls, *args, **kwargs): self._init() return self - def init(self, *, scm=False, dvc=False): + def init(self, *, scm=False, dvc=False, subdir=False): from dvc.repo import Repo from dvc.scm.git import Git @@ -100,7 +100,9 @@ def init(self, *, scm=False, dvc=False): git_init(str_path) if dvc: self.dvc = Repo.init( - str_path, no_scm=not scm and not hasattr(self, "scm") + str_path, + no_scm=not scm and not hasattr(self, "scm"), + subdir=subdir, ) if scm: self.scm = self.dvc.scm if hasattr(self, "dvc") else Git(str_path) @@ -123,10 +125,10 @@ def gen(self, struct, text=""): if isinstance(struct, (str, bytes, pathlib.PurePath)): struct = {struct: text} - self._gen(struct) - return struct.keys() + return self._gen(struct) def _gen(self, struct, prefix=None): + paths = [] for name, contents in struct.items(): path = (prefix or self) / name @@ -141,6 +143,8 @@ def _gen(self, struct, prefix=None): path.write_bytes(contents) else: path.write_text(contents, encoding="utf-8") + paths.append(path) + return paths def dvc_gen(self, struct, text="", commit=None): paths = self.gen(struct, text) @@ -249,10 +253,10 @@ class PosixTmpDir(TmpDir, pathlib.PurePosixPath): @pytest.fixture(scope="session") def make_tmp_dir(tmp_path_factory, request): - def make(name, *, scm=False, dvc=False): + def make(name, *, scm=False, dvc=False, **kwargs): path = tmp_path_factory.mktemp(name) if isinstance(name, str) else name new_dir = TmpDir(path) - new_dir.init(scm=scm, dvc=dvc) + new_dir.init(scm=scm, dvc=dvc, **kwargs) request.addfinalizer(new_dir.close) return new_dir diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index d8c96fbcd8..ed0bb58b73 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -3,7 +3,6 @@ from mock import patch from dvc.external_repo import external_repo -from dvc.path_info import PathInfo from dvc.scm.git import Git from dvc.tree.local import LocalTree from dvc.utils import relpath @@ -20,11 +19,11 @@ def test_external_repo(erepo_dir): with patch.object(Git, "clone", wraps=Git.clone) as mock: with external_repo(url) as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "master" with external_repo(url, rev="branch") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "branch" assert mock.call_count == 1 @@ -43,7 +42,7 @@ def test_source_change(erepo_dir): assert old_rev != new_rev -def test_cache_reused(erepo_dir, mocker, local_cloud): +def test_cache_reused(tmp_dir, erepo_dir, mocker, local_cloud): erepo_dir.add_remote(config=local_cloud.config) with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "text", commit="add file") @@ -54,13 +53,13 @@ def test_cache_reused(erepo_dir, mocker, local_cloud): # Use URL to prevent any fishy optimizations url = f"file://{erepo_dir}" with external_repo(url) as repo: - repo.fetch() + repo.get_external("file", tmp_dir / "file1") assert download_spy.mock.call_count == 1 # Should not download second time erepo_dir.scm.branch("branch") with external_repo(url, "branch") as repo: - repo.fetch() + repo.get_external("file", tmp_dir / "file2") assert download_spy.mock.call_count == 1 @@ -90,10 +89,7 @@ def test_pull_subdir_file(tmp_dir, erepo_dir): dest = tmp_dir / "file" with external_repo(os.fspath(erepo_dir)) as repo: - _, _, save_infos = repo.fetch_external( - [os.path.join("subdir", "file")] - ) - repo.cache.local.checkout(PathInfo(dest), save_infos[0]) + repo.get_external(os.path.join("subdir", "file"), dest) assert dest.is_file() assert dest.read_text() == "contents" @@ -117,7 +113,8 @@ def test_relative_remote(erepo_dir, tmp_dir): url = os.fspath(erepo_dir) with external_repo(url) as repo: - assert os.path.isabs(repo.config["remote"]["upstream"]["url"]) - assert os.path.isdir(repo.config["remote"]["upstream"]["url"]) - with repo.open_by_relpath("file") as fd: + for subrepo in repo.repos: + assert os.path.isabs(subrepo.config["remote"]["upstream"]["url"]) + assert os.path.isdir(subrepo.config["remote"]["upstream"]["url"]) + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "contents" diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 72736f8077..60b4270b46 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -252,3 +252,50 @@ def test_get_pipeline_tracked_outs( with git_dir.chdir(): Repo.get("file:///{}".format(os.fspath(tmp_dir)), "bar", out="baz") assert (git_dir / "baz").read_text() == "foo" + + +def make_subrepo(dir_, scm, config): + dir_.mkdir(parents=True) + with dir_.chdir(): + dir_.scm = scm + dir_.init(dvc=True, subdir=True) + dir_.add_remote(config=config) + + +@pytest.mark.parametrize( + "output", + [ + "foo", + {"foo": "foo", "bar": "bar"}, + {"subdir": {"foo": "foo", "bar": "bar"}}, + ], + ids=["file", "dir", "nested_dir"], +) +@pytest.mark.parametrize( + "erepo", [pytest.lazy_fixture("erepo_dir"), pytest.lazy_fixture("git_dir")] +) +@pytest.mark.parametrize( + "subrepo_paths", + [ + (os.path.join("sub", "subdir1"), os.path.join("sub", "subdir2")), + (os.path.join("sub"), os.path.join("sub", "subdir1")), + ], + ids=["isolated", "nested"], +) +def test_subrepo_multiple( + tmp_dir, scm, output, subrepo_paths, erepo, local_cloud +): + sub_repos = [erepo / path for path in subrepo_paths] + filename = "output" + for repo in sub_repos: + make_subrepo(repo, erepo.scm, local_cloud.config) + repo.dvc_gen({filename: output}, commit="add subrepo") + repo.dvc.push() + + for i, repo in enumerate(sub_repos): + Repo.get( + f"file:///{erepo}", + str((repo / filename).relative_to(erepo)), + out=f"{filename}-{i}", + ) + assert (tmp_dir / f"{filename}-{i}").read_text() == output diff --git a/tests/func/test_import.py b/tests/func/test_import.py index 4e38f70d89..d55dc0585c 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -9,8 +9,10 @@ from dvc.config import NoRemoteError from dvc.dvcfile import Dvcfile from dvc.exceptions import DownloadError, PathMissingError +from dvc.scm.base import CloneError from dvc.system import System from dvc.utils.fs import makedirs, remove +from tests.func.test_get import make_subrepo def test_import(tmp_dir, scm, dvc, erepo_dir): @@ -234,7 +236,8 @@ def test_download_error_pulling_imported_stage(tmp_dir, dvc, erepo_dir): remove(dst_cache) with patch( - "dvc.tree.local.LocalTree._download", side_effect=Exception + "dvc.external_repo.external_repo", + side_effect=CloneError(os.fspath(erepo_dir), "somewhere"), ), pytest.raises(DownloadError): dvc.pull(["foo_imported.dvc"]) @@ -345,3 +348,45 @@ def test_local_import(tmp_dir, dvc, scm): tmp_dir.dvc_gen("foo", "foo", commit="init") (tmp_dir / "outdir").mkdir() dvc.imp(".", "foo", out="outdir") + + +@pytest.mark.parametrize( + "output", + [ + "foo", + {"foo": "foo", "bar": "bar"}, + {"subdir": {"foo": "foo", "bar": "bar"}}, + ], + ids=["file", "dir", "nested_dir"], +) +@pytest.mark.parametrize( + "erepo", [pytest.lazy_fixture("erepo_dir"), pytest.lazy_fixture("git_dir")] +) +@pytest.mark.parametrize( + "subrepo_paths", + [ + (os.path.join("sub", "subdir1"), os.path.join("sub", "subdir2")), + (os.path.join("sub"), os.path.join("sub", "subdir1")), + ], + ids=["isolated", "nested"], +) +def test_subrepo_import( + tmp_dir, scm, dvc, output, subrepo_paths, erepo, local_cloud +): + sub_repos = [erepo / path for path in subrepo_paths] + filename = "output" + for repo in sub_repos: + make_subrepo(repo, erepo.scm, local_cloud.config) + repo.dvc_gen({filename: output}, commit="add subrepo") + repo.dvc.push() + + rev = erepo.scm.get_rev() + for i, repo in enumerate(sub_repos): + url = f"file:///{erepo}" + file = str((repo / filename).relative_to(erepo)) + out = f"{filename}-{i}" + + stage = dvc.imp(url, file, out=out) + + assert stage.deps[0].def_repo == {"url": url, "rev_lock": rev} + assert (tmp_dir / out).read_text() == output diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 93b671bd75..74585c0e1d 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -184,7 +184,7 @@ def test_repotree_walk_fetch(tmp_dir, dvc, scm, local_remote): dvc.push() remove(dvc.cache.local.cache_dir) - tree = RepoTree(dvc, fetch=True) + tree = RepoTree(dvc.tree, [dvc], fetch=True) with dvc.state: for _, _, _ in tree.walk("dir"): pass @@ -208,7 +208,7 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): # # for this test, all file objects are being opened() and copied from tree # into dvc.cache, not fetched or streamed from a remote - tree = RepoTree(erepo_dir.dvc, stream=True) + tree = RepoTree(erepo_dir.dvc.tree, [erepo_dir.dvc], stream=True) expected = [ tree.get_file_hash(PathInfo(erepo_dir / path)) for path in ("dir/bar", "dir/subdir/foo") diff --git a/tests/unit/repo/test_repo_tree.py b/tests/unit/repo/test_repo_tree.py index 417b2c0b92..e1da5f4880 100644 --- a/tests/unit/repo/test_repo_tree.py +++ b/tests/unit/repo/test_repo_tree.py @@ -12,7 +12,7 @@ def test_exists(tmp_dir, dvc): dvc.add("foo") (tmp_dir / "foo").unlink() - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.exists("foo") @@ -21,7 +21,7 @@ def test_open(tmp_dir, dvc): dvc.add("foo") (tmp_dir / "foo").unlink() - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with dvc.state: with tree.open("foo", "r") as fobj: assert fobj.read() == "foo" @@ -31,7 +31,7 @@ def test_open_dirty_hash(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("file", "r") as fobj: assert fobj.read() == "something" @@ -40,7 +40,7 @@ def test_open_dirty_no_hash(tmp_dir, dvc): tmp_dir.gen("file", "file") (tmp_dir / "file.dvc").write_text("outs:\n- path: file\n") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("file", "r") as fobj: assert fobj.read() == "file" @@ -60,7 +60,7 @@ def test_open_in_history(tmp_dir, scm, dvc): if rev == "workspace": continue - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("foo", "r") as fobj: assert fobj.read() == "foo" @@ -68,7 +68,7 @@ def test_open_in_history(tmp_dir, scm, dvc): def test_isdir_isfile(tmp_dir, dvc): tmp_dir.gen({"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}}) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdir("datadir") assert not tree.isfile("datadir") assert not tree.isdvc("datadir") @@ -93,7 +93,7 @@ def test_isdir_mixed(tmp_dir, dvc): dvc.add(str(tmp_dir / "dir" / "foo")) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdir("dir") assert not tree.isfile("dir") @@ -123,7 +123,7 @@ def test_walk(tmp_dir, dvc, dvcfiles, extra_expected): ) dvc.add(str(tmp_dir / "dir"), recursive=True) tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) expected = [ PathInfo("dir") / "subdir1", @@ -150,7 +150,7 @@ def onerror(exc): raise exc tmp_dir.dvc_gen("foo", "foo") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) # path does not exist for _ in tree.walk("dir"): @@ -171,7 +171,7 @@ def test_isdvc(tmp_dir, dvc): tmp_dir.gen({"foo": "foo", "bar": "bar", "dir": {"baz": "baz"}}) dvc.add("foo") dvc.add("dir") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdvc("foo") assert not tree.isdvc("bar") assert tree.isdvc("dir") From 22b293cf794d816a6712ff3c4e2619ac8f6bcc3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 11:46:16 +0545 Subject: [PATCH 02/11] off-by-one --- dvc/tree/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/tree/local.py b/dvc/tree/local.py index e67371397f..058f94554e 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -42,7 +42,7 @@ def __init__( config, use_dvcignore=False, dvcignore_root=None, - ignore_subrepo=False, + ignore_subrepo=True, ): super().__init__(repo, config) url = config.get("url") From 2bd64ccf152aea871bcfd361cea8c4ddde091adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 11:46:45 +0545 Subject: [PATCH 03/11] remove test: get/import will not support absolute paths from now on --- tests/func/test_get.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 60b4270b46..d2e009aa27 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -98,22 +98,6 @@ def test_get_a_dvc_file(tmp_dir, erepo_dir): Repo.get(os.fspath(erepo_dir), "some_file.dvc") -# https://github.com/iterative/dvc/pull/2837#discussion_r352123053 -def test_get_full_dvc_path(tmp_dir, erepo_dir, tmp_path_factory): - path = tmp_path_factory.mktemp("ext") - external_data = path / "ext_data" - external_data.write_text("ext_data") - - with erepo_dir.chdir(): - erepo_dir.dvc.add(os.fspath(external_data), external=True) - erepo_dir.scm_add("ext_data.dvc", commit="add external data") - - Repo.get( - os.fspath(erepo_dir), os.fspath(external_data), "ext_data_imported" - ) - assert (tmp_dir / "ext_data_imported").read_text() == "ext_data" - - def test_non_cached_output(tmp_dir, erepo_dir): src = "non_cached_file" dst = src + "_imported" From c7d1966fe382eeef23545a76948f998d99c9f803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 16:01:35 +0545 Subject: [PATCH 04/11] fixup --- dvc/external_repo.py | 25 +++++++++++++++---------- dvc/repo/tree.py | 15 +++++++++------ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 8fc2ca91d2..234ebed263 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -178,6 +178,12 @@ def close(self): self.scm.close() def fetch_external(self, paths: Iterable, cache, **kwargs): + # `RepoTree` will try downloading it to the repo's cache + # instead of `cache_dir`, need to change on all instances + with self.with_cache(cache.cache_dir): + return self._fetch_to_cache(paths, cache, **kwargs) + + def _fetch_to_cache(self, paths: Iterable, cache, **kwargs): """Fetch specified external repo paths into cache. Returns 3-tuple in the form @@ -197,14 +203,13 @@ def download_update(result): for path in paths: if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) - with self.with_cache(cache.cache_dir): - save_info = cache.save( - path, - self.repo_tree, - None, - save_link=False, - download_callback=download_update, - ) + save_info = cache.save( + path, + self.repo_tree, + None, + save_link=False, + download_callback=download_update, + ) save_infos.append(save_info) return sum(download_results), failed, save_infos @@ -214,7 +219,7 @@ def get_external(self, src, dest): repo = self.in_repo(src) if repo: cache = repo.cache.local - _, _, save_infos = self.fetch_external([src], cache) + _, _, save_infos = self._fetch_to_cache([src], cache) cache.checkout(PathInfo(dest), save_infos[0]) else: path = PathInfo(self.root_dir) / src @@ -233,7 +238,7 @@ def get_checksum(self, path_info, cache): return self.repo_tree.get_file_hash(path_info) def in_repo(self, path): - tree = self.repo_tree.in_subtree(path) + tree = self.repo_tree.in_subtree(PathInfo(self.root_dir) / path) return tree.repo if tree else None @property diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index df4a7c2b41..e377bae229 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -253,12 +253,15 @@ def __init__( ): # pylint: disable=super-init-not-called subrepos = subrepos or [] subrepos.sort(key=lambda r: len(r.root_dir), reverse=True) - dvctrees = [DvcTree(repo, **kwargs) for repo in subrepos] - self._kwargs = kwargs - self._dvctrees = { - os.path.abspath(tree.repo.root_dir): tree for tree in dvctrees - } + dvctrees = [ + (os.path.abspath(repo.root_dir), DvcTree(repo, **kwargs)) + for repo in subrepos + ] + self._dvctrees = dict( + sorted(dvctrees, key=lambda v: len(v[0]), reverse=True) + ) self.tree = tree + self._kwargs = kwargs @property def fetch(self): @@ -275,7 +278,7 @@ def _find_subtree_with_prefix(self, path): # dvctrees is already ordered from low to high path_prefix = os.path.abspath(path) for pref, tree in self._dvctrees.items(): - if os.path.abspath(path_prefix).startswith(pref): + if path_prefix.startswith(pref): return pref, tree return "", None From 7c795f58f0e50d937511d8cbd7496fe274ae1e24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 16:44:13 +0545 Subject: [PATCH 05/11] tests on mac --- .travis.yml | 125 ------------------------------------------- dvc/repo/tree.py | 5 +- scripts/ci/script.sh | 2 +- 3 files changed, 5 insertions(+), 127 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1dd401c1bc..873db39675 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,9 +10,7 @@ env: - PATH="$PYENV_ROOT/bin:$PATH" - PIP_CACHE_DIR="$HOME/.cache/pip" # unify pip cache location for all platforms stages: - - check - test - - build # Travis doesn't support testing python on Mac yet, so we need # to workaround it by installing it directly with brew. # @@ -24,93 +22,10 @@ stages: jobs: include: # patch formatting checks - - name: "Check patch formatting" - stage: check - os: linux - language: python - python: 3.7 - before_install: bash ./scripts/ci/install.sh - install: - script: ./scripts/ci/check_patch.sh # test jobs - - name: "3.7 on Windows" - stage: test - os: windows # Windows 10.0.17134 N/A Build 17134 - language: shell # 'language: python' is an error on Travis CI Windows - env: - - PYTHON_VERSION=3.7.5 - - PATH=/c/Python37:/c/Python37/Scripts:$PATH - - name: "3.8 on Windows" - os: windows # Windows 10.0.17134 N/A Build 17134 - language: shell # 'language: python' is an error on Travis CI Windows - env: - - PYTHON_VERSION=3.8.0 - - PATH=/c/Python38:/c/Python38/Scripts:$PATH - - name: "3.6 on Linux" - language: python - python: 3.6 - - name: "3.7 on Linux" - language: python - python: 3.7 - - name: "3.8 on Linux" - language: python - python: 3.8 - name: "3.7 on Mac" os: osx osx_image: xcode11.3 - # build jobs - - name: "Mac pkg" - stage: build - os: osx - osx_image: xcode9.4 - language: ruby - rvm: 2.4.3 - install: - script: ./scripts/build_posix.sh - - name: "Windows pkg" - os: windows - language: shell - install: - script: ./scripts/build_windows.cmd - env: - - PYTHON_VERSION=3.7.5 - - PATH=/c/Python37:/c/Python37/Scripts:$PATH - - name: "Linux pkgs" - language: python - python: 3.7 - before_install: - install: - script: ./scripts/build_posix.sh - - name: "PyPI pkgs" - language: python - python: 3.8 - script: ./scripts/build_package.sh - - name: Snapcraft snap - language: python - python: 3.8 - addons: - snaps: - - name: snapcraft - channel: stable - confinement: classic - - name: review-tools - channel: stable - - name: lxd - channel: stable - env: - - SNAPCRAFT_IMAGE_INFO: | - '{"build_url": "$TRAVIS_JOB_WEB_URL"}' - - SNAPCRAFT_BUILD_ENVIRONMENT: lxd - - SNAPCRAFT_BUILD_INFO: 1 # https://snapcraft.io/blog/introducing-developer-notifications-for-snap-security-updates - install: - - sudo usermod --append --groups lxd $USER - - sudo /snap/bin/lxd.migrate -yes - - sudo /snap/bin/lxd waitready - - sudo /snap/bin/lxd init --auto - script: - - ./scripts/build_snap.sh - after_failure: - - sudo journalctl -u snapd before_install: - bash ./scripts/ci/before_install.sh @@ -120,43 +35,3 @@ install: - bash ./scripts/ci/install.sh script: - bash ./scripts/ci/script.sh -deploy: - - provider: releases - api_key: - secure: nyo17gF4Xl6DFL/KVkMewC805Cv+eK99L7SiYCU+WUAYDIzzFL3YJm7fm8bznPeor7DmpO7uSLSveMnJETBtdurW6JWhCTUss4yD66yFzpk6Z5cvHf/1UA5pdMHkkMM54fRaTRt6L2Yal7RPwGEQTBECLXW1Xmf78Y6+0l8lTvXQ9jXOQSRO/63VDKI5Zww2+XytVicUFlr0hLMGKUuaCjjXcQlAPzBgQz3/5Iz6EtzeTf2o+gTPjocbRbSFPikfiBXUn8XlJ8jBj7i114XJRJffw59KxYGDzxp+enfLBULGNoW0Np2hZqkKkPUIdB08C8XVbRXlFYX8cEYBczjqhv8A05p5F5CnltnqQ0KazcoUP2awPVqn68dWNRF+kqrnMBus8euYKIZ4dy/6bVEKJRa84ESJXZVDcwXTAXRqyovSxOlXqg9ZgAImMKBfKn/fpDX2xIvQPaXcnJxKbmcePfdbC1d0HYJ5jkbNaxKE/T15VvhzIho7mJ2WAv1r8QouQUuzqtpzIGbRn/KXHRUzCtyP0k4+eMKMLFOPzg848WsDarP3v8VYiSy4nFY7ZP7DRd3KttvW5bgVbSeO4jPmTgS6TnnpRwual9BJXSG7bjbfpIMii3bNZ7jWL4zpKT9Jfm8w9RGKLcX9HqkZ3+y60+LtYdhEfBvqsIlWQoIJBXg= - file_glob: true - file: - - dvc*.rpm - - dvc*.deb - - dvc*.pkg - - dvc*.exe - skip_cleanup: true - on: - condition: "$(./scripts/ci/deploy_condition.sh dvc*.rpm dvc*.deb dvc*.pkg dvc*.exe)" - tags: true - repo: iterative/dvc - stage: build - - provider: pypi - user: - secure: "IOzNfgPoG3ICg3+s4AuZCCnawgd/KqMzeiAB9R0B9Xb6ProPQJp9O3GBZrtg5OJMHdWQQtwULeOh/LWk4sFdXMcX1iNEgm5Tb7DwO7TDtltW14jUgN9AqfY2twj4UIsW/ON8w0P8v36ZoQq1kO6Pfgjijdm3w+9h6rpps8rJ6jXYpVZVO+SSzzijWddanMkEz4GgctZ2r7U5MWlmm1UogF60kEZIZCQykz0yJnhTpP9nMFDKw6Maki/mcfVwTQ/v3RqjzttHei/aaEMGQXYr3TOX8NPYJXerBpjShDZqcgcCfqPzhi7uCwVYaPvRsMMYH5jgtMz/SO3J6ToeGKT0f1F1ihRyCdusg325TMK7Vv8yHJzL4ngeUbb/oqxWtxhLY7f77Lkx6hBwe6ROpTqiQHIfZad5qyIHihKcDcIHo4FwLHVzxR4y+Ds2js5dXwBkUBaEAx0Dmc6ZdgXqxd815GDwv+VVpzXJhtO6qXmKE36F/Oe6eCNG3dk5MdOK4C2DrM9M8MXllqeQtX6JrM2stWlBalAc9Jn4b0xxAoghX3x3G3hNsLc9Uist1VinY2+IcPozusqj66c+Vbi5YngWSbICOjlIcq2GaOlBi3xioGts396442ztf1eLjMVuvykCI3b4Yz/vJYZOZjUvG/6nLfaFQTJLSXkTvI4do3foeY4=" - password: - secure: "ts/3FSPL+w4euXryip9wCJWUqUwuJ2CqHi0nEpYVM10KL0cp9aMkQXN/7C8ih+j/JZ25kaO+tis1nx0JERVnKrg/1F/fJhxHTpakCP4OSIlhKRXDt7yUUHcKKt5QdM2XCKhApptP6RTpWL1vlc36CMrhdvTcMYp7hlDOiWjdhhim24FGQzabL42Ko59ttyyCROqgcnjHqj+WlhJl7iYO+lv2PLOtslpkIYY5EMjgBLuGyOvxTqZsAXtworsASdB2CrsTE6iMRcrM1afgkyBlUqDezoiv9TwfZrZ85oRoAxPB7su4QWK7lEP0wGQV/SDz/RL7c+cNfRaauxUKVcC8I2LfaEpudv8vi205+v/08wzPbS4xSOnf89irUJnyGxUy+bYLoXPPVPMyzcQRAUnrQl+ENCmN62HU+/2pkIyiUJvt0wYYP0vp+3pIbXbYQGmAIMq7nZJd7K3rikfnQIGKI03xZsFe3PEdjzhDI9Ej/tRUbJx4pgg084QuYYca6MSYn/zJ5pqIG4A7ngCSxoxoWZT0duo8PrUWQeT17L73Q86HeEv3OzZhWfEF2fNDN3Qbsu4CllSZnbE7DXgAuysTwTYHS/na6lp7wejOqdRUXrHbidVla9Mwxyc9JdWNWzUDxALvNxAdNbV6l85mO8izy8wjQF6M1O7idqTfjQeR4pc=" - skip_existing: true - file_glob: true - file: - - dist/dvc*.whl - - dist/dvc*.tar.gz - skip_cleanup: true - on: - condition: "$(./scripts/ci/deploy_condition.sh dist/dvc*.whl dist/dvc*.tar.gz)" - tags: true - repo: iterative/dvc - stage: build - - provider: script - skip_cleanup: true - script: "(echo $SNAP_TOKEN | snapcraft login --with -) && timeout 600 snapcraft push dvc_*.snap --release $SNAP_CHANNEL || echo timed out" - on: - all_branches: true - condition: "$(./scripts/ci/deploy_condition.sh dvc_*.snap) && ($TRAVIS_BRANCH = master || -n $TRAVIS_TAG) && -n $SNAP_CHANNEL" - repo: iterative/dvc - stage: build diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index e377bae229..17514a6780 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -2,7 +2,7 @@ import os from typing import Optional -from funcy import cached_property, post_processing +from funcy import cached_property, log_calls, post_processing from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError @@ -260,6 +260,7 @@ def __init__( self._dvctrees = dict( sorted(dvctrees, key=lambda v: len(v[0]), reverse=True) ) + print(self._dvctrees) self.tree = tree self._kwargs = kwargs @@ -274,9 +275,11 @@ def stream(self): def _find_subtree(self, path_prefix) -> Optional[DvcTree]: return self._find_subtree_with_prefix(path_prefix)[1] + @log_calls(logger.info) def _find_subtree_with_prefix(self, path): # dvctrees is already ordered from low to high path_prefix = os.path.abspath(path) + print(path_prefix) for pref, tree in self._dvctrees.items(): if path_prefix.startswith(pref): return pref, tree diff --git a/scripts/ci/script.sh b/scripts/ci/script.sh index 37bfa37f02..6e9c29368a 100644 --- a/scripts/ci/script.sh +++ b/scripts/ci/script.sh @@ -3,7 +3,7 @@ set -x set -e -python -mtests +python -m tests tests/func/test_{get,import,ls,update,api}.py if [[ "$TRAVIS_PULL_REQUEST" == "false" && "$TRAVIS_SECURE_ENV_VARS" == "true" ]]; then pip install codecov From a61d70250c10fdc74baa8be4dc95f6e078fbe9a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 18:26:41 +0545 Subject: [PATCH 06/11] fixup --- dvc/repo/tree.py | 9 ++++----- dvc/tree/http.py | 2 +- tests/__main__.py | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 17514a6780..f0a400190e 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -2,7 +2,7 @@ import os from typing import Optional -from funcy import cached_property, log_calls, post_processing +from funcy import cached_property, post_processing from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError @@ -260,7 +260,7 @@ def __init__( self._dvctrees = dict( sorted(dvctrees, key=lambda v: len(v[0]), reverse=True) ) - print(self._dvctrees) + logger.info("dvctrees: %r", self._dvctrees) self.tree = tree self._kwargs = kwargs @@ -275,11 +275,10 @@ def stream(self): def _find_subtree(self, path_prefix) -> Optional[DvcTree]: return self._find_subtree_with_prefix(path_prefix)[1] - @log_calls(logger.info) def _find_subtree_with_prefix(self, path): # dvctrees is already ordered from low to high path_prefix = os.path.abspath(path) - print(path_prefix) + logger.info("Path prefix: %s, original: %s", path_prefix, path) for pref, tree in self._dvctrees.items(): if path_prefix.startswith(pref): return pref, tree @@ -297,7 +296,7 @@ def open( return self.tree.open(path, mode=mode, encoding=encoding) def open_by_relpath(self, path, *args, **kwargs): - return self.open(PathInfo(self.tree.tree_root, path), *args, **kwargs) + return self.open(PathInfo(self.tree.tree_root) / path, *args, **kwargs) def exists(self, path): # pylint: disable=arguments-differ subtree = self._find_subtree(path) diff --git a/dvc/tree/http.py b/dvc/tree/http.py index a683ec5ea5..6ec5f28e7e 100644 --- a/dvc/tree/http.py +++ b/dvc/tree/http.py @@ -2,7 +2,7 @@ import os.path import threading -from funcy import cached_property, memoize, wrap_prop, wrap_with +from funcy import cached_property, memoize, wrap_prop, wrap_with, log_exits import dvc.prompt as prompt from dvc.exceptions import DvcException, HTTPError diff --git a/tests/__main__.py b/tests/__main__.py index 05dfddd831..c645ddf132 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -17,7 +17,6 @@ params = " ".join(sys.argv[1:]) cmd = ( - "py.test -v -n=4 --timeout=600 --timeout_method=thread --log-level=debug" - " --cov=dvc {params} --durations=0".format(params=params) + "pytest -svv --timeout=600 {params} --durations=0".format(params=params) ) check_call(cmd, shell=True) From 0a63465e296dee08e79334dbda64980624a883fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 18:39:29 +0545 Subject: [PATCH 07/11] debug --- dvc/external_repo.py | 1 + dvc/repo/tree.py | 3 +++ dvc/subrepos.py | 3 +++ 3 files changed, 7 insertions(+) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 234ebed263..ac64143369 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -106,6 +106,7 @@ def __init__( repo_kw = dict(scm=self.scm, rev=rev) paths = subrepos.find(tree) + print("paths: ", paths) self.repos = [Repo(path, **repo_kw) for path in paths] self._setup_cache_dir() self.rev = rev diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index f0a400190e..601b06dcb1 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -251,12 +251,15 @@ class RepoTree(BaseTree): # pylint:disable=abstract-method def __init__( self, tree, subrepos=None, **kwargs ): # pylint: disable=super-init-not-called + logger.info("original subrepos: %s", subrepos) subrepos = subrepos or [] subrepos.sort(key=lambda r: len(r.root_dir), reverse=True) + logger.info("sorted subrepos: %r", subrepos) dvctrees = [ (os.path.abspath(repo.root_dir), DvcTree(repo, **kwargs)) for repo in subrepos ] + logger.info("dvctrees tuples: %r", dvctrees) self._dvctrees = dict( sorted(dvctrees, key=lambda v: len(v[0]), reverse=True) ) diff --git a/dvc/subrepos.py b/dvc/subrepos.py index 0460a054d7..29c0fe86e9 100644 --- a/dvc/subrepos.py +++ b/dvc/subrepos.py @@ -5,7 +5,10 @@ @collecting def find(tree, top=None): + print("tree, top, tree_root: ", tree, top, tree.tree_root) top = top or tree.tree_root for root, _, _ in tree.walk(top): + print("root: ", root) + print("checking if it's a DVC repo", os.path.join(root, ".dvc"), tree.isdir(os.path.join(root, ".dvc"))) if tree.isdir(os.path.join(root, ".dvc")): yield root From e825a88b583338202bb2ee3e1f70154688788ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 18:59:23 +0545 Subject: [PATCH 08/11] debug --- dvc/subrepos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/subrepos.py b/dvc/subrepos.py index 29c0fe86e9..08560fd665 100644 --- a/dvc/subrepos.py +++ b/dvc/subrepos.py @@ -7,7 +7,7 @@ def find(tree, top=None): print("tree, top, tree_root: ", tree, top, tree.tree_root) top = top or tree.tree_root - for root, _, _ in tree.walk(top): + for root, _, _ in tree.walk(os.path.abspath(top)): print("root: ", root) print("checking if it's a DVC repo", os.path.join(root, ".dvc"), tree.isdir(os.path.join(root, ".dvc"))) if tree.isdir(os.path.join(root, ".dvc")): From 2c7ef5bb833bee84b1240667a11ee713b9d42d0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 19:05:41 +0545 Subject: [PATCH 09/11] debug --- dvc/subrepos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/subrepos.py b/dvc/subrepos.py index 08560fd665..17bb925977 100644 --- a/dvc/subrepos.py +++ b/dvc/subrepos.py @@ -7,7 +7,7 @@ def find(tree, top=None): print("tree, top, tree_root: ", tree, top, tree.tree_root) top = top or tree.tree_root - for root, _, _ in tree.walk(os.path.abspath(top)): + for root, _, _ in tree.walk(os.path.realpath(top)): print("root: ", root) print("checking if it's a DVC repo", os.path.join(root, ".dvc"), tree.isdir(os.path.join(root, ".dvc"))) if tree.isdir(os.path.join(root, ".dvc")): From a8a9a50c624a2e4169bc9aca3f78a4b372642d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 19:48:38 +0545 Subject: [PATCH 10/11] debug --- dvc/external_repo.py | 2 +- dvc/subrepos.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index ac64143369..a62f8247c2 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -90,7 +90,7 @@ class ExternalRepo: def __init__( self, root_dir, url, rev, for_write=False, fetch=True, **kwargs ): - self.root_dir = os.path.abspath(root_dir) + self.root_dir = os.path.realpath(root_dir) self.scm = Git(root_dir) self.url = url self.config = {"fetch": fetch, **kwargs} diff --git a/dvc/subrepos.py b/dvc/subrepos.py index 17bb925977..29c0fe86e9 100644 --- a/dvc/subrepos.py +++ b/dvc/subrepos.py @@ -7,7 +7,7 @@ def find(tree, top=None): print("tree, top, tree_root: ", tree, top, tree.tree_root) top = top or tree.tree_root - for root, _, _ in tree.walk(os.path.realpath(top)): + for root, _, _ in tree.walk(top): print("root: ", root) print("checking if it's a DVC repo", os.path.join(root, ".dvc"), tree.isdir(os.path.join(root, ".dvc"))) if tree.isdir(os.path.join(root, ".dvc")): From df7a2ccfdf3c4af2f5928466a9eff24598fb5330 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 20:03:13 +0545 Subject: [PATCH 11/11] debug --- dvc/external_repo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index a62f8247c2..6094d76307 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -91,14 +91,14 @@ def __init__( self, root_dir, url, rev, for_write=False, fetch=True, **kwargs ): self.root_dir = os.path.realpath(root_dir) - self.scm = Git(root_dir) + self.scm = Git(self.root_dir) self.url = url self.config = {"fetch": fetch, **kwargs} self.for_write = for_write repo_kw = {} if for_write: - tree = LocalTree(None, {"url": self.root_dir}) + tree = LocalTree(None, {"url": root_dir}) else: # .dvc folders are ignored by dvcignore which is required for # `subrepos.find()`, hence using a separate tree