From e20110817085421e3e7f382d008b864481aa1be9 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 14:33:02 +0900 Subject: [PATCH 01/22] erepo: load external repos directly from git tree --- dvc/external_repo.py | 36 ++++++++++++++++++++---------------- dvc/repo/__init__.py | 29 ++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 8f21fa0f0d..472569de59 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -29,10 +29,12 @@ def external_repo(url, rev=None, for_write=False): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) + if not rev: + rev = "HEAD" try: - repo = ExternalRepo(path, url) + repo = ExternalRepo(path, url, rev, for_write=for_write) except NotDvcRepoError: - repo = ExternalGitRepo(path, url) + repo = ExternalGitRepo(path, url, rev) try: yield repo @@ -65,8 +67,12 @@ def clean_repos(): class ExternalRepo(Repo): - def __init__(self, root_dir, url): - super().__init__(root_dir) + def __init__(self, root_dir, url, rev, for_write=False): + if for_write: + super().__init__(root_dir) + else: + root_dir = os.path.realpath(root_dir) + super().__init__(root_dir, scm=Git(root_dir), rev=rev) self.url = url self._set_cache_dir() self._fix_upstream() @@ -166,9 +172,10 @@ def _add_upstream(self, src_repo): class ExternalGitRepo: - def __init__(self, root_dir, url): - self.root_dir = root_dir + def __init__(self, root_dir, url, rev): + self.root_dir = os.path.realpath(root_dir) self.url = url + self.tree = self.scm.get_tree(rev) @cached_property def scm(self): @@ -209,14 +216,12 @@ def _cached_clone(url, rev, for_write=False): revision checked out. If for_write is set prevents reusing this dir via cache. """ - if not for_write and Git.is_sha(rev) and (url, rev) in CLONES: - return CLONES[url, rev] - + # even if we have already cloned this repo, we may need to + # fetch/fast-forward to get specified rev clone_path = _clone_default_branch(url, rev) - rev_sha = Git(clone_path).resolve_rev(rev or "HEAD") - if not for_write and (url, rev_sha) in CLONES: - return CLONES[url, rev_sha] + if not for_write and (url) in CLONES: + return CLONES[url] # Copy to a new dir to keep the clone clean repo_path = tempfile.mkdtemp("dvc-erepo") @@ -224,11 +229,10 @@ def _cached_clone(url, rev, for_write=False): copy_tree(clone_path, repo_path) # Check out the specified revision - if rev is not None: + if for_write: _git_checkout(repo_path, rev) - - if not for_write: - CLONES[url, rev_sha] = repo_path + else: + CLONES[url] = repo_path return repo_path diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 8c863942f7..11ee74ee59 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -57,7 +57,7 @@ class Repo: from dvc.repo.get_url import get_url from dvc.repo.update import update - def __init__(self, root_dir=None): + def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM @@ -70,17 +70,23 @@ def __init__(self, root_dir=None): from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache - root_dir = self.find_root(root_dir) + if scm: + # use GitTree instead of WorkingTree as default repo tree instance + tree = scm.get_tree(rev) + self.root_dir = self.find_root(root_dir, tree) + self.scm = scm + self.tree = tree + else: + root_dir = self.find_root(root_dir) + self.root_dir = os.path.abspath(os.path.realpath(root_dir)) - self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) - self.config = Config(self.dvc_dir) - no_scm = self.config["core"].get("no_scm", False) - self.scm = SCM(self.root_dir, no_scm=no_scm) - - self.tree = WorkingTree(self.root_dir) + if not scm: + no_scm = self.config["core"].get("no_scm", False) + self.scm = SCM(self.root_dir, no_scm=no_scm) + self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") @@ -124,9 +130,14 @@ def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" @classmethod - def find_root(cls, root=None): + def find_root(cls, root=None, tree=None): root_dir = os.path.realpath(root or os.curdir) + if tree: + if tree.isdir(os.path.join(root_dir, cls.DVC_DIR)): + return root_dir + raise NotDvcRepoError(f"'{root}' does not contain DVC directory") + if not os.path.isdir(root_dir): raise NotDvcRepoError(f"directory '{root}' does not exist") From 4e9cd92f5ecea53b00507636bb84bff0a5002e40 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 15:55:54 +0900 Subject: [PATCH 02/22] erepo: add fetch_external --- dvc/external_repo.py | 68 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 472569de59..84794df762 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -18,6 +18,7 @@ ) from dvc.path_info import PathInfo from dvc.repo import Repo +from dvc.repo.tree import RepoTree from dvc.scm.git import Git from dvc.utils import tmp_fname from dvc.utils.fs import fs_copy, move, remove @@ -66,7 +67,61 @@ def clean_repos(): _remove(path) -class ExternalRepo(Repo): +class BaseExternalRepo: + + _local_cache = None + + @property + def local_cache(self): + if hasattr(self, "cache"): + return self.cache.local + return self._local_cache + + @contextmanager + def use_cache(self, cache): + """Use the specified cache in place of default tmpdir cache for + download operations. + """ + if hasattr(self, "cache"): + save_cache = self.cache.local + self.cache.local = cache + self._local_cache = cache + + yield + + if hasattr(self, "cache"): + self.cache.local = save_cache + self._local_cache = None + + @cached_property + def repo_tree(self): + return RepoTree(self, fetch=True) + + def fetch_external(self, paths, **kwargs): + download_results = [] + failed = 0 + + paths = [PathInfo(self.root_dir) / path for path in paths] + + def download_update(result): + download_results.append(result) + + for path in paths: + if not self.repo_tree.exists(path): + logger.exception(f"'{path}' does not exist in '{self.url}'") + failed += 1 + continue + self.local_cache.save( + path, + None, + tree=self.repo_tree, + download_callback=download_results, + ) + + return sum(download_results), failed + + +class ExternalRepo(Repo, BaseExternalRepo): def __init__(self, root_dir, url, rev, for_write=False): if for_write: super().__init__(root_dir) @@ -131,6 +186,7 @@ def _set_cache_dir(self): cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache") self.cache.local.cache_dir = cache_dir + self._local_cache = self.cache.local def _fix_upstream(self): if not os.path.isdir(self.url): @@ -171,7 +227,7 @@ def _add_upstream(self, src_repo): self.config["core"]["remote"] = "auto-generated-upstream" -class ExternalGitRepo: +class ExternalGitRepo(BaseExternalRepo): def __init__(self, root_dir, url, rev): self.root_dir = os.path.realpath(root_dir) self.url = url @@ -201,10 +257,12 @@ def pull_to(self, path, to_info): @contextmanager def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" + tree = RepoTree(self) try: - abs_path = os.path.join(self.root_dir, path) - with open(abs_path, mode, encoding=encoding) as fd: - yield fd + with tree.open( + path, mode=mode, encoding=encoding, **kwargs + ) as fobj: + yield fobj except FileNotFoundError: raise PathMissingError(path, self.url) From dde6c61c185020fa17ce1e034f57df30981fcd99 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 15:56:34 +0900 Subject: [PATCH 03/22] remote/RepoTree: add callback for keeping track of download counts when saving tree to cache --- dvc/remote/base.py | 38 +++++++++++++++++++++++++------------- dvc/repo/tree.py | 6 ++++-- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 843563b3c0..0ea32d0c6a 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -459,7 +459,9 @@ def _do_link(self, from_info, to_info, link_method): "Created '%s': %s -> %s", self.cache_types[0], from_info, to_info, ) - def _save_file(self, path_info, checksum, save_link=True, tree=None): + def _save_file( + self, path_info, checksum, save_link=True, tree=None, **kwargs + ): assert checksum cache_info = self.checksum_to_path_info(checksum) @@ -467,6 +469,9 @@ def _save_file(self, path_info, checksum, save_link=True, tree=None): if self.changed_cache(checksum): with tree.open(path_info, mode="rb") as fobj: self.copy_fobj(fobj, cache_info) + callback = kwargs.get("download_callback") + if callback: + callback(1) else: if self.changed_cache(checksum): self.move(path_info, cache_info, mode=self.CACHE_MODE) @@ -510,9 +515,11 @@ def _cache_is_copy(self, path_info): self.cache_type_confirmed = True return self.cache_types[0] == "copy" - def _save_dir(self, path_info, checksum, save_link=True, tree=None): + def _save_dir( + self, path_info, checksum, save_link=True, tree=None, **kwargs + ): if tree: - checksum = self._save_tree(path_info, tree) + checksum = self._save_tree(path_info, tree, **kwargs) else: dir_info = self.get_dir_cache(checksum) @@ -531,17 +538,17 @@ def _save_dir(self, path_info, checksum, save_link=True, tree=None): if not tree or is_working_tree(tree): self.state.save(path_info, checksum) - def _save_tree(self, path_info, tree): + def _save_tree(self, path_info, tree, **kwargs): # save tree directory to cache, collect dir cache during walk and # return the resulting dir checksum dir_info = [] - for fname in tree.walk_files(path_info): + for fname in tree.walk_files(path_info, **kwargs): checksum = tree.get_file_checksum(fname) file_info = { self.PARAM_CHECKSUM: checksum, self.PARAM_RELPATH: fname.relative_to(path_info).as_posix(), } - self._save_file(fname, checksum, tree=tree) + self._save_file(fname, checksum, tree=tree, **kwargs) dir_info.append(file_info) return self._save_dir_info( @@ -575,20 +582,25 @@ def walk_files(self, path_info): def protect(path_info): pass - def save(self, path_info, checksum_info, save_link=True, tree=None): + def save( + self, path_info, checksum_info, save_link=True, tree=None, **kwargs + ): if path_info.scheme != self.scheme: raise RemoteActionNotImplemented( f"save {path_info.scheme} -> {self.scheme}", self.scheme, ) if tree: - # save checksum will be computed during tree walk - checksum = None + if tree.isdir(path_info): + # save checksum will be computed during tree walk + checksum = None + else: + checksum = tree.get_file_checksum(path_info) else: checksum = checksum_info[self.PARAM_CHECKSUM] - self._save(path_info, checksum, save_link, tree) + self._save(path_info, checksum, save_link, tree, **kwargs) - def _save(self, path_info, checksum, save_link=True, tree=None): + def _save(self, path_info, checksum, save_link=True, tree=None, **kwargs): if tree: logger.debug("Saving tree path '%s' to cache.", path_info) else: @@ -602,9 +614,9 @@ def _save(self, path_info, checksum, save_link=True, tree=None): isdir = self.isdir if isdir(path_info): - self._save_dir(path_info, checksum, save_link, tree) + self._save_dir(path_info, checksum, save_link, tree, **kwargs) return - self._save_file(path_info, checksum, save_link, tree) + self._save_file(path_info, checksum, save_link, tree, **kwargs) def _handle_transfer_exception( self, from_info, to_info, exception, operation diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 4e62adf198..4563573139 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -141,7 +141,7 @@ def _walk(self, root, trie, topdown=True): else: assert False - def walk(self, top, topdown=True, **kwargs): + def walk(self, top, topdown=True, download_callback=None, **kwargs): from pygtrie import Trie assert topdown @@ -168,7 +168,9 @@ def walk(self, top, topdown=True, **kwargs): if self.fetch: if out.changed_cache(filter_info=top): used_cache = out.get_used_cache(filter_info=top) - self.repo.cloud.pull(used_cache, **kwargs) + downloaded = self.repo.cloud.pull(used_cache, **kwargs) + if download_callback: + download_callback(downloaded) for entry in dir_cache: entry_relpath = entry[out.remote.PARAM_RELPATH] From fad9794b4ffdd0a1409fdbafb1d4a3c440ececab Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 15:57:43 +0900 Subject: [PATCH 04/22] repo: use erepo.fetch_external() in repo.fetch() --- dvc/repo/fetch.py | 58 +++++------------------------------------------ 1 file changed, 6 insertions(+), 52 deletions(-) diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index a9534d2a42..50ecee337a 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -1,9 +1,7 @@ import logging -from dvc.cache import NamedCache from dvc.config import NoRemoteError -from dvc.exceptions import DownloadError, OutputNotFoundError -from dvc.path_info import PathInfo +from dvc.exceptions import DownloadError from dvc.scm.base import CloneError logger = logging.getLogger(__name__) @@ -78,35 +76,15 @@ def _fetch( def _fetch_external(self, repo_url, repo_rev, files, jobs): - from dvc.external_repo import external_repo, ExternalRepo + from dvc.external_repo import external_repo failed, downloaded = 0, 0 try: with external_repo(repo_url, repo_rev) as repo: - is_dvc_repo = isinstance(repo, ExternalRepo) - # gather git-only tracked files if dvc repo - git_files = [] if is_dvc_repo else files - if is_dvc_repo: - repo.cache.local.cache_dir = self.cache.local.cache_dir - with repo.state: - cache = NamedCache() - for name in files: - try: - out = repo.find_out_by_relpath(name) - except OutputNotFoundError: - # try to add to cache if they are git-tracked files - git_files.append(name) - else: - cache.update(out.get_used_cache()) - - try: - downloaded += repo.cloud.pull(cache, jobs=jobs) - except DownloadError as exc: - failed += exc.amount - - d, f = _git_to_cache(self.cache.local, repo.root_dir, git_files) - downloaded += d - failed += f + with repo.use_cache(self.cache.local): + d, f = repo.fetch_external(files, jobs=jobs) + downloaded += d + failed += f except CloneError: failed += 1 logger.exception( @@ -114,27 +92,3 @@ def _fetch_external(self, repo_url, repo_rev, files, jobs): ) return downloaded, failed - - -def _git_to_cache(cache, repo_root, files): - """Save files from a git repo directly to the cache.""" - failed = set() - num_downloads = 0 - repo_root = PathInfo(repo_root) - for file in files: - info = cache.save_info(repo_root / file) - if info.get(cache.PARAM_CHECKSUM) is None: - failed.add(file) - continue - - if cache.changed_cache(info[cache.PARAM_CHECKSUM]): - logger.debug("fetched '%s' from '%s' repo", file, repo_root) - num_downloads += 1 - cache.save(repo_root / file, info, save_link=False) - - if failed: - logger.exception( - "failed to fetch data for {}".format(", ".join(failed)) - ) - - return num_downloads, len(failed) From d55c06f03dc0c925c7b72c0d48760032777f6d67 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 17:04:25 +0900 Subject: [PATCH 05/22] erepo: return cache save_infos in fetch_external --- dvc/external_repo.py | 34 ++++++++++++++++++++++------------ dvc/remote/base.py | 11 +++++++---- dvc/repo/fetch.py | 2 +- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 84794df762..f17096af61 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -4,6 +4,7 @@ import threading from contextlib import contextmanager from distutils.dir_util import copy_tree +from typing import Iterable from funcy import cached_property, retry, suppress, wrap_with @@ -97,7 +98,14 @@ def use_cache(self, cache): def repo_tree(self): return RepoTree(self, fetch=True) - def fetch_external(self, paths, **kwargs): + def fetch_external(self, paths: Iterable, **kwargs): + """Fetch specified external repo paths into cache. + + Returns 3-tuple in the form + (downloaded, failed, list(cache_infos)) + where cache_infos can be used as checkout targets for the + fetched paths. + """ download_results = [] failed = 0 @@ -106,19 +114,21 @@ def fetch_external(self, paths, **kwargs): def download_update(result): download_results.append(result) + save_infos = [] for path in paths: - if not self.repo_tree.exists(path): - logger.exception(f"'{path}' does not exist in '{self.url}'") + if self.repo_tree.exists(path): + save_info = self.local_cache.save( + path, + None, + tree=self.repo_tree, + download_callback=download_update, + ) + else: failed += 1 - continue - self.local_cache.save( - path, - None, - tree=self.repo_tree, - download_callback=download_results, - ) - - return sum(download_results), failed + save_info = {} + save_infos.append(save_info) + + return sum(download_results), failed, save_infos class ExternalRepo(Repo, BaseExternalRepo): diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 0ea32d0c6a..3707a37a72 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -492,6 +492,7 @@ def _save_file( if not tree or is_working_tree(tree): self.state.save(path_info, checksum) self.state.save(cache_info, checksum) + return {self.PARAM_CHECKSUM: checksum} def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" @@ -537,6 +538,7 @@ def _save_dir( self.state.save(cache_info, checksum) if not tree or is_working_tree(tree): self.state.save(path_info, checksum) + return {self.PARAM_CHECKSUM: checksum} def _save_tree(self, path_info, tree, **kwargs): # save tree directory to cache, collect dir cache during walk and @@ -598,7 +600,7 @@ def save( checksum = tree.get_file_checksum(path_info) else: checksum = checksum_info[self.PARAM_CHECKSUM] - self._save(path_info, checksum, save_link, tree, **kwargs) + return self._save(path_info, checksum, save_link, tree, **kwargs) def _save(self, path_info, checksum, save_link=True, tree=None, **kwargs): if tree: @@ -614,9 +616,10 @@ def _save(self, path_info, checksum, save_link=True, tree=None, **kwargs): isdir = self.isdir if isdir(path_info): - self._save_dir(path_info, checksum, save_link, tree, **kwargs) - return - self._save_file(path_info, checksum, save_link, tree, **kwargs) + return self._save_dir( + path_info, checksum, save_link, tree, **kwargs + ) + return self._save_file(path_info, checksum, save_link, tree, **kwargs) def _handle_transfer_exception( self, from_info, to_info, exception, operation diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index 50ecee337a..dcab08ec47 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -82,7 +82,7 @@ def _fetch_external(self, repo_url, repo_rev, files, jobs): try: with external_repo(repo_url, repo_rev) as repo: with repo.use_cache(self.cache.local): - d, f = repo.fetch_external(files, jobs=jobs) + d, f, _ = repo.fetch_external(files, jobs=jobs) downloaded += d failed += f except CloneError: From d26564042b49bcb964d1354b2e743bd999e05ad1 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 17:05:57 +0900 Subject: [PATCH 06/22] DependencyRepo: use erepo.fetch_external and cache.checkout instead of pull_to --- dvc/dependency/repo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index c53d0d55ba..1ce8a78c41 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -78,10 +78,10 @@ def download(self, to): if self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev() - if hasattr(repo, "cache"): - repo.cache.local.cache_dir = self.repo.cache.local.cache_dir - - repo.pull_to(self.def_path, to.path_info) + cache = self.repo.cache.local + with repo.use_cache(cache): + _, _, cache_infos = repo.fetch_external([self.def_path]) + cache.checkout(to.path_info, cache_infos[0]) def update(self, rev=None): if rev: From a12cbc7fd2d753ceebca3dfda085c8d2bacb2dec Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 17:08:27 +0900 Subject: [PATCH 07/22] erepo: remove pull_to --- dvc/external_repo.py | 62 ++------------------------------ tests/func/test_external_repo.py | 5 ++- 2 files changed, 6 insertions(+), 61 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index f17096af61..eefbb0d4a7 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -6,11 +6,10 @@ from distutils.dir_util import copy_tree from typing import Iterable -from funcy import cached_property, retry, suppress, wrap_with +from funcy import cached_property, retry, wrap_with from dvc.config import NoRemoteError, NotDvcRepoError from dvc.exceptions import ( - CheckoutError, FileMissingError, NoOutputInExternalRepoError, NoRemoteInExternalRepoError, @@ -21,8 +20,7 @@ from dvc.repo import Repo from dvc.repo.tree import RepoTree from dvc.scm.git import Git -from dvc.utils import tmp_fname -from dvc.utils.fs import fs_copy, move, remove +from dvc.utils.fs import remove logger = logging.getLogger(__name__) @@ -142,52 +140,6 @@ def __init__(self, root_dir, url, rev, for_write=False): self._set_cache_dir() self._fix_upstream() - def pull_to(self, path, to_info): - """ - Pull the corresponding file or directory specified by `path` and - checkout it into `to_info`. - - It works with files tracked by Git and DVC, and also local files - outside the repository. - """ - out = None - path_info = PathInfo(self.root_dir) / path - - with suppress(OutputNotFoundError): - (out,) = self.find_outs_by_path(path_info, strict=False) - - try: - if out and out.use_cache: - self._pull_cached(out, path_info, to_info) - return - - # Check if it is handled by Git (it can't have an absolute path) - if os.path.isabs(path): - raise FileNotFoundError - - fs_copy(path_info, to_info) - except FileNotFoundError: - raise PathMissingError(path, self.url) - - def _pull_cached(self, out, path_info, dest): - with self.state: - tmp = PathInfo(tmp_fname(dest)) - src = tmp / path_info.relative_to(out.path_info) - - out.path_info = tmp - - # Only pull unless all needed cache is present - if out.changed_cache(filter_info=src): - self.cloud.pull(out.get_used_cache(filter_info=src)) - - try: - out.checkout(filter_info=src) - except CheckoutError: - raise FileNotFoundError - - move(src, dest) - remove(tmp) - @wrap_with(threading.Lock()) def _set_cache_dir(self): try: @@ -254,16 +206,6 @@ def close(self): def find_out_by_relpath(self, path): raise OutputNotFoundError(path, self) - def pull_to(self, path, to_info): - try: - # Git handled files can't have absolute path - if os.path.isabs(path): - raise FileNotFoundError - - fs_copy(os.path.join(self.root_dir, path), to_info) - except FileNotFoundError: - raise PathMissingError(path, self.url) - @contextmanager def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 3adefbb308..13523d672f 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -89,7 +89,10 @@ def test_pull_subdir_file(tmp_dir, erepo_dir): dest = tmp_dir / "file" with external_repo(os.fspath(erepo_dir)) as repo: - repo.pull_to(os.path.join("subdir", "file"), dest) + _, _, save_infos = repo.fetch_external( + [os.path.join("subdir", "file")] + ) + repo.cache.local.checkout(dest, save_infos[0]) assert dest.is_file() assert dest.read_text() == "contents" From ac7a5fee707739d082ae1c656a61668c15e7797a Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:01:19 +0900 Subject: [PATCH 08/22] DvcTree: support granularity for isdir() --- dvc/repo/tree.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 4563573139..80822c170d 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -105,10 +105,25 @@ def isdir(self, path): path_info = PathInfo(os.path.abspath(path)) outs = self._find_outs(path, strict=False, recursive=True) - if len(outs) != 1 or outs[0].path_info != path_info: + if len(outs) != 1: return True - return outs[0].is_dir_checksum + out = outs[0] + if not out.is_dir_checksum: + if out.path_info != path_info: + return True + return False + + if out.path_info == path_info: + return True + + # for dir checksum, we need to check if this is a file inside the + # directory + try: + self._get_granular_checksum(path, out) + return False + except FileNotFoundError: + return True def isfile(self, path): if not self.exists(path): From 9d5d7544129c30af3da6e200302414790854b128 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:02:20 +0900 Subject: [PATCH 09/22] DvcTree: fix indentation bug --- dvc/repo/tree.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 80822c170d..7243effcaf 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -80,10 +80,8 @@ def open(self, path, mode="r", encoding="utf-8", remote=None): ) except RemoteActionNotImplemented: pass - cache_info = out.get_used_cache( - filter_info=path, remote=remote - ) - self.repo.cloud.pull(cache_info, remote=remote) + cache_info = out.get_used_cache(filter_info=path, remote=remote) + self.repo.cloud.pull(cache_info, remote=remote) if out.is_dir_checksum: checksum = self._get_granular_checksum(path, out) From 419f286085f78186d83490e01d8dc1995895e2aa Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:02:41 +0900 Subject: [PATCH 10/22] RepoTree: add copytree() method --- dvc/repo/tree.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 7243effcaf..5b1ff3620f 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -7,6 +7,7 @@ from dvc.remote.base import RemoteActionNotImplemented from dvc.scm.tree import BaseTree from dvc.utils import file_md5 +from dvc.utils.fs import copy_fobj_to_file, makedirs logger = logging.getLogger(__name__) @@ -370,3 +371,26 @@ def get_file_checksum(self, path_info): except OutputNotFoundError: pass return file_md5(path_info, self)[0] + + def copytree(self, top, dest): + top = PathInfo(top) + dest = PathInfo(dest) + + if not self.exists(top): + raise FileNotFoundError + + if self.isfile(top): + makedirs(dest.parent, exist_ok=True) + with self.open(top, mode="rb") as fobj: + copy_fobj_to_file(fobj, dest) + return + + for root, _, files in self.walk(top): + root = PathInfo(root) + dest_dir = root.relative_to(top) + makedirs(dest_dir, exist_ok=True) + for fname in files: + src = root / fname + dest = dest_dir / fname + with self.open(src, mode="rb") as fobj: + copy_fobj_to_file(fobj, dest) From 10aa9788d505c38d3c84d51544b59fcba98f42ca Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:03:14 +0900 Subject: [PATCH 11/22] LocalRemote: use working tree for local cache walk_files --- dvc/remote/local.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dvc/remote/local.py b/dvc/remote/local.py index 5ae66158fd..43c8df47c9 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -175,9 +175,11 @@ def getsize(path_info): return os.path.getsize(path_info) def walk_files(self, path_info): - assert is_working_tree(self.repo.tree) - - for fname in self.repo.tree.walk_files(path_info): + if self.work_tree: + tree = self.work_tree + else: + tree = self.repo.tree + for fname in tree.walk_files(path_info): yield PathInfo(fname) def get_file_checksum(self, path_info): From c5f8938e4b25c78c52073fc5331fccb26f35a742 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:04:07 +0900 Subject: [PATCH 12/22] repo: state should be noop for GitTree based erepos --- dvc/repo/__init__.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 11ee74ee59..145144189c 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -58,7 +58,7 @@ class Repo: from dvc.repo.update import update def __init__(self, root_dir=None, scm=None, rev=None): - from dvc.state import State + from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache @@ -76,6 +76,7 @@ def __init__(self, root_dir=None, scm=None, rev=None): self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = tree + self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) @@ -100,9 +101,11 @@ def __init__(self, root_dir=None, scm=None, rev=None): friendly=True, ) - # NOTE: storing state and link_state in the repository itself to avoid - # any possible state corruption in 'shared cache dir' scenario. - self.state = State(self) + if not scm: + # NOTE: storing state and link_state in the repository itself to + # avoid any possible state corruption in 'shared cache dir' + # scenario. + self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) From 415aef85ad8649bdcc9f38c2a4661e1068a2fc5f Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:11:28 +0900 Subject: [PATCH 13/22] erepo: add get_external() --- dvc/external_repo.py | 32 ++++++++++++++++++++++---------- dvc/repo/get.py | 4 +--- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index eefbb0d4a7..997cc543b4 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -114,20 +114,32 @@ def download_update(result): save_infos = [] for path in paths: - if self.repo_tree.exists(path): - save_info = self.local_cache.save( - path, - None, - tree=self.repo_tree, - download_callback=download_update, - ) - else: - failed += 1 - save_info = {} + if not self.repo_tree.exists(path): + raise PathMissingError(path, self.url) + save_info = self.local_cache.save( + path, + None, + tree=self.repo_tree, + download_callback=download_update, + ) save_infos.append(save_info) return sum(download_results), failed, save_infos + def get_external(self, path, dest): + """Convenience wrapper for fetch_external and checkout.""" + if self.local_cache: + # fetch DVC and git files to tmpdir cache, then checkout + _, _, save_infos = self.fetch_external([path]) + self.local_cache.checkout(PathInfo(dest), save_infos[0]) + else: + # git-only erepo with no cache, just copy files directly + # to dest + path = PathInfo(self.root_dir) / path + if not self.repo_tree.exists(path): + raise PathMissingError(path, self.url) + self.repo_tree.copytree(path, dest) + class ExternalRepo(Repo, BaseExternalRepo): def __init__(self, root_dir, url, rev, for_write=False): diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 973c5c3371..1141b00772 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -4,7 +4,6 @@ import shortuuid from dvc.exceptions import DvcException -from dvc.path_info import PathInfo from dvc.utils import resolve_output from dvc.utils.fs import remove @@ -51,7 +50,6 @@ def get(url, path, out=None, rev=None): # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - - repo.pull_to(path, PathInfo(out)) + repo.get_external(path, out) finally: remove(tmp_dir) From 56d5c11794248a640c9fab19c551ad0ddcbef7d8 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:15:13 +0900 Subject: [PATCH 14/22] test_get: remove unneeded caplog check --- tests/func/test_get.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 0e1ae42269..515dba1366 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -157,17 +157,13 @@ def test_get_to_dir(tmp_dir, erepo_dir, dname): assert (tmp_dir / dname / "file").read_text() == "contents" -def test_get_from_non_dvc_master(tmp_dir, git_dir, caplog): +def test_get_from_non_dvc_master(tmp_dir, git_dir): with git_dir.chdir(), git_dir.branch("branch", new=True): git_dir.init(dvc=True) git_dir.dvc_gen("some_file", "some text", commit="create some file") - caplog.clear() - - with caplog.at_level(logging.INFO, logger="dvc"): - Repo.get(os.fspath(git_dir), "some_file", out="some_dst", rev="branch") + Repo.get(os.fspath(git_dir), "some_file", out="some_dst", rev="branch") - assert caplog.text == "" assert (tmp_dir / "some_dst").read_text() == "some text" From 0359d02c801c098dacd40ad54b3c67f65cf29c2a Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:43:22 +0900 Subject: [PATCH 15/22] erepo: add get_rev() --- dvc/dependency/repo.py | 4 ++-- dvc/external_repo.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 1ce8a78c41..4e1598d204 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -76,7 +76,7 @@ def dumpd(self): def download(self, to): with self._make_repo() as repo: if self.def_repo.get(self.PARAM_REV_LOCK) is None: - self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev() + self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() cache = self.repo.cache.local with repo.use_cache(cache): @@ -88,4 +88,4 @@ def update(self, rev=None): self.def_repo[self.PARAM_REV] = rev with self._make_repo(locked=False) as repo: - self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev() + self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 997cc543b4..2c66be806c 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -20,6 +20,7 @@ from dvc.repo import Repo from dvc.repo.tree import RepoTree from dvc.scm.git import Git +from dvc.scm.tree import is_working_tree from dvc.utils.fs import remove logger = logging.getLogger(__name__) @@ -96,6 +97,13 @@ def use_cache(self, cache): def repo_tree(self): return RepoTree(self, fetch=True) + def get_rev(self): + if is_working_tree(self.tree): + return self.scm.get_rev() + if hasattr(self.tree, "tree"): + return self.tree.tree.rev + return self.tree.rev + def fetch_external(self, paths: Iterable, **kwargs): """Fetch specified external repo paths into cache. From 092e423d7ed91f3f2fb53da96a90fdf629f97b35 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 21 May 2020 18:43:58 +0900 Subject: [PATCH 16/22] tests: fix erepo test bug --- tests/func/test_external_repo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 13523d672f..f42a191c08 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -3,6 +3,7 @@ from mock import patch from dvc.external_repo import external_repo +from dvc.path_info import PathInfo from dvc.remote import LocalRemote from dvc.scm.git import Git from dvc.utils import relpath @@ -92,7 +93,7 @@ def test_pull_subdir_file(tmp_dir, erepo_dir): _, _, save_infos = repo.fetch_external( [os.path.join("subdir", "file")] ) - repo.cache.local.checkout(dest, save_infos[0]) + repo.cache.local.checkout(PathInfo(dest), save_infos[0]) assert dest.is_file() assert dest.read_text() == "contents" From 6a141e23603f3a92193f085a544b5c073280ecc1 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 22 May 2020 12:53:39 +0900 Subject: [PATCH 17/22] ls: use RepoTree --- dvc/repo/ls.py | 71 ++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index 3fc0abce3b..4564858f7b 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -28,22 +28,13 @@ def ls( } """ from dvc.external_repo import external_repo - from dvc.repo import Repo with external_repo(url, rev) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path - ret = {} - if isinstance(repo, Repo): - ret = _ls(repo, path_info, recursive, True) - - nondvc = {} - if not dvc_only: - nondvc = _ls(repo, path_info, recursive, False) - - ret.update(nondvc) + ret = _ls(repo, path_info, recursive, dvc_only) if path and not ret: raise PathMissingError(path, repo, dvc_only=dvc_only) @@ -56,46 +47,52 @@ def ls( return ret_list -def _ls(repo, path_info, recursive=None, dvc=False): - from dvc.ignore import CleanTree - from dvc.repo.tree import DvcTree - from dvc.scm.tree import WorkingTree +def _ls(repo, path_info, recursive=None, dvc_only=False): + from dvc.repo.tree import RepoTree - if dvc: - tree = DvcTree(repo) - else: - tree = CleanTree(WorkingTree(repo.root_dir)) + # use our own RepoTree instance instead of repo.repo_tree since we do not + # want fetch/stream enabled for ls + tree = RepoTree(repo) ret = {} try: - for root, dirs, files in tree.walk(path_info.fspath): + for root, dirs, files in tree.walk(path_info.fspath, dvcfiles=True): for fname in files: info = PathInfo(root) / fname - path = str(info.relative_to(path_info)) - ret[path] = { - "isout": dvc, - "isdir": False, - "isexec": False if dvc else tree.isexec(info.fspath), - } + dvc = tree.isdvc(info) + if dvc or not dvc_only: + path = str(info.relative_to(path_info)) + ret[path] = { + "isout": dvc, + "isdir": False, + "isexec": False if dvc else tree.isexec(info), + } if not recursive: for dname in dirs: info = PathInfo(root) / dname - path = str(info.relative_to(path_info)) - ret[path] = { - "isout": tree.isdvc(info.fspath) if dvc else False, - "isdir": True, - "isexec": False if dvc else tree.isexec(info.fspath), - } + if not dvc_only or ( + tree.dvctree and tree.dvctree.exists(info) + ): + dvc = tree.isdvc(info) + path = str(info.relative_to(path_info)) + ret[path] = { + "isout": dvc, + "isdir": True, + "isexec": False if dvc else tree.isexec(info), + } break except NotADirectoryError: - return { - path_info.name: { - "isout": dvc, - "isdir": False, - "isexec": False if dvc else tree.isexec(path_info.fspath), + dvc = tree.isdvc(path_info) + if dvc or not dvc_only: + return { + path_info.name: { + "isout": dvc, + "isdir": False, + "isexec": False if dvc else tree.isexec(path_info), + } } - } + return {} except FileNotFoundError: return {} From 0c951a8a3e5c68ffdd9c95197188c4f6e9f6dc7a Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 22 May 2020 13:08:42 +0900 Subject: [PATCH 18/22] remote: support trees in _collect_dir --- dvc/remote/base.py | 50 +++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 3707a37a72..ab36047861 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -212,23 +212,33 @@ def _calculate_checksums(self, file_infos): checksums = dict(zip(file_infos, tasks)) return checksums - def _collect_dir(self, path_info): + def _collect_dir(self, path_info, tree=None, save_tree=False, **kwargs): file_infos = set() - for fname in self.walk_files(path_info): + if tree: + walk_files = tree.walk_files + else: + walk_files = self.walk_files + + for fname in walk_files(path_info, **kwargs): if DvcIgnore.DVCIGNORE_FILE == fname.name: raise DvcIgnoreInCollectedDirError(fname.parent) file_infos.add(fname) - checksums = {fi: self.state.get(fi) for fi in file_infos} - not_in_state = { - fi for fi, checksum in checksums.items() if checksum is None - } - - new_checksums = self._calculate_checksums(not_in_state) + if tree: + checksums = {fi: tree.get_checksum(fi) for fi in file_infos} + if save_tree: + for fi, checksum in checksums.items(): + self._save_file(fi, checksum, tree=tree, **kwargs) + else: + checksums = {fi: self.state.get(fi) for fi in file_infos} + not_in_state = { + fi for fi, checksum in checksums.items() if checksum is None + } - checksums.update(new_checksums) + new_checksums = self._calculate_checksums(not_in_state) + checksums.update(new_checksums) result = [ { @@ -520,7 +530,10 @@ def _save_dir( self, path_info, checksum, save_link=True, tree=None, **kwargs ): if tree: - checksum = self._save_tree(path_info, tree, **kwargs) + dir_info = self._collect_dir( + path_info, tree=tree, save_tree=True, **kwargs + ) + checksum = self._save_dir_info(dir_info) else: dir_info = self.get_dir_cache(checksum) @@ -540,23 +553,6 @@ def _save_dir( self.state.save(path_info, checksum) return {self.PARAM_CHECKSUM: checksum} - def _save_tree(self, path_info, tree, **kwargs): - # save tree directory to cache, collect dir cache during walk and - # return the resulting dir checksum - dir_info = [] - for fname in tree.walk_files(path_info, **kwargs): - checksum = tree.get_file_checksum(fname) - file_info = { - self.PARAM_CHECKSUM: checksum, - self.PARAM_RELPATH: fname.relative_to(path_info).as_posix(), - } - self._save_file(fname, checksum, tree=tree, **kwargs) - dir_info.append(file_info) - - return self._save_dir_info( - sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) - ) - def is_empty(self, path_info): return False From 95ea7e265895686513de72646b6e2500117c4edf Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 22 May 2020 13:12:59 +0900 Subject: [PATCH 19/22] remote: support trees in get_dir_checksum --- dvc/dependency/repo.py | 13 ++++++++++++- dvc/remote/base.py | 9 ++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 4e1598d204..f76344f119 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -50,13 +50,24 @@ def _make_repo(self, *, locked=True): return external_repo(d["url"], rev=rev) def _get_checksum(self, locked=True): + from dvc.repo.tree import RepoTree + with self._make_repo(locked=locked) as repo: try: return repo.find_out_by_relpath(self.def_path).info["md5"] except OutputNotFoundError: path = PathInfo(os.path.join(repo.root_dir, self.def_path)) + + # we want stream but not fetch, so DVC out directories are + # walked, but dir contents is not fetched + tree = RepoTree(repo, stream=True) + # We are polluting our repo cache with some dir listing here - return self.repo.cache.local.get_checksum(path) + if tree.isdir(path): + return self.repo.cache.local.get_dir_checksum( + path, tree=tree + ) + return tree.get_file_checksum(path) def status(self): current_checksum = self._get_checksum(locked=True) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index ab36047861..d66dfd5510 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -227,7 +227,7 @@ def _collect_dir(self, path_info, tree=None, save_tree=False, **kwargs): file_infos.add(fname) if tree: - checksums = {fi: tree.get_checksum(fi) for fi in file_infos} + checksums = {fi: tree.get_file_checksum(fi) for fi in file_infos} if save_tree: for fi, checksum in checksums.items(): self._save_file(fi, checksum, tree=tree, **kwargs) @@ -259,11 +259,14 @@ def _collect_dir(self, path_info, tree=None, save_tree=False, **kwargs): # Sorting the list by path to ensure reproducibility return sorted(result, key=itemgetter(self.PARAM_RELPATH)) - def get_dir_checksum(self, path_info): + def get_dir_checksum(self, path_info, tree=None): if not self.cache: raise RemoteCacheRequiredError(path_info) - dir_info = self._collect_dir(path_info) + dir_info = self._collect_dir(path_info, tree=None) + if tree: + # don't save state entry for path_info if it is a tree path + path_info = None return self._save_dir_info(dir_info, path_info) def _save_dir_info(self, dir_info, path_info=None): From 3ed4250ef4fc054fb38ed63c06b560d32090875a Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 22 May 2020 14:27:02 +0900 Subject: [PATCH 20/22] RepoTree: fix dir_cache relpaths in windows --- dvc/repo/tree.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 5b1ff3620f..f422f9e99e 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -48,7 +48,10 @@ def _get_granular_checksum(self, path, out, remote=None): raise FileNotFoundError dir_cache = out.get_dir_cache(remote=remote) for entry in dir_cache: - if path == out.path_info / entry[out.remote.PARAM_RELPATH]: + entry_relpath = entry[out.remote.PARAM_RELPATH] + if os.name == "nt": + entry_relpath = entry_relpath.replace("/", os.sep) + if path == out.path_info / entry_relpath: return entry[out.remote.PARAM_CHECKSUM] raise FileNotFoundError @@ -188,6 +191,8 @@ def walk(self, top, topdown=True, download_callback=None, **kwargs): for entry in dir_cache: entry_relpath = entry[out.remote.PARAM_RELPATH] + if os.name == "nt": + entry_relpath = entry_relpath.replace("/", os.sep) path_info = out.path_info / entry_relpath trie[path_info.parts] = None From d6d031cbf83591cff0d6072a33a14bad9d5fd323 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 22 May 2020 15:10:31 +0900 Subject: [PATCH 21/22] remote: fix duplicated fetch bug --- dvc/remote/base.py | 7 ++++++- dvc/repo/tree.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index d66dfd5510..8b338fd07b 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -481,7 +481,12 @@ def _save_file( if tree: if self.changed_cache(checksum): with tree.open(path_info, mode="rb") as fobj: - self.copy_fobj(fobj, cache_info) + # if tree has fetch enabled, DVC out will be fetched on + # open and we do not need to read/copy any data + if not ( + tree.isdvc(path_info, strict=False) and tree.fetch + ): + self.copy_fobj(fobj, cache_info) callback = kwargs.get("download_callback") if callback: callback(1) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index f422f9e99e..361e702ac4 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -235,6 +235,18 @@ def __init__(self, repo, **kwargs): # git-only erepo's do not need dvctree self.dvctree = None + @property + def fetch(self): + if self.dvctree: + return self.dvctree.fetch + return False + + @property + def stream(self): + if self.dvctree: + return self.dvctree.stream + return False + def open(self, path, mode="r", encoding="utf-8", **kwargs): if "b" in mode: encoding = None From f6bf5bf74993cbd988e1f753241a80c8901974a0 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 22 May 2020 16:07:51 +0900 Subject: [PATCH 22/22] utils: don't skip md5 dos2unix check for git blobs * otherwise md5 checksums will not match when comparing git version to cached filesystem version on windows --- dvc/istextfile.py | 8 ++++++-- dvc/utils/__init__.py | 6 +----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dvc/istextfile.py b/dvc/istextfile.py index 2e10592d9c..e4c29fdcf1 100644 --- a/dvc/istextfile.py +++ b/dvc/istextfile.py @@ -7,13 +7,17 @@ TEXT_CHARS = bytes(range(32, 127)) + b"\n\r\t\f\b" -def istextfile(fname, blocksize=512): +def istextfile(fname, blocksize=512, tree=None): """ Uses heuristics to guess whether the given file is text or binary, by reading a single block of bytes from the file. If more than 30% of the chars in the block are non-text, or there are NUL ('\x00') bytes in the block, assume this is a binary file. """ - with open(fname, "rb") as fobj: + if tree: + open_func = tree.open + else: + open_func = open + with open_func(fname, "rb") as fobj: block = fobj.read(blocksize) if not block: diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 3055371682..b614d6cbe6 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -52,18 +52,14 @@ def file_md5(fname, tree=None): exists_func = tree.exists stat_func = tree.stat open_func = tree.open - # assume we don't need to run dos2unix when comparing git blobs - binary = True else: exists_func = os.path.exists stat_func = os.stat open_func = open - binary = False if exists_func(fname): hash_md5 = hashlib.md5() - if not binary: - binary = not istextfile(fname) + binary = not istextfile(fname, tree=tree) size = stat_func(fname).st_size no_progress_bar = True if size >= LARGE_FILE_SIZE: