From 016d99d968ab5a7385f14a44cbb5870c61328202 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 9 Jun 2020 17:59:32 +0900 Subject: [PATCH 1/5] dvcignore: ignore paths outside of CleanTree root --- dvc/ignore.py | 5 ++--- tests/func/test_ignore.py | 7 ++----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 7ef67763bd..b7a59e8f0f 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -186,8 +186,7 @@ def _parents_exist(self, path): if path.parent == self.tree_root or Repo.DVC_DIR in path.parts: return True - # if path is outside of tree, assume this is a local remote/local cache - # link/move operation where we do not need to filter ignores + # paths outside of the CleanTree root should be ignored path = relpath(path, self.tree_root) if path.startswith("..") or ( os.name == "nt" @@ -195,7 +194,7 @@ def _parents_exist(self, path): [os.path.abspath(path), self.tree_root] ) ): - return True + return False # check if parent directories are in our ignores, starting from # tree_root diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index a0c648932b..d3a64435f9 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -10,7 +10,6 @@ DvcIgnorePatterns, DvcIgnoreRepo, ) -from dvc.remote import LocalRemote from dvc.repo import Repo from dvc.scm.tree import WorkingTree from dvc.utils import relpath @@ -139,8 +138,7 @@ def test_match_nested(tmp_dir, dvc): } ) - remote = LocalRemote(dvc, {}) - result = {os.fspath(f) for f in remote.tree.walk_files(".")} + result = {os.fspath(os.path.normpath(f)) for f in dvc.tree.walk_files(".")} assert result == {".dvcignore", "foo"} @@ -149,8 +147,7 @@ def test_ignore_external(tmp_dir, scm, dvc, tmp_path_factory): ext_dir = TmpDir(os.fspath(tmp_path_factory.mktemp("external_dir"))) ext_dir.gen({"y.backup": "y", "tmp": "ext tmp"}) - remote = LocalRemote(dvc, {}) - result = {relpath(f, ext_dir) for f in remote.tree.walk_files(ext_dir)} + result = {relpath(f, ext_dir) for f in dvc.tree.walk_files(ext_dir)} assert result == {"y.backup", "tmp"} From 2bc305d90bcbc6da0103e6775e310f6b15cd98f9 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 9 Jun 2020 18:00:38 +0900 Subject: [PATCH 2/5] remote: local remote/cache should always use its own working tree * repo tree (CleanTree) instance should not be relevant to local cache operations. When the dvcignore CleanTree is relevant, repo.tree should be used. For local remote/cache operations, the local remote work_tree should be used. --- dvc/remote/local.py | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/dvc/remote/local.py b/dvc/remote/local.py index d3a49a8b50..1b52f1f233 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -50,18 +50,12 @@ def repo(self): return self.remote.repo @cached_property - def _work_tree(self): - if self.repo: - return WorkingTree(self.repo.root_dir) - return None - - @property def work_tree(self): # When using repo.brancher, repo.tree may change to/from WorkingTree to # GitTree arbitarily. When repo.tree is GitTree, local cache needs to # use its own WorkingTree instance. - if self.repo and not is_working_tree(self.repo.tree): - return self._work_tree + if self.repo: + return WorkingTree(self.repo.root_dir) return None @staticmethod @@ -72,23 +66,17 @@ def exists(self, path_info): assert isinstance(path_info, str) or path_info.scheme == "local" if not self.repo: return os.path.exists(path_info) - if self.work_tree and self.work_tree.exists(path_info): - return True - return self.repo.tree.exists(path_info) + return self.work_tree.exists(path_info) def isfile(self, path_info): if not self.repo: return os.path.isfile(path_info) - if self.work_tree and self.work_tree.isfile(path_info): - return True - return self.repo.tree.isfile(path_info) + return self.work_tree.isfile(path_info) def isdir(self, path_info): if not self.repo: return os.path.isdir(path_info) - if self.work_tree and self.work_tree.isdir(path_info): - return True - return self.repo.tree.isdir(path_info) + return self.work_tree.isdir(path_info) def iscopy(self, path_info): return not ( @@ -96,11 +84,7 @@ def iscopy(self, path_info): ) def walk_files(self, path_info, **kwargs): - if self.work_tree: - tree = self.work_tree - else: - tree = self.repo.tree - for fname in tree.walk_files(path_info): + for fname in self.work_tree.walk_files(path_info): yield PathInfo(fname) def is_empty(self, path_info): From 743854629e7fc663848e8f66e68b8ee6dbaa43b2 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 9 Jun 2020 18:57:35 +0900 Subject: [PATCH 3/5] state: tie state to local cache working tree --- dvc/repo/__init__.py | 8 ++++---- dvc/state.py | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index dce0187784..dd0b1528ac 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -111,14 +111,14 @@ def __init__(self, root_dir=None, scm=None, rev=None): friendly=True, ) + self.cache = Cache(self) + self.cloud = DataCloud(self) + if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. - self.state = State(self) - - self.cache = Cache(self) - self.cloud = DataCloud(self) + self.state = State(self.cache.local) self.stage_cache = StageCache(self) diff --git a/dvc/state.py b/dvc/state.py index ba22d614e1..10e3ee8049 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -7,7 +7,6 @@ from urllib.parse import urlencode, urlunparse from dvc.exceptions import DvcException -from dvc.scm.tree import WorkingTree from dvc.utils import current_timestamp, relpath, to_chunks from dvc.utils.fs import get_inode, get_mtime_and_size, remove @@ -89,10 +88,11 @@ class State: # pylint: disable=too-many-instance-attributes MAX_INT = 2 ** 63 - 1 MAX_UINT = 2 ** 64 - 2 - def __init__(self, repo): + def __init__(self, local_cache): + repo = local_cache.repo self.repo = repo self.root_dir = repo.root_dir - self.tree = WorkingTree(self.root_dir) + self.tree = local_cache.tree.work_tree state_config = repo.config.get("state", {}) self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT) @@ -364,7 +364,7 @@ def save(self, path_info, checksum): """ assert isinstance(path_info, str) or path_info.scheme == "local" assert checksum is not None - assert os.path.exists(path_info) + assert self.tree.exists(path_info) actual_mtime, actual_size = get_mtime_and_size(path_info, self.tree) actual_inode = get_inode(path_info) @@ -394,7 +394,7 @@ def get(self, path_info): assert isinstance(path_info, str) or path_info.scheme == "local" path = os.fspath(path_info) - if not os.path.exists(path): + if not self.tree.exists(path): return None actual_mtime, actual_size = get_mtime_and_size(path, self.tree) @@ -420,7 +420,7 @@ def save_link(self, path_info): """ assert isinstance(path_info, str) or path_info.scheme == "local" - if not os.path.exists(path_info): + if not self.tree.exists(path_info): return mtime, _ = get_mtime_and_size(path_info, self.tree) @@ -446,7 +446,7 @@ def get_unused_links(self, used): inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relpath) - if path in used or not os.path.exists(path): + if path in used or not self.tree.exists(path): continue actual_inode = get_inode(path) From 9c59e625680297b48e358d14da255e611432bccc Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 12 Jun 2020 16:42:01 +0900 Subject: [PATCH 4/5] tests: update tests --- tests/func/test_state.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/func/test_state.py b/tests/func/test_state.py index a36fbf6f84..616faf672f 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -13,7 +13,7 @@ def test_state(tmp_dir, dvc): path_info = PathInfo(path) md5 = file_md5(path)[0] - state = State(dvc) + state = State(dvc.cache.local) with state: state.save(path_info, md5) @@ -57,7 +57,7 @@ def get_inode_mocked(path): def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc): tmp_dir.gen("foo", "foo content") - state = State(dvc) + state = State(dvc.cache.local) inode = state.MAX_INT + 2 assert inode != state._to_sqlite(inode) From 9da6b2a36293a23cb25aaa00fe81c2410b808d66 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 12 Jun 2020 17:09:24 +0900 Subject: [PATCH 5/5] state: use os.path.exists instead of WorkingTree.exists in some cases --- dvc/state.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dvc/state.py b/dvc/state.py index 10e3ee8049..fd3924f932 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -364,7 +364,7 @@ def save(self, path_info, checksum): """ assert isinstance(path_info, str) or path_info.scheme == "local" assert checksum is not None - assert self.tree.exists(path_info) + assert os.path.exists(path_info) actual_mtime, actual_size = get_mtime_and_size(path_info, self.tree) actual_inode = get_inode(path_info) @@ -394,7 +394,10 @@ def get(self, path_info): assert isinstance(path_info, str) or path_info.scheme == "local" path = os.fspath(path_info) - if not self.tree.exists(path): + # NOTE: use os.path.exists instead of WorkingTree.exists + # WorkingTree.exists uses lexists() and will return True for broken + # symlinks that we cannot stat() in get_mtime_and_size + if not os.path.exists(path): return None actual_mtime, actual_size = get_mtime_and_size(path, self.tree)