From 86a1dae94e5d1a7de6584ba1b30995eda0715fbb Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 6 Feb 2020 13:21:45 +0545 Subject: [PATCH 1/5] checkout: fix output message --- dvc/remote/base.py | 4 ++-- dvc/repo/checkout.py | 7 ++++--- dvc/state.py | 7 ++++++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index be5929fd21..f7586b8893 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -878,8 +878,8 @@ def _checkout_file( """The file is changed we need to checkout a new copy""" cache_info = self.checksum_to_path_info(checksum) if self.exists(path_info): - logger.warning( - "data '{}' exists. Removing before checkout.", path_info + logger.info( + "The file '{}' already exists. It will be replaced.", path_info ) self.safe_remove(path_info, force=force) diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 26f9e18e81..39a2e35c74 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -15,7 +15,7 @@ def _cleanup_unused_links(repo): for out in stage.outs if out.scheme == "local" ] - repo.state.remove_unused_links(used) + return repo.state.remove_unused_links(used) def get_all_files_numbers(pairs): @@ -34,9 +34,10 @@ def _checkout( ): from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError + cleaned = False if not targets: targets = [None] - _cleanup_unused_links(self) + cleaned = _cleanup_unused_links(self) pairs = set() for target in targets: @@ -52,7 +53,7 @@ def _checkout( raise CheckoutErrorSuggestGit(target) from exc total = get_all_files_numbers(pairs) - if total == 0: + if total == 0 and not cleaned: logger.info("Nothing to do") failed = [] with Tqdm( diff --git a/dvc/state.py b/dvc/state.py index 6c9106d1e0..4ecdf3ffee 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -467,7 +467,10 @@ def remove_unused_links(self, used): actual_mtime, _ = get_mtime_and_size(path, self.repo.tree) if inode == actual_inode and mtime == actual_mtime: - logger.debug("Removing '{}' as unused link.", path) + logger.info( + "Removing '{}' as it is unused in the current worktree.", + relpath, + ) remove(path) unused.append(relpath) @@ -479,6 +482,8 @@ def remove_unused_links(self, used): ) self._execute(cmd, tuple(chunk_unused)) + return bool(unused) + def _connect_sqlite(filename, options): # Connect by URI was added in Python 3.4 and sqlite 3.7.7, From 3f2069c446292c83866efbcbe5af893ed8401e68 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Mon, 10 Feb 2020 21:20:47 +0545 Subject: [PATCH 2/5] state: refactor to get list of unused links --- dvc/repo/checkout.py | 9 ++++++++- dvc/state.py | 21 ++++++++++----------- tests/func/test_state.py | 20 +++++++++++++++++++- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 39a2e35c74..714a5adf8a 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -15,7 +15,14 @@ def _cleanup_unused_links(repo): for out in stage.outs if out.scheme == "local" ] - return repo.state.remove_unused_links(used) + + unused = repo.state.get_unused_links(used) + for link in unused: + logger.info( + "Removing '{}' as it already exists in the current worktree.", link + ) + repo.state.remove_unused_links(unused) + return bool(unused) def get_all_files_numbers(pairs): diff --git a/dvc/state.py b/dvc/state.py index 4ecdf3ffee..29c0414e9b 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -443,7 +443,7 @@ def save_link(self, path_info): ) self._execute(cmd, (relative_path, self._to_sqlite(inode), mtime)) - def remove_unused_links(self, used): + def get_unused_links(self, used): """Removes all saved links except the ones that are used. Args: @@ -453,9 +453,9 @@ def remove_unused_links(self, used): self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE)) for row in self.cursor: - relpath, inode, mtime = row + relative_path, inode, mtime = row inode = self._from_sqlite(inode) - path = os.path.join(self.root_dir, relpath) + path = os.path.join(self.root_dir, relative_path) if path in used: continue @@ -467,12 +467,13 @@ def remove_unused_links(self, used): actual_mtime, _ = get_mtime_and_size(path, self.repo.tree) if inode == actual_inode and mtime == actual_mtime: - logger.info( - "Removing '{}' as it is unused in the current worktree.", - relpath, - ) - remove(path) - unused.append(relpath) + unused.append(relative_path) + + return unused + + def remove_unused_links(self, unused): + for relative_path in unused: + remove(os.path.join(self.root_dir, relative_path)) for chunk_unused in to_chunks( unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER @@ -482,8 +483,6 @@ def remove_unused_links(self, used): ) self._execute(cmd, tuple(chunk_unused)) - return bool(unused) - def _connect_sqlite(filename, options): # Connect by URI was added in Python 3.4 and sqlite 3.7.7, diff --git a/tests/func/test_state.py b/tests/func/test_state.py index 810338506c..abe6808aac 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -1,4 +1,5 @@ import mock +import os from dvc.path_info import PathInfo from dvc.state import State @@ -78,7 +79,24 @@ def test_remove_unused_links(tmp_dir, dvc): result = dvc.state._execute(cmd_count_links).fetchone()[0] assert result == 2 - dvc.state.remove_unused_links([]) + dvc.state.remove_unused_links(["foo", "bar"]) result = dvc.state._execute(cmd_count_links).fetchone()[0] assert result == 0 + + +def test_get_unused_links(tmp_dir, dvc): + assert len(tmp_dir.dvc_gen("foo", "foo_content")) == 1 + assert len(tmp_dir.dvc_gen("bar", "bar_content")) == 1 + + links = [os.path.join(dvc.root_dir, link) for link in ["foo", "bar"]] + with dvc.state: + assert set(dvc.state.get_unused_links([])) == {"foo", "bar"} + assert set(dvc.state.get_unused_links(links[:1])) == {"bar"} + assert set(dvc.state.get_unused_links(links)) == set() + assert set( + dvc.state.get_unused_links( + used=links[:1] + + [os.path.join(dvc.root_dir, "not-existing-file")] + ) + ) == {"bar"} From 9c7642e29ffefcd39b76a5b41059520038fa3c9d Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Mon, 10 Feb 2020 21:33:17 +0545 Subject: [PATCH 3/5] state: update docstrings --- dvc/state.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dvc/state.py b/dvc/state.py index 29c0414e9b..d214682044 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -444,10 +444,10 @@ def save_link(self, path_info): self._execute(cmd, (relative_path, self._to_sqlite(inode), mtime)) def get_unused_links(self, used): - """Removes all saved links except the ones that are used. + """Returns all saved links except the ones that are used. Args: - used (list): list of used links that should not be removed. + used (list): list of used links """ unused = [] @@ -472,6 +472,11 @@ def get_unused_links(self, used): return unused def remove_unused_links(self, unused): + """Removes all unused links. + + Args: + unused (list): list of unused links that should be removed. + """ for relative_path in unused: remove(os.path.join(self.root_dir, relative_path)) From 2c52d00ca52c7c845c18a09fdca0334b65b9dc3d Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Mon, 10 Feb 2020 22:52:28 +0545 Subject: [PATCH 4/5] state: rename to remove_links --- dvc/repo/checkout.py | 2 +- dvc/state.py | 13 ++++--------- tests/func/test_state.py | 16 ++++++++-------- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 714a5adf8a..2d9c5de4e6 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -21,7 +21,7 @@ def _cleanup_unused_links(repo): logger.info( "Removing '{}' as it already exists in the current worktree.", link ) - repo.state.remove_unused_links(unused) + repo.state.remove_links(unused) return bool(unused) diff --git a/dvc/state.py b/dvc/state.py index d214682044..2f7739267f 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -471,17 +471,12 @@ def get_unused_links(self, used): return unused - def remove_unused_links(self, unused): - """Removes all unused links. - - Args: - unused (list): list of unused links that should be removed. - """ - for relative_path in unused: - remove(os.path.join(self.root_dir, relative_path)) + def remove_links(self, links): + for link in links: + remove(os.path.join(self.root_dir, link)) for chunk_unused in to_chunks( - unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER + links, chunk_size=SQLITE_MAX_VARIABLES_NUMBER ): cmd = "DELETE FROM {} WHERE path IN ({})".format( self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused)) diff --git a/tests/func/test_state.py b/tests/func/test_state.py index abe6808aac..8e841d8c55 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -70,27 +70,27 @@ def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc): assert ret is not None -def test_remove_unused_links(tmp_dir, dvc): - assert len(tmp_dir.dvc_gen("foo", "foo_content")) == 1 - assert len(tmp_dir.dvc_gen("bar", "bar_content")) == 1 +def test_remove_links(tmp_dir, dvc): + tmp_dir.dvc_gen({"foo": "foo_content", "bar": "bar_content"}) - cmd_count_links = "SELECT count(*) FROM {}".format(State.LINK_STATE_TABLE) with dvc.state: + cmd_count_links = "SELECT count(*) FROM {}".format( + State.LINK_STATE_TABLE + ) result = dvc.state._execute(cmd_count_links).fetchone()[0] assert result == 2 - dvc.state.remove_unused_links(["foo", "bar"]) + dvc.state.remove_links(["foo", "bar"]) result = dvc.state._execute(cmd_count_links).fetchone()[0] assert result == 0 def test_get_unused_links(tmp_dir, dvc): - assert len(tmp_dir.dvc_gen("foo", "foo_content")) == 1 - assert len(tmp_dir.dvc_gen("bar", "bar_content")) == 1 + tmp_dir.dvc_gen({"foo": "foo_content", "bar": "bar_content"}) - links = [os.path.join(dvc.root_dir, link) for link in ["foo", "bar"]] with dvc.state: + links = [os.path.join(dvc.root_dir, link) for link in ["foo", "bar"]] assert set(dvc.state.get_unused_links([])) == {"foo", "bar"} assert set(dvc.state.get_unused_links(links[:1])) == {"bar"} assert set(dvc.state.get_unused_links(links)) == set() From f8094cd7cb937cb56169dc9080e62b1aa807eaa9 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Tue, 11 Feb 2020 16:49:36 +0545 Subject: [PATCH 5/5] state: simplify get_unused_links --- dvc/state.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dvc/state.py b/dvc/state.py index 2f7739267f..d54393aede 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -457,16 +457,13 @@ def get_unused_links(self, used): inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relative_path) - if path in used: - continue - - if not os.path.exists(path): + if path in used or not os.path.exists(path): continue actual_inode = get_inode(path) actual_mtime, _ = get_mtime_and_size(path, self.repo.tree) - if inode == actual_inode and mtime == actual_mtime: + if (inode, mtime) == (actual_inode, actual_mtime): unused.append(relative_path) return unused