Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,8 +878,8 @@ def _checkout_file(
"""The file is changed we need to checkout a new copy"""
cache_info = self.checksum_to_path_info(checksum)
if self.exists(path_info):
logger.warning(
"data '{}' exists. Removing before checkout.", path_info
logger.info(
"The file '{}' already exists. It will be replaced.", path_info
)
Comment on lines +881 to 883
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure about converting it to info. Maybe let's remove it for good? Or make it debug?

Copy link
Copy Markdown
Collaborator Author

@skshetry skshetry Feb 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the result of the discussion from #2979, no?
Also: #2979 (comment)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@skshetry Good point, that needs to be clarified. Left a comment there.

self.safe_remove(path_info, force=force)

Expand Down
14 changes: 11 additions & 3 deletions dvc/repo/checkout.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,14 @@ def _cleanup_unused_links(repo):
for out in stage.outs
if out.scheme == "local"
]
repo.state.remove_unused_links(used)

unused = repo.state.get_unused_links(used)
for link in unused:
logger.info(
"Removing '{}' as it already exists in the current worktree.", link
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @jorgeorpinel - please review the messages in this PR

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't think of succinct messages. 😒

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The message is not very useful for info, IMHO. How about we move it to debug?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, it is pretty weird to split old remove_unused_links just to print a message here. Or am I missing something?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, it is pretty weird to split old remove_unused_links just to print a message here. Or am I missing something?

yes: #3297 (comment)

Copy link
Copy Markdown
Contributor

@efiop efiop Feb 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First of all, I tried to maintain consistency: state does not have logger.info anywhere. Therefore, I did not try to introduce one, because I wanted to separate concerns (ui vs state/model) wherever possible, and it seemed like a good idea here for doing just that. slightly_smiling_face

So you moved it just to not use logger.info in state? That is a pretty weak argument for doing that, tbh πŸ™‚ There is nothing inherently wrong with using logging.info in state by itself. I think you just made half a step at separating the context and you really felt that "storing" and "removing" should be handled separately. Please see #3297 (comment) , I think that it should be like that or we should keep remove_unused_links in one piece. Let me know what you think.

Copy link
Copy Markdown
Collaborator Author

@skshetry skshetry Feb 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't consider separation of UI and business logic to be a weak argument. But probably as it's a cli app and already done this way on most of the places here, maybe it is.

I think that it should be like that or we should keep remove_unused_links in one piece. Let me know what you think.

I am fine, either way. Previously, I did not want to separate those too far off like you suggested below and tried to just get the stuff done. πŸ™ˆ
It does not really need to be separated if logger.info call is kept on dvc.state as it's only used in a single place.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @jorgeorpinel - please review the messages in this PR

I see #3326 replaced this and only has a debug string change

)
repo.state.remove_links(unused)
return bool(unused)


def get_all_files_numbers(pairs):
Expand All @@ -34,9 +41,10 @@ def _checkout(
):
from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError

cleaned = False
if not targets:
targets = [None]
_cleanup_unused_links(self)
cleaned = _cleanup_unused_links(self)

pairs = set()
for target in targets:
Expand All @@ -52,7 +60,7 @@ def _checkout(
raise CheckoutErrorSuggestGit(target) from exc

total = get_all_files_numbers(pairs)
if total == 0:
if total == 0 and not cleaned:
logger.info("Nothing to do")
failed = []
with Tqdm(
Expand Down
29 changes: 15 additions & 14 deletions dvc/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,36 +443,37 @@ def save_link(self, path_info):
)
self._execute(cmd, (relative_path, self._to_sqlite(inode), mtime))

def remove_unused_links(self, used):
"""Removes all saved links except the ones that are used.
def get_unused_links(self, used):
"""Returns all saved links except the ones that are used.

Args:
used (list): list of used links that should not be removed.
used (list): list of used links
"""
unused = []

self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE))
for row in self.cursor:
relpath, inode, mtime = row
relative_path, inode, mtime = row
inode = self._from_sqlite(inode)
path = os.path.join(self.root_dir, relpath)
path = os.path.join(self.root_dir, relative_path)

if path in used:
continue

if not os.path.exists(path):
if path in used or not os.path.exists(path):
continue

actual_inode = get_inode(path)
actual_mtime, _ = get_mtime_and_size(path, self.repo.tree)

if inode == actual_inode and mtime == actual_mtime:
logger.debug("Removing '{}' as unused link.", path)
remove(path)
unused.append(relpath)
if (inode, mtime) == (actual_inode, actual_mtime):
unused.append(relative_path)

return unused

def remove_links(self, links):
for link in links:
remove(os.path.join(self.root_dir, link))
Comment on lines +472 to +473
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A more natural refactor would split old remove_unused_links into get_unused_links and remove_link(s)(as a reverse save_link). So we would get_unused_links, remove them in checkout and then state.remove_links() what we've deleted. That way the context would be properly spread across these parts of code.


for chunk_unused in to_chunks(
unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER
links, chunk_size=SQLITE_MAX_VARIABLES_NUMBER
):
cmd = "DELETE FROM {} WHERE path IN ({})".format(
self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused))
Expand Down
28 changes: 23 additions & 5 deletions tests/func/test_state.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import mock
import os

from dvc.path_info import PathInfo
from dvc.state import State
Expand Down Expand Up @@ -69,16 +70,33 @@ def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc):
assert ret is not None


def test_remove_unused_links(tmp_dir, dvc):
assert len(tmp_dir.dvc_gen("foo", "foo_content")) == 1
assert len(tmp_dir.dvc_gen("bar", "bar_content")) == 1
def test_remove_links(tmp_dir, dvc):
tmp_dir.dvc_gen({"foo": "foo_content", "bar": "bar_content"})

cmd_count_links = "SELECT count(*) FROM {}".format(State.LINK_STATE_TABLE)
with dvc.state:
cmd_count_links = "SELECT count(*) FROM {}".format(
State.LINK_STATE_TABLE
)
result = dvc.state._execute(cmd_count_links).fetchone()[0]
assert result == 2

dvc.state.remove_unused_links([])
dvc.state.remove_links(["foo", "bar"])

result = dvc.state._execute(cmd_count_links).fetchone()[0]
assert result == 0


def test_get_unused_links(tmp_dir, dvc):
tmp_dir.dvc_gen({"foo": "foo_content", "bar": "bar_content"})

with dvc.state:
links = [os.path.join(dvc.root_dir, link) for link in ["foo", "bar"]]
assert set(dvc.state.get_unused_links([])) == {"foo", "bar"}
assert set(dvc.state.get_unused_links(links[:1])) == {"bar"}
assert set(dvc.state.get_unused_links(links)) == set()
assert set(
dvc.state.get_unused_links(
used=links[:1]
+ [os.path.join(dvc.root_dir, "not-existing-file")]
)
) == {"bar"}