diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 3ddd3dd400..4842018a4a 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -17,12 +17,15 @@ def _digest(checksum): return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8]) -def _show_md(diff, show_hash=False): +def _show_md(diff, show_hash=False, hide_missing=False): from dvc.utils.diff import table header = ["Status", "Hash", "Path"] if show_hash else ["Status", "Path"] rows = [] - for status in ["added", "deleted", "modified"]: + statuses = ["added", "deleted", "modified"] + if not hide_missing: + statuses.append("not in cache") + for status in statuses: entries = diff.get(status, []) if not entries: continue @@ -39,7 +42,7 @@ def _show_md(diff, show_hash=False): class CmdDiff(CmdBase): @staticmethod - def _format(diff): + def _format(diff, hide_missing=False): """ Given a diff structure, generate a string of paths separated by new lines and grouped together by their state. @@ -69,12 +72,16 @@ def _format(diff): "added": colorama.Fore.GREEN, "modified": colorama.Fore.YELLOW, "deleted": colorama.Fore.RED, + "not in cache": colorama.Fore.YELLOW, } summary = {} groups = [] - for state in ["added", "deleted", "modified"]: + states = ["added", "deleted", "modified"] + if not hide_missing: + states.append("not in cache") + for state in states: summary[state] = 0 entries = diff[state] @@ -105,10 +112,16 @@ def _format(diff): ) ) - groups.append( + if not sum(summary.values()): + return None + + fmt = ( "files summary: {added} added, {deleted} deleted," - " {modified} modified".format_map(summary) + " {modified} modified" ) + if not hide_missing: + fmt += ", {not in cache} not in cache" + groups.append(fmt.format_map(summary)) return "\n\n".join(groups) @@ -116,6 +129,9 @@ def run(self): try: diff = self.repo.diff(self.args.a_rev, self.args.b_rev) show_hash = self.args.show_hash + hide_missing = self.args.b_rev or self.args.hide_missing + if hide_missing: + del diff["not in cache"] for key, entries in diff.items(): entries = sorted(entries, key=lambda entry: entry["path"]) @@ -127,9 +143,11 @@ def run(self): if self.args.show_json: logger.info(json.dumps(diff)) elif self.args.show_md: - logger.info(_show_md(diff, show_hash)) + logger.info(_show_md(diff, show_hash, hide_missing)) elif diff: - logger.info(self._format(diff)) + output = self._format(diff, hide_missing) + if output: + logger.info(output) except DvcException: logger.exception("failed to get diff") @@ -178,4 +196,9 @@ def add_parser(subparsers, parent_parser): action="store_true", default=False, ) + diff_parser.add_argument( + "--hide-missing", + help="Hide missing cache file status.", + action="store_true", + ) diff_parser.set_defaults(func=CmdDiff) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 2fa261683d..17ef236c82 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -36,7 +36,14 @@ def diff(self, a_rev="HEAD", b_rev=None): # Compare paths between the old and new tree. # set() efficiently converts dict keys to a set added = sorted(set(new) - set(old)) - deleted = sorted(set(old) - set(new)) + deleted_or_missing = set(old) - set(new) + if b_rev == "workspace": + # missing status is only applicable when diffing local workspace + # against a commit + missing = sorted(_filter_missing(self, deleted_or_missing)) + else: + missing = [] + deleted = sorted(deleted_or_missing - set(missing)) modified = sorted(set(old) & set(new)) ret = { @@ -47,6 +54,9 @@ def diff(self, a_rev="HEAD", b_rev=None): for path in modified if old[path] != new[path] ], + "not in cache": [ + {"path": path, "hash": old[path]} for path in missing + ], } return ret if any(ret.values()) else {} @@ -104,3 +114,13 @@ def _dir_output_paths(repo_tree, output): yield str(fname), repo_tree.get_hash(fname).value except NoRemoteError: logger.warning("dir cache entry for '%s' is missing", output) + + +def _filter_missing(repo, paths): + repo_tree = RepoTree(repo, stream=True) + for path in paths: + metadata = repo_tree.metadata(path) + if metadata.is_dvc: + out = metadata.outs[0] + if out.status()[str(out)] == "not in cache": + yield path diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 1eda16f7a4..6f296aa35e 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -28,6 +28,7 @@ def test_added(tmp_dir, scm, dvc): "added": [{"path": "file", "hash": digest("text")}], "deleted": [], "modified": [], + "not in cache": [], } @@ -55,6 +56,7 @@ def test_no_cache_entry(tmp_dir, scm, dvc): "hash": {"old": digest("first"), "new": digest("second")}, } ], + "not in cache": [], } @@ -66,6 +68,7 @@ def test_deleted(tmp_dir, scm, dvc): "added": [], "deleted": [{"path": "file", "hash": digest("text")}], "modified": [], + "not in cache": [], } @@ -82,6 +85,7 @@ def test_modified(tmp_dir, scm, dvc): "hash": {"old": digest("first"), "new": digest("second")}, } ], + "not in cache": [], } @@ -98,12 +102,14 @@ def test_refs(tmp_dir, scm, dvc): "added": [], "deleted": [], "modified": [{"path": "file", "hash": {"old": HEAD_1, "new": HEAD}}], + "not in cache": [], } assert dvc.diff("HEAD~2", "HEAD~1") == { "added": [], "deleted": [], "modified": [{"path": "file", "hash": {"old": HEAD_2, "new": HEAD_1}}], + "not in cache": [], } with pytest.raises(DvcException, match=r"unknown Git revision 'missing'"): @@ -134,6 +140,7 @@ def test_directories(tmp_dir, scm, dvc): ], "deleted": [], "modified": [], + "not in cache": [], } assert dvc.diff(":/directory", ":/modify") == { @@ -152,6 +159,7 @@ def test_directories(tmp_dir, scm, dvc): "hash": {"old": digest("2"), "new": digest("two")}, }, ], + "not in cache": [], } assert dvc.diff(":/modify", ":/delete") == { @@ -168,6 +176,7 @@ def test_directories(tmp_dir, scm, dvc): }, } ], + "not in cache": [], } @@ -189,6 +198,20 @@ def test_diff_no_cache(tmp_dir, scm, dvc): assert diff["added"] == [] assert diff["deleted"] == [] assert first(diff["modified"])["path"] == os.path.join("dir", "") + assert diff["not in cache"] == [] + + (tmp_dir / "dir" / "file").unlink() + remove(str(tmp_dir / "dir")) + diff = dvc.diff() + assert diff["added"] == [] + assert diff["deleted"] == [] + assert diff["modified"] == [] + assert diff["not in cache"] == [ + { + "path": os.path.join("dir", ""), + "hash": "f0f7a307d223921557c929f944bf5303.dir", + }, + ] def test_diff_dirty(tmp_dir, scm, dvc): @@ -223,6 +246,7 @@ def test_diff_dirty(tmp_dir, scm, dvc): "path": os.path.join("dir", ""), } ], + "not in cache": [], } diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index 4e2b49ddb1..1d6986e07b 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -28,6 +28,7 @@ def test_default(mocker, caplog): "added": [{"path": "file", "hash": "00000000"}], "deleted": [], "modified": [], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) @@ -36,7 +37,7 @@ def test_default(mocker, caplog): "Added:\n" " file\n" "\n" - "files summary: 1 added, 0 deleted, 0 modified" + "files summary: 1 added, 0 deleted, 0 modified, 0 not in cache" ) in caplog.text @@ -54,6 +55,7 @@ def test_show_hash(mocker, caplog): {"path": "file2", "hash": {"old": "AAAAAAAA", "new": "BBBBBBBB"}}, {"path": "file1", "hash": {"old": "CCCCCCCC", "new": "DDDDDDDD"}}, ], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == cmd.run() @@ -67,7 +69,7 @@ def test_show_hash(mocker, caplog): " CCCCCCCC..DDDDDDDD file1\n" " AAAAAAAA..BBBBBBBB file2\n" "\n" - "files summary: 0 added, 2 deleted, 2 modified" + "files summary: 0 added, 2 deleted, 2 modified, 0 not in cache" ) in caplog.text @@ -81,6 +83,7 @@ def test_show_json(mocker, caplog): ], "deleted": [], "modified": [], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) @@ -88,6 +91,7 @@ def test_show_json(mocker, caplog): assert '"added": [{"path": "file1"}, {"path": "file2"}]' in caplog.text assert '"deleted": []' in caplog.text assert '"modified": []' in caplog.text + assert '"not in cache": []' in caplog.text def test_show_json_and_hash(mocker, caplog): @@ -102,6 +106,7 @@ def test_show_json_and_hash(mocker, caplog): ], "deleted": [], "modified": [], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) @@ -112,6 +117,28 @@ def test_show_json_and_hash(mocker, caplog): ) assert '"deleted": []' in caplog.text assert '"modified": []' in caplog.text + assert '"not in cache": []' in caplog.text + + +def test_show_json_hide_missing(mocker, caplog): + args = parse_args(["diff", "--show-json", "--hide-missing"]) + cmd = args.func(args) + diff = { + "added": [ + {"path": "file2", "hash": "22222222"}, + {"path": "file1", "hash": "11111111"}, + ], + "deleted": [], + "modified": [], + "not in cache": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert '"added": [{"path": "file1"}, {"path": "file2"}]' in caplog.text + assert '"deleted": []' in caplog.text + assert '"modified": []' in caplog.text + assert '"not in cache": []' not in caplog.text @pytest.mark.parametrize("show_hash", [None, True, False]) @@ -126,7 +153,7 @@ def test_diff_show_md_and_hash(mock_show_md, mocker, caplog, show_hash): mocker.patch("dvc.repo.Repo.diff", return_value=diff.copy()) assert 0 == cmd.run() - mock_show_md.assert_called_once_with(diff, show_hash) + mock_show_md.assert_called_once_with(diff, show_hash, False) def test_no_changes(mocker, caplog): @@ -166,16 +193,18 @@ def test_show_md(): ], "modified": [{"path": "file"}], "added": [{"path": "file"}], + "not in cache": [{"path": "file2"}], } assert _show_md(diff) == ( - "| Status | Path |\n" - "|----------|----------|\n" - "| added | file |\n" - "| deleted | zoo |\n" - "| deleted | data{sep} |\n" - "| deleted | data{sep}foo |\n" - "| deleted | data{sep}bar |\n" - "| modified | file |\n" + "| Status | Path |\n" + "|--------------|----------|\n" + "| added | file |\n" + "| deleted | zoo |\n" + "| deleted | data{sep} |\n" + "| deleted | data{sep}foo |\n" + "| deleted | data{sep}bar |\n" + "| modified | file |\n" + "| not in cache | file2 |\n" ).format(sep=os.path.sep) @@ -191,14 +220,61 @@ def test_show_md_with_hash(): {"path": "file", "hash": {"old": "AAAAAAAA", "new": "BBBBBBBB"}} ], "added": [{"path": "file", "hash": "00000000"}], + "not in cache": [{"path": "file2", "hash": "12345678"}], } assert _show_md(diff, show_hash=True) == ( - "| Status | Hash | Path |\n" - "|----------|--------------------|----------|\n" - "| added | 00000000 | file |\n" - "| deleted | 22222 | zoo |\n" - "| deleted | XXXXXXXX | data{sep} |\n" - "| deleted | 11111111 | data{sep}foo |\n" - "| deleted | 00000000 | data{sep}bar |\n" - "| modified | AAAAAAAA..BBBBBBBB | file |\n" + "| Status | Hash | Path |\n" + "|--------------|--------------------|----------|\n" + "| added | 00000000 | file |\n" + "| deleted | 22222 | zoo |\n" + "| deleted | XXXXXXXX | data{sep} |\n" + "| deleted | 11111111 | data{sep}foo |\n" + "| deleted | 00000000 | data{sep}bar |\n" + "| modified | AAAAAAAA..BBBBBBBB | file |\n" + "| not in cache | 12345678 | file2 |\n" + ).format(sep=os.path.sep) + + +def test_show_md_hide_missing(): + diff = { + "deleted": [ + {"path": "zoo"}, + {"path": os.path.join("data", "")}, + {"path": os.path.join("data", "foo")}, + {"path": os.path.join("data", "bar")}, + ], + "modified": [{"path": "file"}], + "added": [{"path": "file"}], + "not in cache": [{"path": "file2"}], + } + assert _show_md(diff, hide_missing=True) == ( + "| Status | Path |\n" + "|----------|----------|\n" + "| added | file |\n" + "| deleted | zoo |\n" + "| deleted | data{sep} |\n" + "| deleted | data{sep}foo |\n" + "| deleted | data{sep}bar |\n" + "| modified | file |\n" ).format(sep=os.path.sep) + + +def test_hide_missing(mocker, caplog): + args = parse_args(["diff", "--hide-missing"]) + cmd = args.func(args) + diff = { + "added": [{"path": "file", "hash": "00000000"}], + "deleted": [], + "modified": [], + "not in cache": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert ( + "Added:\n" + " file\n" + "\n" + "files summary: 1 added, 0 deleted, 0 modified" + ) in caplog.text + assert "not in cache" not in caplog.text