From c166e846bc204ce9c2d306f0eb30203f380df9d5 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 15:45:31 +0900 Subject: [PATCH 1/8] diff: support missing cache status for workspace diff --- dvc/repo/diff.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 2fa261683d..1481048622 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -36,7 +36,14 @@ def diff(self, a_rev="HEAD", b_rev=None): # Compare paths between the old and new tree. # set() efficiently converts dict keys to a set added = sorted(set(new) - set(old)) - deleted = sorted(set(old) - set(new)) + deleted_or_missing = set(old) - set(new) + if b_rev == "workspace": + # missing status is only applicable when diffing local workspace + # against a commit + missing = sorted(_filter_missing(self, deleted_or_missing)) + else: + missing = [] + deleted = sorted(deleted_or_missing - set(missing)) modified = sorted(set(old) & set(new)) ret = { @@ -47,6 +54,9 @@ def diff(self, a_rev="HEAD", b_rev=None): for path in modified if old[path] != new[path] ], + "not in cache": [ + {"path": path, "hash": old[path]} for path in missing + ], } return ret if any(ret.values()) else {} @@ -104,3 +114,13 @@ def _dir_output_paths(repo_tree, output): yield str(fname), repo_tree.get_hash(fname).value except NoRemoteError: logger.warning("dir cache entry for '%s' is missing", output) + + +def _filter_missing(repo, paths): + repo_tree = RepoTree(repo, stream=True) + for path in paths: + metadata = repo_tree.metadata(path) + if metadata.is_output or metadata.part_of_output: + out = metadata.outs[0] + if out.status()[str(out)] == "not in cache": + yield path From c1dd0771211cd2ed6912a592787b953626b51f6f Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 15:48:08 +0900 Subject: [PATCH 2/8] diff: add --hide-missing option to hide cache missing status --- dvc/command/diff.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 3ddd3dd400..a0a499c890 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -17,12 +17,15 @@ def _digest(checksum): return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8]) -def _show_md(diff, show_hash=False): +def _show_md(diff, show_hash=False, show_missing=True): from dvc.utils.diff import table header = ["Status", "Hash", "Path"] if show_hash else ["Status", "Path"] rows = [] - for status in ["added", "deleted", "modified"]: + statuses = ["added", "deleted", "modified"] + if show_missing: + statuses.append("not in cache") + for status in statuses: entries = diff.get(status, []) if not entries: continue @@ -39,7 +42,7 @@ def _show_md(diff, show_hash=False): class CmdDiff(CmdBase): @staticmethod - def _format(diff): + def _format(diff, show_missing=True): """ Given a diff structure, generate a string of paths separated by new lines and grouped together by their state. @@ -69,12 +72,16 @@ def _format(diff): "added": colorama.Fore.GREEN, "modified": colorama.Fore.YELLOW, "deleted": colorama.Fore.RED, + "not in cache": colorama.Fore.YELLOW, } summary = {} groups = [] - for state in ["added", "deleted", "modified"]: + states = ["added", "deleted", "modified"] + if show_missing: + states.append("not in cache") + for state in states: summary[state] = 0 entries = diff[state] @@ -105,10 +112,13 @@ def _format(diff): ) ) - groups.append( + fmt = ( "files summary: {added} added, {deleted} deleted," - " {modified} modified".format_map(summary) + " {modified} modified" ) + if show_missing: + fmt += ", {not in cache} not in cache" + groups.append(fmt.format_map(summary)) return "\n\n".join(groups) @@ -116,6 +126,9 @@ def run(self): try: diff = self.repo.diff(self.args.a_rev, self.args.b_rev) show_hash = self.args.show_hash + show_missing = self.args.show_missing + if not show_missing: + del diff["not in cache"] for key, entries in diff.items(): entries = sorted(entries, key=lambda entry: entry["path"]) @@ -127,9 +140,9 @@ def run(self): if self.args.show_json: logger.info(json.dumps(diff)) elif self.args.show_md: - logger.info(_show_md(diff, show_hash)) + logger.info(_show_md(diff, show_hash, show_missing)) elif diff: - logger.info(self._format(diff)) + logger.info(self._format(diff, show_missing)) except DvcException: logger.exception("failed to get diff") @@ -178,4 +191,10 @@ def add_parser(subparsers, parent_parser): action="store_true", default=False, ) + diff_parser.add_argument( + "--hide-missing", + help="Hide missing cache file status.", + action="store_false", + dest="show_missing", + ) diff_parser.set_defaults(func=CmdDiff) From feb9972e2031af4c522bffcca1c72147a6037f5f Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 16:11:43 +0900 Subject: [PATCH 3/8] update command tests --- tests/unit/command/test_diff.py | 114 ++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 19 deletions(-) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index 4e2b49ddb1..2de933c15b 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -28,6 +28,7 @@ def test_default(mocker, caplog): "added": [{"path": "file", "hash": "00000000"}], "deleted": [], "modified": [], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) @@ -36,7 +37,7 @@ def test_default(mocker, caplog): "Added:\n" " file\n" "\n" - "files summary: 1 added, 0 deleted, 0 modified" + "files summary: 1 added, 0 deleted, 0 modified, 0 not in cache" ) in caplog.text @@ -54,6 +55,7 @@ def test_show_hash(mocker, caplog): {"path": "file2", "hash": {"old": "AAAAAAAA", "new": "BBBBBBBB"}}, {"path": "file1", "hash": {"old": "CCCCCCCC", "new": "DDDDDDDD"}}, ], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == cmd.run() @@ -67,7 +69,7 @@ def test_show_hash(mocker, caplog): " CCCCCCCC..DDDDDDDD file1\n" " AAAAAAAA..BBBBBBBB file2\n" "\n" - "files summary: 0 added, 2 deleted, 2 modified" + "files summary: 0 added, 2 deleted, 2 modified, 0 not in cache" ) in caplog.text @@ -81,6 +83,7 @@ def test_show_json(mocker, caplog): ], "deleted": [], "modified": [], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) @@ -88,6 +91,7 @@ def test_show_json(mocker, caplog): assert '"added": [{"path": "file1"}, {"path": "file2"}]' in caplog.text assert '"deleted": []' in caplog.text assert '"modified": []' in caplog.text + assert '"not in cache": []' in caplog.text def test_show_json_and_hash(mocker, caplog): @@ -102,6 +106,7 @@ def test_show_json_and_hash(mocker, caplog): ], "deleted": [], "modified": [], + "not in cache": [], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) @@ -112,6 +117,28 @@ def test_show_json_and_hash(mocker, caplog): ) assert '"deleted": []' in caplog.text assert '"modified": []' in caplog.text + assert '"not in cache": []' in caplog.text + + +def test_show_json_hide_missing(mocker, caplog): + args = parse_args(["diff", "--show-json", "--hide-missing"]) + cmd = args.func(args) + diff = { + "added": [ + {"path": "file2", "hash": "22222222"}, + {"path": "file1", "hash": "11111111"}, + ], + "deleted": [], + "modified": [], + "not in cache": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert '"added": [{"path": "file1"}, {"path": "file2"}]' in caplog.text + assert '"deleted": []' in caplog.text + assert '"modified": []' in caplog.text + assert '"not in cache": []' not in caplog.text @pytest.mark.parametrize("show_hash", [None, True, False]) @@ -126,7 +153,7 @@ def test_diff_show_md_and_hash(mock_show_md, mocker, caplog, show_hash): mocker.patch("dvc.repo.Repo.diff", return_value=diff.copy()) assert 0 == cmd.run() - mock_show_md.assert_called_once_with(diff, show_hash) + mock_show_md.assert_called_once_with(diff, show_hash, True) def test_no_changes(mocker, caplog): @@ -166,16 +193,18 @@ def test_show_md(): ], "modified": [{"path": "file"}], "added": [{"path": "file"}], + "not in cache": [{"path": "file2"}], } assert _show_md(diff) == ( - "| Status | Path |\n" - "|----------|----------|\n" - "| added | file |\n" - "| deleted | zoo |\n" - "| deleted | data{sep} |\n" - "| deleted | data{sep}foo |\n" - "| deleted | data{sep}bar |\n" - "| modified | file |\n" + "| Status | Path |\n" + "|--------------|----------|\n" + "| added | file |\n" + "| deleted | zoo |\n" + "| deleted | data{sep} |\n" + "| deleted | data{sep}foo |\n" + "| deleted | data{sep}bar |\n" + "| modified | file |\n" + "| not in cache | file2 |\n" ).format(sep=os.path.sep) @@ -191,14 +220,61 @@ def test_show_md_with_hash(): {"path": "file", "hash": {"old": "AAAAAAAA", "new": "BBBBBBBB"}} ], "added": [{"path": "file", "hash": "00000000"}], + "not in cache": [{"path": "file2", "hash": "12345678"}], } assert _show_md(diff, show_hash=True) == ( - "| Status | Hash | Path |\n" - "|----------|--------------------|----------|\n" - "| added | 00000000 | file |\n" - "| deleted | 22222 | zoo |\n" - "| deleted | XXXXXXXX | data{sep} |\n" - "| deleted | 11111111 | data{sep}foo |\n" - "| deleted | 00000000 | data{sep}bar |\n" - "| modified | AAAAAAAA..BBBBBBBB | file |\n" + "| Status | Hash | Path |\n" + "|--------------|--------------------|----------|\n" + "| added | 00000000 | file |\n" + "| deleted | 22222 | zoo |\n" + "| deleted | XXXXXXXX | data{sep} |\n" + "| deleted | 11111111 | data{sep}foo |\n" + "| deleted | 00000000 | data{sep}bar |\n" + "| modified | AAAAAAAA..BBBBBBBB | file |\n" + "| not in cache | 12345678 | file2 |\n" + ).format(sep=os.path.sep) + + +def test_show_md_hide_missing(): + diff = { + "deleted": [ + {"path": "zoo"}, + {"path": os.path.join("data", "")}, + {"path": os.path.join("data", "foo")}, + {"path": os.path.join("data", "bar")}, + ], + "modified": [{"path": "file"}], + "added": [{"path": "file"}], + "not in cache": [{"path": "file2"}], + } + assert _show_md(diff, show_missing=False) == ( + "| Status | Path |\n" + "|----------|----------|\n" + "| added | file |\n" + "| deleted | zoo |\n" + "| deleted | data{sep} |\n" + "| deleted | data{sep}foo |\n" + "| deleted | data{sep}bar |\n" + "| modified | file |\n" ).format(sep=os.path.sep) + + +def test_hide_missing(mocker, caplog): + args = parse_args(["diff", "--hide-missing"]) + cmd = args.func(args) + diff = { + "added": [{"path": "file", "hash": "00000000"}], + "deleted": [], + "modified": [], + "not in cache": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert ( + "Added:\n" + " file\n" + "\n" + "files summary: 1 added, 0 deleted, 0 modified" + ) in caplog.text + assert "not in cache" not in caplog.text From 3d2de159949c1438f00e725e0519b825db10d5f8 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 16:36:54 +0900 Subject: [PATCH 4/8] update diff func tests --- tests/func/test_diff.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 1eda16f7a4..6f296aa35e 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -28,6 +28,7 @@ def test_added(tmp_dir, scm, dvc): "added": [{"path": "file", "hash": digest("text")}], "deleted": [], "modified": [], + "not in cache": [], } @@ -55,6 +56,7 @@ def test_no_cache_entry(tmp_dir, scm, dvc): "hash": {"old": digest("first"), "new": digest("second")}, } ], + "not in cache": [], } @@ -66,6 +68,7 @@ def test_deleted(tmp_dir, scm, dvc): "added": [], "deleted": [{"path": "file", "hash": digest("text")}], "modified": [], + "not in cache": [], } @@ -82,6 +85,7 @@ def test_modified(tmp_dir, scm, dvc): "hash": {"old": digest("first"), "new": digest("second")}, } ], + "not in cache": [], } @@ -98,12 +102,14 @@ def test_refs(tmp_dir, scm, dvc): "added": [], "deleted": [], "modified": [{"path": "file", "hash": {"old": HEAD_1, "new": HEAD}}], + "not in cache": [], } assert dvc.diff("HEAD~2", "HEAD~1") == { "added": [], "deleted": [], "modified": [{"path": "file", "hash": {"old": HEAD_2, "new": HEAD_1}}], + "not in cache": [], } with pytest.raises(DvcException, match=r"unknown Git revision 'missing'"): @@ -134,6 +140,7 @@ def test_directories(tmp_dir, scm, dvc): ], "deleted": [], "modified": [], + "not in cache": [], } assert dvc.diff(":/directory", ":/modify") == { @@ -152,6 +159,7 @@ def test_directories(tmp_dir, scm, dvc): "hash": {"old": digest("2"), "new": digest("two")}, }, ], + "not in cache": [], } assert dvc.diff(":/modify", ":/delete") == { @@ -168,6 +176,7 @@ def test_directories(tmp_dir, scm, dvc): }, } ], + "not in cache": [], } @@ -189,6 +198,20 @@ def test_diff_no_cache(tmp_dir, scm, dvc): assert diff["added"] == [] assert diff["deleted"] == [] assert first(diff["modified"])["path"] == os.path.join("dir", "") + assert diff["not in cache"] == [] + + (tmp_dir / "dir" / "file").unlink() + remove(str(tmp_dir / "dir")) + diff = dvc.diff() + assert diff["added"] == [] + assert diff["deleted"] == [] + assert diff["modified"] == [] + assert diff["not in cache"] == [ + { + "path": os.path.join("dir", ""), + "hash": "f0f7a307d223921557c929f944bf5303.dir", + }, + ] def test_diff_dirty(tmp_dir, scm, dvc): @@ -223,6 +246,7 @@ def test_diff_dirty(tmp_dir, scm, dvc): "path": os.path.join("dir", ""), } ], + "not in cache": [], } From 8bb038be7d7a8b85164e9ca4e39838ef6fe4f6d8 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 17:00:38 +0900 Subject: [PATCH 5/8] only show missing cache status when diffing local workspace --- dvc/command/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index a0a499c890..dad6db3fa6 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -126,7 +126,7 @@ def run(self): try: diff = self.repo.diff(self.args.a_rev, self.args.b_rev) show_hash = self.args.show_hash - show_missing = self.args.show_missing + show_missing = False if self.args.b_rev else self.args.show_missing if not show_missing: del diff["not in cache"] From 54ab89aa414f58592340bcad83299567826df761 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 17:33:24 +0900 Subject: [PATCH 6/8] suppress all output if no changes and --hide-missing --- dvc/command/diff.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index dad6db3fa6..ae4bdae408 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -112,6 +112,9 @@ def _format(diff, show_missing=True): ) ) + if not sum(summary.values()): + return None + fmt = ( "files summary: {added} added, {deleted} deleted," " {modified} modified" @@ -142,7 +145,9 @@ def run(self): elif self.args.show_md: logger.info(_show_md(diff, show_hash, show_missing)) elif diff: - logger.info(self._format(diff, show_missing)) + output = self._format(diff, show_missing) + if output: + logger.info(output) except DvcException: logger.exception("failed to get diff") From aee0cdc889f4b17bc10a1ea0aff3e9d6151d1274 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 17:50:39 +0900 Subject: [PATCH 7/8] use hide_missing instead of show_missing --- dvc/command/diff.py | 21 ++++++++++----------- tests/unit/command/test_diff.py | 4 ++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index ae4bdae408..4842018a4a 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -17,13 +17,13 @@ def _digest(checksum): return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8]) -def _show_md(diff, show_hash=False, show_missing=True): +def _show_md(diff, show_hash=False, hide_missing=False): from dvc.utils.diff import table header = ["Status", "Hash", "Path"] if show_hash else ["Status", "Path"] rows = [] statuses = ["added", "deleted", "modified"] - if show_missing: + if not hide_missing: statuses.append("not in cache") for status in statuses: entries = diff.get(status, []) @@ -42,7 +42,7 @@ def _show_md(diff, show_hash=False, show_missing=True): class CmdDiff(CmdBase): @staticmethod - def _format(diff, show_missing=True): + def _format(diff, hide_missing=False): """ Given a diff structure, generate a string of paths separated by new lines and grouped together by their state. @@ -79,7 +79,7 @@ def _format(diff, show_missing=True): groups = [] states = ["added", "deleted", "modified"] - if show_missing: + if not hide_missing: states.append("not in cache") for state in states: summary[state] = 0 @@ -119,7 +119,7 @@ def _format(diff, show_missing=True): "files summary: {added} added, {deleted} deleted," " {modified} modified" ) - if show_missing: + if not hide_missing: fmt += ", {not in cache} not in cache" groups.append(fmt.format_map(summary)) @@ -129,8 +129,8 @@ def run(self): try: diff = self.repo.diff(self.args.a_rev, self.args.b_rev) show_hash = self.args.show_hash - show_missing = False if self.args.b_rev else self.args.show_missing - if not show_missing: + hide_missing = self.args.b_rev or self.args.hide_missing + if hide_missing: del diff["not in cache"] for key, entries in diff.items(): @@ -143,9 +143,9 @@ def run(self): if self.args.show_json: logger.info(json.dumps(diff)) elif self.args.show_md: - logger.info(_show_md(diff, show_hash, show_missing)) + logger.info(_show_md(diff, show_hash, hide_missing)) elif diff: - output = self._format(diff, show_missing) + output = self._format(diff, hide_missing) if output: logger.info(output) @@ -199,7 +199,6 @@ def add_parser(subparsers, parent_parser): diff_parser.add_argument( "--hide-missing", help="Hide missing cache file status.", - action="store_false", - dest="show_missing", + action="store_true", ) diff_parser.set_defaults(func=CmdDiff) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index 2de933c15b..1d6986e07b 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -153,7 +153,7 @@ def test_diff_show_md_and_hash(mock_show_md, mocker, caplog, show_hash): mocker.patch("dvc.repo.Repo.diff", return_value=diff.copy()) assert 0 == cmd.run() - mock_show_md.assert_called_once_with(diff, show_hash, True) + mock_show_md.assert_called_once_with(diff, show_hash, False) def test_no_changes(mocker, caplog): @@ -247,7 +247,7 @@ def test_show_md_hide_missing(): "added": [{"path": "file"}], "not in cache": [{"path": "file2"}], } - assert _show_md(diff, show_missing=False) == ( + assert _show_md(diff, hide_missing=True) == ( "| Status | Path |\n" "|----------|----------|\n" "| added | file |\n" From 60e076539e4e335eb99dfc094ae19d0042e31748 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 4 Sep 2020 17:50:59 +0900 Subject: [PATCH 8/8] use is_dvc --- dvc/repo/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 1481048622..17ef236c82 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -120,7 +120,7 @@ def _filter_missing(repo, paths): repo_tree = RepoTree(repo, stream=True) for path in paths: metadata = repo_tree.metadata(path) - if metadata.is_output or metadata.part_of_output: + if metadata.is_dvc: out = metadata.outs[0] if out.status()[str(out)] == "not in cache": yield path