From 259454024969e428164e208349b6bc880e9bee9e Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 7 Aug 2020 14:55:02 +0900 Subject: [PATCH 1/3] tests: add test to ensure dir hashes are not pushed for incomplete directories --- tests/func/test_remote.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/func/test_remote.py b/tests/func/test_remote.py index 38196ffbdd..c557c9d6f1 100644 --- a/tests/func/test_remote.py +++ b/tests/func/test_remote.py @@ -4,6 +4,7 @@ import configobj import pytest +from funcy import first from mock import patch from dvc.config import Config @@ -412,3 +413,19 @@ def test_protect_local_remote(tmp_dir, dvc, local_remote): assert os.path.exists(remote_cache_file) assert stat.S_IMODE(os.stat(remote_cache_file).st_mode) == 0o444 + + +def test_push_incomplete_dir(tmp_dir, dvc, mocker, local_remote): + (stage,) = tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) + remote = dvc.cloud.get_remote("upstream") + + cache = dvc.cache.local + dir_hash = stage.outs[0].checksum + used = stage.get_used_cache(remote=remote) + + # remove one of the cache files for directory + file_hash = first(used.child_keys(cache.tree.scheme, dir_hash)) + remove(cache.tree.hash_to_path_info(file_hash)) + + dvc.push() + assert not remote.tree.exists(remote.tree.hash_to_path_info(dir_hash)) From 10e49fb32d9adcc61c81ec997f4c71ffeeee9445 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 7 Aug 2020 15:15:07 +0900 Subject: [PATCH 2/3] push: do not push .dir file for directory with missing files --- dvc/cache/local.py | 50 +++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/dvc/cache/local.py b/dvc/cache/local.py index 1db76fb239..ec4d1d2673 100644 --- a/dvc/cache/local.py +++ b/dvc/cache/local.py @@ -255,6 +255,7 @@ def _get_plans(self, download, remote, status_info, status): path_infos = [] names = [] hashes = [] + missing = [] for md5, info in Tqdm( status_info.items(), desc="Analysing status", unit="file" ): @@ -263,6 +264,8 @@ def _get_plans(self, download, remote, status_info, status): path_infos.append(remote.tree.hash_to_path_info(md5)) names.append(info["name"]) hashes.append(md5) + elif info["status"] == STATUS_MISSING: + missing.append(md5) if download: to_infos = cache @@ -271,7 +274,7 @@ def _get_plans(self, download, remote, status_info, status): to_infos = path_infos from_infos = cache - return from_infos, to_infos, names, hashes + return (from_infos, to_infos, names, hashes), missing def _process( self, @@ -312,8 +315,10 @@ def _process( download=download, ) - dir_plans = self._get_plans(download, remote, dir_status, status) - file_plans = self._get_plans(download, remote, file_status, status) + dir_plans, _ = self._get_plans(download, remote, dir_status, status) + file_plans, missing_files = self._get_plans( + download, remote, file_status, status + ) total = len(dir_plans[0]) + len(file_plans[0]) if total == 0: @@ -339,19 +344,32 @@ def _process( ) dir_futures = {} for from_info, to_info, name, dir_hash in zip(*dir_plans): - wait_futures = { - future - for file_hash, future in file_futures.items() - if file_hash in dir_contents[dir_hash] - } - dir_futures[dir_hash] = executor.submit( - self._dir_upload, - func, - wait_futures, - from_info, - to_info, - name, - ) + # if for some reason a file contained in this dir is + # missing both locally and in the remote, we want to + # push whatever file content we have, but should not + # push .dir file + for file_hash in missing_files: + if file_hash in dir_contents[dir_hash]: + logger.debug( + "directory '%s' contains missing files," + "skipping .dir file upload", + name, + ) + break + else: + wait_futures = { + future + for file_hash, future in file_futures.items() + if file_hash in dir_contents[dir_hash] + } + dir_futures[dir_hash] = executor.submit( + self._dir_upload, + func, + wait_futures, + from_info, + to_info, + name, + ) fails = sum( future.result() for future in concat( From ee6a9eb06eb08a904a540ae409f78fab93452fe9 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 7 Aug 2020 15:26:41 +0900 Subject: [PATCH 3/3] update test --- tests/func/test_remote.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/func/test_remote.py b/tests/func/test_remote.py index c557c9d6f1..008b804218 100644 --- a/tests/func/test_remote.py +++ b/tests/func/test_remote.py @@ -4,7 +4,6 @@ import configobj import pytest -from funcy import first from mock import patch from dvc.config import Config @@ -424,8 +423,12 @@ def test_push_incomplete_dir(tmp_dir, dvc, mocker, local_remote): used = stage.get_used_cache(remote=remote) # remove one of the cache files for directory - file_hash = first(used.child_keys(cache.tree.scheme, dir_hash)) - remove(cache.tree.hash_to_path_info(file_hash)) + file_hashes = list(used.child_keys(cache.tree.scheme, dir_hash)) + remove(cache.tree.hash_to_path_info(file_hashes[0])) dvc.push() assert not remote.tree.exists(remote.tree.hash_to_path_info(dir_hash)) + assert not remote.tree.exists( + remote.tree.hash_to_path_info(file_hashes[0]) + ) + assert remote.tree.exists(remote.tree.hash_to_path_info(file_hashes[1]))