From 55349af48264c2582682fcd1988228c06b9520a3 Mon Sep 17 00:00:00 2001 From: vyloy Date: Tue, 23 Apr 2019 15:52:31 +0800 Subject: [PATCH 01/20] config: Alterative hashings for data (#1676) --- dvc/config.py | 17 ++++++++++ dvc/output/__init__.py | 5 +-- dvc/output/base.py | 12 +++++-- dvc/remote/base.py | 17 ++++++++++ dvc/remote/local/__init__.py | 18 ++++++++++- dvc/stage.py | 40 ++++++++++++++++++++--- dvc/utils/__init__.py | 20 ++++++------ tests/unit/remote/hash/__init__.py | 0 tests/unit/remote/hash/test_hash.py | 49 +++++++++++++++++++++++++++++ 9 files changed, 159 insertions(+), 19 deletions(-) create mode 100644 tests/unit/remote/hash/__init__.py create mode 100644 tests/unit/remote/hash/test_hash.py diff --git a/dvc/config.py b/dvc/config.py index a2956d3d0b..8c16218af9 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -52,6 +52,20 @@ def supported_loglevel(level): return level in ["info", "debug", "warning", "error"] +def supported_core_hash(hash_names): + """Checks if hash config option has a valid value. + + Args: + hash_names (list/string): hash name(s). + """ + if isinstance(hash_names, str): + hash_names = [h.strip() for h in hash_names.split(",")] + for h in hash_names: + if h not in ["sha256", "md5"]: + return False + return True + + def supported_cloud(cloud): """Checks if obsoleted cloud option has a valid value. @@ -144,6 +158,8 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_CORE_INTERACTIVE = "interactive" SECTION_CORE_ANALYTICS = "analytics" SECTION_CORE_ANALYTICS_SCHEMA = BOOL_SCHEMA + SECTION_CORE_HASH = "hash" + SECTION_CORE_HASH_SCHEMA = supported_core_hash SECTION_CACHE = "cache" SECTION_CACHE_DIR = "dir" @@ -190,6 +206,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes # backward compatibility Optional(SECTION_CORE_CLOUD, default=""): SECTION_CORE_CLOUD_SCHEMA, Optional(SECTION_CORE_STORAGEPATH, default=""): str, + Optional(SECTION_CORE_HASH, default=None): SECTION_CORE_HASH_SCHEMA, } # backward compatibility diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index 60db6a50c4..a7d45ef933 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -3,6 +3,7 @@ import schema from dvc.scheme import Schemes +from dvc import utils from dvc.utils.compat import urlparse, str from dvc.output.base import OutputBase @@ -15,7 +16,6 @@ from dvc.remote import Remote from dvc.remote.s3 import RemoteS3 from dvc.remote.hdfs import RemoteHDFS -from dvc.remote.local import RemoteLOCAL OUTS = [ OutputHDFS, @@ -42,7 +42,8 @@ # so when a few types of outputs share the same name, we only need # specify it once. CHECKSUM_SCHEMA = { - schema.Optional(RemoteLOCAL.PARAM_CHECKSUM): schema.Or(str, None), + schema.Optional(utils.CHECKSUM_MD5): schema.Or(str, None), + schema.Optional(utils.CHECKSUM_SHA256): schema.Or(str, None), schema.Optional(RemoteS3.PARAM_CHECKSUM): schema.Or(str, None), schema.Optional(RemoteHDFS.PARAM_CHECKSUM): schema.Or(str, None), } diff --git a/dvc/output/base.py b/dvc/output/base.py index 784c12e995..231d1fdb35 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -143,7 +143,11 @@ def sep(self): @property def checksum(self): - return self.info.get(self.remote.PARAM_CHECKSUM) + for h in self.remote.get_hash_list(): + info = self.info.get(h) + if info: + return info + return None @property def is_dir_checksum(self): @@ -154,8 +158,12 @@ def exists(self): return self.remote.exists(self.path_info) def changed_checksum(self): + cs = self.checksum + if cs is None: + self.remote.update_state_checksum(self.path_info) + return True return ( - self.checksum + cs != self.remote.save_info(self.path_info)[ self.remote.PARAM_CHECKSUM ] diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 30f13a8f47..16059e5efc 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -156,6 +156,9 @@ def group(self, name): def cache(self): return getattr(self.repo.cache, self.scheme) + def get_hash_list(self): + raise [self.PARAM_CHECKSUM] + def get_file_checksum(self, path_info): raise NotImplementedError @@ -279,6 +282,20 @@ def get_checksum(self, path_info): return checksum + def update_state_checksum(self, path_info): + if not self.exists(path_info): + return None + + if self.isdir(path_info): + checksum = self.get_dir_checksum(path_info) + else: + checksum = self.get_file_checksum(path_info) + + if checksum: + self.state.save(path_info, checksum) + + return checksum + def save_info(self, path_info): assert path_info.scheme == self.scheme return {self.PARAM_CHECKSUM: self.get_checksum(path_info)} diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 117d50edf9..f765ea8d7a 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -77,6 +77,19 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) + conf = repo.config.config[Config.SECTION_CORE] + if conf: + core_hash = conf.get(Config.SECTION_CORE_HASH, None) + if core_hash: + if isinstance(core_hash, str): + core_hash = [h.strip() for h in core_hash.split(",")] + self.hash = core_hash + else: + self.hash = ["md5"] + else: + self.hash = ["md5"] + RemoteLOCAL.PARAM_CHECKSUM = self.hash[0] + if self.cache_dir is not None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) @@ -206,8 +219,11 @@ def isdir(self, path_info): def walk(self, path_info): return os.walk(path_info.path) + def get_hash_list(self): + return self.hash + def get_file_checksum(self, path_info): - return file_md5(path_info.path)[0] + return file_md5(path_info.path, self.hash[0])[0] def remove(self, path_info): if path_info.scheme != "local": diff --git a/dvc/stage.py b/dvc/stage.py index a5b018230d..291c5d61f8 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +from dvc.config import Config from dvc.utils.compat import str import copy @@ -120,6 +121,7 @@ class Stage(object): STAGE_FILE_SUFFIX = ".dvc" PARAM_MD5 = "md5" + PARAM_SHA256 = "sha256" PARAM_CMD = "cmd" PARAM_WDIR = "wdir" PARAM_DEPS = "deps" @@ -129,6 +131,7 @@ class Stage(object): SCHEMA = { Optional(PARAM_MD5): Or(str, None), + Optional(PARAM_SHA256): Or(str, None), Optional(PARAM_CMD): Or(str, None), Optional(PARAM_WDIR): Or(str, None), Optional(PARAM_DEPS): Or(And(list, Schema([dependency.SCHEMA])), None), @@ -148,6 +151,7 @@ def __init__( deps=None, outs=None, md5=None, + sha256=None, locked=False, tag=None, state=None, @@ -164,10 +168,23 @@ def __init__( self.outs = outs self.deps = deps self.md5 = md5 + self.sha256 = sha256 self.locked = locked self.tag = tag self._state = state or {} + conf = repo.config.config[Config.SECTION_CORE] + if conf: + core_hash = conf.get(Config.SECTION_CORE_HASH, None) + if core_hash: + if isinstance(core_hash, str): + core_hash = [h.strip() for h in core_hash.split(",")] + self.hash = core_hash + else: + self.hash = ["md5"] + else: + self.hash = ["md5"] + def __repr__(self): return "Stage: '{path}'".format( path=self.relpath if self.path else "No path" @@ -197,7 +214,12 @@ def is_stage_file(path): return os.path.isfile(path) and Stage.is_valid_filename(path) def changed_md5(self): - return self.md5 != self._compute_md5() + for h in self.hash: + checksum = getattr(self, h) + if checksum: + return checksum != self._compute_md5(h) + + return self._compute_md5(self.hash[0]) is not None @property def is_callback(self): @@ -608,6 +630,7 @@ def load(repo, fname): ), cmd=d.get(Stage.PARAM_CMD), md5=d.get(Stage.PARAM_MD5), + sha256=d.get(Stage.PARAM_SHA256), locked=d.get(Stage.PARAM_LOCKED, False), tag=tag, state=state, @@ -625,6 +648,7 @@ def dumpd(self): key: value for key, value in { Stage.PARAM_MD5: self.md5, + Stage.PARAM_SHA256: self.sha256, Stage.PARAM_CMD: self.cmd, Stage.PARAM_WDIR: RemoteBASE.to_posixpath( os.path.relpath(self.wdir, os.path.dirname(self.path)) @@ -653,7 +677,7 @@ def dump(self): self.repo.scm.track_file(os.path.relpath(fname)) - def _compute_md5(self): + def _compute_md5(self, checksum_type=PARAM_MD5): from dvc.output.base import OutputBase d = self.dumpd() @@ -661,6 +685,8 @@ def _compute_md5(self): # NOTE: removing md5 manually in order to not affect md5s in deps/outs if self.PARAM_MD5 in d.keys(): del d[self.PARAM_MD5] + if self.PARAM_SHA256 in d.keys(): + del d[self.PARAM_SHA256] # Ignore the wdir default value. In this case stage file w/o # wdir has the same md5 as a file with the default value specified. @@ -680,8 +706,13 @@ def _compute_md5(self): OutputBase.PARAM_TAGS, OutputBase.PARAM_PERSIST, ], + checksum_type=checksum_type, + ) + logger.debug( + "Computed stage '{}' {}: '{}'".format( + self.relpath, checksum_type, m + ) ) - logger.debug("Computed stage '{}' md5: '{}'".format(self.relpath, m)) return m def save(self): @@ -691,7 +722,8 @@ def save(self): for out in self.outs: out.save() - self.md5 = self._compute_md5() + hash_type = self.hash[0] + setattr(self, hash_type, self._compute_md5(hash_type)) @staticmethod def _changed_entries(entries): diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 784ba1ab4d..49a0631217 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -27,18 +27,22 @@ LARGE_FILE_SIZE = 1024 * 1024 * 1024 LARGE_DIR_SIZE = 100 +CHECKSUM_MD5 = "md5" +CHECKSUM_SHA256 = "sha256" +CHECKSUM_MAP = {CHECKSUM_MD5: hashlib.md5, CHECKSUM_SHA256: hashlib.sha256} + def dos2unix(data): return data.replace(b"\r\n", b"\n") -def file_md5(fname): +def file_md5(fname, checksum_type=CHECKSUM_MD5): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import progress from dvc.istextfile import istextfile if os.path.exists(fname): - hash_md5 = hashlib.md5() + hash_md5 = CHECKSUM_MAP[checksum_type]() binary = not istextfile(fname) size = os.path.getsize(fname) bar = False @@ -74,12 +78,6 @@ def file_md5(fname): return (None, None) -def bytes_md5(byts): - hasher = hashlib.md5() - hasher.update(byts) - return hasher.hexdigest() - - def dict_filter(d, exclude=[]): """ Exclude specified keys from a nested dict @@ -105,10 +103,12 @@ def dict_filter(d, exclude=[]): return d -def dict_md5(d, exclude=[]): +def dict_md5(d, exclude=[], checksum_type=CHECKSUM_MD5): filtered = dict_filter(d, exclude) byts = json.dumps(filtered, sort_keys=True).encode("utf-8") - return bytes_md5(byts) + hasher = CHECKSUM_MAP[checksum_type]() + hasher.update(byts) + return hasher.hexdigest() def copyfile(src, dest, no_progress_bar=False, name=None): diff --git a/tests/unit/remote/hash/__init__.py b/tests/unit/remote/hash/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/remote/hash/test_hash.py b/tests/unit/remote/hash/test_hash.py new file mode 100644 index 0000000000..0cb45fdfcc --- /dev/null +++ b/tests/unit/remote/hash/test_hash.py @@ -0,0 +1,49 @@ +from dvc.main import main +from dvc.utils.stage import load_stage_file +from tests.basic_env import TestDvc + + +class TestHash(TestDvc): + def test_compatibility(self): + ret = main(["add", self.FOO]) + self.assertEqual(0, ret) + + ret = main(["config", "core.hash", "sha256,md5"]) + self.assertEqual(0, ret) + + ret = main(["status"]) + self.assertEqual(0, ret) + + ret = main(["status", "-q"]) + self.assertEqual(0, ret) + + ret = main(["add", self.FOO]) + self.assertEqual(0, ret) + + d = load_stage_file("foo.dvc") + self.assertEqual( + d["sha256"], + "6bc1683385ab4218fed1f53195c41d95d186c085a9e929cd77f012d4f541dd34", + ) + + def test_incompatibility(self): + ret = main(["add", self.FOO]) + self.assertEqual(0, ret) + + ret = main(["config", "core.hash", "sha256"]) + self.assertEqual(0, ret) + + ret = main(["status"]) + self.assertEqual(0, ret) + + ret = main(["status", "-q"]) + self.assertEqual(1, ret) + + ret = main(["add", self.FOO]) + self.assertEqual(0, ret) + + d = load_stage_file("foo.dvc") + self.assertEqual( + d["sha256"], + "6bc1683385ab4218fed1f53195c41d95d186c085a9e929cd77f012d4f541dd34", + ) From 6453c16495e534a2e34752dcc63bbf7e6d3fb025 Mon Sep 17 00:00:00 2001 From: vyloy Date: Thu, 25 Apr 2019 15:14:39 +0800 Subject: [PATCH 02/20] config: use hash section for hashing configuration --- dvc/config.py | 13 +++++++++---- dvc/remote/local/__init__.py | 11 ++++++----- dvc/stage.py | 10 +++++----- tests/unit/remote/hash/test_hash.py | 4 ++-- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 8c16218af9..3a67644168 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -52,7 +52,7 @@ def supported_loglevel(level): return level in ["info", "debug", "warning", "error"] -def supported_core_hash(hash_names): +def supported_hash_local(hash_names): """Checks if hash config option has a valid value. Args: @@ -158,8 +158,13 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_CORE_INTERACTIVE = "interactive" SECTION_CORE_ANALYTICS = "analytics" SECTION_CORE_ANALYTICS_SCHEMA = BOOL_SCHEMA - SECTION_CORE_HASH = "hash" - SECTION_CORE_HASH_SCHEMA = supported_core_hash + + SECTION_HASH = "hash" + SECTION_HASH_LOCAL = "local" + SECTION_HASH_LOCAL_SCHEMA = supported_hash_local + SECTION_HASH_SCHEMA = { + Optional(SECTION_HASH_LOCAL, default=None): SECTION_HASH_LOCAL_SCHEMA + } SECTION_CACHE = "cache" SECTION_CACHE_DIR = "dir" @@ -206,7 +211,6 @@ class Config(object): # pylint: disable=too-many-instance-attributes # backward compatibility Optional(SECTION_CORE_CLOUD, default=""): SECTION_CORE_CLOUD_SCHEMA, Optional(SECTION_CORE_STORAGEPATH, default=""): str, - Optional(SECTION_CORE_HASH, default=None): SECTION_CORE_HASH_SCHEMA, } # backward compatibility @@ -295,6 +299,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(Regex(SECTION_REMOTE_REGEX)): SECTION_REMOTE_SCHEMA, Optional(SECTION_CACHE, default={}): SECTION_CACHE_SCHEMA, Optional(SECTION_STATE, default={}): SECTION_STATE_SCHEMA, + Optional(SECTION_HASH, default={}): SECTION_HASH_SCHEMA, # backward compatibility Optional(SECTION_AWS, default={}): SECTION_AWS_SCHEMA, Optional(SECTION_GCP, default={}): SECTION_GCP_SCHEMA, diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index f765ea8d7a..8202677ab2 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -6,6 +6,7 @@ from dvc.path.base import PathBASE from dvc.path.local import PathLOCAL from dvc.remote.local.slow_link_detection import slow_link_guard +from dvc import utils from dvc.utils.compat import str, makedirs import os @@ -77,17 +78,17 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) - conf = repo.config.config[Config.SECTION_CORE] - if conf: - core_hash = conf.get(Config.SECTION_CORE_HASH, None) + if repo and repo.config and repo.config.config: + conf = repo.config.config[Config.SECTION_HASH] + core_hash = conf.get(Config.SECTION_HASH_LOCAL, None) if core_hash: if isinstance(core_hash, str): core_hash = [h.strip() for h in core_hash.split(",")] self.hash = core_hash else: - self.hash = ["md5"] + self.hash = [utils.CHECKSUM_MD5] else: - self.hash = ["md5"] + self.hash = [utils.CHECKSUM_MD5] RemoteLOCAL.PARAM_CHECKSUM = self.hash[0] if self.cache_dir is not None and not os.path.exists(self.cache_dir): diff --git a/dvc/stage.py b/dvc/stage.py index 291c5d61f8..634c350e49 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -173,17 +173,17 @@ def __init__( self.tag = tag self._state = state or {} - conf = repo.config.config[Config.SECTION_CORE] - if conf: - core_hash = conf.get(Config.SECTION_CORE_HASH, None) + if repo and repo.config and repo.config.config: + conf = repo.config.config[Config.SECTION_HASH] + core_hash = conf.get(Config.SECTION_HASH_LOCAL, None) if core_hash: if isinstance(core_hash, str): core_hash = [h.strip() for h in core_hash.split(",")] self.hash = core_hash else: - self.hash = ["md5"] + self.hash = [self.PARAM_MD5] else: - self.hash = ["md5"] + self.hash = [self.PARAM_MD5] def __repr__(self): return "Stage: '{path}'".format( diff --git a/tests/unit/remote/hash/test_hash.py b/tests/unit/remote/hash/test_hash.py index 0cb45fdfcc..dc916756c1 100644 --- a/tests/unit/remote/hash/test_hash.py +++ b/tests/unit/remote/hash/test_hash.py @@ -8,7 +8,7 @@ def test_compatibility(self): ret = main(["add", self.FOO]) self.assertEqual(0, ret) - ret = main(["config", "core.hash", "sha256,md5"]) + ret = main(["config", "hash.local", "sha256,md5"]) self.assertEqual(0, ret) ret = main(["status"]) @@ -30,7 +30,7 @@ def test_incompatibility(self): ret = main(["add", self.FOO]) self.assertEqual(0, ret) - ret = main(["config", "core.hash", "sha256"]) + ret = main(["config", "hash.local", "sha256"]) self.assertEqual(0, ret) ret = main(["status"]) From 7ebedc4cd72c1c421140458d3db598b0c67e91c1 Mon Sep 17 00:00:00 2001 From: vyloy Date: Wed, 24 Apr 2019 09:07:43 +0800 Subject: [PATCH 03/20] utils: rename file_md5 to file_checksum, dict_md5 to dict_checksum --- dvc/remote/local/__init__.py | 2 +- dvc/stage.py | 4 ++-- dvc/utils/__init__.py | 4 ++-- tests/func/test_add.py | 12 ++++++------ tests/func/test_data_cloud.py | 10 +++++----- tests/func/test_repro.py | 6 ++++-- tests/func/test_run.py | 4 ++-- tests/func/test_state.py | 8 ++++---- tests/func/test_utils.py | 8 ++++++-- 9 files changed, 32 insertions(+), 26 deletions(-) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 8202677ab2..b9f61481f9 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -31,7 +31,7 @@ copyfile, to_chunks, tmp_fname, - file_md5, + file_checksum, walk_files, ) from dvc.config import Config diff --git a/dvc/stage.py b/dvc/stage.py index 634c350e49..7d4a3e61e9 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -16,7 +16,7 @@ import dvc.dependency as dependency import dvc.output as output from dvc.exceptions import DvcException -from dvc.utils import dict_md5, fix_env +from dvc.utils import dict_checksum, fix_env from dvc.utils.collections import apply_diff from dvc.utils.stage import load_stage_fd, dump_stage_file @@ -698,7 +698,7 @@ def _compute_md5(self, checksum_type=PARAM_MD5): # NOTE: excluding parameters that don't affect the state of the # pipeline. Not excluding `OutputLOCAL.PARAM_CACHE`, because if # it has changed, we might not have that output in our cache. - m = dict_md5( + m = dict_checksum( d, exclude=[ self.PARAM_LOCKED, diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 49a0631217..ef969e3501 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -36,7 +36,7 @@ def dos2unix(data): return data.replace(b"\r\n", b"\n") -def file_md5(fname, checksum_type=CHECKSUM_MD5): +def file_checksum(fname, checksum_type=CHECKSUM_MD5): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import progress from dvc.istextfile import istextfile @@ -103,7 +103,7 @@ def dict_filter(d, exclude=[]): return d -def dict_md5(d, exclude=[], checksum_type=CHECKSUM_MD5): +def dict_checksum(d, exclude=[], checksum_type=CHECKSUM_MD5): filtered = dict_filter(d, exclude) byts = json.dumps(filtered, sort_keys=True).encode("utf-8") hasher = CHECKSUM_MAP[checksum_type]() diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 87a16ecf5f..e4a97f8622 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -16,7 +16,7 @@ from mock import patch from dvc.main import main -from dvc.utils import file_md5, LARGE_DIR_SIZE +from dvc.utils import file_checksum, LARGE_DIR_SIZE from dvc.utils.stage import load_stage_file from dvc.utils.compat import range from dvc.stage import Stage @@ -30,7 +30,7 @@ class TestAdd(TestDvc): def test(self): - md5 = file_md5(self.FOO)[0] + md5 = file_checksum(self.FOO)[0] stages = self.dvc.add(self.FOO) self.assertEqual(len(stages), 1) @@ -253,8 +253,8 @@ def test_dir(self): class TestShouldUpdateStateEntryForFileAfterAdd(TestDvc): def test(self): - file_md5_counter = spy(dvc.remote.local.file_md5) - with patch.object(dvc.remote.local, "file_md5", file_md5_counter): + file_md5_counter = spy(dvc.remote.local.file_checksum) + with patch.object(dvc.remote.local, "file_checksum", file_md5_counter): ret = main(["config", "cache.type", "copy"]) self.assertEqual(ret, 0) @@ -282,8 +282,8 @@ def test(self): class TestShouldUpdateStateEntryForDirectoryAfterAdd(TestDvc): def test(self): - file_md5_counter = spy(dvc.remote.local.file_md5) - with patch.object(dvc.remote.local, "file_md5", file_md5_counter): + file_md5_counter = spy(dvc.remote.local.file_checksum) + with patch.object(dvc.remote.local, "file_checksum", file_md5_counter): ret = main(["config", "cache.type", "copy"]) self.assertEqual(ret, 0) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index eecd6dbdb6..1bd57d777f 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -27,7 +27,7 @@ RemoteHTTP, ) from dvc.remote.base import STATUS_OK, STATUS_NEW, STATUS_DELETED -from dvc.utils import file_md5 +from dvc.utils import file_checksum from dvc.utils.stage import load_stage_file, dump_stage_file from tests.basic_env import TestDvc @@ -816,8 +816,8 @@ def test(self): self._setup_cloud() self._prepare_repo() - data_md5 = file_md5(self.DATA)[0] - data_sub_md5 = file_md5(self.DATA_SUB)[0] + data_md5 = file_checksum(self.DATA)[0] + data_sub_md5 = file_checksum(self.DATA_SUB)[0] self._test_recursive_push(data_md5, data_sub_md5) @@ -863,8 +863,8 @@ def setUp(self): ret = main(["remote", "add", "remote_name", "-d", cache_dir]) self.assertEqual(0, ret) - checksum_foo = file_md5(self.FOO)[0] - checksum_bar = file_md5(self.BAR)[0] + checksum_foo = file_checksum(self.FOO)[0] + checksum_bar = file_checksum(self.BAR)[0] self.message_header = ( "Some of the cache files do not exist neither locally " "nor on remote. Missing cache files: \n" diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index a58fcbd6f9..2debc5d1a3 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -18,7 +18,7 @@ from dvc.main import main from dvc.repo import Repo as DvcRepo -from dvc.utils import file_md5 +from dvc.utils import file_checksum from dvc.utils.stage import load_stage_file, dump_stage_file from dvc.remote.local import RemoteLOCAL from dvc.stage import Stage, StageFileDoesNotExistError @@ -623,7 +623,9 @@ def test(self): stages = self.dvc.reproduce(self.foo_stage.path) self.assertTrue(filecmp.cmp(self.FOO, self.BAR, shallow=False)) - self.assertEqual(stages[0].outs[0].checksum, file_md5(self.BAR)[0]) + self.assertEqual( + stages[0].outs[0].checksum, file_checksum(self.BAR)[0] + ) class TestReproChangedDir(TestDvc): diff --git a/tests/func/test_run.py b/tests/func/test_run.py index 047aa30b2e..c5af653952 100644 --- a/tests/func/test_run.py +++ b/tests/func/test_run.py @@ -10,7 +10,7 @@ from dvc.main import main from dvc.output import OutputBase from dvc.repo import Repo as DvcRepo -from dvc.utils import file_md5 +from dvc.utils import file_checksum from dvc.utils.stage import load_stage_file from dvc.system import System from dvc.stage import Stage, StagePathNotFoundError, StagePathNotDirectoryError @@ -53,7 +53,7 @@ def test(self): self.assertEqual(len(stage.deps), len(deps)) self.assertEqual(len(stage.outs), len(outs + outs_no_cache)) self.assertEqual(stage.outs[0].path, outs[0]) - self.assertEqual(stage.outs[0].checksum, file_md5(self.FOO)[0]) + self.assertEqual(stage.outs[0].checksum, file_checksum(self.FOO)[0]) self.assertTrue(stage.path, fname) with self.assertRaises(OutputDuplicationError): diff --git a/tests/func/test_state.py b/tests/func/test_state.py index 70fb0db0bb..1c0cf71aff 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -5,7 +5,7 @@ from dvc.utils.compat import str from dvc.state import State -from dvc.utils import file_md5 +from dvc.utils import file_checksum from dvc.main import main from dvc.utils.fs import get_inode @@ -16,7 +16,7 @@ class TestState(TestDvc): def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) path_info = PathLOCAL(path=path) - md5 = file_md5(path)[0] + md5 = file_checksum(path)[0] state = State(self.dvc, self.dvc.config.config) @@ -32,7 +32,7 @@ def test_update(self): entry_md5 = state.get(path_info) self.assertTrue(entry_md5 is None) - md5 = file_md5(path)[0] + md5 = file_checksum(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) @@ -74,7 +74,7 @@ def test_transforms_inode(self, get_inode_mock): self.assertNotEqual(inode, state._to_sqlite(inode)) path = os.path.join(self.dvc.root_dir, self.FOO) - md5 = file_md5(path)[0] + md5 = file_checksum(path)[0] get_inode_mock.side_effect = self.mock_get_inode(path, inode) with state: diff --git a/tests/func/test_utils.py b/tests/func/test_utils.py index 1baf49c225..eb990a842a 100644 --- a/tests/func/test_utils.py +++ b/tests/func/test_utils.py @@ -35,7 +35,9 @@ def test_file_md5_crlf(self): with open("crlf", "wb+") as fd: fd.write(b"a\r\nb\r\nc") - self.assertEqual(utils.file_md5("cr")[0], utils.file_md5("crlf")[0]) + self.assertEqual( + utils.file_checksum("cr")[0], utils.file_checksum("crlf")[0] + ) def test_dict_md5(self): d = { @@ -57,7 +59,9 @@ def test_dict_md5(self): md5 = "8b263fa05ede6c3145c164829be694b4" - self.assertEqual(md5, utils.dict_md5(d, exclude=["metric", "locked"])) + self.assertEqual( + md5, utils.dict_checksum(d, exclude=["metric", "locked"]) + ) def test_boxify(self): expected = ( From a367dc3a1baa21bf135c4ca5327638ee2b4ee29a Mon Sep 17 00:00:00 2001 From: vyloy Date: Wed, 24 Apr 2019 09:10:48 +0800 Subject: [PATCH 04/20] stage: rename _compute_md5 to _compute_checksum --- dvc/stage.py | 8 ++++---- tests/unit/test_stage.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dvc/stage.py b/dvc/stage.py index 7d4a3e61e9..81c89daec7 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -217,9 +217,9 @@ def changed_md5(self): for h in self.hash: checksum = getattr(self, h) if checksum: - return checksum != self._compute_md5(h) + return checksum != self._compute_checksum(h) - return self._compute_md5(self.hash[0]) is not None + return self._compute_checksum(self.hash[0]) is not None @property def is_callback(self): @@ -677,7 +677,7 @@ def dump(self): self.repo.scm.track_file(os.path.relpath(fname)) - def _compute_md5(self, checksum_type=PARAM_MD5): + def _compute_checksum(self, checksum_type=PARAM_MD5): from dvc.output.base import OutputBase d = self.dumpd() @@ -723,7 +723,7 @@ def save(self): out.save() hash_type = self.hash[0] - setattr(self, hash_type, self._compute_md5(hash_type)) + setattr(self, hash_type, self._compute_checksum(hash_type)) @staticmethod def _changed_entries(entries): diff --git a/tests/unit/test_stage.py b/tests/unit/test_stage.py index 013950b046..ae8bf565c3 100644 --- a/tests/unit/test_stage.py +++ b/tests/unit/test_stage.py @@ -16,7 +16,7 @@ def test(self): with mock.patch.object(stage, "dumpd", return_value=d): self.assertEqual( - stage._compute_md5(), "e9521a22111493406ea64a88cda63e0b" + stage._compute_checksum(), "e9521a22111493406ea64a88cda63e0b" ) def test_wdir_default_ignored(self): @@ -33,7 +33,7 @@ def test_wdir_default_ignored(self): with mock.patch.object(stage, "dumpd", return_value=d): self.assertEqual( - stage._compute_md5(), "e9521a22111493406ea64a88cda63e0b" + stage._compute_checksum(), "e9521a22111493406ea64a88cda63e0b" ) def test_wdir_non_default_is_not_ignored(self): @@ -50,7 +50,7 @@ def test_wdir_non_default_is_not_ignored(self): with mock.patch.object(stage, "dumpd", return_value=d): self.assertEqual( - stage._compute_md5(), "2ceba15e87f6848aa756502c1e6d24e9" + stage._compute_checksum(), "2ceba15e87f6848aa756502c1e6d24e9" ) From e8d790ca5851cbedbab0778067510144db71c73d Mon Sep 17 00:00:00 2001 From: vyloy Date: Fri, 26 Apr 2019 08:56:25 +0800 Subject: [PATCH 05/20] remote: fix error --- dvc/remote/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 16059e5efc..d2c64451eb 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -157,7 +157,7 @@ def cache(self): return getattr(self.repo.cache, self.scheme) def get_hash_list(self): - raise [self.PARAM_CHECKSUM] + return [self.PARAM_CHECKSUM] def get_file_checksum(self, path_info): raise NotImplementedError From 52f67de5b03af3ee27622327043bcbf09b33418b Mon Sep 17 00:00:00 2001 From: vyloy Date: Fri, 26 Apr 2019 09:58:58 +0800 Subject: [PATCH 06/20] remote: improve some details about PARAM_CHECKSUM --- dvc/output/base.py | 2 +- dvc/remote/base.py | 34 +++++++++++++++++++--------------- dvc/remote/local/__init__.py | 21 ++++++++++----------- dvc/repo/__init__.py | 2 +- dvc/stage.py | 18 ++++-------------- dvc/utils/__init__.py | 10 +++++++--- 6 files changed, 42 insertions(+), 45 deletions(-) diff --git a/dvc/output/base.py b/dvc/output/base.py index 231d1fdb35..1ba883b0b0 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -165,7 +165,7 @@ def changed_checksum(self): return ( cs != self.remote.save_info(self.path_info)[ - self.remote.PARAM_CHECKSUM + self.remote.get_prefer_hash_type() ] ) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index d2c64451eb..17458d52d3 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -159,6 +159,9 @@ def cache(self): def get_hash_list(self): return [self.PARAM_CHECKSUM] + def get_prefer_hash_type(self): + return self.PARAM_CHECKSUM + def get_file_checksum(self, path_info): raise NotImplementedError @@ -186,7 +189,7 @@ def _collect_dir(self, path_info): dir_info.append( { self.PARAM_RELPATH: relpath, - self.PARAM_CHECKSUM: checksum, + self.get_prefer_hash_type(): checksum, } ) @@ -298,7 +301,7 @@ def update_state_checksum(self, path_info): def save_info(self, path_info): assert path_info.scheme == self.scheme - return {self.PARAM_CHECKSUM: self.get_checksum(path_info)} + return {self.get_prefer_hash_type(): self.get_checksum(path_info)} def changed(self, path_info, checksum_info): """Checks if data has changed. @@ -327,7 +330,7 @@ def changed(self, path_info, checksum_info): logger.debug("'{}' doesn't exist.".format(path_info)) return True - checksum = checksum_info.get(self.PARAM_CHECKSUM) + checksum = checksum_info.get(self.get_prefer_hash_type()) if checksum is None: logger.debug("checksum for '{}' is missing.".format(path_info)) return True @@ -338,7 +341,7 @@ def changed(self, path_info, checksum_info): ) return True - actual = self.save_info(path_info)[self.PARAM_CHECKSUM] + actual = self.save_info(path_info)[self.get_prefer_hash_type()] if checksum != actual: logger.debug( "checksum '{}'(actual '{}') for '{}' has changed.".format( @@ -379,11 +382,10 @@ def _save_dir(self, path_info, checksum): entry_info = copy(path_info) for entry in dir_info: - entry_checksum = entry[self.PARAM_CHECKSUM] + entry_checksum = entry[self.get_prefer_hash_type()] entry_info.path = self.ospath.join( path_info.path, entry[self.PARAM_RELPATH] ) - self._save_file(entry_info, entry_checksum, save_link=False) self.state.save_link(path_info) @@ -412,7 +414,7 @@ def save(self, path_info, checksum_info): self.scheme, ) - checksum = checksum_info[self.PARAM_CHECKSUM] + checksum = checksum_info[self.get_prefer_hash_type()] if not self.changed_cache(checksum): self._checkout(path_info, checksum) return @@ -500,13 +502,15 @@ def all(self): ) def gc(self, cinfos): - from dvc.remote.local import RemoteLOCAL - used = {info[RemoteLOCAL.PARAM_CHECKSUM] for info in cinfos["local"]} + used = { + info[self.repo.cache.local.get_prefer_hash_type()] + for info in cinfos["local"] + } - if self.scheme != "": + if self.scheme != "local": used |= { - info[self.PARAM_CHECKSUM] + info[self.get_prefer_hash_type()] for info in cinfos.get(self.scheme, []) } @@ -557,7 +561,7 @@ def _changed_dir_cache(self, checksum): return True for entry in self.get_dir_cache(checksum): - checksum = entry[self.PARAM_CHECKSUM] + checksum = entry[self.get_prefer_hash_type()] if self.changed_cache_file(checksum): return True @@ -635,12 +639,12 @@ def _checkout_dir( entry_info = copy(path_info) for entry in dir_info: relpath = entry[self.PARAM_RELPATH] - entry_checksum = entry[self.PARAM_CHECKSUM] + entry_checksum = entry[self.get_prefer_hash_type()] entry_cache_info = self.checksum_to_path_info(entry_checksum) entry_info.url = self.ospath.join(path_info.url, relpath) entry_info.path = self.ospath.join(path_info.path, relpath) - entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum} + entry_checksum_info = {self.get_prefer_hash_type(): entry_checksum} if self.changed(entry_info, entry_checksum_info): if self.exists(entry_info): self.safe_remove(entry_info, force=force) @@ -682,7 +686,7 @@ def checkout( ): raise NotImplementedError - checksum = checksum_info.get(self.PARAM_CHECKSUM) + checksum = checksum_info.get(self.get_prefer_hash_type()) if not checksum: msg = "No checksum info for '{}'." logger.debug(msg.format(str(path_info))) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index b9f61481f9..68d6316802 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -78,18 +78,14 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) + self.hash = [utils.CHECKSUM_MD5] if repo and repo.config and repo.config.config: conf = repo.config.config[Config.SECTION_HASH] - core_hash = conf.get(Config.SECTION_HASH_LOCAL, None) - if core_hash: - if isinstance(core_hash, str): - core_hash = [h.strip() for h in core_hash.split(",")] - self.hash = core_hash - else: - self.hash = [utils.CHECKSUM_MD5] - else: - self.hash = [utils.CHECKSUM_MD5] - RemoteLOCAL.PARAM_CHECKSUM = self.hash[0] + hash_local = conf.get(Config.SECTION_HASH_LOCAL, None) + if hash_local: + if isinstance(hash_local, str): + hash_local = [h.strip() for h in hash_local.split(",")] + self.hash = hash_local if self.cache_dir is not None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) @@ -223,6 +219,9 @@ def walk(self, path_info): def get_hash_list(self): return self.hash + def get_prefer_hash_type(self): + return self.hash[0] + def get_file_checksum(self, path_info): return file_md5(path_info.path, self.hash[0])[0] @@ -333,7 +332,7 @@ def _group(self, checksum_infos, show_checksums=False): by_md5 = {} for info in checksum_infos: - md5 = info[self.PARAM_CHECKSUM] + md5 = info[self.get_prefer_hash_type()] if show_checksums: by_md5[md5] = {"name": md5} diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index eb6ec0396c..83a5dd2409 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -200,7 +200,7 @@ def _collect_dir_cache( info = out.dumpd() ret = [info] r = out.remote - md5 = info[r.PARAM_CHECKSUM] + md5 = info[r.get_prefer_hash_type()] if self.cache.local.changed_cache_file(md5): try: diff --git a/dvc/stage.py b/dvc/stage.py index 81c89daec7..0e1c1c0352 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -from dvc.config import Config from dvc.utils.compat import str import copy @@ -173,17 +172,9 @@ def __init__( self.tag = tag self._state = state or {} - if repo and repo.config and repo.config.config: - conf = repo.config.config[Config.SECTION_HASH] - core_hash = conf.get(Config.SECTION_HASH_LOCAL, None) - if core_hash: - if isinstance(core_hash, str): - core_hash = [h.strip() for h in core_hash.split(",")] - self.hash = core_hash - else: - self.hash = [self.PARAM_MD5] - else: - self.hash = [self.PARAM_MD5] + self.hash = [self.PARAM_MD5] + if repo and repo.cache and repo.cache.local and repo.cache.local.hash: + self.hash = repo.cache.local.hash def __repr__(self): return "Stage: '{path}'".format( @@ -392,7 +383,6 @@ def is_cached(self): """ Checks if this stage has been already ran and stored """ - from dvc.remote.local import RemoteLOCAL from dvc.remote.s3 import RemoteS3 old = Stage.load(self.repo, self.path) @@ -413,7 +403,7 @@ def is_cached(self): new_d.pop(self.PARAM_MD5, None) outs = old_d.get(self.PARAM_OUTS, []) for out in outs: - out.pop(RemoteLOCAL.PARAM_CHECKSUM, None) + out.pop(self.hash[0], None) out.pop(RemoteS3.PARAM_CHECKSUM, None) if old_d != new_d: diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index ef969e3501..e342a67159 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -78,6 +78,12 @@ def file_checksum(fname, checksum_type=CHECKSUM_MD5): return (None, None) +def bytes_checksum(byts, checksum_type=CHECKSUM_MD5): + hasher = CHECKSUM_MAP[checksum_type]() + hasher.update(byts) + return hasher.hexdigest() + + def dict_filter(d, exclude=[]): """ Exclude specified keys from a nested dict @@ -106,9 +112,7 @@ def dict_filter(d, exclude=[]): def dict_checksum(d, exclude=[], checksum_type=CHECKSUM_MD5): filtered = dict_filter(d, exclude) byts = json.dumps(filtered, sort_keys=True).encode("utf-8") - hasher = CHECKSUM_MAP[checksum_type]() - hasher.update(byts) - return hasher.hexdigest() + return bytes_checksum(byts, checksum_type) def copyfile(src, dest, no_progress_bar=False, name=None): From 31616f3f66c349ede9c9b32316db31d2f5137c7a Mon Sep 17 00:00:00 2001 From: vyloy Date: Mon, 29 Apr 2019 15:47:46 +0800 Subject: [PATCH 07/20] config: separate hash library and improve some details --- dvc/config.py | 10 ++- dvc/output/__init__.py | 6 +- dvc/remote/local/__init__.py | 21 +++---- dvc/stage.py | 80 ++++++++++++------------ dvc/utils/__init__.py | 90 --------------------------- dvc/utils/hash/__init__.py | 114 ++++++++++++++++++++++++++++++++++ tests/func/test_add.py | 7 ++- tests/func/test_data_cloud.py | 2 +- tests/func/test_repro.py | 2 +- tests/func/test_run.py | 2 +- tests/func/test_stage.py | 6 +- tests/func/test_state.py | 2 +- tests/func/test_utils.py | 5 +- 13 files changed, 185 insertions(+), 162 deletions(-) create mode 100644 dvc/utils/hash/__init__.py diff --git a/dvc/config.py b/dvc/config.py index 3a67644168..324e913ddd 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from dvc.utils.compat import str, open, urlparse +from dvc.utils import hash import os import errno @@ -58,12 +59,9 @@ def supported_hash_local(hash_names): Args: hash_names (list/string): hash name(s). """ - if isinstance(hash_names, str): - hash_names = [h.strip() for h in hash_names.split(",")] - for h in hash_names: - if h not in ["sha256", "md5"]: - return False - return True + if hash.checksum_types_from_str(hash_names): + return True + return False def supported_cloud(cloud): diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index a7d45ef933..4d851345b1 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -3,7 +3,7 @@ import schema from dvc.scheme import Schemes -from dvc import utils +from dvc.utils import hash from dvc.utils.compat import urlparse, str from dvc.output.base import OutputBase @@ -42,8 +42,8 @@ # so when a few types of outputs share the same name, we only need # specify it once. CHECKSUM_SCHEMA = { - schema.Optional(utils.CHECKSUM_MD5): schema.Or(str, None), - schema.Optional(utils.CHECKSUM_SHA256): schema.Or(str, None), + schema.Optional(hash.CHECKSUM_MD5): schema.Or(str, None), + schema.Optional(hash.CHECKSUM_SHA256): schema.Or(str, None), schema.Optional(RemoteS3.PARAM_CHECKSUM): schema.Or(str, None), schema.Optional(RemoteHDFS.PARAM_CHECKSUM): schema.Or(str, None), } diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 68d6316802..17da58120c 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -6,7 +6,7 @@ from dvc.path.base import PathBASE from dvc.path.local import PathLOCAL from dvc.remote.local.slow_link_detection import slow_link_guard -from dvc import utils +from dvc.utils import hash from dvc.utils.compat import str, makedirs import os @@ -25,15 +25,8 @@ STATUS_DELETED, STATUS_MISSING, ) -from dvc.utils import ( - remove, - move, - copyfile, - to_chunks, - tmp_fname, - file_checksum, - walk_files, -) +from dvc.utils import remove, move, copyfile, to_chunks, tmp_fname, walk_files +from dvc.utils.hash import file_checksum from dvc.config import Config from dvc.exceptions import DvcException from dvc.progress import progress @@ -78,14 +71,14 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) - self.hash = [utils.CHECKSUM_MD5] + self.hash = [hash.CHECKSUM_MD5] if repo and repo.config and repo.config.config: conf = repo.config.config[Config.SECTION_HASH] hash_local = conf.get(Config.SECTION_HASH_LOCAL, None) if hash_local: - if isinstance(hash_local, str): - hash_local = [h.strip() for h in hash_local.split(",")] - self.hash = hash_local + self.hash = ( + hash.checksum_types_from_str(hash_local) or self.hash + ) if self.cache_dir is not None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) diff --git a/dvc/stage.py b/dvc/stage.py index 0e1c1c0352..d88bd77e66 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -15,7 +15,9 @@ import dvc.dependency as dependency import dvc.output as output from dvc.exceptions import DvcException -from dvc.utils import dict_checksum, fix_env +from dvc.utils import fix_env +from dvc.utils import hash +from dvc.utils.hash import dict_checksum from dvc.utils.collections import apply_diff from dvc.utils.stage import load_stage_fd, dump_stage_file @@ -119,8 +121,6 @@ class Stage(object): STAGE_FILE = "Dvcfile" STAGE_FILE_SUFFIX = ".dvc" - PARAM_MD5 = "md5" - PARAM_SHA256 = "sha256" PARAM_CMD = "cmd" PARAM_WDIR = "wdir" PARAM_DEPS = "deps" @@ -129,8 +129,8 @@ class Stage(object): PARAM_META = "meta" SCHEMA = { - Optional(PARAM_MD5): Or(str, None), - Optional(PARAM_SHA256): Or(str, None), + Optional(hash.CHECKSUM_MD5): Or(str, None), + Optional(hash.CHECKSUM_SHA256): Or(str, None), Optional(PARAM_CMD): Or(str, None), Optional(PARAM_WDIR): Or(str, None), Optional(PARAM_DEPS): Or(And(list, Schema([dependency.SCHEMA])), None), @@ -149,8 +149,7 @@ def __init__( wdir=os.curdir, deps=None, outs=None, - md5=None, - sha256=None, + checksum={}, locked=False, tag=None, state=None, @@ -166,13 +165,12 @@ def __init__( self.wdir = wdir self.outs = outs self.deps = deps - self.md5 = md5 - self.sha256 = sha256 + self.checksum = checksum self.locked = locked self.tag = tag self._state = state or {} - self.hash = [self.PARAM_MD5] + self.hash = [hash.CHECKSUM_MD5] if repo and repo.cache and repo.cache.local and repo.cache.local.hash: self.hash = repo.cache.local.hash @@ -206,9 +204,8 @@ def is_stage_file(path): def changed_md5(self): for h in self.hash: - checksum = getattr(self, h) - if checksum: - return checksum != self._compute_checksum(h) + if h in self.checksum: + return self.checksum[h] != self._compute_checksum(h) return self._compute_checksum(self.hash[0]) is not None @@ -399,8 +396,9 @@ def is_cached(self): # NOTE: need to remove checksums from old dict in order to compare # it to the new one, since the new one doesn't have checksums yet. - old_d.pop(self.PARAM_MD5, None) - new_d.pop(self.PARAM_MD5, None) + for k in hash.CHECKSUM_MAP.keys(): + old_d.pop(k, None) + new_d.pop(k, None) outs = old_d.get(self.PARAM_OUTS, []) for out in outs: out.pop(self.hash[0], None) @@ -610,6 +608,12 @@ def load(repo, fname): Stage.validate(d, fname=os.path.relpath(fname)) path = os.path.abspath(fname) + checksum = {} + for k in hash.CHECKSUM_MAP.keys(): + v = d.get(k) + if v: + checksum[k] = v + stage = Stage( repo=repo, path=path, @@ -619,8 +623,7 @@ def load(repo, fname): ) ), cmd=d.get(Stage.PARAM_CMD), - md5=d.get(Stage.PARAM_MD5), - sha256=d.get(Stage.PARAM_SHA256), + checksum=checksum, locked=d.get(Stage.PARAM_LOCKED, False), tag=tag, state=state, @@ -634,22 +637,22 @@ def load(repo, fname): def dumpd(self): from dvc.remote.base import RemoteBASE - return { - key: value - for key, value in { - Stage.PARAM_MD5: self.md5, - Stage.PARAM_SHA256: self.sha256, - Stage.PARAM_CMD: self.cmd, - Stage.PARAM_WDIR: RemoteBASE.to_posixpath( - os.path.relpath(self.wdir, os.path.dirname(self.path)) - ), - Stage.PARAM_LOCKED: self.locked, - Stage.PARAM_DEPS: [d.dumpd() for d in self.deps], - Stage.PARAM_OUTS: [o.dumpd() for o in self.outs], - Stage.PARAM_META: self._state.get("meta"), - }.items() - if value - } + r = {} + for key, value in self.checksum.items(): + if value: + r[key] = value + for key, value in { + Stage.PARAM_CMD: self.cmd, + Stage.PARAM_WDIR: RemoteBASE.to_posixpath( + os.path.relpath(self.wdir, os.path.dirname(self.path)) + ), + Stage.PARAM_LOCKED: self.locked, + Stage.PARAM_DEPS: [d.dumpd() for d in self.deps], + Stage.PARAM_OUTS: [o.dumpd() for o in self.outs], + Stage.PARAM_META: self._state.get("meta"), + }.items(): + if value: + r[key] = value def dump(self): fname = self.path @@ -667,16 +670,15 @@ def dump(self): self.repo.scm.track_file(os.path.relpath(fname)) - def _compute_checksum(self, checksum_type=PARAM_MD5): + def _compute_checksum(self, checksum_type=hash.CHECKSUM_MD5): from dvc.output.base import OutputBase d = self.dumpd() # NOTE: removing md5 manually in order to not affect md5s in deps/outs - if self.PARAM_MD5 in d.keys(): - del d[self.PARAM_MD5] - if self.PARAM_SHA256 in d.keys(): - del d[self.PARAM_SHA256] + for k in hash.CHECKSUM_MAP.keys(): + if k in d.keys(): + del d[k] # Ignore the wdir default value. In this case stage file w/o # wdir has the same md5 as a file with the default value specified. @@ -713,7 +715,7 @@ def save(self): out.save() hash_type = self.hash[0] - setattr(self, hash_type, self._compute_checksum(hash_type)) + self.checksum = {hash_type: self._compute_checksum(hash_type)} @staticmethod def _changed_entries(entries): diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index e342a67159..44ad983b27 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -8,10 +8,8 @@ import sys import stat import math -import json import errno import shutil -import hashlib import nanotime import time import colorama @@ -24,96 +22,8 @@ logger = logging.getLogger(__name__) LOCAL_CHUNK_SIZE = 1024 * 1024 -LARGE_FILE_SIZE = 1024 * 1024 * 1024 LARGE_DIR_SIZE = 100 -CHECKSUM_MD5 = "md5" -CHECKSUM_SHA256 = "sha256" -CHECKSUM_MAP = {CHECKSUM_MD5: hashlib.md5, CHECKSUM_SHA256: hashlib.sha256} - - -def dos2unix(data): - return data.replace(b"\r\n", b"\n") - - -def file_checksum(fname, checksum_type=CHECKSUM_MD5): - """ get the (md5 hexdigest, md5 digest) of a file """ - from dvc.progress import progress - from dvc.istextfile import istextfile - - if os.path.exists(fname): - hash_md5 = CHECKSUM_MAP[checksum_type]() - binary = not istextfile(fname) - size = os.path.getsize(fname) - bar = False - if size >= LARGE_FILE_SIZE: - bar = True - msg = "Computing md5 for a large file {}. This is only done once." - logger.info(msg.format(os.path.relpath(fname))) - name = os.path.relpath(fname) - total = 0 - - with open(fname, "rb") as fobj: - while True: - data = fobj.read(LOCAL_CHUNK_SIZE) - if not data: - break - - if bar: - total += len(data) - progress.update_target(name, total, size) - - if binary: - chunk = data - else: - chunk = dos2unix(data) - - hash_md5.update(chunk) - - if bar: - progress.finish_target(name) - - return (hash_md5.hexdigest(), hash_md5.digest()) - else: - return (None, None) - - -def bytes_checksum(byts, checksum_type=CHECKSUM_MD5): - hasher = CHECKSUM_MAP[checksum_type]() - hasher.update(byts) - return hasher.hexdigest() - - -def dict_filter(d, exclude=[]): - """ - Exclude specified keys from a nested dict - """ - - if isinstance(d, list): - ret = [] - for e in d: - ret.append(dict_filter(e, exclude)) - return ret - elif isinstance(d, dict): - ret = {} - for k, v in d.items(): - if isinstance(k, builtin_str): - k = str(k) - - assert isinstance(k, str) - if k in exclude: - continue - ret[k] = dict_filter(v, exclude) - return ret - - return d - - -def dict_checksum(d, exclude=[], checksum_type=CHECKSUM_MD5): - filtered = dict_filter(d, exclude) - byts = json.dumps(filtered, sort_keys=True).encode("utf-8") - return bytes_checksum(byts, checksum_type) - def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" diff --git a/dvc/utils/hash/__init__.py b/dvc/utils/hash/__init__.py new file mode 100644 index 0000000000..97af657640 --- /dev/null +++ b/dvc/utils/hash/__init__.py @@ -0,0 +1,114 @@ +"""Helpers for other modules.""" + +from __future__ import unicode_literals + +from dvc.utils.compat import str, builtin_str, open + +import os +import json +import hashlib +import logging + + +logger = logging.getLogger(__name__) + +LOCAL_CHUNK_SIZE = 1024 * 1024 +LARGE_FILE_SIZE = 1024 * 1024 * 1024 + +CHECKSUM_MD5 = "md5" +CHECKSUM_SHA256 = "sha256" +CHECKSUM_MAP = {CHECKSUM_MD5: hashlib.md5, CHECKSUM_SHA256: hashlib.sha256} + + +def checksum_types_from_str(hash_names): + if isinstance(hash_names, str): + hash_names = [h.strip().lower() for h in hash_names.split(",")] + if not isinstance(hash_names, list) or len(hash_names) < 1: + return None + for h in hash_names: + if h not in CHECKSUM_MAP.keys(): + return None + return hash_names + + +def dos2unix(data): + return data.replace(b"\r\n", b"\n") + + +def file_checksum(fname, checksum_type=CHECKSUM_MD5): + """ get the (md5 hexdigest, md5 digest) of a file """ + from dvc.progress import progress + from dvc.istextfile import istextfile + + if os.path.exists(fname): + hasher = CHECKSUM_MAP[checksum_type]() + binary = not istextfile(fname) + size = os.path.getsize(fname) + bar = False + if size >= LARGE_FILE_SIZE: + bar = True + msg = "Computing md5 for a large file {}. This is only done once." + logger.info(msg.format(os.path.relpath(fname))) + name = os.path.relpath(fname) + total = 0 + + with open(fname, "rb") as fobj: + while True: + data = fobj.read(LOCAL_CHUNK_SIZE) + if not data: + break + + if bar: + total += len(data) + progress.update_target(name, total, size) + + if binary: + chunk = data + else: + chunk = dos2unix(data) + + hasher.update(chunk) + + if bar: + progress.finish_target(name) + + return (hasher.hexdigest(), hasher.digest()) + else: + return (None, None) + + +def bytes_checksum(byts, checksum_type=CHECKSUM_MD5): + hasher = CHECKSUM_MAP[checksum_type]() + hasher.update(byts) + return hasher.hexdigest() + + +def dict_filter(d, exclude=[]): + """ + Exclude specified keys from a nested dict + """ + + if isinstance(d, list): + ret = [] + for e in d: + ret.append(dict_filter(e, exclude)) + return ret + elif isinstance(d, dict): + ret = {} + for k, v in d.items(): + if isinstance(k, builtin_str): + k = str(k) + + assert isinstance(k, str) + if k in exclude: + continue + ret[k] = dict_filter(v, exclude) + return ret + + return d + + +def dict_checksum(d, exclude=[], checksum_type=CHECKSUM_MD5): + filtered = dict_filter(d, exclude) + byts = json.dumps(filtered, sort_keys=True).encode("utf-8") + return bytes_checksum(byts, checksum_type) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index e4a97f8622..d7d802f4c1 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -16,7 +16,8 @@ from mock import patch from dvc.main import main -from dvc.utils import file_checksum, LARGE_DIR_SIZE +from dvc.utils import LARGE_DIR_SIZE +from dvc.utils.hash import file_checksum from dvc.utils.stage import load_stage_file from dvc.utils.compat import range from dvc.stage import Stage @@ -43,7 +44,9 @@ def test(self): self.assertEqual(len(stage.deps), 0) self.assertEqual(stage.cmd, None) self.assertEqual(stage.outs[0].info["md5"], md5) - self.assertEqual(stage.md5, "ee343f2482f53efffc109be83cc976ac") + self.assertEqual( + stage.checksum["md5"], "ee343f2482f53efffc109be83cc976ac" + ) def test_unicode(self): fname = "\xe1" diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 1bd57d777f..6dfff69e43 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -27,7 +27,7 @@ RemoteHTTP, ) from dvc.remote.base import STATUS_OK, STATUS_NEW, STATUS_DELETED -from dvc.utils import file_checksum +from dvc.utils.hash import file_checksum from dvc.utils.stage import load_stage_file, dump_stage_file from tests.basic_env import TestDvc diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index 2debc5d1a3..03bf8997f7 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -18,7 +18,7 @@ from dvc.main import main from dvc.repo import Repo as DvcRepo -from dvc.utils import file_checksum +from dvc.utils.hash import file_checksum from dvc.utils.stage import load_stage_file, dump_stage_file from dvc.remote.local import RemoteLOCAL from dvc.stage import Stage, StageFileDoesNotExistError diff --git a/tests/func/test_run.py b/tests/func/test_run.py index c5af653952..6233596c34 100644 --- a/tests/func/test_run.py +++ b/tests/func/test_run.py @@ -10,7 +10,7 @@ from dvc.main import main from dvc.output import OutputBase from dvc.repo import Repo as DvcRepo -from dvc.utils import file_checksum +from dvc.utils.hash import file_checksum from dvc.utils.stage import load_stage_file from dvc.system import System from dvc.stage import Stage, StagePathNotFoundError, StagePathNotDirectoryError diff --git a/tests/func/test_stage.py b/tests/func/test_stage.py index 8598411aeb..9f4b749680 100644 --- a/tests/func/test_stage.py +++ b/tests/func/test_stage.py @@ -80,9 +80,11 @@ def test(self): d = load_stage_file(stage.relpath) + from dvc.utils import hash + # NOTE: checking that reloaded stage didn't change its checksum md5 = "11111111111111111111111111111111" - d[stage.PARAM_MD5] = md5 + d[hash.CHECKSUM_MD5] = md5 dump_stage_file(stage.relpath, d) stage = Stage.load(self.dvc, stage.relpath) @@ -90,7 +92,7 @@ def test(self): stage.dump() d = load_stage_file(stage.relpath) - self.assertEqual(d[stage.PARAM_MD5], md5) + self.assertEqual(d[hash.CHECKSUM_MD5], md5) class TestDefaultWorkingDirectory(TestDvc): diff --git a/tests/func/test_state.py b/tests/func/test_state.py index 1c0cf71aff..323e0dfb5b 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -5,7 +5,7 @@ from dvc.utils.compat import str from dvc.state import State -from dvc.utils import file_checksum +from dvc.utils.hash import file_checksum from dvc.main import main from dvc.utils.fs import get_inode diff --git a/tests/func/test_utils.py b/tests/func/test_utils.py index eb990a842a..de69dc6092 100644 --- a/tests/func/test_utils.py +++ b/tests/func/test_utils.py @@ -3,6 +3,7 @@ import filecmp from dvc import utils +from dvc.utils import hash from tests.basic_env import TestDvc @@ -36,7 +37,7 @@ def test_file_md5_crlf(self): fd.write(b"a\r\nb\r\nc") self.assertEqual( - utils.file_checksum("cr")[0], utils.file_checksum("crlf")[0] + hash.file_checksum("cr")[0], hash.file_checksum("crlf")[0] ) def test_dict_md5(self): @@ -60,7 +61,7 @@ def test_dict_md5(self): md5 = "8b263fa05ede6c3145c164829be694b4" self.assertEqual( - md5, utils.dict_checksum(d, exclude=["metric", "locked"]) + md5, hash.dict_checksum(d, exclude=["metric", "locked"]) ) def test_boxify(self): From e941d7cb1ff837006d5f8fb3c27f0de90787c1bc Mon Sep 17 00:00:00 2001 From: vyloy Date: Tue, 30 Apr 2019 16:07:11 +0800 Subject: [PATCH 08/20] hash: add a test about changed file --- tests/unit/remote/hash/test_hash.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/tests/unit/remote/hash/test_hash.py b/tests/unit/remote/hash/test_hash.py index dc916756c1..51ce7da5c2 100644 --- a/tests/unit/remote/hash/test_hash.py +++ b/tests/unit/remote/hash/test_hash.py @@ -11,19 +11,40 @@ def test_compatibility(self): ret = main(["config", "hash.local", "sha256,md5"]) self.assertEqual(0, ret) - ret = main(["status"]) + ret = main(["status", "-q"]) + self.assertEqual(0, ret) + + ret = main(["add", self.FOO]) + self.assertEqual(0, ret) + + d = load_stage_file("foo.dvc") + self.assertEqual( + d["sha256"], + "6bc1683385ab4218fed1f53195c41d95d186c085a9e929cd77f012d4f541dd34", + ) + + def test_changedCompatibility(self): + ret = main(["add", self.FOO]) + self.assertEqual(0, ret) + + ret = main(["config", "hash.local", "sha256,md5"]) self.assertEqual(0, ret) ret = main(["status", "-q"]) self.assertEqual(0, ret) + self.create(self.FOO, "changed") + + ret = main(["status", "-q"]) + self.assertEqual(1, ret) + ret = main(["add", self.FOO]) self.assertEqual(0, ret) d = load_stage_file("foo.dvc") self.assertEqual( d["sha256"], - "6bc1683385ab4218fed1f53195c41d95d186c085a9e929cd77f012d4f541dd34", + "c1bf464aa6158d7a150c0178a693b5df4244c32a5c91b4eb64dc460dc46a10fc", ) def test_incompatibility(self): @@ -33,9 +54,6 @@ def test_incompatibility(self): ret = main(["config", "hash.local", "sha256"]) self.assertEqual(0, ret) - ret = main(["status"]) - self.assertEqual(0, ret) - ret = main(["status", "-q"]) self.assertEqual(1, ret) From b2ecd9123362a3c4b2ef28285cae00d470323140 Mon Sep 17 00:00:00 2001 From: vyloy Date: Tue, 30 Apr 2019 16:33:36 +0800 Subject: [PATCH 09/20] remote: improvements after more tests --- dvc/remote/local/__init__.py | 19 ++++++++----------- dvc/stage.py | 17 +++++++++-------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 17da58120c..a5e517624b 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -71,14 +71,11 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) - self.hash = [hash.CHECKSUM_MD5] - if repo and repo.config and repo.config.config: - conf = repo.config.config[Config.SECTION_HASH] - hash_local = conf.get(Config.SECTION_HASH_LOCAL, None) - if hash_local: - self.hash = ( - hash.checksum_types_from_str(hash_local) or self.hash - ) + self._hash = [hash.CHECKSUM_MD5] + conf = repo.config.config[Config.SECTION_HASH] + hash_local = conf.get(Config.SECTION_HASH_LOCAL, None) + if hash_local: + self._hash = hash.checksum_types_from_str(hash_local) or self._hash if self.cache_dir is not None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) @@ -210,13 +207,13 @@ def walk(self, path_info): return os.walk(path_info.path) def get_hash_list(self): - return self.hash + return self._hash def get_prefer_hash_type(self): - return self.hash[0] + return self._hash[0] def get_file_checksum(self, path_info): - return file_md5(path_info.path, self.hash[0])[0] + return file_checksum(path_info.path, self.hash[0])[0] def remove(self, path_info): if path_info.scheme != "local": diff --git a/dvc/stage.py b/dvc/stage.py index d88bd77e66..8bcdd7b21d 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -170,10 +170,6 @@ def __init__( self.tag = tag self._state = state or {} - self.hash = [hash.CHECKSUM_MD5] - if repo and repo.cache and repo.cache.local and repo.cache.local.hash: - self.hash = repo.cache.local.hash - def __repr__(self): return "Stage: '{path}'".format( path=self.relpath if self.path else "No path" @@ -203,11 +199,16 @@ def is_stage_file(path): return os.path.isfile(path) and Stage.is_valid_filename(path) def changed_md5(self): - for h in self.hash: + for h in self.repo.cache.local.get_hash_list(): if h in self.checksum: return self.checksum[h] != self._compute_checksum(h) - return self._compute_checksum(self.hash[0]) is not None + return ( + self._compute_checksum( + self.repo.cache.local.get_prefer_hash_type() + ) + is not None + ) @property def is_callback(self): @@ -401,7 +402,7 @@ def is_cached(self): new_d.pop(k, None) outs = old_d.get(self.PARAM_OUTS, []) for out in outs: - out.pop(self.hash[0], None) + out.pop(self.repo.cache.local.get_prefer_hash_type(), None) out.pop(RemoteS3.PARAM_CHECKSUM, None) if old_d != new_d: @@ -714,7 +715,7 @@ def save(self): for out in self.outs: out.save() - hash_type = self.hash[0] + hash_type = self.repo.cache.local.get_prefer_hash_type() self.checksum = {hash_type: self._compute_checksum(hash_type)} @staticmethod From 3f0b14b0b24f7906ffff20d6f836623d26317e3c Mon Sep 17 00:00:00 2001 From: vyloy Date: Tue, 30 Apr 2019 17:57:22 +0800 Subject: [PATCH 10/20] rename variable to fix python2.7 object does not support item assignment error --- dvc/stage.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dvc/stage.py b/dvc/stage.py index 8bcdd7b21d..265d8aae3f 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -655,6 +655,8 @@ def dumpd(self): if value: r[key] = value + return r + def dump(self): fname = self.path From b2a25a0d2a6db1f7b70ebb212c6f79e6e6c3df1d Mon Sep 17 00:00:00 2001 From: vyloy Date: Thu, 2 May 2019 16:08:17 +0800 Subject: [PATCH 11/20] rename mod hash to checksum --- dvc/config.py | 4 ++-- dvc/output/__init__.py | 6 +++--- dvc/remote/local/__init__.py | 10 ++++++---- dvc/stage.py | 16 ++++++++-------- dvc/utils/{hash/__init__.py => checksum.py} | 0 tests/func/test_add.py | 2 +- tests/func/test_data_cloud.py | 2 +- tests/func/test_repro.py | 2 +- tests/func/test_run.py | 2 +- tests/func/test_stage.py | 6 +++--- tests/func/test_state.py | 2 +- tests/func/test_utils.py | 7 ++++--- 12 files changed, 31 insertions(+), 28 deletions(-) rename dvc/utils/{hash/__init__.py => checksum.py} (100%) diff --git a/dvc/config.py b/dvc/config.py index 324e913ddd..f60cc7fb9d 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from dvc.utils.compat import str, open, urlparse -from dvc.utils import hash +from dvc.utils import checksum as modchecksum import os import errno @@ -59,7 +59,7 @@ def supported_hash_local(hash_names): Args: hash_names (list/string): hash name(s). """ - if hash.checksum_types_from_str(hash_names): + if modchecksum.checksum_types_from_str(hash_names): return True return False diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index 4d851345b1..35be175f37 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -3,7 +3,7 @@ import schema from dvc.scheme import Schemes -from dvc.utils import hash +from dvc.utils import checksum as modchecksum from dvc.utils.compat import urlparse, str from dvc.output.base import OutputBase @@ -42,8 +42,8 @@ # so when a few types of outputs share the same name, we only need # specify it once. CHECKSUM_SCHEMA = { - schema.Optional(hash.CHECKSUM_MD5): schema.Or(str, None), - schema.Optional(hash.CHECKSUM_SHA256): schema.Or(str, None), + schema.Optional(modchecksum.CHECKSUM_MD5): schema.Or(str, None), + schema.Optional(modchecksum.CHECKSUM_SHA256): schema.Or(str, None), schema.Optional(RemoteS3.PARAM_CHECKSUM): schema.Or(str, None), schema.Optional(RemoteHDFS.PARAM_CHECKSUM): schema.Or(str, None), } diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index a5e517624b..fd38e997e4 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -6,7 +6,7 @@ from dvc.path.base import PathBASE from dvc.path.local import PathLOCAL from dvc.remote.local.slow_link_detection import slow_link_guard -from dvc.utils import hash +from dvc.utils import checksum as modchecksum from dvc.utils.compat import str, makedirs import os @@ -26,7 +26,7 @@ STATUS_MISSING, ) from dvc.utils import remove, move, copyfile, to_chunks, tmp_fname, walk_files -from dvc.utils.hash import file_checksum +from dvc.utils.checksum import file_checksum from dvc.config import Config from dvc.exceptions import DvcException from dvc.progress import progress @@ -71,11 +71,13 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) - self._hash = [hash.CHECKSUM_MD5] + self._hash = [modchecksum.CHECKSUM_MD5] conf = repo.config.config[Config.SECTION_HASH] hash_local = conf.get(Config.SECTION_HASH_LOCAL, None) if hash_local: - self._hash = hash.checksum_types_from_str(hash_local) or self._hash + self._hash = ( + modchecksum.checksum_types_from_str(hash_local) or self._hash + ) if self.cache_dir is not None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) diff --git a/dvc/stage.py b/dvc/stage.py index 265d8aae3f..8785ecf0c6 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -16,8 +16,8 @@ import dvc.output as output from dvc.exceptions import DvcException from dvc.utils import fix_env -from dvc.utils import hash -from dvc.utils.hash import dict_checksum +from dvc.utils import checksum as modchecksum +from dvc.utils.checksum import dict_checksum from dvc.utils.collections import apply_diff from dvc.utils.stage import load_stage_fd, dump_stage_file @@ -129,8 +129,8 @@ class Stage(object): PARAM_META = "meta" SCHEMA = { - Optional(hash.CHECKSUM_MD5): Or(str, None), - Optional(hash.CHECKSUM_SHA256): Or(str, None), + Optional(modchecksum.CHECKSUM_MD5): Or(str, None), + Optional(modchecksum.CHECKSUM_SHA256): Or(str, None), Optional(PARAM_CMD): Or(str, None), Optional(PARAM_WDIR): Or(str, None), Optional(PARAM_DEPS): Or(And(list, Schema([dependency.SCHEMA])), None), @@ -397,7 +397,7 @@ def is_cached(self): # NOTE: need to remove checksums from old dict in order to compare # it to the new one, since the new one doesn't have checksums yet. - for k in hash.CHECKSUM_MAP.keys(): + for k in modchecksum.CHECKSUM_MAP.keys(): old_d.pop(k, None) new_d.pop(k, None) outs = old_d.get(self.PARAM_OUTS, []) @@ -610,7 +610,7 @@ def load(repo, fname): path = os.path.abspath(fname) checksum = {} - for k in hash.CHECKSUM_MAP.keys(): + for k in modchecksum.CHECKSUM_MAP.keys(): v = d.get(k) if v: checksum[k] = v @@ -673,13 +673,13 @@ def dump(self): self.repo.scm.track_file(os.path.relpath(fname)) - def _compute_checksum(self, checksum_type=hash.CHECKSUM_MD5): + def _compute_checksum(self, checksum_type=modchecksum.CHECKSUM_MD5): from dvc.output.base import OutputBase d = self.dumpd() # NOTE: removing md5 manually in order to not affect md5s in deps/outs - for k in hash.CHECKSUM_MAP.keys(): + for k in modchecksum.CHECKSUM_MAP.keys(): if k in d.keys(): del d[k] diff --git a/dvc/utils/hash/__init__.py b/dvc/utils/checksum.py similarity index 100% rename from dvc/utils/hash/__init__.py rename to dvc/utils/checksum.py diff --git a/tests/func/test_add.py b/tests/func/test_add.py index d7d802f4c1..55abeb477e 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -17,7 +17,7 @@ from dvc.main import main from dvc.utils import LARGE_DIR_SIZE -from dvc.utils.hash import file_checksum +from dvc.utils.checksum import file_checksum from dvc.utils.stage import load_stage_file from dvc.utils.compat import range from dvc.stage import Stage diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 6dfff69e43..e1353a94ad 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -27,7 +27,7 @@ RemoteHTTP, ) from dvc.remote.base import STATUS_OK, STATUS_NEW, STATUS_DELETED -from dvc.utils.hash import file_checksum +from dvc.utils.checksum import file_checksum from dvc.utils.stage import load_stage_file, dump_stage_file from tests.basic_env import TestDvc diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index 03bf8997f7..dc1111cb98 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -18,7 +18,7 @@ from dvc.main import main from dvc.repo import Repo as DvcRepo -from dvc.utils.hash import file_checksum +from dvc.utils.checksum import file_checksum from dvc.utils.stage import load_stage_file, dump_stage_file from dvc.remote.local import RemoteLOCAL from dvc.stage import Stage, StageFileDoesNotExistError diff --git a/tests/func/test_run.py b/tests/func/test_run.py index 6233596c34..e595333f3a 100644 --- a/tests/func/test_run.py +++ b/tests/func/test_run.py @@ -10,7 +10,7 @@ from dvc.main import main from dvc.output import OutputBase from dvc.repo import Repo as DvcRepo -from dvc.utils.hash import file_checksum +from dvc.utils.checksum import file_checksum from dvc.utils.stage import load_stage_file from dvc.system import System from dvc.stage import Stage, StagePathNotFoundError, StagePathNotDirectoryError diff --git a/tests/func/test_stage.py b/tests/func/test_stage.py index 9f4b749680..328f88e631 100644 --- a/tests/func/test_stage.py +++ b/tests/func/test_stage.py @@ -80,11 +80,11 @@ def test(self): d = load_stage_file(stage.relpath) - from dvc.utils import hash + from dvc.utils import checksum as modchecksum # NOTE: checking that reloaded stage didn't change its checksum md5 = "11111111111111111111111111111111" - d[hash.CHECKSUM_MD5] = md5 + d[modchecksum.CHECKSUM_MD5] = md5 dump_stage_file(stage.relpath, d) stage = Stage.load(self.dvc, stage.relpath) @@ -92,7 +92,7 @@ def test(self): stage.dump() d = load_stage_file(stage.relpath) - self.assertEqual(d[hash.CHECKSUM_MD5], md5) + self.assertEqual(d[modchecksum.CHECKSUM_MD5], md5) class TestDefaultWorkingDirectory(TestDvc): diff --git a/tests/func/test_state.py b/tests/func/test_state.py index 323e0dfb5b..779a902a84 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -5,7 +5,7 @@ from dvc.utils.compat import str from dvc.state import State -from dvc.utils.hash import file_checksum +from dvc.utils.checksum import file_checksum from dvc.main import main from dvc.utils.fs import get_inode diff --git a/tests/func/test_utils.py b/tests/func/test_utils.py index de69dc6092..a80d8da485 100644 --- a/tests/func/test_utils.py +++ b/tests/func/test_utils.py @@ -3,7 +3,7 @@ import filecmp from dvc import utils -from dvc.utils import hash +from dvc.utils import checksum as modchecksum from tests.basic_env import TestDvc @@ -37,7 +37,8 @@ def test_file_md5_crlf(self): fd.write(b"a\r\nb\r\nc") self.assertEqual( - hash.file_checksum("cr")[0], hash.file_checksum("crlf")[0] + modchecksum.file_checksum("cr")[0], + modchecksum.file_checksum("crlf")[0], ) def test_dict_md5(self): @@ -61,7 +62,7 @@ def test_dict_md5(self): md5 = "8b263fa05ede6c3145c164829be694b4" self.assertEqual( - md5, hash.dict_checksum(d, exclude=["metric", "locked"]) + md5, modchecksum.dict_checksum(d, exclude=["metric", "locked"]) ) def test_boxify(self): From de52cf07f14608602012a44779105422806cdac6 Mon Sep 17 00:00:00 2001 From: vyloy Date: Sun, 5 May 2019 10:06:54 +0800 Subject: [PATCH 12/20] refactor the code to deal with checksum type --- dvc/output/base.py | 27 +++++++++------- dvc/remote/base.py | 76 ++++++++++++++++++++++++---------------------- dvc/state.py | 52 ++++++++++++++++++++++--------- 3 files changed, 94 insertions(+), 61 deletions(-) diff --git a/dvc/output/base.py b/dvc/output/base.py index 1ba883b0b0..a358eb6c80 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -141,10 +141,17 @@ def cache_path(self): def sep(self): return "/" + @property + def checksum_type(self): + for t in self.remote.get_hash_list(): + if t in self.info: + return t + return None + @property def checksum(self): - for h in self.remote.get_hash_list(): - info = self.info.get(h) + for t in self.remote.get_hash_list(): + info = self.info.get(t) if info: return info return None @@ -158,22 +165,20 @@ def exists(self): return self.remote.exists(self.path_info) def changed_checksum(self): - cs = self.checksum - if cs is None: - self.remote.update_state_checksum(self.path_info) - return True return ( - cs - != self.remote.save_info(self.path_info)[ - self.remote.get_prefer_hash_type() - ] + self.checksum + != tuple( + self.remote.save_info( + self.path_info, self.checksum_type + ).items() + )[0][1] ) def changed_cache(self): if not self.use_cache or not self.checksum: return True - return self.cache.changed_cache(self.checksum) + return self.cache.changed_cache(self.checksum, self.checksum_type) def status(self): if self.checksum and self.use_cache and self.changed_cache(): diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 17458d52d3..e6c2976572 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -204,8 +204,8 @@ def get_dir_checksum(self, path_info): if self.cache.changed_cache_file(checksum): self.cache.move(from_info, to_info) - self.state.save(path_info, checksum) - self.state.save(to_info, checksum) + self.state.save(path_info, checksum, self.get_prefer_hash_type()) + self.state.save(to_info, checksum, self.get_prefer_hash_type()) return checksum @@ -267,11 +267,13 @@ def load_dir_cache(self, checksum): def is_dir_checksum(cls, checksum): return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX) - def get_checksum(self, path_info): + def get_checksum(self, path_info, checksum_type=None): if not self.exists(path_info): return None - checksum = self.state.get(path_info) + if checksum_type is None: + checksum_type = self.get_prefer_hash_type() + checksum = self.state.get(path_info, checksum_type) if checksum: return checksum @@ -281,27 +283,13 @@ def get_checksum(self, path_info): checksum = self.get_file_checksum(path_info) if checksum: - self.state.save(path_info, checksum) - - return checksum - - def update_state_checksum(self, path_info): - if not self.exists(path_info): - return None - - if self.isdir(path_info): - checksum = self.get_dir_checksum(path_info) - else: - checksum = self.get_file_checksum(path_info) - - if checksum: - self.state.save(path_info, checksum) + self.state.save(path_info, checksum, checksum_type) return checksum def save_info(self, path_info): assert path_info.scheme == self.scheme - return {self.get_prefer_hash_type(): self.get_checksum(path_info)} + return {self.checksum_type(): self.get_checksum(path_info)} def changed(self, path_info, checksum_info): """Checks if data has changed. @@ -330,18 +318,23 @@ def changed(self, path_info, checksum_info): logger.debug("'{}' doesn't exist.".format(path_info)) return True - checksum = checksum_info.get(self.get_prefer_hash_type()) + checksum = None + for t in self.get_hash_list(): + checksum = checksum_info.get(t) + if checksum: + break + if checksum is None: logger.debug("checksum for '{}' is missing.".format(path_info)) return True - if self.changed_cache(checksum): + if self.changed_cache(checksum, t): logger.debug( "cache for '{}'('{}') has changed.".format(path_info, checksum) ) return True - actual = self.save_info(path_info)[self.get_prefer_hash_type()] + actual = self.save_info(path_info, t)[t] if checksum != actual: logger.debug( "checksum '{}'(actual '{}') for '{}' has changed.".format( @@ -373,8 +366,8 @@ def _save_file(self, path_info, checksum, save_link=True): # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation - self.state.save(path_info, checksum) - self.state.save(cache_info, checksum) + self.state.save(path_info, checksum, self.get_prefer_hash_type()) + self.state.save(cache_info, checksum, self.get_prefer_hash_type()) def _save_dir(self, path_info, checksum): cache_info = self.checksum_to_path_info(checksum) @@ -389,8 +382,8 @@ def _save_dir(self, path_info, checksum): self._save_file(entry_info, entry_checksum, save_link=False) self.state.save_link(path_info) - self.state.save(cache_info, checksum) - self.state.save(path_info, checksum) + self.state.save(cache_info, checksum, self.get_prefer_hash_type()) + self.state.save(path_info, checksum, self.get_prefer_hash_type()) def is_empty(self, path_info): return False @@ -415,7 +408,7 @@ def save(self, path_info, checksum_info): ) checksum = checksum_info[self.get_prefer_hash_type()] - if not self.changed_cache(checksum): + if not self.changed_cache(checksum, self.get_prefer_hash_type()): self._checkout(path_info, checksum) return @@ -523,7 +516,7 @@ def gc(self, cinfos): removed = True return removed - def changed_cache_file(self, checksum): + def changed_cache_file(self, checksum, checksum_type=None): """Compare the given checksum with the (corresponding) actual one. - Use `State` as a cache for computed checksums @@ -535,8 +528,10 @@ def changed_cache_file(self, checksum): - Remove the file from cache if it doesn't match the actual checksum """ + if checksum_type is None: + checksum_type = self.get_prefer_hash_type() cache_info = self.checksum_to_path_info(checksum) - actual = self.get_checksum(cache_info) + actual = self.get_checksum(cache_info, checksum_type) logger.debug( "cache '{}' expected '{}' actual '{}'".format( @@ -556,21 +551,30 @@ def changed_cache_file(self, checksum): return True - def _changed_dir_cache(self, checksum): + def _changed_cache_dir(self): + # NOTE: only implemented for RemoteLOCAL + return True + + def _changed_dir_cache(self, checksum, checksum_type): + if not self._changed_cache_dir(): + return False + if self.changed_cache_file(checksum): return True for entry in self.get_dir_cache(checksum): - checksum = entry[self.get_prefer_hash_type()] - if self.changed_cache_file(checksum): + checksum = entry[checksum_type] + if self.changed_cache_file(checksum, checksum_type): return True return False - def changed_cache(self, checksum): + def changed_cache(self, checksum, checksum_type=None): + if checksum_type is None: + checksum_type = self.get_prefer_hash_type() if self.is_dir_checksum(checksum): - return self._changed_dir_cache(checksum) - return self.changed_cache_file(checksum) + return self._changed_dir_cache(checksum, checksum_type) + return self.changed_cache_file(checksum, checksum_type) def cache_exists(self, checksums): # NOTE: The reason for such an odd logic is that most of the remotes diff --git a/dvc/state.py b/dvc/state.py index 8e46b31cd0..a070b48af5 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -5,9 +5,11 @@ import os import sqlite3 import logging +import json from dvc.config import Config from dvc.utils import remove, current_timestamp +from dvc.utils import checksum as modchecksum from dvc.exceptions import DvcException from dvc.utils.fs import get_mtime_and_size, get_inode @@ -30,10 +32,10 @@ def __init__(self, dvc_version, expected, actual): class StateBase(object): - def save(self, path_info, checksum): + def save(self, path_info, checksum, checksum_type): pass - def get(self, path_info): + def get(self, path_info, checksum_type): return None def save_link(self, path_info): @@ -59,7 +61,7 @@ class State(StateBase): # pylint: disable=too-many-instance-attributes "inode INTEGER PRIMARY KEY, " "mtime TEXT NOT NULL, " "size TEXT NOT NULL, " - "md5 TEXT NOT NULL, " + "checksums TEXT NOT NULL, " "timestamp TEXT NOT NULL" ) @@ -298,7 +300,7 @@ def _update_state_for_path_changed( cmd = ( "UPDATE {} SET " 'mtime = "{}", size = "{}", ' - 'md5 = "{}", timestamp = "{}" ' + "checksums = '{}', timestamp = \"{}\" " "WHERE inode = {}" ) self._execute( @@ -318,8 +320,8 @@ def _insert_new_state_record( assert checksum is not None cmd = ( - "INSERT INTO {}(inode, mtime, size, md5, timestamp) " - 'VALUES ({}, "{}", "{}", "{}", "{}")' + "INSERT INTO {}(inode, mtime, size, checksums, timestamp) " + 'VALUES ({}, "{}", "{}", \'{}\', "{}")' ) self._execute( cmd.format( @@ -334,7 +336,10 @@ def _insert_new_state_record( self.inserts += 1 def get_state_record_for_inode(self, inode): - cmd = "SELECT mtime, size, md5, timestamp from {} " "WHERE inode={}" + cmd = ( + "SELECT mtime, size, checksums, timestamp from {} " + "WHERE inode={}" + ) cmd = cmd.format(self.STATE_TABLE, self._to_sqlite(inode)) self._execute(cmd) results = self._fetchall() @@ -344,12 +349,15 @@ def get_state_record_for_inode(self, inode): return results[0] return None - def save(self, path_info, checksum): + def save( + self, path_info, checksum, checksum_type=modchecksum.CHECKSUM_MD5 + ): """Save checksum for the specified path info. Args: path_info (dict): path_info to save checksum for. checksum (str): checksum to save. + checksum_type (str): checksum type to save. """ assert path_info.scheme == "local" assert checksum is not None @@ -360,23 +368,33 @@ def save(self, path_info, checksum): actual_mtime, actual_size = get_mtime_and_size(path) actual_inode = get_inode(path) + d = {checksum_type: checksum} + checksum_json = json.dumps(d) + existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: self._insert_new_state_record( - actual_inode, actual_mtime, actual_size, checksum + actual_inode, actual_mtime, actual_size, checksum_json ) return + existing_json = existing_record[2] + if existing_json: + checksums = json.loads(existing_json) + checksums[checksum_type] = checksum + checksum_json = json.dumps(checksums) + self._update_state_for_path_changed( - actual_inode, actual_mtime, actual_size, checksum + actual_inode, actual_mtime, actual_size, checksum_json ) - def get(self, path_info): + def get(self, path_info, checksum_type=modchecksum.CHECKSUM_MD5): """Gets the checksum for the specified path info. Checksum will be retrieved from the state database if available. Args: path_info (dict): path info to get the checksum for. + checksum_type (str): type to get the checksum for. Returns: str or None: checksum for the specified path info or None if it @@ -395,12 +413,17 @@ def get(self, path_info): if not existing_record: return None - mtime, size, checksum, _ = existing_record + mtime, size, checksum_json, _ = existing_record if self._file_metadata_changed(actual_mtime, mtime, actual_size, size): return None + if checksum_json: + checksums = json.loads(checksum_json) + if checksum_type not in checksums: + return None + self._update_state_record_timestamp_for_inode(actual_inode) - return checksum + return checksums[checksum_type] def save_link(self, path_info): """Adds the specified path to the list of links created by dvc. This @@ -465,7 +488,8 @@ def _update_cache_directory_state(self): inode = get_inode(cache_path) cmd = ( - "INSERT OR REPLACE INTO {}(inode, size, mtime, timestamp, md5) " + "INSERT OR REPLACE INTO {}" + "(inode, size, mtime, timestamp, checksums) " 'VALUES ({}, "{}", "{}", "{}", "")'.format( self.STATE_TABLE, self._to_sqlite(inode), From 90d65a2556dfe0a18eb670a381707865f26abfc6 Mon Sep 17 00:00:00 2001 From: vyloy Date: Sun, 5 May 2019 10:11:52 +0800 Subject: [PATCH 13/20] rename legacy function about md5 to checksum --- dvc/config.py | 16 ++++++------ dvc/output/base.py | 4 +-- dvc/remote/base.py | 38 +++++++++++++++-------------- dvc/remote/local/__init__.py | 16 ++++++------ dvc/repo/__init__.py | 2 +- dvc/stage.py | 26 +++++++++++--------- tests/func/test_stage.py | 2 +- tests/unit/remote/hash/test_hash.py | 6 ++--- 8 files changed, 59 insertions(+), 51 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index f60cc7fb9d..f89cb2386e 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -53,7 +53,7 @@ def supported_loglevel(level): return level in ["info", "debug", "warning", "error"] -def supported_hash_local(hash_names): +def supported_checksum_local(hash_names): """Checks if hash config option has a valid value. Args: @@ -157,11 +157,13 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_CORE_ANALYTICS = "analytics" SECTION_CORE_ANALYTICS_SCHEMA = BOOL_SCHEMA - SECTION_HASH = "hash" - SECTION_HASH_LOCAL = "local" - SECTION_HASH_LOCAL_SCHEMA = supported_hash_local - SECTION_HASH_SCHEMA = { - Optional(SECTION_HASH_LOCAL, default=None): SECTION_HASH_LOCAL_SCHEMA + SECTION_CHECKSUM = "checksum" + SECTION_CHECKSUM_LOCAL = "local" + SECTION_CHECKSUM_LOCAL_SCHEMA = supported_checksum_local + SECTION_CHECKSUM_SCHEMA = { + Optional( + SECTION_CHECKSUM_LOCAL, default=None + ): SECTION_CHECKSUM_LOCAL_SCHEMA } SECTION_CACHE = "cache" @@ -297,7 +299,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(Regex(SECTION_REMOTE_REGEX)): SECTION_REMOTE_SCHEMA, Optional(SECTION_CACHE, default={}): SECTION_CACHE_SCHEMA, Optional(SECTION_STATE, default={}): SECTION_STATE_SCHEMA, - Optional(SECTION_HASH, default={}): SECTION_HASH_SCHEMA, + Optional(SECTION_CHECKSUM, default={}): SECTION_CHECKSUM_SCHEMA, # backward compatibility Optional(SECTION_AWS, default={}): SECTION_AWS_SCHEMA, Optional(SECTION_GCP, default={}): SECTION_GCP_SCHEMA, diff --git a/dvc/output/base.py b/dvc/output/base.py index a358eb6c80..03b6b99103 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -143,14 +143,14 @@ def sep(self): @property def checksum_type(self): - for t in self.remote.get_hash_list(): + for t in self.remote.get_checksum_type_list(): if t in self.info: return t return None @property def checksum(self): - for t in self.remote.get_hash_list(): + for t in self.remote.get_checksum_type_list(): info = self.info.get(t) if info: return info diff --git a/dvc/remote/base.py b/dvc/remote/base.py index e6c2976572..0e2d1d2efc 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -156,10 +156,10 @@ def group(self, name): def cache(self): return getattr(self.repo.cache, self.scheme) - def get_hash_list(self): + def get_checksum_type_list(self): return [self.PARAM_CHECKSUM] - def get_prefer_hash_type(self): + def get_prefer_checksum_type(self): return self.PARAM_CHECKSUM def get_file_checksum(self, path_info): @@ -189,7 +189,7 @@ def _collect_dir(self, path_info): dir_info.append( { self.PARAM_RELPATH: relpath, - self.get_prefer_hash_type(): checksum, + self.get_prefer_checksum_type(): checksum, } ) @@ -272,7 +272,7 @@ def get_checksum(self, path_info, checksum_type=None): return None if checksum_type is None: - checksum_type = self.get_prefer_hash_type() + checksum_type = self.get_prefer_checksum_type() checksum = self.state.get(path_info, checksum_type) if checksum: return checksum @@ -319,7 +319,7 @@ def changed(self, path_info, checksum_info): return True checksum = None - for t in self.get_hash_list(): + for t in self.get_checksum_type_list(): checksum = checksum_info.get(t) if checksum: break @@ -366,8 +366,8 @@ def _save_file(self, path_info, checksum, save_link=True): # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation - self.state.save(path_info, checksum, self.get_prefer_hash_type()) - self.state.save(cache_info, checksum, self.get_prefer_hash_type()) + self.state.save(path_info, checksum, self.get_prefer_checksum_type()) + self.state.save(cache_info, checksum, self.get_prefer_checksum_type()) def _save_dir(self, path_info, checksum): cache_info = self.checksum_to_path_info(checksum) @@ -375,15 +375,15 @@ def _save_dir(self, path_info, checksum): entry_info = copy(path_info) for entry in dir_info: - entry_checksum = entry[self.get_prefer_hash_type()] + entry_checksum = entry[self.get_prefer_checksum_type()] entry_info.path = self.ospath.join( path_info.path, entry[self.PARAM_RELPATH] ) self._save_file(entry_info, entry_checksum, save_link=False) self.state.save_link(path_info) - self.state.save(cache_info, checksum, self.get_prefer_hash_type()) - self.state.save(path_info, checksum, self.get_prefer_hash_type()) + self.state.save(cache_info, checksum, self.get_prefer_checksum_type()) + self.state.save(path_info, checksum, self.get_prefer_checksum_type()) def is_empty(self, path_info): return False @@ -407,8 +407,8 @@ def save(self, path_info, checksum_info): self.scheme, ) - checksum = checksum_info[self.get_prefer_hash_type()] - if not self.changed_cache(checksum, self.get_prefer_hash_type()): + checksum = checksum_info[self.get_prefer_checksum_type()] + if not self.changed_cache(checksum, self.get_prefer_checksum_type()): self._checkout(path_info, checksum) return @@ -497,7 +497,7 @@ def all(self): def gc(self, cinfos): used = { - info[self.repo.cache.local.get_prefer_hash_type()] + info[self.repo.cache.local.get_prefer_checksum_type()] for info in cinfos["local"] } @@ -529,7 +529,7 @@ def changed_cache_file(self, checksum, checksum_type=None): - Remove the file from cache if it doesn't match the actual checksum """ if checksum_type is None: - checksum_type = self.get_prefer_hash_type() + checksum_type = self.get_prefer_checksum_type() cache_info = self.checksum_to_path_info(checksum) actual = self.get_checksum(cache_info, checksum_type) @@ -571,7 +571,7 @@ def _changed_dir_cache(self, checksum, checksum_type): def changed_cache(self, checksum, checksum_type=None): if checksum_type is None: - checksum_type = self.get_prefer_hash_type() + checksum_type = self.get_prefer_checksum_type() if self.is_dir_checksum(checksum): return self._changed_dir_cache(checksum, checksum_type) return self.changed_cache_file(checksum, checksum_type) @@ -643,12 +643,14 @@ def _checkout_dir( entry_info = copy(path_info) for entry in dir_info: relpath = entry[self.PARAM_RELPATH] - entry_checksum = entry[self.get_prefer_hash_type()] + entry_checksum = entry[self.get_prefer_checksum_type()] entry_cache_info = self.checksum_to_path_info(entry_checksum) entry_info.url = self.ospath.join(path_info.url, relpath) entry_info.path = self.ospath.join(path_info.path, relpath) - entry_checksum_info = {self.get_prefer_hash_type(): entry_checksum} + entry_checksum_info = { + self.get_prefer_checksum_type(): entry_checksum + } if self.changed(entry_info, entry_checksum_info): if self.exists(entry_info): self.safe_remove(entry_info, force=force) @@ -690,7 +692,7 @@ def checkout( ): raise NotImplementedError - checksum = checksum_info.get(self.get_prefer_hash_type()) + checksum = checksum_info.get(self.get_prefer_checksum_type()) if not checksum: msg = "No checksum info for '{}'." logger.debug(msg.format(str(path_info))) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index fd38e997e4..edb977ca62 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -72,8 +72,8 @@ def __init__(self, repo, config): self.cache_types = copy(self.DEFAULT_CACHE_TYPES) self._hash = [modchecksum.CHECKSUM_MD5] - conf = repo.config.config[Config.SECTION_HASH] - hash_local = conf.get(Config.SECTION_HASH_LOCAL, None) + conf = repo.config.config[Config.SECTION_CHECKSUM] + hash_local = conf.get(Config.SECTION_CHECKSUM_LOCAL, None) if hash_local: self._hash = ( modchecksum.checksum_types_from_str(hash_local) or self._hash @@ -181,12 +181,12 @@ def ospath(self): def already_cached(self, path_info): assert path_info.scheme in ["", "local"] - current_md5 = self.get_checksum(path_info) + cs = self.get_checksum(path_info) - if not current_md5: + if not cs: return False - return not self.changed_cache(current_md5) + return not self.changed_cache(cs) def is_empty(self, path_info): path = path_info.path @@ -208,10 +208,10 @@ def isdir(self, path_info): def walk(self, path_info): return os.walk(path_info.path) - def get_hash_list(self): + def get_checksum_type_list(self): return self._hash - def get_prefer_hash_type(self): + def get_prefer_checksum_type(self): return self._hash[0] def get_file_checksum(self, path_info): @@ -324,7 +324,7 @@ def _group(self, checksum_infos, show_checksums=False): by_md5 = {} for info in checksum_infos: - md5 = info[self.get_prefer_hash_type()] + md5 = info[self.get_prefer_checksum_type()] if show_checksums: by_md5[md5] = {"name": md5} diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 83a5dd2409..bab8960d62 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -200,7 +200,7 @@ def _collect_dir_cache( info = out.dumpd() ret = [info] r = out.remote - md5 = info[r.get_prefer_hash_type()] + md5 = info[r.get_prefer_checksum_type()] if self.cache.local.changed_cache_file(md5): try: diff --git a/dvc/stage.py b/dvc/stage.py index 8785ecf0c6..57a1650f7f 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -198,14 +198,14 @@ def is_valid_filename(path): def is_stage_file(path): return os.path.isfile(path) and Stage.is_valid_filename(path) - def changed_md5(self): - for h in self.repo.cache.local.get_hash_list(): + def changed_checksum(self): + for h in self.repo.cache.local.get_checksum_type_list(): if h in self.checksum: return self.checksum[h] != self._compute_checksum(h) return ( self._compute_checksum( - self.repo.cache.local.get_prefer_hash_type() + self.repo.cache.local.get_prefer_checksum_type() ) is not None ) @@ -262,15 +262,19 @@ def _changed_outs(self): return False - def _changed_md5(self): - if self.changed_md5(): + def _changed_checksum(self): + if self.changed_checksum(): logger.warning("Dvc file '{}' changed.".format(self.relpath)) return True return False def changed(self): ret = any( - [self._changed_deps(), self._changed_outs(), self._changed_md5()] + [ + self._changed_deps(), + self._changed_outs(), + self._changed_checksum(), + ] ) if ret: @@ -402,7 +406,7 @@ def is_cached(self): new_d.pop(k, None) outs = old_d.get(self.PARAM_OUTS, []) for out in outs: - out.pop(self.repo.cache.local.get_prefer_hash_type(), None) + out.pop(self.repo.cache.local.get_prefer_checksum_type(), None) out.pop(RemoteS3.PARAM_CHECKSUM, None) if old_d != new_d: @@ -717,7 +721,7 @@ def save(self): for out in self.outs: out.save() - hash_type = self.repo.cache.local.get_prefer_hash_type() + hash_type = self.repo.cache.local.get_prefer_checksum_type() self.checksum = {hash_type: self._compute_checksum(hash_type)} @staticmethod @@ -732,7 +736,7 @@ def check_can_commit(self, force): changed_deps = self._changed_entries(self.deps) changed_outs = self._changed_entries(self.outs) - if changed_deps or changed_outs or self.changed_md5(): + if changed_deps or changed_outs or self.changed_checksum(): msg = ( "dependencies {}".format(changed_deps) if changed_deps else "" ) @@ -899,7 +903,7 @@ def status(self): if outs_status: ret.append({"changed outs": outs_status}) - if self.changed_md5(): + if self.changed_checksum(): ret.append("changed checksum") if self.is_callback: @@ -912,7 +916,7 @@ def status(self): def _already_cached(self): return ( - not self.changed_md5() + not self.changed_checksum() and all(not dep.changed() for dep in self.deps) and all( not out.changed_cache() if out.use_cache else not out.changed() diff --git a/tests/func/test_stage.py b/tests/func/test_stage.py index 328f88e631..8c36957f84 100644 --- a/tests/func/test_stage.py +++ b/tests/func/test_stage.py @@ -168,7 +168,7 @@ def test_md5_ignores_comments(repo_dir, dvc_repo): f.write("# End comment\n") new_stage = Stage.load(dvc_repo, stage.path) - assert not new_stage.changed_md5() + assert not new_stage.changed_checksum() def test_meta_is_preserved(dvc_repo): diff --git a/tests/unit/remote/hash/test_hash.py b/tests/unit/remote/hash/test_hash.py index 51ce7da5c2..a0c39fde69 100644 --- a/tests/unit/remote/hash/test_hash.py +++ b/tests/unit/remote/hash/test_hash.py @@ -8,7 +8,7 @@ def test_compatibility(self): ret = main(["add", self.FOO]) self.assertEqual(0, ret) - ret = main(["config", "hash.local", "sha256,md5"]) + ret = main(["config", "checksum.local", "sha256,md5"]) self.assertEqual(0, ret) ret = main(["status", "-q"]) @@ -27,7 +27,7 @@ def test_changedCompatibility(self): ret = main(["add", self.FOO]) self.assertEqual(0, ret) - ret = main(["config", "hash.local", "sha256,md5"]) + ret = main(["config", "checksum.local", "sha256,md5"]) self.assertEqual(0, ret) ret = main(["status", "-q"]) @@ -51,7 +51,7 @@ def test_incompatibility(self): ret = main(["add", self.FOO]) self.assertEqual(0, ret) - ret = main(["config", "hash.local", "sha256"]) + ret = main(["config", "checksum.local", "sha256"]) self.assertEqual(0, ret) ret = main(["status", "-q"]) From f62d053a7f9ac2e344d769cc44a7430bc2dea507 Mon Sep 17 00:00:00 2001 From: vyloy Date: Sun, 5 May 2019 10:43:13 +0800 Subject: [PATCH 14/20] collect schemas --- dvc/output/__init__.py | 21 ++------------------- dvc/stage.py | 8 ++++---- dvc/utils/checksum.py | 23 +++++++++++++++++++++++ 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index 35be175f37..b3ae9f7b29 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -14,8 +14,6 @@ from dvc.output.ssh import OutputSSH from dvc.remote import Remote -from dvc.remote.s3 import RemoteS3 -from dvc.remote.hdfs import RemoteHDFS OUTS = [ OutputHDFS, @@ -33,24 +31,9 @@ Schemes.LOCAL: OutputLOCAL, } -# NOTE: currently there are only 3 possible checksum names: -# -# 1) md5 (LOCAL, SSH, GS); -# 2) etag (S3); -# 3) checksum (HDFS); -# -# so when a few types of outputs share the same name, we only need -# specify it once. -CHECKSUM_SCHEMA = { - schema.Optional(modchecksum.CHECKSUM_MD5): schema.Or(str, None), - schema.Optional(modchecksum.CHECKSUM_SHA256): schema.Or(str, None), - schema.Optional(RemoteS3.PARAM_CHECKSUM): schema.Or(str, None), - schema.Optional(RemoteHDFS.PARAM_CHECKSUM): schema.Or(str, None), -} - -TAGS_SCHEMA = {schema.Optional(str): CHECKSUM_SCHEMA} +TAGS_SCHEMA = {schema.Optional(str): modchecksum.CHECKSUM_SCHEMA} -SCHEMA = CHECKSUM_SCHEMA.copy() +SCHEMA = modchecksum.CHECKSUM_SCHEMA.copy() SCHEMA[OutputBase.PARAM_PATH] = str SCHEMA[schema.Optional(OutputBase.PARAM_CACHE)] = bool SCHEMA[schema.Optional(OutputBase.PARAM_METRIC)] = OutputBase.METRIC_SCHEMA diff --git a/dvc/stage.py b/dvc/stage.py index 57a1650f7f..c510164335 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -128,16 +128,16 @@ class Stage(object): PARAM_LOCKED = "locked" PARAM_META = "meta" - SCHEMA = { - Optional(modchecksum.CHECKSUM_MD5): Or(str, None), - Optional(modchecksum.CHECKSUM_SHA256): Or(str, None), + SCHEMA = modchecksum.CHECKSUM_LOCAL_SCHEMA.copy() + for k, v in { Optional(PARAM_CMD): Or(str, None), Optional(PARAM_WDIR): Or(str, None), Optional(PARAM_DEPS): Or(And(list, Schema([dependency.SCHEMA])), None), Optional(PARAM_OUTS): Or(And(list, Schema([output.SCHEMA])), None), Optional(PARAM_LOCKED): bool, Optional(PARAM_META): object, - } + }.items(): + SCHEMA[k] = v TAG_REGEX = r"^(?P.*)@(?P[^\\/@:]*)$" diff --git a/dvc/utils/checksum.py b/dvc/utils/checksum.py index 97af657640..cc2b8b1995 100644 --- a/dvc/utils/checksum.py +++ b/dvc/utils/checksum.py @@ -8,6 +8,7 @@ import json import hashlib import logging +import schema logger = logging.getLogger(__name__) @@ -20,6 +21,28 @@ CHECKSUM_MAP = {CHECKSUM_MD5: hashlib.md5, CHECKSUM_SHA256: hashlib.sha256} +CHECKSUM_LOCAL_SCHEMA = { + schema.Optional(CHECKSUM_MD5): schema.Or(str, None), + schema.Optional(CHECKSUM_SHA256): schema.Or(str, None), +} + +# NOTE: currently there are only 4 possible checksum names: +# +# 1) md5 (LOCAL, SSH, GS); +# 2) etag (S3); +# 3) checksum (HDFS); +# 4) sha256 (LOCAL); +# +# so when a few types of outputs share the same name, we only need +# specify it once. +REMOTE_S3_CHECKSUM = "etag" +REMOTE_HDFS_CHECKSUM = "checksum" + +CHECKSUM_SCHEMA = CHECKSUM_LOCAL_SCHEMA.copy() +CHECKSUM_SCHEMA[schema.Optional(REMOTE_S3_CHECKSUM)] = schema.Or(str, None) +CHECKSUM_SCHEMA[schema.Optional(REMOTE_HDFS_CHECKSUM)] = schema.Or(str, None) + + def checksum_types_from_str(hash_names): if isinstance(hash_names, str): hash_names = [h.strip().lower() for h in hash_names.split(",")] From 060f6321e304f59e4c0d30104b6812700990ab6d Mon Sep 17 00:00:00 2001 From: vyloy Date: Sun, 5 May 2019 10:49:41 +0800 Subject: [PATCH 15/20] increase state version --- dvc/state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/state.py b/dvc/state.py index a070b48af5..56dc8a5025 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -54,7 +54,7 @@ class State(StateBase): # pylint: disable=too-many-instance-attributes state database version. """ - VERSION = 3 + VERSION = 4 STATE_FILE = "state" STATE_TABLE = "state" STATE_TABLE_LAYOUT = ( From 62c6339ee82bafdb010961b677abf8e36c481c10 Mon Sep 17 00:00:00 2001 From: vyloy Date: Sun, 5 May 2019 11:11:39 +0800 Subject: [PATCH 16/20] fix error after rebase --- dvc/remote/base.py | 4 ++-- dvc/remote/local/__init__.py | 14 ++++++++------ tests/func/test_install.py | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 0e2d1d2efc..222b9a9d71 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -204,8 +204,8 @@ def get_dir_checksum(self, path_info): if self.cache.changed_cache_file(checksum): self.cache.move(from_info, to_info) - self.state.save(path_info, checksum, self.get_prefer_hash_type()) - self.state.save(to_info, checksum, self.get_prefer_hash_type()) + self.state.save(path_info, checksum, self.get_prefer_checksum_type()) + self.state.save(to_info, checksum, self.get_prefer_checksum_type()) return checksum diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index edb977ca62..2d4878daec 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -72,12 +72,14 @@ def __init__(self, repo, config): self.cache_types = copy(self.DEFAULT_CACHE_TYPES) self._hash = [modchecksum.CHECKSUM_MD5] - conf = repo.config.config[Config.SECTION_CHECKSUM] - hash_local = conf.get(Config.SECTION_CHECKSUM_LOCAL, None) - if hash_local: - self._hash = ( - modchecksum.checksum_types_from_str(hash_local) or self._hash - ) + if repo is not None: + conf = repo.config.config[Config.SECTION_CHECKSUM] + hash_local = conf.get(Config.SECTION_CHECKSUM_LOCAL, None) + if hash_local: + self._hash = ( + modchecksum.checksum_types_from_str(hash_local) + or self._hash + ) if self.cache_dir is not None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) diff --git a/tests/func/test_install.py b/tests/func/test_install.py index f845ba38ea..3cc11b5ec9 100644 --- a/tests/func/test_install.py +++ b/tests/func/test_install.py @@ -2,7 +2,7 @@ import sys import pytest -from dvc.utils import file_md5 +from dvc.utils.checksum import file_checksum from dvc.main import main from dvc.stage import Stage @@ -71,7 +71,7 @@ def test_should_pre_push_hook_push(self, repo_dir, dvc_repo): git_remote = os.path.join(temp, "project.git") storage_path = os.path.join(temp, "dvc_storage") - foo_checksum = file_md5(repo_dir.FOO)[0] + foo_checksum = file_checksum(repo_dir.FOO)[0] expected_cache_path = dvc_repo.cache.local.get(foo_checksum) ret = main(["remote", "add", "-d", "store", storage_path]) From 44059a587b5405caa23f5493ad22aa7dadd1308f Mon Sep 17 00:00:00 2001 From: vyloy Date: Tue, 7 May 2019 11:34:40 +0800 Subject: [PATCH 17/20] refactor remote.changed --- dvc/output/base.py | 9 +------ dvc/remote/base.py | 66 ++++++++++++++++++++++++++++++---------------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/dvc/output/base.py b/dvc/output/base.py index 03b6b99103..7955ed69b6 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -165,14 +165,7 @@ def exists(self): return self.remote.exists(self.path_info) def changed_checksum(self): - return ( - self.checksum - != tuple( - self.remote.save_info( - self.path_info, self.checksum_type - ).items() - )[0][1] - ) + return self.cache.changed_checksum(self.path_info, self.info) def changed_cache(self): if not self.use_cache or not self.checksum: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 222b9a9d71..f16b26fbeb 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -291,6 +291,48 @@ def save_info(self, path_info): assert path_info.scheme == self.scheme return {self.checksum_type(): self.get_checksum(path_info)} + def changed_checksum(self, path_info, checksum_info, addition_check=None): + if not checksum_info: + logger.debug( + "checksum for '{}' is missing({}).".format( + path_info, checksum_info + ) + ) + return True + + types = set(checksum_info.keys()) + supported_types = set(self.get_checksum_type_list()) + intersection = supported_types.intersection(types) + if not supported_types.intersection(types): + logger.debug( + "checksum for '{}' is not supported({}).".format( + path_info, checksum_info + ) + ) + return True + + for typ in intersection: + expected = checksum_info.get(typ) + if addition_check: + if addition_check(expected, typ): + logger.debug( + "{} for '{}'('{}') has changed.".format( + addition_check, path_info, checksum_info + ) + ) + return True + + actual = self.get_checksum(path_info, typ) + if actual != expected: + logger.debug( + "checksum '{}'(actual '{}') for '{}' has changed.".format( + expected, actual, path_info + ) + ) + return True + + return False + def changed(self, path_info, checksum_info): """Checks if data has changed. @@ -318,29 +360,7 @@ def changed(self, path_info, checksum_info): logger.debug("'{}' doesn't exist.".format(path_info)) return True - checksum = None - for t in self.get_checksum_type_list(): - checksum = checksum_info.get(t) - if checksum: - break - - if checksum is None: - logger.debug("checksum for '{}' is missing.".format(path_info)) - return True - - if self.changed_cache(checksum, t): - logger.debug( - "cache for '{}'('{}') has changed.".format(path_info, checksum) - ) - return True - - actual = self.save_info(path_info, t)[t] - if checksum != actual: - logger.debug( - "checksum '{}'(actual '{}') for '{}' has changed.".format( - checksum, actual, path_info - ) - ) + if self.changed_checksum(path_info, checksum_info, self.changed_cache): return True logger.debug("'{}' hasn't changed.".format(path_info)) From f0dbadf5ebf65ac70374e8e6b3208c73ee514fc2 Mon Sep 17 00:00:00 2001 From: vyloy Date: Tue, 7 May 2019 11:43:44 +0800 Subject: [PATCH 18/20] rename get_prefer_checksum_type to checksum_type rename get_checksum_type_list to checksum_types --- dvc/output/base.py | 4 ++-- dvc/remote/base.py | 45 +++++++++++++++++------------------- dvc/remote/local/__init__.py | 18 +++++++-------- dvc/repo/__init__.py | 2 +- dvc/stage.py | 10 ++++---- 5 files changed, 37 insertions(+), 42 deletions(-) diff --git a/dvc/output/base.py b/dvc/output/base.py index 7955ed69b6..86381188c9 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -143,14 +143,14 @@ def sep(self): @property def checksum_type(self): - for t in self.remote.get_checksum_type_list(): + for t in self.remote.checksum_types(): if t in self.info: return t return None @property def checksum(self): - for t in self.remote.get_checksum_type_list(): + for t in self.remote.checksum_types(): info = self.info.get(t) if info: return info diff --git a/dvc/remote/base.py b/dvc/remote/base.py index f16b26fbeb..66c837d2d7 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -156,10 +156,10 @@ def group(self, name): def cache(self): return getattr(self.repo.cache, self.scheme) - def get_checksum_type_list(self): + def checksum_types(self): return [self.PARAM_CHECKSUM] - def get_prefer_checksum_type(self): + def checksum_type(self): return self.PARAM_CHECKSUM def get_file_checksum(self, path_info): @@ -189,7 +189,7 @@ def _collect_dir(self, path_info): dir_info.append( { self.PARAM_RELPATH: relpath, - self.get_prefer_checksum_type(): checksum, + self.checksum_type(): checksum, } ) @@ -204,8 +204,8 @@ def get_dir_checksum(self, path_info): if self.cache.changed_cache_file(checksum): self.cache.move(from_info, to_info) - self.state.save(path_info, checksum, self.get_prefer_checksum_type()) - self.state.save(to_info, checksum, self.get_prefer_checksum_type()) + self.state.save(path_info, checksum, self.checksum_type()) + self.state.save(to_info, checksum, self.checksum_type()) return checksum @@ -272,7 +272,7 @@ def get_checksum(self, path_info, checksum_type=None): return None if checksum_type is None: - checksum_type = self.get_prefer_checksum_type() + checksum_type = self.checksum_type() checksum = self.state.get(path_info, checksum_type) if checksum: return checksum @@ -301,7 +301,7 @@ def changed_checksum(self, path_info, checksum_info, addition_check=None): return True types = set(checksum_info.keys()) - supported_types = set(self.get_checksum_type_list()) + supported_types = set(self.checksum_types()) intersection = supported_types.intersection(types) if not supported_types.intersection(types): logger.debug( @@ -386,8 +386,8 @@ def _save_file(self, path_info, checksum, save_link=True): # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation - self.state.save(path_info, checksum, self.get_prefer_checksum_type()) - self.state.save(cache_info, checksum, self.get_prefer_checksum_type()) + self.state.save(path_info, checksum, self.checksum_type()) + self.state.save(cache_info, checksum, self.checksum_type()) def _save_dir(self, path_info, checksum): cache_info = self.checksum_to_path_info(checksum) @@ -395,15 +395,15 @@ def _save_dir(self, path_info, checksum): entry_info = copy(path_info) for entry in dir_info: - entry_checksum = entry[self.get_prefer_checksum_type()] + entry_checksum = entry[self.checksum_type()] entry_info.path = self.ospath.join( path_info.path, entry[self.PARAM_RELPATH] ) self._save_file(entry_info, entry_checksum, save_link=False) self.state.save_link(path_info) - self.state.save(cache_info, checksum, self.get_prefer_checksum_type()) - self.state.save(path_info, checksum, self.get_prefer_checksum_type()) + self.state.save(cache_info, checksum, self.checksum_type()) + self.state.save(path_info, checksum, self.checksum_type()) def is_empty(self, path_info): return False @@ -427,8 +427,8 @@ def save(self, path_info, checksum_info): self.scheme, ) - checksum = checksum_info[self.get_prefer_checksum_type()] - if not self.changed_cache(checksum, self.get_prefer_checksum_type()): + checksum = checksum_info[self.checksum_type()] + if not self.changed_cache(checksum, self.checksum_type()): self._checkout(path_info, checksum) return @@ -517,14 +517,13 @@ def all(self): def gc(self, cinfos): used = { - info[self.repo.cache.local.get_prefer_checksum_type()] + info[self.repo.cache.local.checksum_type()] for info in cinfos["local"] } if self.scheme != "local": used |= { - info[self.get_prefer_hash_type()] - for info in cinfos.get(self.scheme, []) + info[self.checksum_type()] for info in cinfos[self.scheme] } removed = False @@ -549,7 +548,7 @@ def changed_cache_file(self, checksum, checksum_type=None): - Remove the file from cache if it doesn't match the actual checksum """ if checksum_type is None: - checksum_type = self.get_prefer_checksum_type() + checksum_type = self.checksum_type() cache_info = self.checksum_to_path_info(checksum) actual = self.get_checksum(cache_info, checksum_type) @@ -591,7 +590,7 @@ def _changed_dir_cache(self, checksum, checksum_type): def changed_cache(self, checksum, checksum_type=None): if checksum_type is None: - checksum_type = self.get_prefer_checksum_type() + checksum_type = self.checksum_type() if self.is_dir_checksum(checksum): return self._changed_dir_cache(checksum, checksum_type) return self.changed_cache_file(checksum, checksum_type) @@ -663,14 +662,12 @@ def _checkout_dir( entry_info = copy(path_info) for entry in dir_info: relpath = entry[self.PARAM_RELPATH] - entry_checksum = entry[self.get_prefer_checksum_type()] + entry_checksum = entry[self.checksum_type()] entry_cache_info = self.checksum_to_path_info(entry_checksum) entry_info.url = self.ospath.join(path_info.url, relpath) entry_info.path = self.ospath.join(path_info.path, relpath) - entry_checksum_info = { - self.get_prefer_checksum_type(): entry_checksum - } + entry_checksum_info = {self.checksum_type(): entry_checksum} if self.changed(entry_info, entry_checksum_info): if self.exists(entry_info): self.safe_remove(entry_info, force=force) @@ -712,7 +709,7 @@ def checkout( ): raise NotImplementedError - checksum = checksum_info.get(self.get_prefer_checksum_type()) + checksum = checksum_info.get(self.checksum_type()) if not checksum: msg = "No checksum info for '{}'." logger.debug(msg.format(str(path_info))) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 2d4878daec..cf956619dc 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -71,14 +71,14 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) - self._hash = [modchecksum.CHECKSUM_MD5] + self._checksum_types = [modchecksum.CHECKSUM_MD5] if repo is not None: conf = repo.config.config[Config.SECTION_CHECKSUM] hash_local = conf.get(Config.SECTION_CHECKSUM_LOCAL, None) if hash_local: - self._hash = ( + self._checksum_types = ( modchecksum.checksum_types_from_str(hash_local) - or self._hash + or self._checksum_types ) if self.cache_dir is not None and not os.path.exists(self.cache_dir): @@ -210,14 +210,14 @@ def isdir(self, path_info): def walk(self, path_info): return os.walk(path_info.path) - def get_checksum_type_list(self): - return self._hash + def checksum_types(self): + return self._checksum_types - def get_prefer_checksum_type(self): - return self._hash[0] + def checksum_type(self): + return self._checksum_types[0] def get_file_checksum(self, path_info): - return file_checksum(path_info.path, self.hash[0])[0] + return file_checksum(path_info.path, self._checksum_types[0])[0] def remove(self, path_info): if path_info.scheme != "local": @@ -326,7 +326,7 @@ def _group(self, checksum_infos, show_checksums=False): by_md5 = {} for info in checksum_infos: - md5 = info[self.get_prefer_checksum_type()] + md5 = info[self.checksum_type()] if show_checksums: by_md5[md5] = {"name": md5} diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index bab8960d62..03d7430014 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -200,7 +200,7 @@ def _collect_dir_cache( info = out.dumpd() ret = [info] r = out.remote - md5 = info[r.get_prefer_checksum_type()] + md5 = info[r.checksum_type()] if self.cache.local.changed_cache_file(md5): try: diff --git a/dvc/stage.py b/dvc/stage.py index c510164335..22bdbf006d 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -199,14 +199,12 @@ def is_stage_file(path): return os.path.isfile(path) and Stage.is_valid_filename(path) def changed_checksum(self): - for h in self.repo.cache.local.get_checksum_type_list(): + for h in self.repo.cache.local.checksum_types(): if h in self.checksum: return self.checksum[h] != self._compute_checksum(h) return ( - self._compute_checksum( - self.repo.cache.local.get_prefer_checksum_type() - ) + self._compute_checksum(self.repo.cache.local.checksum_type()) is not None ) @@ -406,7 +404,7 @@ def is_cached(self): new_d.pop(k, None) outs = old_d.get(self.PARAM_OUTS, []) for out in outs: - out.pop(self.repo.cache.local.get_prefer_checksum_type(), None) + out.pop(self.repo.cache.local.checksum_type(), None) out.pop(RemoteS3.PARAM_CHECKSUM, None) if old_d != new_d: @@ -721,7 +719,7 @@ def save(self): for out in self.outs: out.save() - hash_type = self.repo.cache.local.get_prefer_checksum_type() + hash_type = self.repo.cache.local.checksum_type() self.checksum = {hash_type: self._compute_checksum(hash_type)} @staticmethod From 6728cea49e4a6b78234bc5abe5cf18381ac590f4 Mon Sep 17 00:00:00 2001 From: vyloy Date: Thu, 9 May 2019 16:27:45 +0800 Subject: [PATCH 19/20] remote: initially add checksum types config ability for other remotes --- dvc/config.py | 8 +++++--- dvc/remote/azure.py | 3 +++ dvc/remote/base.py | 19 +++++++++++++++++-- dvc/remote/gs.py | 3 +++ dvc/remote/hdfs.py | 3 +++ dvc/remote/http.py | 3 +++ dvc/remote/local/__init__.py | 18 ++---------------- dvc/remote/s3.py | 3 +++ dvc/remote/ssh/__init__.py | 3 +++ dvc/utils/checksum.py | 32 +++++++++++++++++++++----------- tests/unit/remote/test_base.py | 8 ++++++-- 11 files changed, 69 insertions(+), 34 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index f89cb2386e..1f48344db0 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -53,13 +53,15 @@ def supported_loglevel(level): return level in ["info", "debug", "warning", "error"] -def supported_checksum_local(hash_names): +def supported_checksum_local(checksum_types): """Checks if hash config option has a valid value. Args: - hash_names (list/string): hash name(s). + checksum_types (list/string): hash name(s). """ - if modchecksum.checksum_types_from_str(hash_names): + if modchecksum.checksum_types_from_str( + checksum_types, modchecksum.LOCAL_SUPPORTED_CHECKSUM_TYPES + ): return True return False diff --git a/dvc/remote/azure.py b/dvc/remote/azure.py index df86d75408..d3c8da2e9c 100644 --- a/dvc/remote/azure.py +++ b/dvc/remote/azure.py @@ -15,6 +15,7 @@ BlockBlobService = None from dvc.utils import tmp_fname, move +from dvc.utils import checksum as modchecksum from dvc.utils.compat import urlparse, makedirs from dvc.progress import progress from dvc.config import Config @@ -44,6 +45,8 @@ class RemoteAZURE(RemoteBASE): PARAM_CHECKSUM = "etag" COPY_POLL_SECONDS = 5 + SUPPORTED_CHECKSUM_TYPES = modchecksum.AZURE_SUPPORTED_CHECKSUM_TYPES + def __init__(self, repo, config): super(RemoteAZURE, self).__init__(repo, config) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 66c837d2d7..5b86a4b835 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -20,6 +20,7 @@ from dvc.exceptions import DvcException, ConfirmRemoveError from dvc.progress import progress from dvc.utils import LARGE_DIR_SIZE, tmp_fname +from dvc.utils import checksum as modchecksum from dvc.state import StateBase @@ -75,6 +76,8 @@ class RemoteBASE(object): PARAM_RELPATH = "relpath" CHECKSUM_DIR_SUFFIX = ".dir" + SUPPORTED_CHECKSUM_TYPES = None + def __init__(self, repo, config): self.repo = repo deps_ok = all(self.REQUIRES.values()) @@ -108,6 +111,18 @@ def __init__(self, repo, config): self._dir_info = {} + self._checksum_types = self.SUPPORTED_CHECKSUM_TYPES + if repo is not None: + conf = repo.config.config[Config.SECTION_CHECKSUM] + types = conf.get(self.scheme, None) + if types: + self._checksum_types = ( + modchecksum.checksum_types_from_str( + types, self.SUPPORTED_CHECKSUM_TYPES + ) + or self._checksum_types + ) + def __repr__(self): return "{class_name}: '{url}'".format( class_name=type(self).__name__, url=(self.url or "No url") @@ -157,10 +172,10 @@ def cache(self): return getattr(self.repo.cache, self.scheme) def checksum_types(self): - return [self.PARAM_CHECKSUM] + return self._checksum_types def checksum_type(self): - return self.PARAM_CHECKSUM + return self._checksum_types[0] def get_file_checksum(self, path_info): raise NotImplementedError diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 2e747f1ddd..ef94fe0cab 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -12,6 +12,7 @@ storage = None from dvc.utils import tmp_fname, move +from dvc.utils import checksum as modchecksum from dvc.utils.compat import urlparse, makedirs from dvc.remote.base import RemoteBASE from dvc.config import Config @@ -28,6 +29,8 @@ class RemoteGS(RemoteBASE): REQUIRES = {"google.cloud.storage": storage} PARAM_CHECKSUM = "md5" + SUPPORTED_CHECKSUM_TYPES = modchecksum.GS_SUPPORTED_CHECKSUM_TYPES + def __init__(self, repo, config): super(RemoteGS, self).__init__(repo, config) storagepath = "gs://" diff --git a/dvc/remote/hdfs.py b/dvc/remote/hdfs.py index b8a3b22cc8..43bb4a5107 100644 --- a/dvc/remote/hdfs.py +++ b/dvc/remote/hdfs.py @@ -12,6 +12,7 @@ from dvc.path.hdfs import PathHDFS from dvc.remote.base import RemoteBASE, RemoteCmdError from dvc.utils import fix_env, tmp_fname +from dvc.utils import checksum as modchecksum logger = logging.getLogger(__name__) @@ -22,6 +23,8 @@ class RemoteHDFS(RemoteBASE): REGEX = r"^hdfs://((?P.*)@)?.*$" PARAM_CHECKSUM = "checksum" + SUPPORTED_CHECKSUM_TYPES = modchecksum.HDFS_SUPPORTED_CHECKSUM_TYPES + def __init__(self, repo, config): super(RemoteHDFS, self).__init__(repo, config) self.url = config.get(Config.SECTION_REMOTE_URL, "/") diff --git a/dvc/remote/http.py b/dvc/remote/http.py index 6af541104e..b774e49f2d 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -3,6 +3,7 @@ from dvc.scheme import Schemes from dvc.path import Path from dvc.utils.compat import open, makedirs +from dvc.utils import checksum as modchecksum import os import threading @@ -39,6 +40,8 @@ class RemoteHTTP(RemoteBASE): CHUNK_SIZE = 2 ** 16 PARAM_CHECKSUM = "etag" + SUPPORTED_CHECKSUM_TYPES = modchecksum.HTTP_SUPPORTED_CHECKSUM_TYPES + def __init__(self, repo, config): super(RemoteHTTP, self).__init__(repo, config) self.cache_dir = config.get(Config.SECTION_REMOTE_URL) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index cf956619dc..1574a37491 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -52,6 +52,8 @@ class RemoteLOCAL(RemoteBASE): "reflink": System.reflink, } + SUPPORTED_CHECKSUM_TYPES = modchecksum.LOCAL_SUPPORTED_CHECKSUM_TYPES + def __init__(self, repo, config): super(RemoteLOCAL, self).__init__(repo, config) self.state = self.repo.state if self.repo else None @@ -71,16 +73,6 @@ def __init__(self, repo, config): else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) - self._checksum_types = [modchecksum.CHECKSUM_MD5] - if repo is not None: - conf = repo.config.config[Config.SECTION_CHECKSUM] - hash_local = conf.get(Config.SECTION_CHECKSUM_LOCAL, None) - if hash_local: - self._checksum_types = ( - modchecksum.checksum_types_from_str(hash_local) - or self._checksum_types - ) - if self.cache_dir is not None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) @@ -210,12 +202,6 @@ def isdir(self, path_info): def walk(self, path_info): return os.walk(path_info.path) - def checksum_types(self): - return self._checksum_types - - def checksum_type(self): - return self._checksum_types[0] - def get_file_checksum(self, path_info): return file_checksum(path_info.path, self._checksum_types[0])[0] diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 99ff523d2f..85b66f7135 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -13,6 +13,7 @@ boto3 = None from dvc.utils import tmp_fname, move +from dvc.utils import checksum as modchecksum from dvc.utils.compat import urlparse, makedirs from dvc.progress import progress from dvc.config import Config @@ -41,6 +42,8 @@ class RemoteS3(RemoteBASE): REQUIRES = {"boto3": boto3} PARAM_CHECKSUM = "etag" + SUPPORTED_CHECKSUM_TYPES = modchecksum.S3_SUPPORTED_CHECKSUM_TYPES + def __init__(self, repo, config): super(RemoteS3, self).__init__(repo, config) diff --git a/dvc/remote/ssh/__init__.py b/dvc/remote/ssh/__init__.py index a8cf29c384..6f2aeccb20 100644 --- a/dvc/remote/ssh/__init__.py +++ b/dvc/remote/ssh/__init__.py @@ -17,6 +17,7 @@ from dvc.config import Config from dvc.utils.compat import urlparse from dvc.remote.base import RemoteBASE +from dvc.utils import checksum as modchecksum logger = logging.getLogger(__name__) @@ -36,6 +37,8 @@ class RemoteSSH(RemoteBASE): DEFAULT_PORT = 22 TIMEOUT = 1800 + SUPPORTED_CHECKSUM_TYPES = modchecksum.SSH_SUPPORTED_CHECKSUM_TYPES + def __init__(self, repo, config): super(RemoteSSH, self).__init__(repo, config) self.url = config.get(Config.SECTION_REMOTE_URL, "ssh://") diff --git a/dvc/utils/checksum.py b/dvc/utils/checksum.py index cc2b8b1995..202e6b7247 100644 --- a/dvc/utils/checksum.py +++ b/dvc/utils/checksum.py @@ -35,23 +35,33 @@ # # so when a few types of outputs share the same name, we only need # specify it once. -REMOTE_S3_CHECKSUM = "etag" -REMOTE_HDFS_CHECKSUM = "checksum" +CHECKSUM_ETAG = "etag" +CHECKSUM_CHECKSUM = "checksum" CHECKSUM_SCHEMA = CHECKSUM_LOCAL_SCHEMA.copy() -CHECKSUM_SCHEMA[schema.Optional(REMOTE_S3_CHECKSUM)] = schema.Or(str, None) -CHECKSUM_SCHEMA[schema.Optional(REMOTE_HDFS_CHECKSUM)] = schema.Or(str, None) +CHECKSUM_SCHEMA[schema.Optional(CHECKSUM_ETAG)] = schema.Or(str, None) +CHECKSUM_SCHEMA[schema.Optional(CHECKSUM_CHECKSUM)] = schema.Or(str, None) +LOCAL_SUPPORTED_CHECKSUM_TYPES = [CHECKSUM_MD5, CHECKSUM_SHA256] +HTTP_SUPPORTED_CHECKSUM_TYPES = [CHECKSUM_ETAG, CHECKSUM_MD5] +SSH_SUPPORTED_CHECKSUM_TYPES = [CHECKSUM_MD5] +AZURE_SUPPORTED_CHECKSUM_TYPES = [CHECKSUM_ETAG] +GS_SUPPORTED_CHECKSUM_TYPES = [CHECKSUM_MD5] +HDFS_SUPPORTED_CHECKSUM_TYPES = [CHECKSUM_CHECKSUM] +S3_SUPPORTED_CHECKSUM_TYPES = [CHECKSUM_ETAG] -def checksum_types_from_str(hash_names): - if isinstance(hash_names, str): - hash_names = [h.strip().lower() for h in hash_names.split(",")] - if not isinstance(hash_names, list) or len(hash_names) < 1: + +def checksum_types_from_str(checksum_types, supported_types): + if not supported_types: + return None + if isinstance(checksum_types, str): + checksum_types = [h.strip().lower() for h in checksum_types.split(",")] + if not isinstance(checksum_types, list) or len(checksum_types) < 1: return None - for h in hash_names: - if h not in CHECKSUM_MAP.keys(): + for h in checksum_types: + if h not in supported_types: return None - return hash_names + return checksum_types def dos2unix(data): diff --git a/tests/unit/remote/test_base.py b/tests/unit/remote/test_base.py index 39ed3e5ee4..cbb2404b33 100644 --- a/tests/unit/remote/test_base.py +++ b/tests/unit/remote/test_base.py @@ -41,14 +41,18 @@ def test(self): "url": "base://example/prefix", "connection_string": "1234567", } - remote = RemoteBASE(None, config) + + class RemoteTest(RemoteBASE): + SUPPORTED_CHECKSUM_TYPES = ["checksum"] + + remote = RemoteTest(None, config) remote.PARAM_CHECKSUM = "checksum" remote.path_info = PathBASE(None, None) remote.url = "" remote.prefix = "" path_info = PathBASE("example", None) - checksum_info = {remote.PARAM_CHECKSUM: "1234567890"} + checksum_info = {remote.checksum_type(): "1234567890"} with mock.patch.object(remote, "_checkout") as mock_checkout: with mock.patch.object(remote, "_save") as mock_save: From befb5ee3c8e03b1c8d157768e84ebd69eb4b0352 Mon Sep 17 00:00:00 2001 From: Ivan Shcheklein Date: Mon, 27 May 2019 16:40:42 -0700 Subject: [PATCH 20/20] move dict filter to utils collections --- dvc/config.py | 2 +- dvc/utils/checksum.py | 30 +++--------------------------- dvc/utils/collections.py | 31 ++++++++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 1f48344db0..36c2e4e15c 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -120,7 +120,7 @@ def is_percent(val): Returns: bool: True if 0<=value<=100, False otherwise. """ - return int(val) >= 0 and int(val) <= 100 + return 0 <= int(val) <= 100 class Config(object): # pylint: disable=too-many-instance-attributes diff --git a/dvc/utils/checksum.py b/dvc/utils/checksum.py index 202e6b7247..21c0d85e86 100644 --- a/dvc/utils/checksum.py +++ b/dvc/utils/checksum.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals -from dvc.utils.compat import str, builtin_str, open +from dvc.utils.collections import dict_filter +from dvc.utils.compat import str, open import os import json @@ -116,32 +117,7 @@ def bytes_checksum(byts, checksum_type=CHECKSUM_MD5): return hasher.hexdigest() -def dict_filter(d, exclude=[]): - """ - Exclude specified keys from a nested dict - """ - - if isinstance(d, list): - ret = [] - for e in d: - ret.append(dict_filter(e, exclude)) - return ret - elif isinstance(d, dict): - ret = {} - for k, v in d.items(): - if isinstance(k, builtin_str): - k = str(k) - - assert isinstance(k, str) - if k in exclude: - continue - ret[k] = dict_filter(v, exclude) - return ret - - return d - - -def dict_checksum(d, exclude=[], checksum_type=CHECKSUM_MD5): +def dict_checksum(d, exclude=None, checksum_type=CHECKSUM_MD5): filtered = dict_filter(d, exclude) byts = json.dumps(filtered, sort_keys=True).encode("utf-8") return bytes_checksum(byts, checksum_type) diff --git a/dvc/utils/collections.py b/dvc/utils/collections.py index e1367d10a8..087a16683d 100644 --- a/dvc/utils/collections.py +++ b/dvc/utils/collections.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, unicode_literals -from dvc.utils.compat import Mapping + +from dvc.utils.compat import Mapping, builtin_str # just simple check for Nones and emtpy strings @@ -52,3 +53,31 @@ def is_same_type(a, b): src.__class__.__name__, dest.__class__.__name__ ) ) + + +def dict_filter(d, exclude=None): + """ + Exclude specified keys from a nested dict + """ + + if not exclude: + return d + + if isinstance(d, list): + ret = [] + for e in d: + ret.append(dict_filter(e, exclude)) + return ret + elif isinstance(d, dict): + ret = {} + for k, v in d.items(): + if isinstance(k, builtin_str): + k = str(k) + + assert isinstance(k, str) + if k in exclude: + continue + ret[k] = dict_filter(v, exclude) + return ret + + return d