From 5aae88b316edc88c1bae193061af5bef120e1902 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 17 Jul 2019 14:45:34 +0300 Subject: [PATCH 1/8] local: don't try to protect file if it is already protected Signed-off-by: Ruslan Kuprieiev --- dvc/remote/local/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 5241031d97..0c8bc2d68f 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -8,6 +8,7 @@ import os import stat +import errno from shortuuid import uuid import shutil import logging @@ -487,9 +488,18 @@ def unprotect(self, path_info): @staticmethod def protect(path_info): - os.chmod( - fspath_py35(path_info), stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH - ) + path = fspath_py35(path_info) + mode = stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH + + try: + os.chmod(path, mode) + except OSError as exc: + if exc.errno not in [errno.EPERM, errno.EACCES]: + raise + + actual = os.stat(path).st_mode + if actual & mode != mode: + raise def _get_unpacked_dir_path_info(self, checksum): info = self.checksum_to_path_info(checksum) From 9c5fd0c76cf16d2697921a8c86abc0fad4458f68 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 17 Jul 2019 15:14:07 +0300 Subject: [PATCH 2/8] dvc: support `group` cache mode In this mode, dvc will set appropriate file/dir permissions to support sharing cache between different users within the same group. Fixes #2287 Signed-off-by: Ruslan Kuprieiev --- dvc/cache.py | 2 ++ dvc/config.py | 3 +++ dvc/remote/base.py | 29 ++++++++++++++++----- dvc/remote/local/__init__.py | 50 ++++++++++++++++++++++++------------ dvc/utils/__init__.py | 46 +++++++++++++++++++++++++-------- tests/func/test_cache.py | 26 +++++++++++++++++++ 6 files changed, 123 insertions(+), 33 deletions(-) diff --git a/dvc/cache.py b/dvc/cache.py index c834ed0ae0..24b9022a96 100644 --- a/dvc/cache.py +++ b/dvc/cache.py @@ -46,6 +46,7 @@ def __init__(self, repo): cache_dir = config.get(Config.SECTION_CACHE_DIR, default_cache_dir) cache_type = config.get(Config.SECTION_CACHE_TYPE) protected = config.get(Config.SECTION_CACHE_PROTECTED) + shared = config.get(Config.SECTION_CACHE_SHARED) settings = { Config.PRIVATE_CWD: config.get( @@ -54,6 +55,7 @@ def __init__(self, repo): Config.SECTION_REMOTE_URL: cache_dir, Config.SECTION_CACHE_TYPE: cache_type, Config.SECTION_CACHE_PROTECTED: protected, + Config.SECTION_CACHE_SHARED: shared, } self.local = Remote(repo, **settings) diff --git a/dvc/config.py b/dvc/config.py index 10b228b566..d5a3a4ee69 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -160,6 +160,8 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_CACHE_TYPE = "type" SECTION_CACHE_TYPE_SCHEMA = supported_cache_type SECTION_CACHE_PROTECTED = "protected" + SECTION_CACHE_SHARED = "shared" + SECTION_CACHE_SHARED_SCHEMA = And(Use(str.lower), Choices("group")) SECTION_CACHE_LOCAL = "local" SECTION_CACHE_S3 = "s3" SECTION_CACHE_GS = "gs" @@ -177,6 +179,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_CACHE_DIR): str, Optional(SECTION_CACHE_TYPE, default=None): SECTION_CACHE_TYPE_SCHEMA, Optional(SECTION_CACHE_PROTECTED, default=False): BOOL_SCHEMA, + Optional(SECTION_CACHE_SHARED): SECTION_CACHE_SHARED_SCHEMA, Optional(PRIVATE_CWD): str, Optional(SECTION_CACHE_SLOW_LINK_WARNING, default=True): BOOL_SCHEMA, } diff --git a/dvc/remote/base.py b/dvc/remote/base.py index edacef7249..e596336bf6 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from dvc.ignore import DvcIgnore -from dvc.utils.compat import str, basestring, urlparse, fspath_py35, makedirs +from dvc.utils.compat import str, basestring, urlparse import os import json @@ -20,7 +20,14 @@ DvcIgnoreInCollectedDirError, ) from dvc.progress import progress, ProgressCallback -from dvc.utils import LARGE_DIR_SIZE, tmp_fname, to_chunks, move, relpath +from dvc.utils import ( + LARGE_DIR_SIZE, + tmp_fname, + to_chunks, + move, + relpath, + makedirs, +) from dvc.state import StateNoop from dvc.path_info import PathInfo, URLInfo @@ -202,6 +209,7 @@ def get_dir_checksum(self, path_info): checksum, tmp_info = self._get_dir_info_checksum(dir_info) new_info = self.cache.checksum_to_path_info(checksum) if self.cache.changed_cache_file(checksum): + self.cache.makedirs(new_info.parent) self.cache.move(tmp_info, new_info) self.state.save(path_info, checksum) @@ -348,7 +356,7 @@ def changed(self, path_info, checksum_info): logger.debug("'{}' hasn't changed.".format(path_info)) return False - def link(self, from_info, to_info, link_type=None): + def link(self, from_info, to_info, link_types=None): self.copy(from_info, to_info) def _save_file(self, path_info, checksum, save_link=True): @@ -356,6 +364,7 @@ def _save_file(self, path_info, checksum, save_link=True): cache_info = self.checksum_to_path_info(checksum) if self.changed_cache(checksum): + self.makedirs(cache_info.parent) self.move(path_info, cache_info) else: self.remove(path_info) @@ -456,7 +465,15 @@ def upload(self, from_info, to_info, name=None, no_progress_bar=False): return 0 - def download(self, from_info, to_info, name=None, no_progress_bar=False): + def download( + self, + from_info, + to_info, + name=None, + no_progress_bar=False, + file_mode=None, + dir_mode=None, + ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) @@ -479,7 +496,7 @@ def download(self, from_info, to_info, name=None, no_progress_bar=False): # lets at least show start and finish progress.update_target(name, 0, None) - makedirs(fspath_py35(to_info.parent), exist_ok=True) + makedirs(to_info.parent, exist_ok=True, mode=dir_mode) tmp_file = tmp_fname(to_info) try: @@ -491,7 +508,7 @@ def download(self, from_info, to_info, name=None, no_progress_bar=False): logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail - move(tmp_file, fspath_py35(to_info)) + move(tmp_file, to_info, mode=file_mode) if not no_progress_bar: progress.finish_target(name) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 0c8bc2d68f..fafe8e7521 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -4,7 +4,7 @@ from dvc.scheme import Schemes from dvc.remote.local.slow_link_detection import slow_link_guard -from dvc.utils.compat import str, makedirs, fspath_py35 +from dvc.utils.compat import str, fspath_py35 import os import stat @@ -12,6 +12,7 @@ from shortuuid import uuid import shutil import logging +from functools import partial from dvc.system import System from dvc.remote.base import ( @@ -30,6 +31,7 @@ walk_files, relpath, dvc_walk, + makedirs, ) from dvc.config import Config from dvc.exceptions import DvcException @@ -57,10 +59,19 @@ class RemoteLOCAL(RemoteBASE): "reflink": System.reflink, } + SHARED_MODE_MAP = {None: (0o644, 0o755), "group": (0o664, 0o775)} + def __init__(self, repo, config): super(RemoteLOCAL, self).__init__(repo, config) self.protected = config.get(Config.SECTION_CACHE_PROTECTED, False) + shared = config.get(Config.SECTION_CACHE_SHARED) + self._file_mode, self._dir_mode = self.SHARED_MODE_MAP[shared] + + if self.protected: + # cache files are set to be read-only for everyone + self._file_mode = stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH + types = config.get(Config.SECTION_CACHE_TYPE, None) if types: if isinstance(types, str): @@ -120,10 +131,9 @@ def exists(self, path_info): return os.path.lexists(fspath_py35(path_info)) def makedirs(self, path_info): - if not self.exists(path_info): - os.makedirs(fspath_py35(path_info)) + makedirs(path_info, exist_ok=True, mode=self._dir_mode) - def link(self, from_info, to_info, link_type=None): + def link(self, from_info, to_info, link_types=None): from_path = from_info.fspath to_path = to_info.fspath @@ -141,10 +151,8 @@ def link(self, from_info, to_info, link_type=None): logger.debug(msg) return - if not link_type: + if not link_types: link_types = self.cache_types - else: - link_types = [link_type] self._try_links(from_info, to_info, link_types) @@ -235,15 +243,12 @@ def move(self, from_info, to_info): if from_info.scheme != "local" or to_info.scheme != "local": raise NotImplementedError - inp = from_info.fspath - outp = to_info.fspath + if self.isfile(from_info): + mode = self._file_mode + else: + mode = self._dir_mode - # moving in two stages to make the whole operation atomic in - # case inp and outp are in different filesystems and actual - # physical copying of data is happening - tmp = "{}.{}".format(outp, str(uuid())) - move(inp, tmp) - move(tmp, outp) + move(from_info, to_info, mode=mode) def cache_exists(self, md5s, jobs=None): return [ @@ -374,7 +379,11 @@ def _process( ) if download: - func = remote.download + func = partial( + remote.download, + dir_mode=self._dir_mode, + file_mode=self._file_mode, + ) status = STATUS_DELETED else: func = remote.upload @@ -494,6 +503,8 @@ def protect(path_info): try: os.chmod(path, mode) except OSError as exc: + # In share cache scenario, we might not own the cache file, so we + # need to check if cache file is already protected. if exc.errno not in [errno.EPERM, errno.EACCES]: raise @@ -534,8 +545,13 @@ def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info): entry[self.PARAM_CHECKSUM] ) relative_path = entry[self.PARAM_RELPATH] + # In shared cache mode some cache files might not be owned by the + # user, so we need to use symlinks because, unless + # /proc/sys/fs/protected_hardlinks is disabled, the user is not + # allowed to create hardlinks to files that he doesn't own. + link_types = ["hardlink", "symlink"] self.link( - entry_cache_info, unpacked_dir_info / relative_path, "hardlink" + entry_cache_info, unpacked_dir_info / relative_path, link_types ) self.state.save(unpacked_dir_info, checksum) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index b7c98211f3..c8aeae46ba 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -9,8 +9,9 @@ cast_bytes_py2, StringIO, fspath_py35, + fspath, + makedirs as _makedirs, ) -from dvc.utils.compat import fspath import os import sys @@ -26,6 +27,7 @@ import re import logging +from shortuuid import uuid from ruamel.yaml import YAML @@ -145,18 +147,44 @@ def copyfile(src, dest, no_progress_bar=False, name=None): progress.finish_target(name) -def move(src, dst): +def makedirs(path, exist_ok=False, mode=None): + path = fspath_py35(path) + + if mode is None: + _makedirs(path, exist_ok=exist_ok) + return + + umask = os.umask(0) + try: + _makedirs(path, exist_ok=exist_ok, mode=mode) + finally: + os.umask(umask) + + +def move(src, dst, mode=None): + """Atomically move src to dst and chmod it with mode. + + Moving is performed in two stages to make the whole operation atomic in + case src and dst are on different filesystems and actual physical copying + of data is happening. + """ + + src = fspath_py35(src) + dst = fspath_py35(dst) + dst = os.path.abspath(dst) - dname = os.path.dirname(dst) - if not os.path.exists(dname): - os.makedirs(dname) + tmp = "{}.{}".format(dst, str(uuid())) if os.path.islink(src): - shutil.copy(os.readlink(src), dst) + shutil.copy(os.readlink(src), tmp) os.unlink(src) - return + else: + shutil.move(src, tmp) + + if mode is not None: + os.chmod(tmp, mode) - shutil.move(src, dst) + shutil.move(tmp, dst) def _chmod(func, p, excinfo): @@ -254,8 +282,6 @@ def convert_to_unicode(data): def tmp_fname(fname): """ Temporary name for a partial download """ - from shortuuid import uuid - return fspath(fname) + "." + str(uuid()) + ".tmp" diff --git a/tests/func/test_cache.py b/tests/func/test_cache.py index beec489914..4f8ffa950d 100644 --- a/tests/func/test_cache.py +++ b/tests/func/test_cache.py @@ -1,4 +1,6 @@ import os +import stat +import pytest import shutil import configobj @@ -175,3 +177,27 @@ def test_relative_path(self): class TestShouldCacheBeReflinkOrCopyByDefault(TestDvc): def test(self): self.assertEqual(self.dvc.cache.local.cache_types, ["reflink", "copy"]) + + +@pytest.mark.skipif(os.name == "nt", reason="Not supported for Windows.") +@pytest.mark.parametrize( + "protected,dir_mode,file_mode", + [(False, 0o775, 0o664), (True, 0o775, 0o444)], +) +def test_shared_cache(repo_dir, dvc_repo, protected, dir_mode, file_mode): + assert main(["config", "cache.shared", "group"]) == 0 + + if protected: + assert main(["config", "cache.protected", "true"]) == 0 + + assert main(["add", repo_dir.FOO]) == 0 + assert main(["add", repo_dir.DATA_DIR]) == 0 + + for root, dnames, fnames in os.walk(dvc_repo.cache.local.cache_dir): + for dname in dnames: + path = os.path.join(root, dname) + assert stat.S_IMODE(os.stat(path).st_mode) == dir_mode + + for fname in fnames: + path = os.path.join(root, fname) + assert stat.S_IMODE(os.stat(path).st_mode) == file_mode From b2fe3f7a7195752d225970e0580f59d5eda3ffad Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 9 Aug 2019 16:42:00 +0300 Subject: [PATCH 3/8] utils: remove unnecessary else Signed-off-by: Ruslan Kuprieiev --- dvc/utils/__init__.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index c8aeae46ba..0f462cb3dc 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -80,8 +80,8 @@ def file_md5(fname): progress.finish_target(name) return (hash_md5.hexdigest(), hash_md5.digest()) - else: - return (None, None) + + return (None, None) def bytes_md5(byts): @@ -101,14 +101,13 @@ def fix_key(k): if isinstance(d, list): return [dict_filter(e, exclude) for e in d] - elif isinstance(d, dict): + if isinstance(d, dict): items = ((fix_key(k), v) for k, v in d.items()) return { k: dict_filter(v, exclude) for k, v in items if k not in exclude } - else: - return d + return d def dict_md5(d, exclude=()): @@ -272,12 +271,11 @@ def fix_env(env=None): def convert_to_unicode(data): if isinstance(data, builtin_str): return str(data) - elif isinstance(data, dict): + if isinstance(data, dict): return dict(map(convert_to_unicode, data.items())) - elif isinstance(data, list) or isinstance(data, tuple): + if isinstance(data, list) or isinstance(data, tuple): return type(data)(map(convert_to_unicode, data)) - else: - return data + return data def tmp_fname(fname): From 713e288adcde6dd5c7bf700b679ecb12f258cc54 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 9 Aug 2019 16:45:08 +0300 Subject: [PATCH 4/8] remote: base: checkout: remove unnecessary return Signed-off-by: Ruslan Kuprieiev --- dvc/remote/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index e596336bf6..9608af5a90 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -773,7 +773,7 @@ def checkout( msg = "Checking out '{}' with cache '{}'." logger.debug(msg.format(str(path_info), checksum)) - return self._checkout(path_info, checksum, force, progress_callback) + self._checkout(path_info, checksum, force, progress_callback) def _checkout( self, path_info, checksum, force=False, progress_callback=None From 24428e47c3f9c3269cae2a6e8e933f077f310217 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 9 Aug 2019 16:47:33 +0300 Subject: [PATCH 5/8] remote: local: cache_exists: match parameters from the BASE Signed-off-by: Ruslan Kuprieiev --- dvc/remote/local/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index fafe8e7521..4876f8b12d 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -250,10 +250,10 @@ def move(self, from_info, to_info): move(from_info, to_info, mode=mode) - def cache_exists(self, md5s, jobs=None): + def cache_exists(self, checksums, jobs=None): return [ checksum - for checksum in progress(md5s) + for checksum in progress(checksums) if not self.changed_cache_file(checksum) ] From 640e73757094faf0c6fa86ec3670f21634883cc9 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 9 Aug 2019 16:49:14 +0300 Subject: [PATCH 6/8] remote: base: remove unused argument Signed-off-by: Ruslan Kuprieiev --- dvc/remote/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 9608af5a90..d83c64ccf0 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -803,5 +803,5 @@ def extract_used_local_checksums(self, cinfos): def _changed_unpacked_dir(self, checksum): return True - def _update_unpacked_dir(self, checksum, progress_callback=None): + def _update_unpacked_dir(self, checksum): pass From 9955f1c64f1a509546fd04b973b19cf2494fcf84 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 9 Aug 2019 16:52:22 +0300 Subject: [PATCH 7/8] utils: merge isinstance checks Signed-off-by: Ruslan Kuprieiev --- dvc/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 0f462cb3dc..af7356575c 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -273,7 +273,7 @@ def convert_to_unicode(data): return str(data) if isinstance(data, dict): return dict(map(convert_to_unicode, data.items())) - if isinstance(data, list) or isinstance(data, tuple): + if isinstance(data, (list, tuple)): return type(data)(map(convert_to_unicode, data)) return data From 2b36d25f492b215b72bc5b182a06d58911974067 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 9 Aug 2019 16:54:23 +0300 Subject: [PATCH 8/8] remote: base: remove unused argument Signed-off-by: Ruslan Kuprieiev --- dvc/remote/base.py | 2 +- dvc/remote/local/__init__.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index d83c64ccf0..dddd200b67 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -356,7 +356,7 @@ def changed(self, path_info, checksum_info): logger.debug("'{}' hasn't changed.".format(path_info)) return False - def link(self, from_info, to_info, link_types=None): + def link(self, from_info, to_info): self.copy(from_info, to_info) def _save_file(self, path_info, checksum, save_link=True): diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 4876f8b12d..9c1a242a0b 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -133,7 +133,10 @@ def exists(self, path_info): def makedirs(self, path_info): makedirs(path_info, exist_ok=True, mode=self._dir_mode) - def link(self, from_info, to_info, link_types=None): + def link(self, from_info, to_info): + self._link(from_info, to_info, self.cache_types) + + def _link(self, from_info, to_info, link_types): from_path = from_info.fspath to_path = to_info.fspath @@ -151,9 +154,6 @@ def link(self, from_info, to_info, link_types=None): logger.debug(msg) return - if not link_types: - link_types = self.cache_types - self._try_links(from_info, to_info, link_types) @classmethod @@ -165,7 +165,7 @@ def _get_link_method(cls, link_type): "Cache type: '{}' not supported!".format(link_type) ) - def _link(self, from_info, to_info, link_method): + def _do_link(self, from_info, to_info, link_method): if self.exists(to_info): raise DvcException("Link '{}' already exists!".format(to_info)) else: @@ -188,7 +188,7 @@ def _try_links(self, from_info, to_info, link_types): while i > 0: link_method = self._get_link_method(link_types[0]) try: - self._link(from_info, to_info, link_method) + self._do_link(from_info, to_info, link_method) return except DvcException as exc: @@ -550,7 +550,7 @@ def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info): # /proc/sys/fs/protected_hardlinks is disabled, the user is not # allowed to create hardlinks to files that he doesn't own. link_types = ["hardlink", "symlink"] - self.link( + self._link( entry_cache_info, unpacked_dir_info / relative_path, link_types )