From bbd27527ff93a9ef4482513d48c56953d4cd10b8 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 6 Feb 2019 18:48:12 +0200 Subject: [PATCH 1/7] remote: RFC: remote.save() Signed-off-by: Ruslan Kuprieiev --- dvc/output/base.py | 9 ++++----- dvc/output/local.py | 3 ++- dvc/remote/base.py | 2 +- dvc/remote/hdfs.py | 6 ++---- dvc/remote/local.py | 18 +++++++----------- dvc/remote/s3.py | 6 ++---- dvc/remote/ssh.py | 6 ++---- 7 files changed, 20 insertions(+), 30 deletions(-) diff --git a/dvc/output/base.py b/dvc/output/base.py index abbcaf402a..c841200b4b 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -137,11 +137,10 @@ def status(self): return {} def save(self): - if not self.use_cache: - self.info = self.remote.save_info(self.path_info) - else: - self.info = getattr(self.repo.cache, self.scheme).save( - self.path_info + self.info = self.remote.save_info(self.path_info) + if self.use_cache: + getattr(self.repo.cache, self.scheme).save( + self.path_info, self.info ) def dumpd(self): diff --git a/dvc/output/local.py b/dvc/output/local.py index 1324e13771..ea47f92854 100644 --- a/dvc/output/local.py +++ b/dvc/output/local.py @@ -117,4 +117,5 @@ def save(self): if self.use_cache: self.repo.scm.ignore(self.path) - self.info = self.repo.cache.local.save(self.path_info) + self.info = self.remote.save_info(self.path_info) + self.project.cache.local.save(self.path_info, self.info) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index b08b83986c..5793751e18 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -172,7 +172,7 @@ def changed(self, path_info, checksum_info): logger.debug("'{}' hasn't changed.".format(path_info)) return False - def save(self, path_info): + def save(self, path_info, checksum_info): raise RemoteActionNotImplemented("save", self.scheme) def download( diff --git a/dvc/remote/hdfs.py b/dvc/remote/hdfs.py index 5d36500cb1..287bb66ae6 100644 --- a/dvc/remote/hdfs.py +++ b/dvc/remote/hdfs.py @@ -88,20 +88,18 @@ def save_info(self, path_info): return {self.PARAM_CHECKSUM: self.checksum(path_info)} - def save(self, path_info): + def save(self, path_info, checksum_info): if path_info["scheme"] != "hdfs": raise NotImplementedError assert path_info.get("path") - checksum = self.checksum(path_info) + checksum = checksum_info[self.PARAM_CHECKSUM] dest = path_info.copy() dest["path"] = self.checksum_to_path(checksum) self.copy(path_info, dest) - return {self.PARAM_CHECKSUM: checksum} - def remove(self, path_info): if path_info["scheme"] != "hdfs": raise NotImplementedError diff --git a/dvc/remote/local.py b/dvc/remote/local.py index 6a5414189d..7526bad880 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -370,9 +370,7 @@ def _move(self, inp, outp): move(inp, tmp) move(tmp, outp) - def _save_file(self, path_info): - path = path_info["path"] - md5 = self.state.update(path) + def _save_file(self, path, md5): assert md5 is not None cache = self.get(md5) @@ -393,9 +391,8 @@ def _save_file(self, path_info): return {self.PARAM_CHECKSUM: md5} - def _save_dir(self, path_info): - path = path_info["path"] - md5, dir_info = self.state.update_info(path) + def _save_dir(self, path, md5): + dir_info = self.load_dir_cache(md5) dir_relpath = os.path.relpath(path) dir_size = len(dir_info) bar = dir_size > LARGE_DIR_SIZE @@ -429,9 +426,7 @@ def _save_dir(self, path_info): if bar: progress.finish_target(dir_relpath) - return {self.PARAM_CHECKSUM: md5} - - def save(self, path_info): + def save(self, path_info, checksum_info): if path_info["scheme"] != "local": raise NotImplementedError @@ -442,10 +437,11 @@ def save(self, path_info): msg.format(os.path.relpath(path), os.path.relpath(self.cache_dir)) ) + md5 = checksum_info[self.PARAM_CHECKSUM] if os.path.isdir(path): - return self._save_dir(path_info) + self._save_dir(path, md5) else: - return self._save_file(path_info) + self._save_file(path, md5) def save_info(self, path_info): if path_info["scheme"] != "local": diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index d7c3d4f19a..ad8349fa6d 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -109,18 +109,16 @@ def copy(self, from_info, to_info, s3=None): source = {"Bucket": from_info["bucket"], "Key": from_info["path"]} self.s3.copy(source, to_info["bucket"], to_info["path"]) - def save(self, path_info): + def save(self, path_info, checksum_info): if path_info["scheme"] != "s3": raise NotImplementedError - etag = self.get_etag(path_info["bucket"], path_info["path"]) + etag = checksum_info[self.PARAM_CHECKSUM] path = self.checksum_to_path(etag) to_info = {"scheme": "s3", "bucket": self.bucket, "path": path} self.copy(path_info, to_info) - return {self.PARAM_CHECKSUM: etag} - def remove(self, path_info): if path_info["scheme"] != "s3": raise NotImplementedError diff --git a/dvc/remote/ssh.py b/dvc/remote/ssh.py index 1a2f6ee5ea..2b175e45c8 100644 --- a/dvc/remote/ssh.py +++ b/dvc/remote/ssh.py @@ -244,18 +244,16 @@ def save_info(self, path_info): return {self.PARAM_CHECKSUM: self.md5(path_info)} - def save(self, path_info): + def save(self, path_info, checksum_info): if path_info["scheme"] != "ssh": raise NotImplementedError - md5 = self.md5(path_info) + md5 = checksum_info[self.PARAM_CHECKSUM] dest = path_info.copy() dest["path"] = self.checksum_to_path(md5) self.copy(path_info, dest) - return {self.PARAM_CHECKSUM: md5} - def remove(self, path_info): if path_info["scheme"] != "ssh": raise NotImplementedError From 0eab7ed29ec96d6260cf02fbc1b3e8a11deda63a Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 6 Feb 2019 20:24:21 +0200 Subject: [PATCH 2/7] stage: split save() into save() + commit() Signed-off-by: Ruslan Kuprieiev --- dvc/output/base.py | 3 +++ dvc/output/local.py | 1 - dvc/repo/add.py | 1 + dvc/stage.py | 5 +++++ 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/dvc/output/base.py b/dvc/output/base.py index c841200b4b..e9cfd500ae 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -138,6 +138,8 @@ def status(self): def save(self): self.info = self.remote.save_info(self.path_info) + + def commit(self): if self.use_cache: getattr(self.repo.cache, self.scheme).save( self.path_info, self.info @@ -190,6 +192,7 @@ def move(self, out): self.url = out.url self.path_info = out.path_info self.save() + self.commit() if self.scheme == "local" and self.use_cache and self.is_local: self.repo.scm.ignore(self.path) diff --git a/dvc/output/local.py b/dvc/output/local.py index ea47f92854..142d18e168 100644 --- a/dvc/output/local.py +++ b/dvc/output/local.py @@ -118,4 +118,3 @@ def save(self): self.repo.scm.ignore(self.path) self.info = self.remote.save_info(self.path_info) - self.project.cache.local.save(self.path_info, self.info) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 3e727c1762..6c1138cb18 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -30,6 +30,7 @@ def add(self, fname, recursive=False): continue stage.save() + stage.commit() stages.append(stage) self.check_dag(self.stages() + stages) diff --git a/dvc/stage.py b/dvc/stage.py index ff3b616920..44c88d2d28 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -526,6 +526,10 @@ def save(self): self.md5 = self._compute_md5() + def commit(self): + for out in self.outs: + out.commit() + def _check_missing_deps(self): missing = [dep for dep in self.deps if not dep.exists] @@ -625,6 +629,7 @@ def run(self, dry=False, resume=False): if not dry: self.save() + self.commit() def check_missing_outputs(self): paths = [ From 3bc0b9728bda1de3b3bb4b1ababe10d6a07551bc Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 6 Feb 2019 20:39:31 +0200 Subject: [PATCH 3/7] run: introduce --no-commit Signed-off-by: Ruslan Kuprieiev --- dvc/command/run.py | 7 +++++++ dvc/repo/run.py | 3 ++- dvc/stage.py | 5 +++-- tests/test_run.py | 11 +++++++++++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/dvc/command/run.py b/dvc/command/run.py index cae5696540..8222cbcdab 100644 --- a/dvc/command/run.py +++ b/dvc/command/run.py @@ -42,6 +42,7 @@ def run(self): overwrite=overwrite, ignore_build_cache=self.args.ignore_build_cache, remove_outs=self.args.remove_outs, + no_commit=self.args.no_commit, ) except DvcException: logger.error("failed to run command") @@ -167,6 +168,12 @@ def add_parser(subparsers, parent_parser): default=False, help="Remove outputs before running the command.", ) + run_parser.add_argument( + "--no-commit", + action="store_true", + default=False, + help="Don't put files/directories into cache.", + ) run_parser.add_argument( "command", nargs=argparse.REMAINDER, help="Command to execute." ) diff --git a/dvc/repo/run.py b/dvc/repo/run.py index a2c1426551..b782655cff 100644 --- a/dvc/repo/run.py +++ b/dvc/repo/run.py @@ -15,6 +15,7 @@ def run( overwrite=False, ignore_build_cache=False, remove_outs=False, + no_commit=False, ): from dvc.stage import Stage @@ -53,7 +54,7 @@ def run( self.files_to_git_add = [] with self.state: if not no_exec: - stage.run() + stage.run(no_commit=no_commit) stage.dump() diff --git a/dvc/stage.py b/dvc/stage.py index 44c88d2d28..3f9a77b90f 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -589,7 +589,7 @@ def _run(self): if p.returncode != 0: raise StageCmdFailedError(self) - def run(self, dry=False, resume=False): + def run(self, dry=False, resume=False, no_commit=False): if self.locked: logger.info( "Verifying outputs in locked stage '{stage}'".format( @@ -629,7 +629,8 @@ def run(self, dry=False, resume=False): if not dry: self.save() - self.commit() + if not no_commit: + self.commit() def check_missing_outputs(self): paths = [ diff --git a/tests/test_run.py b/tests/test_run.py index 97d979f288..ed48b9db1e 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -628,3 +628,14 @@ def test(self): self.cmd += " arg" with self.assertRaises(StageFileAlreadyExistsError): self._run() + + +class TestRunNoCommit(TestDvc): + def test(self): + fname = "test" + ret = main( + ["run", "-o", "foo", "--no-commit", "echo", "test", ">", fname] + ) + self.assertEqual(ret, 0) + self.assertTrue(os.path.isfile(fname)) + self.assertEqual(len(os.listdir(self.dvc.cache.local.cache_dir)), 0) From 99f4107d585d23a30ef2d792aa22e543b1b5fc3c Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 6 Feb 2019 22:58:22 +0200 Subject: [PATCH 4/7] add: introduce --no-commit Signed-off-by: Ruslan Kuprieiev --- dvc/command/add.py | 12 +++++++++++- dvc/repo/add.py | 5 +++-- tests/test_add.py | 8 ++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/dvc/command/add.py b/dvc/command/add.py index 79c523aceb..eb0ac3af48 100644 --- a/dvc/command/add.py +++ b/dvc/command/add.py @@ -9,7 +9,11 @@ class CmdAdd(CmdBase): def run(self): for target in self.args.targets: try: - self.repo.add(target, recursive=self.args.recursive) + self.repo.add( + target, + recursive=self.args.recursive, + no_commit=self.args.no_commit, + ) except DvcException: logger.error("failed to add file") return 1 @@ -28,6 +32,12 @@ def add_parser(subparsers, parent_parser): default=False, help="Recursively add each file under the directory.", ) + add_parser.add_argument( + "--no-commit", + action="store_true", + default=False, + help="Don't put files/directories into cache.", + ) add_parser.add_argument( "targets", nargs="+", help="Input files/directories." ) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 6c1138cb18..ff82b17aa7 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -1,7 +1,7 @@ import os -def add(self, fname, recursive=False): +def add(self, fname, recursive=False, no_commit=False): from dvc.stage import Stage fnames = [] @@ -30,7 +30,8 @@ def add(self, fname, recursive=False): continue stage.save() - stage.commit() + if not no_commit: + stage.commit() stages.append(stage) self.check_dag(self.stages() + stages) diff --git a/tests/test_add.py b/tests/test_add.py index 3fa99a3265..bceb84aa65 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -264,3 +264,11 @@ def test(self): ) self.assertEqual(ret, 0) self.assertEqual(file_md5_counter.mock.call_count, 3) + + +class TestAddNoCommit(TestDvc): + def test(self): + ret = main(["add", self.FOO, "--no-commit"]) + self.assertEqual(ret, 0) + self.assertTrue(os.path.isfile(self.FOO)) + self.assertEqual(len(os.listdir(self.dvc.cache.local.cache_dir)), 0) From 1f18119b744383dc61ad0fc801553d911923a02a Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 6 Feb 2019 23:46:42 +0200 Subject: [PATCH 5/7] repro: introduce --no-commit Signed-off-by: Ruslan Kuprieiev --- dvc/command/repro.py | 7 +++++++ dvc/repo/reproduce.py | 28 ++++++++++++++++++++++------ dvc/stage.py | 6 ++++-- tests/test_repro.py | 8 ++++++++ 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/dvc/command/repro.py b/dvc/command/repro.py index 59727c3d88..05b82e879f 100644 --- a/dvc/command/repro.py +++ b/dvc/command/repro.py @@ -33,6 +33,7 @@ def run(self): pipeline=self.args.pipeline, all_pipelines=self.args.all_pipelines, ignore_build_cache=self.args.ignore_build_cache, + no_commit=self.args.no_commit, ) if len(stages) == 0: @@ -124,4 +125,10 @@ def add_parser(subparsers, parent_parser): help="Reproduce all descendants of a changed stage even if their " "direct dependencies didn't change.", ) + repro_parser.add_argument( + "--no-commit", + action="store_true", + default=False, + help="Don't put files/directories into cache.", + ) repro_parser.set_defaults(func=CmdRepro) diff --git a/dvc/repo/reproduce.py b/dvc/repo/reproduce.py index a367f396f4..0bc6d8656a 100644 --- a/dvc/repo/reproduce.py +++ b/dvc/repo/reproduce.py @@ -6,7 +6,7 @@ from dvc.exceptions import ReproductionError -def _reproduce_stage(stages, node, force, dry, interactive): +def _reproduce_stage(stages, node, force, dry, interactive, no_commit): stage = stages[node] if stage.locked: @@ -15,7 +15,9 @@ def _reproduce_stage(stages, node, force, dry, interactive): " not going to be reproduced.".format(path=stage.relpath) ) - stage = stage.reproduce(force=force, dry=dry, interactive=interactive) + stage = stage.reproduce( + force=force, dry=dry, interactive=interactive, no_commit=no_commit + ) if not stage: return [] @@ -35,6 +37,7 @@ def reproduce( pipeline=False, all_pipelines=False, ignore_build_cache=False, + no_commit=False, ): from dvc.stage import Stage @@ -75,6 +78,7 @@ def reproduce( dry=dry, interactive=interactive, ignore_build_cache=ignore_build_cache, + no_commit=no_commit, ) ret.extend(stages) @@ -91,6 +95,7 @@ def _reproduce( dry=False, interactive=False, ignore_build_cache=False, + no_commit=False, ): import networkx as nx from dvc.stage import Stage @@ -102,23 +107,34 @@ def _reproduce( if recursive: ret = _reproduce_stages( - G, stages, node, force, dry, interactive, ignore_build_cache + G, + stages, + node, + force, + dry, + interactive, + ignore_build_cache, + no_commit, ) else: - ret = _reproduce_stage(stages, node, force, dry, interactive) + ret = _reproduce_stage( + stages, node, force, dry, interactive, no_commit + ) return ret def _reproduce_stages( - G, stages, node, force, dry, interactive, ignore_build_cache + G, stages, node, force, dry, interactive, ignore_build_cache, no_commit ): import networkx as nx result = [] for n in nx.dfs_postorder_nodes(G, node): try: - ret = _reproduce_stage(stages, n, force, dry, interactive) + ret = _reproduce_stage( + stages, n, force, dry, interactive, no_commit + ) if len(ret) != 0 and ignore_build_cache: # NOTE: we are walking our pipeline from the top to the diff --git a/dvc/stage.py b/dvc/stage.py index 3f9a77b90f..6d12e55c55 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -251,7 +251,9 @@ def remove(self): self.remove_outs(ignore_remove=True) os.unlink(self.path) - def reproduce(self, force=False, dry=False, interactive=False): + def reproduce( + self, force=False, dry=False, interactive=False, no_commit=False + ): if not self.changed() and not force: return None @@ -268,7 +270,7 @@ def reproduce(self, force=False, dry=False, interactive=False): logger.info("Reproducing '{stage}'".format(stage=self.relpath)) - self.run(dry=dry) + self.run(dry=dry, no_commit=no_commit) logger.debug("'{stage}' was reproduced".format(stage=self.relpath)) diff --git a/tests/test_repro.py b/tests/test_repro.py index 1748d90d7a..d38b26c517 100644 --- a/tests/test_repro.py +++ b/tests/test_repro.py @@ -1269,3 +1269,11 @@ def test(self): ret = main(["repro", "--all-pipelines"]) self.assertEqual(ret, 0) self.assertEqual(mock_reproduce.call_count, 4) + + +class TestReproNoCommit(TestRepro): + def test(self): + shutil.rmtree(self.dvc.cache.local.cache_dir) + ret = main(["repro", self.file1_stage, "--no-commit"]) + self.assertEqual(ret, 0) + self.assertEqual(len(os.listdir(self.dvc.cache.local.cache_dir)), 0) From 26c9fc1ba450eaee3b7db20fb3c60e6e6d84537c Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 7 Feb 2019 04:54:46 +0200 Subject: [PATCH 6/7] status: improve message verbosity Signed-off-by: Ruslan Kuprieiev --- dvc/command/status.py | 23 +++++++++++++++++------ dvc/output/base.py | 33 +++++++++++++++++++++------------ dvc/stage.py | 25 ++++++++++++++++--------- 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/dvc/command/status.py b/dvc/command/status.py index 75bc4f2e45..1e38ed0d82 100644 --- a/dvc/command/status.py +++ b/dvc/command/status.py @@ -2,10 +2,11 @@ import dvc.logger as logger from dvc.command.data_sync import CmdDataBase +from dvc.utils.compat import str class CmdDataStatus(CmdDataBase): - STATUS_LEN = 10 + STATUS_LEN = 20 STATUS_INDENT = "\t" UP_TO_DATE_MSG = "Pipeline is up to date. Nothing to reproduce." @@ -17,13 +18,23 @@ def _normalize(self, s): def _show(self, status, indent=0): ind = indent * self.STATUS_INDENT + if isinstance(status, str): + logger.info("{}{}".format(ind, status)) + return + + if isinstance(status, list): + for entry in status: + self._show(entry, indent) + return + + assert isinstance(status, dict) + for key, value in status.items(): - if isinstance(value, dict): - logger.info("{}{}".format(ind, key)) + if isinstance(value, str): + logger.info("{}{}{}".format(ind, self._normalize(value), key)) + elif value: + logger.info("{}{}:".format(ind, key)) self._show(value, indent + 1) - else: - msg = "{}{}{}".format(ind, self._normalize(value), key) - logger.info(msg) def do_run(self, target=None): indent = 1 if self.args.cloud else 0 diff --git a/dvc/output/base.py b/dvc/output/base.py index e9cfd500ae..9fb8bb1ca2 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -111,15 +111,12 @@ def checksum(self): def exists(self): return self.remote.exists(self.path_info) - def changed(self): - if not self.exists: - return True - - if not self.use_cache: - return self.info != self.remote.save_info(self.path_info) - - return getattr(self.repo.cache, self.scheme).changed( - self.path_info, self.info + def changed_checksum(self): + return ( + self.checksum + != self.remote.save_info(self.path_info)[ + self.remote.PARAM_CHECKSUM + ] ) def changed_cache(self): @@ -131,11 +128,23 @@ def changed_cache(self): return cache.changed_cache(self.checksum) def status(self): - if self.changed(): - # FIXME better msgs - return {str(self): "changed"} + if self.checksum and self.use_cache and self.changed_cache(): + return {str(self): "not in cache"} + + if not self.exists: + return {str(self): "deleted"} + + if self.changed_checksum(): + return {str(self): "modified"} + + if not self.checksum: + return {str(self): "new"} + return {} + def changed(self): + return bool(self.status()) + def save(self): self.info = self.remote.save_info(self.path_info) diff --git a/dvc/stage.py b/dvc/stage.py index 6d12e55c55..187201f458 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -649,26 +649,33 @@ def checkout(self, force=False): out.checkout(force=force) @staticmethod - def _status(entries, name): + def _status(entries): ret = {} for entry in entries: ret.update(entry.status()) - if ret: - return {name: ret} - - return {} + return ret def status(self): - ret = {} + ret = [] if not self.locked: - ret.update(self._status(self.deps, "deps")) + deps_status = self._status(self.deps) + if deps_status: + ret.append({"changed deps": deps_status}) - ret.update(self._status(self.outs, "outs")) + outs_status = self._status(self.outs) + if outs_status: + ret.append({"changed outs": outs_status}) - if ret or self.changed_md5() or self.is_callback: + if self.changed_md5(): + ret.append("changed checksum") + + if self.is_callback: + ret.append("always changed") + + if ret: return {self.relpath: ret} return {} From 01a120f9fc7266f37728c6114e8f706f1cf8b80f Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Sun, 10 Feb 2019 05:46:13 +0200 Subject: [PATCH 7/7] dvc: introduce `dvc commit` Signed-off-by: Ruslan Kuprieiev --- dvc/cli.py | 2 + dvc/command/commit.py | 63 +++++++++++++++++++++++++ dvc/repo/__init__.py | 1 + dvc/repo/commit.py | 11 +++++ dvc/stage.py | 33 +++++++++++++ tests/test_add.py | 7 ++- tests/test_commit.py | 91 ++++++++++++++++++++++++++++++++++++ tests/test_run.py | 9 +++- tests/unit/command/commit.py | 0 9 files changed, 214 insertions(+), 3 deletions(-) create mode 100644 dvc/command/commit.py create mode 100644 dvc/repo/commit.py create mode 100644 tests/test_commit.py create mode 100644 tests/unit/command/commit.py diff --git a/dvc/cli.py b/dvc/cli.py index a82d3a8545..3a0379f2a4 100644 --- a/dvc/cli.py +++ b/dvc/cli.py @@ -29,6 +29,7 @@ import dvc.command.lock as lock import dvc.command.pipeline as pipeline import dvc.command.daemon as daemon +import dvc.command.commit as commit from dvc.exceptions import DvcParserError from dvc import VERSION @@ -56,6 +57,7 @@ lock, pipeline, daemon, + commit, ] diff --git a/dvc/command/commit.py b/dvc/command/commit.py new file mode 100644 index 0000000000..f45763e3ba --- /dev/null +++ b/dvc/command/commit.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals + +import dvc.logger as logger +from dvc.exceptions import DvcException +from dvc.command.base import CmdBase + + +class CmdCommit(CmdBase): + def run(self): + if not self.args.targets: + self.args.targets = [None] + + for target in self.args.targets: + try: + self.repo.commit( + target, + with_deps=self.args.with_deps, + recursive=self.args.recursive, + force=self.args.force, + ) + except DvcException: + logger.error( + "failed to commit{}".format( + (" " + target) if target else "" + ) + ) + return 1 + return 0 + + +def add_parser(subparsers, parent_parser): + COMMIT_HELP = "Record changes to the repository." + commit_parser = subparsers.add_parser( + "commit", + parents=[parent_parser], + description=COMMIT_HELP, + help=COMMIT_HELP, + ) + commit_parser.add_argument( + "-f", + "--force", + action="store_true", + default=False, + help="Commit even if checksums for dependencies/outputs changed.", + ) + commit_parser.add_argument( + "-d", + "--with-deps", + action="store_true", + default=False, + help="Commit all dependencies of the specified target.", + ) + commit_parser.add_argument( + "-R", + "--recursive", + action="store_true", + default=False, + help="Commit cache for subdirectories of the specified directory.", + ) + commit_parser.add_argument( + "targets", nargs="*", default=None, help="DVC files." + ) + commit_parser.set_defaults(func=CmdCommit) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 7b7a742915..889abc7684 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -28,6 +28,7 @@ class Repo(object): from dvc.repo.pull import pull from dvc.repo.status import status from dvc.repo.gc import gc + from dvc.repo.commit import commit def __init__(self, root_dir=None): from dvc.config import Config diff --git a/dvc/repo/commit.py b/dvc/repo/commit.py new file mode 100644 index 0000000000..ae4b9007cf --- /dev/null +++ b/dvc/repo/commit.py @@ -0,0 +1,11 @@ +def commit(self, target, with_deps=False, recursive=False, force=False): + if target and not recursive: + stages = self.collect(target, with_deps=with_deps) + else: + stages = self.active_stages(target) + + with self.state: + for stage in stages: + stage.check_can_commit(force=force) + stage.commit() + stage.dump() diff --git a/dvc/stage.py b/dvc/stage.py index 187201f458..b3c51d3d28 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -67,6 +67,10 @@ def __init__(self, cwd): super(StageBadCwdError, self).__init__(msg.format(cwd)) +class StageCommitError(DvcException): + pass + + class MissingDep(DvcException): def __init__(self, deps): assert len(deps) > 0 @@ -528,6 +532,35 @@ def save(self): self.md5 = self._compute_md5() + @staticmethod + def _changed_entries(entries): + ret = [] + for entry in entries: + if entry.checksum and entry.changed_checksum(): + ret.append(entry.rel_path) + return ret + + def check_can_commit(self, force): + changed_deps = self._changed_entries(self.deps) + changed_outs = self._changed_entries(self.outs) + + if changed_deps or changed_outs or self.changed_md5(): + msg = ( + "dependencies {}".format(changed_deps) if changed_deps else "" + ) + msg += " and " if (changed_deps and changed_outs) else "" + msg += "outputs {}".format(changed_outs) if changed_outs else "" + msg += "md5" if not (changed_deps or changed_outs) else "" + msg += " of '{}' changed. Are you sure you commit it?".format( + self.relpath + ) + if not force and not prompt.confirm(msg): + raise StageCommitError( + "unable to commit changed '{}'. Use `-f|--force` to " + "force.`".format(self.relpath) + ) + self.save() + def commit(self): for out in self.outs: out.commit() diff --git a/tests/test_add.py b/tests/test_add.py index bceb84aa65..a835fce03d 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -266,9 +266,14 @@ def test(self): self.assertEqual(file_md5_counter.mock.call_count, 3) -class TestAddNoCommit(TestDvc): +class TestAddCommit(TestDvc): def test(self): ret = main(["add", self.FOO, "--no-commit"]) self.assertEqual(ret, 0) self.assertTrue(os.path.isfile(self.FOO)) self.assertEqual(len(os.listdir(self.dvc.cache.local.cache_dir)), 0) + + ret = main(["commit", self.FOO + ".dvc"]) + self.assertEqual(ret, 0) + self.assertTrue(os.path.isfile(self.FOO)) + self.assertEqual(len(os.listdir(self.dvc.cache.local.cache_dir)), 1) diff --git a/tests/test_commit.py b/tests/test_commit.py new file mode 100644 index 0000000000..68581225a4 --- /dev/null +++ b/tests/test_commit.py @@ -0,0 +1,91 @@ +import yaml +import shutil + +from tests.basic_env import TestDvc +from dvc.stage import StageCommitError + + +class TestCommitRecursive(TestDvc): + def test(self): + stages = self.dvc.add(self.DATA_DIR, recursive=True, no_commit=True) + self.assertEqual(len(stages), 2) + + self.assertNotEqual(self.dvc.status(), {}) + + self.dvc.commit(self.DATA_DIR, recursive=True) + + self.assertEqual(self.dvc.status(), {}) + + +class TestCommitForce(TestDvc): + def test(self): + stages = self.dvc.add(self.FOO, no_commit=True) + self.assertEqual(len(stages), 1) + stage = stages[0] + + with self.dvc.state: + self.assertTrue(stage.outs[0].changed_cache()) + + with open(self.FOO, "a") as fobj: + fobj.write(self.FOO_CONTENTS) + + with self.dvc.state: + self.assertTrue(stage.outs[0].changed_cache()) + + with self.assertRaises(StageCommitError): + self.dvc.commit(stage.path) + + with self.dvc.state: + self.assertTrue(stage.outs[0].changed_cache()) + + self.dvc.commit(stage.path, force=True) + + self.assertEqual(self.dvc.status(stage.path), {}) + + +class TestCommitWithDeps(TestDvc): + def test(self): + stages = self.dvc.add(self.FOO, no_commit=True) + self.assertEqual(len(stages), 1) + foo_stage = stages[0] + self.assertTrue(foo_stage is not None) + self.assertEqual(len(foo_stage.outs), 1) + + fname = "file" + stage = self.dvc.run( + cmd="python {} {} {}".format(self.CODE, self.FOO, fname), + outs=[fname], + deps=[self.FOO, self.CODE], + no_commit=True, + ) + self.assertTrue(stage is not None) + self.assertEqual(len(stage.outs), 1) + + with self.dvc.state: + self.assertTrue(foo_stage.outs[0].changed_cache()) + self.assertTrue(stage.outs[0].changed_cache()) + + self.dvc.commit(stage.path, with_deps=True) + with self.dvc.state: + self.assertFalse(foo_stage.outs[0].changed_cache()) + self.assertFalse(stage.outs[0].changed_cache()) + + +class TestCommitChangedMd5(TestDvc): + def test(self): + stages = self.dvc.add(self.FOO, no_commit=True) + self.assertEqual(len(stages), 1) + stage = stages[0] + + st = None + with open(stage.path, "r") as fobj: + st = yaml.safe_load(fobj) + + st["md5"] = "1111111111" + with open(stage.path, "w") as fobj: + yaml.safe_dump(st, fobj, default_flow_style=False) + + with self.assertRaises(StageCommitError): + self.dvc.commit(stage.path) + + self.dvc.commit(stage.path, force=True) diff --git a/tests/test_run.py b/tests/test_run.py index ed48b9db1e..98eaa1f759 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -630,12 +630,17 @@ def test(self): self._run() -class TestRunNoCommit(TestDvc): +class TestRunCommit(TestDvc): def test(self): fname = "test" ret = main( - ["run", "-o", "foo", "--no-commit", "echo", "test", ">", fname] + ["run", "-o", self.FOO, "--no-commit", "echo", "test", ">", fname] ) self.assertEqual(ret, 0) self.assertTrue(os.path.isfile(fname)) self.assertEqual(len(os.listdir(self.dvc.cache.local.cache_dir)), 0) + + ret = main(["commit", self.FOO + ".dvc"]) + self.assertEqual(ret, 0) + self.assertTrue(os.path.isfile(fname)) + self.assertEqual(len(os.listdir(self.dvc.cache.local.cache_dir)), 1) diff --git a/tests/unit/command/commit.py b/tests/unit/command/commit.py new file mode 100644 index 0000000000..e69de29bb2