diff --git a/dvc/cli/parser.py b/dvc/cli/parser.py index c92c8276c6..2a548b1abd 100644 --- a/dvc/cli/parser.py +++ b/dvc/cli/parser.py @@ -39,6 +39,7 @@ move, params, plots, + purge, queue, remote, remove, @@ -90,6 +91,7 @@ move, params, plots, + purge, queue, remote, remove, diff --git a/dvc/commands/purge.py b/dvc/commands/purge.py new file mode 100644 index 0000000000..2600bcb090 --- /dev/null +++ b/dvc/commands/purge.py @@ -0,0 +1,108 @@ +import os + +from dvc.cli import formatter +from dvc.cli.command import CmdBase +from dvc.cli.utils import append_doc_link +from dvc.log import logger +from dvc.ui import ui + +logger = logger.getChild(__name__) + + +class CmdPurge(CmdBase): + def run(self): + if not self.args.dry_run: + msg = "This will permanently remove local DVC-tracked outputs " + else: + msg = "This will show what local DVC-tracked outputs would be removed " + if self.args.targets: + msg += "for the following targets:\n - " + "\n - ".join( + [os.path.abspath(t) for t in self.args.targets] + ) + else: + msg += "for the entire workspace." + + if self.args.recursive: + msg += "\nRecursive purge is enabled." + + if self.args.dry_run: + msg += "\n(dry-run: showing what would be removed, no changes)." + + logger.warning(msg) + + if ( + not self.args.force + and not self.args.dry_run + and not self.args.yes + and not ui.confirm("Are you sure you want to proceed?") + ): + return 1 + + # Call repo API + self.repo.purge( + targets=self.args.targets, + recursive=self.args.recursive, + force=self.args.force, + dry_run=self.args.dry_run, + unused_cache=self.args.unused_cache, + ) + return 0 + + +def add_parser(subparsers, parent_parser): + PURGE_HELP = "Remove tracked outputs and their cache." + PURGE_DESCRIPTION = ( + "Removes cache objects and workspace copies of DVC-tracked outputs.\n" + "Metadata remains intact, and non-DVC files are untouched.\n\n" + "`--unused-cache` mode will clear the cache of any files not checked\n" + "out in the current workspace." + ) + purge_parser = subparsers.add_parser( + "purge", + parents=[parent_parser], + description=append_doc_link(PURGE_DESCRIPTION, "purge"), + help=PURGE_HELP, + formatter_class=formatter.RawDescriptionHelpFormatter, + ) + + purge_parser.add_argument( + "targets", + nargs="*", + help="Optional list of files/directories to purge (default: entire repo).", + ) + purge_parser.add_argument( + "-r", + "--recursive", + action="store_true", + default=False, + help="Recursively purge directories.", + ) + purge_parser.add_argument( + "--dry-run", + dest="dry_run", + action="store_true", + default=False, + help="Only print what would be removed without actually removing.", + ) + purge_parser.add_argument( + "-f", + "--force", + action="store_true", + default=False, + help="Force purge, bypassing safety checks and prompts.", + ) + purge_parser.add_argument( + "-y", + "--yes", + action="store_true", + default=False, + help="Do not prompt for confirmation (respects safety checks).", + ) + purge_parser.add_argument( + "--unused-cache", + action="store_true", + default=False, + help="Remove cache objects not currently checked out in the workspace.", + ) + + purge_parser.set_defaults(func=CmdPurge) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index a7c3d4d7ca..6ddae775e2 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -83,6 +83,7 @@ class Repo: from dvc.repo.ls_url import ls_url as _ls_url # type: ignore[misc] from dvc.repo.move import move # type: ignore[misc] from dvc.repo.pull import pull # type: ignore[misc] + from dvc.repo.purge import purge # type: ignore[misc] from dvc.repo.push import push # type: ignore[misc] from dvc.repo.remove import remove # type: ignore[misc] from dvc.repo.reproduce import reproduce # type: ignore[misc] diff --git a/dvc/repo/purge.py b/dvc/repo/purge.py new file mode 100644 index 0000000000..de2f88b4c4 --- /dev/null +++ b/dvc/repo/purge.py @@ -0,0 +1,234 @@ +from typing import TYPE_CHECKING, Optional + +from dvc.config import NoRemoteError, RemoteNotFoundError +from dvc.exceptions import DvcException +from dvc.log import logger + +from . import locked + +if TYPE_CHECKING: + from dvc.output import Output + from dvc.repo import Repo + +logger = logger.getChild(__name__) + + +class PurgeError(DvcException): + """Raised when purge fails due to safety or internal errors.""" + + +def _flatten_stages_or_outs(items) -> list["Output"]: + """Normalize collect() results into a flat list of Output objects.""" + outs = [] + for item in items: + if isinstance(item, list): + outs.extend(_flatten_stages_or_outs(item)) + elif hasattr(item, "outs"): # Stage + outs.extend(item.outs) + elif hasattr(item, "use_cache"): # Already an Output + outs.append(item) + else: + logger.debug("Skipping non-stage item in collect(): %r", item) + return outs + + +def _check_dirty(outs, force: bool) -> None: + dirty = [o for o in outs if o.use_cache and o.changed()] + if dirty and not force: + raise PurgeError( + "Some tracked outputs have uncommitted changes. " + "Use `--force` to purge anyway.\n - " + + "\n - ".join(str(o) for o in dirty) + ) + + +def _get_remote_odb(repo: "Repo"): + try: + return repo.cloud.get_remote_odb(None) + except (RemoteNotFoundError, NoRemoteError): + return None + + +def _check_remote_backup(repo: "Repo", outs, force: bool) -> None: + remote_odb = _get_remote_odb(repo) + + if not remote_odb: + if not force: + raise PurgeError( + "No default remote configured. " + "Cannot safely purge outputs without verifying remote backup.\n" + "Use `--force` to purge anyway." + ) + logger.warning( + "No default remote configured. Proceeding with purge due to --force. " + "Outputs may be permanently lost." + ) + return + + # remote exists, check objects + not_in_remote = [ + str(o) + for o in outs + if o.use_cache + and o.hash_info + and o.hash_info.value + and not remote_odb.exists(o.hash_info.value) + ] + if not_in_remote and not force: + raise PurgeError( + "Some outputs are not present in the remote cache and would be " + "permanently lost if purged:\n - " + + "\n - ".join(not_in_remote) + + "\nUse `--force` to purge anyway." + ) + if not_in_remote and force: + logger.warning( + "Some outputs are not present in the remote cache and may be " + "permanently lost:\n - %s", + "\n - ".join(not_in_remote), + ) + + +def _remove_outs(outs, dry_run: bool) -> int: + removed = 0 + for out in outs: + if dry_run: + logger.info("[dry-run] Would remove %s", out) + continue + + try: + # remove workspace file + if out.exists: + out.remove(ignore_remove=False) + + # remove cache entry + if out.use_cache and out.hash_info: + cache_path = out.cache.oid_to_path(out.hash_info.value) + if out.cache.fs.exists(cache_path): + out.cache.fs.remove(cache_path, recursive=True) + + removed += 1 + except Exception: + logger.exception("Failed to remove %s", out) + return removed + + +def _compute_checked_out_hashes(repo: "Repo"): + # Collect all stages + items = list(repo.index.stages) + + # Flatten to outs + all_outs = [] + for st in items: + all_outs.extend(st.outs) + + # Keep only outs that actually exist in the workspace + used = set() + for out in all_outs: + if out.use_cache and out.exists and out.hash_info and out.hash_info.value: + used.add(out.hash_info.value) + + return used + + +def _remove_unused_cache(repo: "Repo", dry_run: bool) -> int: + """ + Remove cache objects whose outputs are not currently checked out. + A 'used' object is defined as: workspace file exists AND has a hash. + """ + # Compute hashes for outputs that are currently checked out + used_hashes = _compute_checked_out_hashes(repo) + + removed = 0 + + # Iterate through all local cache ODBs + for _scheme, odb in repo.cache.by_scheme(): + if not odb: + continue + + # Iterate through all cached object IDs + for obj_id in list(odb.all()): + if obj_id in used_hashes: + continue + + cache_path = odb.oid_to_path(obj_id) + + if dry_run: + logger.info("[dry-run] Would remove unused cache %s", cache_path) + else: + try: + odb.fs.remove(cache_path, recursive=True) + removed += 1 + except Exception: + logger.exception("Failed to remove unused cache %s", cache_path) + + return removed + + +@locked +def purge( + self: "Repo", + targets: Optional[list[str]] = None, + recursive: bool = False, + force: bool = False, + dry_run: bool = False, + unused_cache: bool = False, +) -> int: + """ + Purge removes local copies of DVC-tracked outputs and their cache. + + - Collects outs from .dvc files and dvc.yaml. + - Ensures safety (no dirty outs unless --force). + - Ensures outputs are backed up to remote (unless --force). + - Removes both workspace copies and cache objects. + - Metadata remains intact. + """ + from dvc.repo.collect import collect + from dvc.stage.exceptions import StageFileDoesNotExistError + + try: + items = ( + collect(self, targets=targets, recursive=recursive) + if targets + else list(self.index.stages) + ) + except StageFileDoesNotExistError as e: + raise PurgeError(str(e)) from e + + outs = _flatten_stages_or_outs(items) + if not outs: + logger.info("No DVC-tracked outputs found to purge.") + return 0 + + # Determine whether we should remove outs. + # If unused_cache mode, don't remove anything. + remove_outs = not unused_cache + if unused_cache and targets is not None: + logger.warning( + "`--unused-cache` mode should be used exclusively," + " other args have been provided but will be ignored." + ) + + removed = 0 + if remove_outs: + # Run safety checks + _check_dirty(outs, force) + _check_remote_backup(self, outs, force) + + # Remove outs + removed = _remove_outs(outs, dry_run) + + # Remove unused cache if requested + if unused_cache: + logger.info("Removing unused cache objects...") + unused_removed = _remove_unused_cache(self, dry_run=dry_run) + if unused_removed: + logger.info("Removed %d unused cache objects.", unused_removed) + else: + logger.info("No unused cache objects to remove.") + + if removed: + logger.info("Removed %d outputs (workspace + cache).", removed) + else: + logger.info("Nothing to purge.") + return 0 diff --git a/tests/func/test_purge.py b/tests/func/test_purge.py new file mode 100644 index 0000000000..195c33fd8e --- /dev/null +++ b/tests/func/test_purge.py @@ -0,0 +1,246 @@ +from pathlib import Path + +import pytest + +from dvc.cli import main +from dvc.repo.purge import PurgeError + + +def test_purge_no_remote_configured_errors(tmp_dir, dvc): + tmp_dir.dvc_gen("foo", "foo") + with pytest.raises(PurgeError): + dvc.purge() + + +def test_purge_no_remote_configured_with_force_warns(tmp_dir, dvc, caplog): + tmp_dir.dvc_gen("foo", "foo") + caplog.clear() + dvc.purge(force=True) + assert ( + "No default remote configured. Proceeding with purge due to --force" + in caplog.text + ) + + +def test_purge_api_removes_file_and_cache(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("foo", "foo") + assert (tmp_dir / "foo").exists() + assert Path(stage.outs[0].cache_path).exists() + + dvc.push("foo") # ensure remote has backup + + dvc.purge() + + # workspace file gone, cache gone, metadata remains + assert not (tmp_dir / "foo").exists() + assert not Path(stage.outs[0].cache_path).exists() + assert (tmp_dir / "foo.dvc").exists() + + +def test_purge_cli_removes_file_and_cache(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("bar", "bar") + assert (tmp_dir / "bar").exists() + assert Path(stage.outs[0].cache_path).exists() + + # force will skip check that remote has backup + assert main(["purge", "--force"]) == 0 + + assert not (tmp_dir / "bar").exists() + assert not Path(stage.outs[0].cache_path).exists() + assert (tmp_dir / "bar.dvc").exists() + + +def test_purge_targets_only(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen({"dir": {"a.txt": "A", "b.txt": "B"}}) + assert (tmp_dir / "dir" / "a.txt").exists() + assert (tmp_dir / "dir" / "b.txt").exists() + + dvc.purge(targets=[str(tmp_dir / "dir")], force=True) + + assert not (tmp_dir / "dir").exists() + assert (tmp_dir / "dir.dvc").exists() + + +def test_purge_recursive(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen({"nested": {"sub": {"file.txt": "content"}}}) + assert (tmp_dir / "nested" / "sub" / "file.txt").exists() + + dvc.purge(targets=["nested"], recursive=True, force=True) + assert not (tmp_dir / "nested" / "sub" / "file.txt").exists() + + +def test_purge_individual_targets(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + + # Generate two *separate* tracked files + (stage_a,) = tmp_dir.dvc_gen("a.txt", "A") + (stage_b,) = tmp_dir.dvc_gen("b.txt", "B") + + assert (tmp_dir / "a.txt").exists() + assert (tmp_dir / "b.txt").exists() + assert Path(stage_a.outs[0].cache_path).exists() + assert Path(stage_b.outs[0].cache_path).exists() + + # Push both so purge passes remote safety + dvc.push() + + # Purge only a.txt + dvc.purge(targets=[str(tmp_dir / "a.txt")]) + + # a.txt and its cache should be gone, but metadata intact + assert not (tmp_dir / "a.txt").exists() + assert not Path(stage_a.outs[0].cache_path).exists() + assert (tmp_dir / "a.txt.dvc").exists() + + # b.txt and its cache should still exist + assert (tmp_dir / "b.txt").exists() + assert Path(stage_b.outs[0].cache_path).exists() + assert (tmp_dir / "b.txt.dvc").exists() + + +def test_purge_dry_run_does_not_delete(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("baz", "baz") + cache_path = Path(stage.outs[0].cache_path) + + dvc.purge(dry_run=True, force=True) + + assert (tmp_dir / "baz").exists() + assert cache_path.exists() + + +def test_purge_dirty_file_requires_force(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen("foo", "foo") + (tmp_dir / "foo").write_text("modified") + + with pytest.raises(PurgeError): + dvc.purge() + + dvc.purge(force=True) + assert not (tmp_dir / "foo").exists() + + +def test_purge_missing_remote_object_requires_force(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + tmp_dir.dvc_gen("foo", "foo") + dvc.push("foo") + + remote = dvc.cloud.get_remote_odb("backup") + remote.fs.remove(remote.path, recursive=True) # wipe remote + + with pytest.raises(PurgeError): + dvc.purge() + + +def test_purge_missing_remote_object_with_force_warns( + tmp_dir, dvc, make_remote, caplog +): + make_remote("backup", default=True) + tmp_dir.dvc_gen("foo", "foo") + dvc.push("foo") + + remote = dvc.cloud.get_remote_odb("backup") + remote.fs.remove(remote.path, recursive=True) # wipe remote + + caplog.clear() + dvc.purge(force=True) + assert "Some outputs are not present in the remote cache" in caplog.text + + +def test_purge_unused_cache(tmp_dir, dvc, make_remote): + """Basic behavior for `unused-cache` flag. + + Removes cache for files not checked out""" + make_remote("backup", default=True) + + # tracked & checked out + (stage_a,) = tmp_dir.dvc_gen("a.txt", "A") + + # tracked but workspace file removed + (stage_b,) = tmp_dir.dvc_gen("b.txt", "B") + (tmp_dir / "b.txt").unlink() + + dvc.push() # ensure remote OK so purge doesn't fail + + cache_a = Path(stage_a.outs[0].cache_path) + cache_b = Path(stage_b.outs[0].cache_path) + + assert cache_a.exists() + assert cache_b.exists() + + # Remove unused cache only + dvc.purge(unused_cache=True, force=False) + + # a.txt exists in workspace -> its cache kept + assert cache_a.exists() + + # b.txt removed -> its cache purged + assert not cache_b.exists() + + +def test_purge_unused_cache_does_not_delete_workspace_files(tmp_dir, dvc, make_remote): + """Only cache files should be removed, not workspace files""" + make_remote("backup", default=True) + tmp_dir.dvc_gen("file.txt", "X") + dvc.push() + + # Running --unused-cache alone must not delete the file itself + dvc.purge(unused_cache=True, force=True) + + assert (tmp_dir / "file.txt").exists() + assert (tmp_dir / "file.txt.dvc").exists() + + +def test_purge_unused_cache_dry_run(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("foo", "content") + dvc.push() + + # Delete workspace file -> cache is now unused + (tmp_dir / "foo").unlink() + cache_path = Path(stage.outs[0].cache_path) + assert cache_path.exists() + + dvc.purge(unused_cache=True, dry_run=True, force=True) + + # Dry run must NOT delete anything + assert cache_path.exists() + + +def test_purge_and_unused_cache_together(tmp_dir, dvc, make_remote, caplog): + make_remote("backup", default=True) + + (stage_a,) = tmp_dir.dvc_gen("a.txt", "A") + (stage_b,) = tmp_dir.dvc_gen("b.txt", "B") + dvc.push() + + cache_a = Path(stage_a.outs[0].cache_path) + cache_b = Path(stage_b.outs[0].cache_path) + + # Purge only a.txt (should be ignored and raise warning) + caplog.clear() + dvc.purge(targets=[str(tmp_dir / "a.txt")], unused_cache=True, force=True) + + # unused-cache is exclusive; targets ignored + assert "other args have been provided but will be ignored" in caplog.text + + # a.txt is NOT removed by purge + assert (tmp_dir / "a.txt").exists() + assert cache_a.exists() + + # b.txt still exists in workspace -> cache kept + assert (tmp_dir / "b.txt").exists() + assert cache_b.exists() + + +def test_unused_cache_ignores_dirty_outputs(tmp_dir, dvc, make_remote): + """Unused-cache does not concern itself with dirty outputs.""" + make_remote("backup", default=True) + tmp_dir.dvc_gen("foo", "foo") + (tmp_dir / "foo").write_text("modified") # dirty + dvc.purge(unused_cache=True) diff --git a/tests/unit/command/test_purge.py b/tests/unit/command/test_purge.py new file mode 100644 index 0000000000..e160973bbd --- /dev/null +++ b/tests/unit/command/test_purge.py @@ -0,0 +1,103 @@ +import pytest + +from dvc.cli import parse_args +from dvc.commands.purge import CmdPurge +from dvc.repo.purge import PurgeError + + +def test_purge_args_and_call(dvc, scm, mocker): + cli_args = parse_args( + [ + "purge", + "foo", + "bar", + "--recursive", + "--dry-run", + "--force", + ] + ) + assert cli_args.func == CmdPurge + + cmd = cli_args.func(cli_args) + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + m.assert_called_once_with( + targets=["foo", "bar"], + recursive=True, + force=True, + dry_run=True, + unused_cache=False, + ) + + +def test_purge_defaults(mocker): + cli_args = parse_args(["purge"]) + cmd = cli_args.func(cli_args) + + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + m.assert_called_once_with( + targets=[], + recursive=False, + force=False, + dry_run=False, + unused_cache=False, + ) + + +def test_purge_safety_error(mocker): + cli_args = parse_args(["purge"]) + cmd = cli_args.func(cli_args) + + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", side_effect=PurgeError("dirty outs")) + + with pytest.raises(PurgeError): + cmd.run() + + m.assert_called_once() + + +def test_purge_yes_skips_confirm(mocker): + cli_args = parse_args(["purge", "-y"]) + cmd = cli_args.func(cli_args) + + confirm = mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + # -y should skip confirmation + confirm.assert_not_called() + m.assert_called_once() + + +def test_purge_unused_cache_arg(mocker): + cli_args = parse_args( + [ + "purge", + "--unused-cache", + "--force", + ] + ) + + cmd = cli_args.func(cli_args) + + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + m.assert_called_once_with( + targets=[], + recursive=False, + force=True, + dry_run=False, + unused_cache=True, + )