diff --git a/dvc/ignore.py b/dvc/ignore.py new file mode 100644 index 0000000000..d989d9a8e6 --- /dev/null +++ b/dvc/ignore.py @@ -0,0 +1,134 @@ +import os + +from dulwich.ignore import match_pattern, read_ignore_patterns +from dvc.utils.compat import cast_bytes +from dvc.utils.fs import get_parent_dirs_up_to + + +class DvcIgnoreFileHandler(object): + def __init__(self, tree): + self.tree = tree + + def read_patterns(self, path): + with self.tree.open(path, binary=True) as stream: + return self._read_patterns(stream) + + def get_repo_root(self): + return self.tree.tree_root + + def _read_patterns(self, binary_stream): + negate_patterns = [] + patterns = [] + for pattern in read_ignore_patterns(binary_stream): + if pattern.lstrip().startswith(b"!"): + negate_patterns.append(pattern) + else: + patterns.append(pattern) + + return negate_patterns, patterns + + +class DvcIgnore(object): + DVCIGNORE_FILE = ".dvcignore" + + def __call__(self, root, dirs, files): + raise NotImplementedError + + +class DvcIgnoreFromFile(DvcIgnore): + def __init__(self, ignore_file_path, ignore_handler): + self.ignore_file_path = ignore_file_path + self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) + self.patterns = [] + self.negate_patterns = [] + + self.negate_patterns, self.patterns = ignore_handler.read_patterns( + ignore_file_path + ) + + def __call__(self, root, dirs, files): + files = [f for f in files if not self.matches(root, f)] + dirs = [d for d in dirs if not self.matches(root, d)] + + return dirs, files + + def get_match(self, abs_path): + rel_path = os.path.relpath(abs_path, self.dirname) + if os.name == "nt": + rel_path = rel_path.replace("\\", "/") + rel_path = cast_bytes(rel_path, "utf-8") + + for pattern in self.patterns: + if match_pattern( + rel_path, pattern + ) and self._no_negate_pattern_matches(rel_path): + return (abs_path, pattern, self.ignore_file_path) + return None + + def matches(self, dirname, basename): + if self.get_match(os.path.join(dirname, basename)): + return True + return False + + def _no_negate_pattern_matches(self, path): + return all([not match_pattern(path, p) for p in self.negate_patterns]) + + def __hash__(self): + return hash(self.ignore_file_path) + + +class DvcIgnoreConstant(DvcIgnore): + def __init__(self, basename): + self.basename = basename + + +class DvcIgnoreDir(DvcIgnoreConstant): + def __call__(self, root, dirs, files): + dirs = [d for d in dirs if not d == self.basename] + + return dirs, files + + +class DvcIgnoreFile(DvcIgnoreConstant): + def __call__(self, root, dirs, files): + files = [f for f in files if not f == self.basename] + + return dirs, files + + +class DvcIgnoreFilter(object): + def __init__(self, wdir, ignore_file_handler=None): + self.ignores = [ + DvcIgnoreDir(".git"), + DvcIgnoreDir(".hg"), + DvcIgnoreDir(".dvc"), + DvcIgnoreFile(".dvcignore"), + ] + + self.ignore_file_handler = ignore_file_handler + self._process_ignores_in_parent_dirs(wdir) + + def _process_ignores_in_parent_dirs(self, wdir): + if self.ignore_file_handler: + wdir = os.path.normpath(os.path.abspath(wdir)) + ignore_search_end_dir = self.ignore_file_handler.get_repo_root() + parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) + for d in parent_dirs: + self.update(d) + + def update(self, wdir): + ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) + if os.path.exists(ignore_file_path): + file_ignore = DvcIgnoreFromFile( + ignore_file_path, ignore_handler=self.ignore_file_handler + ) + self.ignores.append(file_ignore) + + def __call__(self, root, dirs, files): + if self.ignore_file_handler: + self.update(root) + + for ignore in self.ignores: + dirs, files = ignore(root, dirs, files) + + return dirs, files diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 8a5bc7bc7f..65c204c6d4 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -11,7 +11,7 @@ OutputNotFoundError, TargetNotDirectoryError, ) - +from dvc.ignore import DvcIgnoreFileHandler logger = logging.getLogger(__name__) @@ -58,7 +58,7 @@ def __init__(self, root_dir=None): self.config = Config(self.dvc_dir) - self.tree = WorkingTree() + self.tree = WorkingTree(self.root_dir) self.scm = SCM(self.root_dir, repo=self) self.lock = Lock(self.dvc_dir) @@ -390,7 +390,11 @@ def stages(self, from_directory=None, check_dag=True): stages = [] outs = [] - for root, dirs, files in self.tree.walk(from_directory): + + ignore_file_handler = DvcIgnoreFileHandler(self.tree) + for root, dirs, files in self.tree.walk( + from_directory, ignore_file_handler=ignore_file_handler + ): for fname in files: path = os.path.join(root, fname) if not Stage.is_valid_filename(path): diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index bc4f3e38a8..e84fcaf708 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -27,7 +27,7 @@ def brancher( # noqa: E302 if self.scm.is_dirty(): from dvc.scm.tree import WorkingTree - self.tree = WorkingTree() + self.tree = WorkingTree(self.root_dir) yield "Working Tree" if all_branches: diff --git a/dvc/scm/git/tree.py b/dvc/scm/git/tree.py index d6aff9151c..3cb21834be 100644 --- a/dvc/scm/git/tree.py +++ b/dvc/scm/git/tree.py @@ -1,7 +1,8 @@ import errno import os -from dvc.utils.compat import StringIO +from dvc.ignore import DvcIgnoreFilter +from dvc.utils.compat import StringIO, BytesIO from dvc.scm.tree import BaseTree @@ -24,7 +25,11 @@ def __init__(self, git, rev): self.git = git self.rev = rev - def open(self, path): + @property + def tree_root(self): + return self.git.working_dir + + def open(self, path, binary=False): relpath = os.path.relpath(path, self.git.working_dir) @@ -39,7 +44,10 @@ def open(self, path): # read it immediately, also it needs to be to decoded if we follow # the `open()` behavior (since data_stream.read() returns bytes, # and `open` with default "r" mode returns str) - return StringIO(obj.data_stream.read().decode("utf-8")) + data = obj.data_stream.read() + if binary: + return BytesIO(data) + return StringIO(data.decode("utf-8")) def exists(self, path): return self.git_object_by_path(path) is not None @@ -81,8 +89,13 @@ def git_object_by_path(self, path): tree = tree[i] return tree - def _walk(self, tree, topdown=True): - + def _walk( + self, + tree, + topdown=True, + ignore_file_handler=None, + dvc_ignore_filter=None, + ): dirs, nondirs = [], [] for i in tree: if i.mode == GIT_MODE_DIR: @@ -91,14 +104,26 @@ def _walk(self, tree, topdown=True): nondirs.append(i.name) if topdown: + if not dvc_ignore_filter: + dvc_ignore_filter = DvcIgnoreFilter( + tree.abspath, ignore_file_handler=ignore_file_handler + ) + dirs, nondirs = dvc_ignore_filter(tree.path, dirs, nondirs) yield os.path.normpath(tree.path), dirs, nondirs + for i in dirs: - for x in self._walk(tree[i], topdown=True): + for x in self._walk( + tree[i], + topdown=True, + ignore_file_handler=ignore_file_handler, + dvc_ignore_filter=dvc_ignore_filter, + ): yield x + if not topdown: yield os.path.normpath(tree.path), dirs, nondirs - def walk(self, top, topdown=True): + def walk(self, top, topdown=True, ignore_file_handler=None): """Directory tree generator. See `os.walk` for the docs. Differences: diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index 3e215ff09b..c411a7076f 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -1,12 +1,17 @@ import os +from dvc.utils import dvc_walk from dvc.utils.compat import open class BaseTree(object): """Abstract class to represent access to files""" - def open(self, path): + @property + def tree_root(self): + pass + + def open(self, path, binary=False): """Open file and return a stream.""" def exists(self, path): @@ -18,7 +23,7 @@ def isdir(self, path): def isfile(self, path): """Test whether a path is a regular file""" - def walk(self, top, topdown=True): + def walk(self, top, topdown=True, ignore_file_handler=None): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -30,8 +35,17 @@ def walk(self, top, topdown=True): class WorkingTree(BaseTree): """Proxies the repo file access methods to working tree files""" - def open(self, path): + def __init__(self, repo_root=os.getcwd()): + self.repo_root = repo_root + + @property + def tree_root(self): + return self.repo_root + + def open(self, path, binary=False): """Open file and return a stream.""" + if binary: + return open(path, "rb") return open(path, encoding="utf-8") def exists(self, path): @@ -46,7 +60,7 @@ def isfile(self, path): """Test whether a path is a regular file""" return os.path.isfile(path) - def walk(self, top, topdown=True): + def walk(self, top, topdown=True, ignore_file_handler=None): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -57,9 +71,10 @@ def walk(self, top, topdown=True): def onerror(e): raise e - for root, dirs, files in os.walk( - top, topdown=topdown, onerror=onerror + for root, dirs, files in dvc_walk( + top, + topdown=topdown, + onerror=onerror, + ignore_file_handler=ignore_file_handler, ): - if topdown: - dirs[:] = [i for i in dirs if i not in (".git", ".hg", ".dvc")] yield os.path.normpath(root), dirs, files diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 83faa59e78..5b5f206d8b 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -257,8 +257,39 @@ def load_stage_file_fobj(fobj, path): raise StageFileCorruptedError(path) -def walk_files(directory): - for root, _, files in os.walk(str(directory)): +def dvc_walk( + top, + topdown=True, + onerror=None, + followlinks=False, + ignore_file_handler=None, +): + """ + Proxy for `os.walk` directory tree generator. + Utilizes DvcIgnoreFilter functionality. + """ + ignore_filter = None + if topdown: + from dvc.ignore import DvcIgnoreFilter + + ignore_filter = DvcIgnoreFilter( + top, ignore_file_handler=ignore_file_handler + ) + + for root, dirs, files in os.walk( + top, topdown=topdown, onerror=onerror, followlinks=followlinks + ): + + if ignore_filter: + dirs[:], files[:] = ignore_filter(root, dirs, files) + + yield root, dirs, files + + +def walk_files(directory, ignore_file_handler=None): + for root, _, files in dvc_walk( + str(directory), ignore_file_handler=ignore_file_handler + ): for f in files: yield os.path.join(root, f) diff --git a/dvc/utils/compat.py b/dvc/utils/compat.py index bf0803427d..b32b38e969 100644 --- a/dvc/utils/compat.py +++ b/dvc/utils/compat.py @@ -93,6 +93,7 @@ def _makedirs(name, mode=0o777, exist_ok=False): if is_py2: from urlparse import urlparse, urljoin # noqa: F401 from StringIO import StringIO # noqa: F401 + from io import BytesIO # noqa: F401 from BaseHTTPServer import HTTPServer # noqa: F401 from SimpleHTTPServer import SimpleHTTPRequestHandler # noqa: F401 import ConfigParser # noqa: F401 @@ -111,7 +112,7 @@ def _makedirs(name, mode=0o777, exist_ok=False): elif is_py3: from os import makedirs # noqa: F401 from urllib.parse import urlparse, urljoin # noqa: F401 - from io import StringIO # noqa: F401 + from io import StringIO, BytesIO # noqa: F401 from http.server import ( # noqa: F401 HTTPServer, # noqa: F401 SimpleHTTPRequestHandler, # noqa: F401 diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 7b3c67101d..cecc0a207f 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -7,6 +7,7 @@ from dvc.exceptions import DvcException from dvc.system import System +from dvc.utils import dvc_walk from dvc.utils.compat import str @@ -24,7 +25,7 @@ def get_mtime_and_size(path): mtime = os.path.getmtime(path) if os.path.isdir(path): - for root, dirs, files in os.walk(str(path)): + for root, dirs, files in dvc_walk(str(path)): for name in dirs + files: entry = os.path.join(root, name) try: @@ -63,3 +64,22 @@ def contains_symlink_up_to(path, base_path): if os.path.dirname(path) == path: return False return contains_symlink_up_to(os.path.dirname(path), base_path) + + +def get_parent_dirs_up_to(wdir, root_dir): + + assert os.path.isabs(wdir) + assert os.path.isabs(root_dir) + + wdir = os.path.normpath(wdir) + root_dir = os.path.normpath(root_dir) + if root_dir not in wdir: + return [] + + dirs = [] + dirs.append(wdir) + while wdir != root_dir: + wdir = os.path.dirname(wdir) + dirs.append(wdir) + + return dirs diff --git a/requirements.txt b/requirements.txt index 374f7baeda..5723655b1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ appdirs>=1.4.3 treelib>=1.5.5 inflect>=2.1.0 humanize>=0.5.1 +dulwich>=0.19.11 diff --git a/setup.py b/setup.py index f394141d28..afe1a6260e 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ def run(self): "treelib>=1.5.5", "inflect>=2.1.0", "humanize>=0.5.1", + "dulwich>=0.19.11", ] # Extra dependencies for remote integrations diff --git a/tests/test_ignore.py b/tests/test_ignore.py new file mode 100644 index 0000000000..ca90387370 --- /dev/null +++ b/tests/test_ignore.py @@ -0,0 +1,68 @@ +import os + +from dvc.ignore import DvcIgnoreFromFile, DvcIgnore, DvcIgnoreFileHandler +from dvc.utils.compat import cast_bytes +from tests.basic_env import TestDvc + + +class TestDvcIgnore(TestDvc): + def setUp(self): + super(TestDvcIgnore, self).setUp() + self.ignore_file_handler = DvcIgnoreFileHandler(self.dvc.tree) + + def _get_all_paths(self): + + paths = [] + ignore_file_handler = DvcIgnoreFileHandler(self.dvc.tree) + for root, dirs, files in self.dvc.tree.walk( + self.dvc.root_dir, ignore_file_handler=ignore_file_handler + ): + for dname in dirs: + paths.append(os.path.join(root, dname)) + + for fname in files: + paths.append(os.path.join(root, fname)) + + return paths + + def test_ignore_comments(self): + ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) + with open(ignore_file, "w") as fobj: + fobj.write(os.path.basename(self.DATA)) + fobj.write(" #this is comment") + + ignore = DvcIgnoreFromFile(ignore_file, self.ignore_file_handler) + + self.assertEqual(1, len(ignore.patterns)) + + def test_ignore_in_child_dir(self): + ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) + with open(ignore_file, "w") as fobj: + fobj.write("data_dir/data") + + forbidden_path = os.path.join(self.dvc.root_dir, self.DATA) + all_paths = self._get_all_paths() + + self.assertNotIn(forbidden_path, all_paths) + + def test_ignore_in_child_dir_unicode(self): + ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) + with open(ignore_file, "wb") as fobj: + fobj.write(cast_bytes(self.UNICODE, "utf-8")) + + forbidden_path = os.path.join(self.dvc.root_dir, self.UNICODE) + all_paths = self._get_all_paths() + + self.assertNotIn(forbidden_path, all_paths) + + def test_ignore_in_parent_dir(self): + ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) + with open(ignore_file, "w") as fobj: + fobj.write("data_dir/data") + + os.chdir(self.DATA_DIR) + + forbidden_path = os.path.join(self.dvc.root_dir, self.DATA) + all_paths = self._get_all_paths() + + self.assertNotIn(forbidden_path, all_paths) diff --git a/tests/test_repo.py b/tests/test_repo.py index aa33063016..43aced7c4d 100644 --- a/tests/test_repo.py +++ b/tests/test_repo.py @@ -1,3 +1,5 @@ +from dvc.main import main +from dvc.stage import Stage from tests.basic_env import TestDvc from dvc.scm.git import GitTree @@ -44,3 +46,27 @@ def test(self): ) result = self._check("new_branch", "buzz", False, [["buzz"]]) self.assertEqual([i.rel_path for i in result[0].deps], ["bar"]) + + +class TestIgnore(TestDvc): + def _stage_name(self, file): + return file + Stage.STAGE_FILE_SUFFIX + + def test_should_not_gather_stage_files_from_ignored_d(self): + ret = main(["add", self.FOO, self.BAR, self.DATA, self.DATA_SUB]) + self.assertEqual(0, ret) + + stages = self.dvc.stages() + self.assertEqual(4, len(stages)) + + with open(".dvcignore", "w") as fobj: + fobj.write("data_dir") + + stages = self.dvc.stages() + self.assertEqual(2, len(stages)) + + stagenames = [s.relpath for s in stages] + self.assertIn(self._stage_name(self.FOO), stagenames) + self.assertIn(self._stage_name(self.BAR), stagenames) + self.assertNotIn(self._stage_name(self.DATA), stagenames) + self.assertNotIn(self._stage_name(self.DATA_SUB), stagenames) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py new file mode 100644 index 0000000000..5c50feb040 --- /dev/null +++ b/tests/unit/test_ignore.py @@ -0,0 +1,132 @@ +import os + +import pytest +from dvc.ignore import DvcIgnoreFromFile, DvcIgnoreDir, DvcIgnoreFile +from mock import patch, Mock + +from dvc.utils.compat import cast_bytes + + +def read_pattern(p): + return cast_bytes(p, "utf-8") + + +def mock_dvcignore(dvcignore_path, negate_patterns, patterns): + negate_patterns = [read_pattern(p) for p in negate_patterns] + patterns = [read_pattern(p) for p in patterns] + + mock_ignore_file_handler = Mock() + with patch.object( + mock_ignore_file_handler, + "read_patterns", + return_value=(negate_patterns, patterns), + ): + ignore_file = DvcIgnoreFromFile( + dvcignore_path, mock_ignore_file_handler + ) + return ignore_file + + +def test_ignore_from_file_should_filter_dirs_and_files(): + dvcignore_path = os.path.join( + os.path.sep, "full", "path", "to", "ignore", "file", ".dvcignore" + ) + + negate_patterns = [] + patterns = ["dir_to_ignore", "file_to_ignore"] + + root = os.path.dirname(dvcignore_path) + dirs = ["dir1", "dir2", "dir_to_ignore"] + files = ["file1", "file2", "file_to_ignore"] + + ignore = mock_dvcignore(dvcignore_path, negate_patterns, patterns) + new_dirs, new_files = ignore(root, dirs, files) + + assert {"dir1", "dir2"} == set(new_dirs) + assert {"file1", "file2"} == set(new_files) + + +@pytest.mark.parametrize( + "file_to_ignore_relpath, negate_patterns,patterns, expected_match", + [ + ("to_ignore", [], ["to_ignore"], True), + ("to_ignore.txt", [], ["to_ignore*"], True), + ( + os.path.join("rel", "p", "p2", "to_ignore"), + [], + ["rel/**/to_ignore"], + True, + ), + ( + os.path.join( + os.path.sep, + "full", + "path", + "to", + "ignore", + "file", + "to_ignore", + ), + [], + ["to_ignore"], + True, + ), + ("to_ignore.txt", [], ["/*.txt"], True), + ( + os.path.join("rel", "path", "path2", "to_ignore"), + [], + ["rel/*/to_ignore"], + False, + ), + (os.path.join("path", "to_ignore.txt"), [], ["/*.txt"], False), + ( + os.path.join("rel", "path", "path2", "dont_ignore"), + [], + ["rel/**/to_ignore"], + False, + ), + ("dont_ignore.txt", [], ["dont_ignore"], False), + ("dont_ignore.txt", ["!dont_ignore.txt"], ["dont*"], False), + ], +) +def test_match_ignore_from_file( + file_to_ignore_relpath, negate_patterns, patterns, expected_match +): + + dvcignore_path = os.path.join( + os.path.sep, "full", "path", "to", "ignore", "file", ".dvcignore" + ) + dvcignore_dirname = os.path.dirname(dvcignore_path) + + ignore_file = mock_dvcignore(dvcignore_path, negate_patterns, patterns) + + assert ( + ignore_file.matches(dvcignore_dirname, file_to_ignore_relpath) + == expected_match + ) + + +@pytest.mark.parametrize("omit_dir", [".git", ".hg", ".dvc"]) +def test_should_ignore_dir(omit_dir): + ignore = DvcIgnoreDir(omit_dir) + + root = os.path.join(os.path.sep, "walk", "dir", "root") + dirs = [omit_dir, "dir1", "dir2"] + files = [] + + new_dirs, _ = ignore(root, dirs, files) + + assert set(new_dirs) == {"dir1", "dir2"} + + +def test_should_ignore_file(): + dvcignore = ".dvcignore" + ignore = DvcIgnoreFile(dvcignore) + + root = os.path.join(os.path.sep, "walk", "dir", "root") + dirs = [] + files = ["file1", "file2", dvcignore] + + _, new_files = ignore(root, dirs, files) + + assert set(new_files) == {"file1", "file2"} diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index 109422715a..dc2f034aa4 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -2,12 +2,14 @@ from unittest import TestCase import dvc +import pytest from dvc.system import System from dvc.utils.compat import str from dvc.utils.fs import ( get_mtime_and_size, contains_symlink_up_to, BasePathNotInCheckedPathException, + get_parent_dirs_up_to, ) from mock import patch from tests.basic_env import TestDir @@ -90,3 +92,46 @@ def base_path_is_symlink(path): System, "is_symlink", side_effect=base_path_is_symlink ): self.assertFalse(contains_symlink_up_to(target_path, base_path)) + + +@pytest.mark.parametrize( + "path1, path2", + [ + ( + os.path.join("non", "abs", "path"), + os.path.join(os.sep, "full", "path"), + ), + ( + os.path.join(os.sep, "full", "path"), + os.path.join("non", "abs", "path"), + ), + ], +) +def test_get_parent_dirs_up_to_should_raise_on_no_absolute(path1, path2): + with pytest.raises(AssertionError): + get_parent_dirs_up_to(path1, path2) + + +@pytest.mark.parametrize( + "path1, path2, expected_dirs", + [ + ( + os.path.join(os.sep, "non", "matching", "path"), + os.path.join(os.sep, "other", "path"), + [], + ), + ( + os.path.join(os.sep, "some", "long", "path"), + os.path.join(os.sep, "some"), + [ + os.path.join(os.sep, "some"), + os.path.join(os.sep, "some", "long"), + os.path.join(os.sep, "some", "long", "path"), + ], + ), + ], +) +def test_get_parent_dirs_up_to(path1, path2, expected_dirs): + result = get_parent_dirs_up_to(path1, path2) + + assert set(result) == set(expected_dirs)