diff --git a/dvc/checkout.py b/dvc/checkout.py index 49f78e06fd..0af6de1a6c 100644 --- a/dvc/checkout.py +++ b/dvc/checkout.py @@ -9,12 +9,14 @@ ConfirmRemoveError, DvcException, ) +from dvc.ignore import DvcIgnoreFilter from dvc.objects import check, load from dvc.objects.errors import ObjectFormatError from dvc.objects.stage import stage from dvc.remote.slow_link_detection import ( # type: ignore[attr-defined] slow_link_guard, ) +from dvc.types import Optional logger = logging.getLogger(__name__) @@ -175,8 +177,13 @@ def _checkout_file( return modified -def _remove_redundant_files(path_info, fs, obj, cache, force): - existing_files = set(fs.walk_files(path_info)) +def _remove_redundant_files( + path_info, fs, obj, cache, force, dvcignore: Optional[DvcIgnoreFilter], +): + if dvcignore: + existing_files = set(dvcignore.walk_files(fs, path_info)) + else: + existing_files = set(fs.walk_files(path_info)) needed_files = {path_info.joinpath(*key) for key, _ in obj} redundant_files = existing_files - needed_files @@ -187,9 +194,16 @@ def _remove_redundant_files(path_info, fs, obj, cache, force): def _checkout_dir( - path_info, fs, obj, cache, force, progress_callback=None, relink=False, + path_info, + fs, + obj, + cache, + force, + progress_callback=None, + relink=False, + dvcignore: Optional[DvcIgnoreFilter] = None, ): - modified = False, False + modified = False # Create dir separately so that dir is created # even if there are no files in it if not fs.exists(path_info): @@ -212,7 +226,10 @@ def _checkout_dir( modified = True modified = ( - _remove_redundant_files(path_info, fs, obj, cache, force) or modified + _remove_redundant_files( + path_info, fs, obj, cache, force, dvcignore=dvcignore + ) + or modified ) fs.repo.state.save(path_info, fs, obj.hash_info) @@ -229,6 +246,7 @@ def _checkout( force=False, progress_callback=None, relink=False, + dvcignore: Optional[DvcIgnoreFilter] = None, ): if not obj.hash_info.isdir: ret = _checkout_file( @@ -236,7 +254,14 @@ def _checkout( ) else: ret = _checkout_dir( - path_info, fs, obj, cache, force, progress_callback, relink, + path_info, + fs, + obj, + cache, + force, + progress_callback, + relink, + dvcignore=dvcignore, ) fs.repo.state.save_link(path_info, fs) @@ -253,6 +278,7 @@ def checkout( progress_callback=None, relink=False, quiet=False, + dvcignore: Optional[DvcIgnoreFilter] = None, ): if path_info.scheme not in ["local", cache.fs.scheme]: raise NotImplementedError @@ -269,7 +295,7 @@ def checkout( failed = path_info elif not relink and not _changed(path_info, fs, obj, cache): - logger.trace("Data '%s' didn't change.", path_info) + logger.trace("Data '%s' didn't change.", path_info) # type: ignore skip = True else: try: @@ -296,5 +322,12 @@ def checkout( logger.debug("Checking out '%s' with cache '%s'.", path_info, obj) return _checkout( - path_info, fs, obj, cache, force, progress_callback, relink, + path_info, + fs, + obj, + cache, + force, + progress_callback, + relink, + dvcignore=dvcignore, ) diff --git a/dvc/command/check_ignore.py b/dvc/command/check_ignore.py index f96e46abca..9666fd012b 100644 --- a/dvc/command/check_ignore.py +++ b/dvc/command/check_ignore.py @@ -10,7 +10,7 @@ class CmdCheckIgnore(CmdBase): def __init__(self, args): super().__init__(args) - self.ignore_filter = self.repo.fs.dvcignore + self.ignore_filter = self.repo.dvcignore def _show_results(self, result): if not result.match and not self.args.non_matching: diff --git a/dvc/config.py b/dvc/config.py index adab7e3878..0ed1a44122 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -154,7 +154,7 @@ def _load_config(self, level): filename = self.files[level] fs = self._get_fs(level) - if fs.exists(filename, use_dvcignore=False): + if fs.exists(filename): with fs.open(filename) as fobj: conf_obj = ConfigObj(fobj) else: diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 909080a8e9..bdab8e6413 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -56,7 +56,6 @@ def _get_hash(self, locked=True): path_info, repo.repo_fs, self.repo.odb.local.fs.PARAM_CHECKSUM, - follow_subrepos=False, ).hash_info def workspace_status(self): @@ -95,16 +94,11 @@ def download(self, to, jobs=None): except (NoOutputOrStageError, NoRemoteError): pass obj = stage( - odb, - path_info, - repo.repo_fs, - odb.fs.PARAM_CHECKSUM, - jobs=jobs, - follow_subrepos=False, + odb, path_info, repo.repo_fs, odb.fs.PARAM_CHECKSUM, jobs=jobs, ) save(odb, obj, jobs=jobs) - checkout(to.path_info, to.fs, obj, odb) + checkout(to.path_info, to.fs, obj, odb, dvcignore=None) def update(self, rev=None): if rev: diff --git a/dvc/dvcfile.py b/dvc/dvcfile.py index 116594518b..9b2df3a87b 100644 --- a/dvc/dvcfile.py +++ b/dvc/dvcfile.py @@ -124,7 +124,8 @@ def relpath(self): return relpath(self.path) def exists(self): - return self.repo.fs.exists(self.path) + is_ignored = self.repo.dvcignore.is_ignored_file(self.path) + return self.repo.fs.exists(self.path) and not is_ignored def _is_git_ignored(self): return is_git_ignored(self.repo, self.path) @@ -144,8 +145,10 @@ def _load(self): # 3. path doesn't represent a regular file # 4. when the file is git ignored if not self.exists(): - is_ignored = self.repo.fs.exists(self.path, use_dvcignore=False) - raise StageFileDoesNotExistError(self.path, dvc_ignored=is_ignored) + dvc_ignored = self.repo.dvcignore.is_ignored_file(self.path) + raise StageFileDoesNotExistError( + self.path, dvc_ignored=dvc_ignored + ) self._verify_filename() if not self.repo.fs.isfile(self.path): diff --git a/dvc/fs/azure.py b/dvc/fs/azure.py index a25f7a397f..b33afc827a 100644 --- a/dvc/fs/azure.py +++ b/dvc/fs/azure.py @@ -53,7 +53,7 @@ class AzureAuthError(DvcException): pass -class AzureFileSystem(FSSpecWrapper): +class AzureFileSystem(FSSpecWrapper): # pylint:disable=abstract-method scheme = Schemes.AZURE PATH_CLS = CloudURLInfo PARAM_CHECKSUM = "etag" diff --git a/dvc/fs/base.py b/dvc/fs/base.py index 7aa349b303..a7c59aba4c 100644 --- a/dvc/fs/base.py +++ b/dvc/fs/base.py @@ -9,6 +9,7 @@ from dvc.exceptions import DvcException from dvc.path_info import URLInfo from dvc.progress import Tqdm +from dvc.scheme import Schemes from dvc.utils import tmp_fname from dvc.utils.fs import makedirs, move from dvc.utils.http import open_url @@ -92,7 +93,6 @@ def get_missing_deps(cls): return missing def _check_requires(self): - from ..scheme import Schemes from ..utils import format_link from ..utils.pkg import PKG @@ -139,7 +139,7 @@ def open(self, path_info, mode: str = "r", encoding: str = None, **kwargs): raise RemoteActionNotImplemented("open", self.scheme) - def exists(self, path_info, use_dvcignore=True) -> bool: + def exists(self, path_info) -> bool: raise NotImplementedError # pylint: disable=unused-argument @@ -166,6 +166,11 @@ def iscopy(self, path_info): """Check if this file is an independent copy.""" return False # We can't be sure by default + def walk(self, top, topdown=True, onerror=None, **kwargs): + """Return a generator with (root, dirs, files). + """ + raise NotImplementedError + def walk_files(self, path_info, **kwargs): """Return a generator with `PathInfo`s to all the files. diff --git a/dvc/fs/fsspec_wrapper.py b/dvc/fs/fsspec_wrapper.py index 6d4ca84777..08f6f6be89 100644 --- a/dvc/fs/fsspec_wrapper.py +++ b/dvc/fs/fsspec_wrapper.py @@ -2,6 +2,8 @@ import shutil from functools import lru_cache +from funcy import cached_property + from dvc.progress import Tqdm from .base import BaseFileSystem @@ -14,6 +16,7 @@ def __init__(self, repo, config): self.fs_args = {"skip_instance_cache": True} self.fs_args.update(self._prepare_credentials(config)) + @cached_property def fs(self): raise NotImplementedError @@ -88,7 +91,7 @@ def open( def copy(self, from_info, to_info): self.fs.copy(self._with_bucket(from_info), self._with_bucket(to_info)) - def exists(self, path_info, use_dvcignore=False): + def exists(self, path_info) -> bool: return self.fs.exists(self._with_bucket(path_info)) def ls(self, path_info, detail=False): diff --git a/dvc/fs/gdrive.py b/dvc/fs/gdrive.py index f851cf0b3a..cee3982269 100644 --- a/dvc/fs/gdrive.py +++ b/dvc/fs/gdrive.py @@ -85,7 +85,7 @@ def __init__(self, url): self._spath = re.sub("/{2,}", "/", self._spath.rstrip("/")) -class GDriveFileSystem(BaseFileSystem): +class GDriveFileSystem(BaseFileSystem): # pylint:disable=abstract-method scheme = Schemes.GDRIVE PATH_CLS = GDriveURLInfo PARAM_CHECKSUM = "checksum" @@ -517,7 +517,7 @@ def _get_item_id(self, path_info, create=False, use_cache=True, hint=None): assert not create raise FileMissingError(path_info, hint) - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: try: self._get_item_id(path_info) except FileMissingError: diff --git a/dvc/fs/git.py b/dvc/fs/git.py index 5c1293b406..24276fbce6 100644 --- a/dvc/fs/git.py +++ b/dvc/fs/git.py @@ -1,8 +1,6 @@ import errno import os -from funcy import cached_property - from dvc.utils import is_exec, relpath from .base import BaseFileSystem @@ -11,14 +9,12 @@ class GitFileSystem(BaseFileSystem): # pylint:disable=abstract-method """Proxies the repo file access methods to Git objects""" - def __init__( - self, root_dir, trie, use_dvcignore=False, dvcignore_root=None - ): + scheme = "local" + + def __init__(self, root_dir, trie): super().__init__(None, {}) self._fs_root = root_dir self.trie = trie - self.use_dvcignore = use_dvcignore - self.dvcignore_root = dvcignore_root @property def rev(self): @@ -40,14 +36,6 @@ def _get_key(self, path): return () return tuple(relparts) - @cached_property - def dvcignore(self): - from dvc.ignore import DvcIgnoreFilter, DvcIgnoreFilterNoop - - root = self.dvcignore_root or self.fs_root - cls = DvcIgnoreFilter if self.use_dvcignore else DvcIgnoreFilterNoop - return cls(self, root) - def open( self, path, mode="r", encoding=None ): # pylint: disable=arguments-differ @@ -66,50 +54,27 @@ def open( errno.EISDIR, os.strerror(errno.EISDIR), path ) from exc - def exists( - self, path, use_dvcignore=True - ): # pylint: disable=arguments-differ - def _is_ignored(path): - return self.dvcignore.is_ignored_file( - path - ) or self.dvcignore.is_ignored_dir(path) - - if use_dvcignore and _is_ignored(path): - return False - - key = self._get_key(path) + def exists(self, path_info) -> bool: + key = self._get_key(path_info) return self.trie.exists(key) - def isdir( - self, path, use_dvcignore=True - ): # pylint: disable=arguments-differ - if use_dvcignore and self.dvcignore.is_ignored_dir(path): - return False - key = self._get_key(path) + def isdir(self, path_info) -> bool: + key = self._get_key(path_info) return self.trie.isdir(key) - def isfile(self, path): # pylint: disable=arguments-differ - if self.dvcignore.is_ignored_file(path): - return False - key = self._get_key(path) + def isfile(self, path_info) -> bool: + key = self._get_key(path_info) return self.trie.isfile(key) - def walk( - self, - top, - topdown=True, - onerror=None, - use_dvcignore=True, - ignore_subrepos=True, - ): + def walk(self, top, topdown=True, onerror=None, **kwargs): """Directory tree generator. See `os.walk` for the docs. Differences: - no support for symlinks """ - if not self.isdir(top, use_dvcignore=use_dvcignore): + if not self.isdir(top): if onerror: - if self.exists(top, use_dvcignore=use_dvcignore): + if self.exists(top): exc = NotADirectoryError( errno.ENOTDIR, os.strerror(errno.ENOTDIR), top ) @@ -126,10 +91,6 @@ def walk( root = os.path.join(self.fs_root, os.sep.join(prefix)) else: root = self.fs_root - if use_dvcignore: - dirs[:], files[:] = self.dvcignore( - root, dirs, files, ignore_subrepos=ignore_subrepos, - ) yield root, dirs, files def isexec(self, path_info): @@ -158,11 +119,8 @@ def stat(self, path): errno.ENOENT, os.strerror(errno.ENOENT), path ) - def walk_files(self, top): # pylint: disable=arguments-differ - for root, _, files in self.walk(top): + def walk_files(self, path_info, **kwargs): + for root, _, files in self.walk(path_info, **kwargs): for file in files: # NOTE: os.path.join is ~5.5 times slower yield f"{root}{os.sep}{file}" - - def _reset(self): - return self.__dict__.pop("dvcignore", None) diff --git a/dvc/fs/gs.py b/dvc/fs/gs.py index fd90ef7c38..d68156aa84 100644 --- a/dvc/fs/gs.py +++ b/dvc/fs/gs.py @@ -9,7 +9,7 @@ from .fsspec_wrapper import FSSpecWrapper -class GSFileSystem(FSSpecWrapper): +class GSFileSystem(FSSpecWrapper): # pylint:disable=abstract-method scheme = Schemes.GS PATH_CLS = CloudURLInfo REQUIRES = {"gcsfs": "gcsfs"} diff --git a/dvc/fs/hdfs.py b/dvc/fs/hdfs.py index 28fec1509a..5ee98a312a 100644 --- a/dvc/fs/hdfs.py +++ b/dvc/fs/hdfs.py @@ -123,7 +123,7 @@ def open(self, path_info, mode="r", encoding=None, **kwargs): raise FileNotFoundError(*e.args) raise - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: assert not isinstance(path_info, list) assert path_info.scheme == "hdfs" with self.hdfs(path_info) as hdfs: @@ -159,15 +159,15 @@ def _walk(self, hdfs, root, topdown=True): if not topdown: yield root, dirs, nondirs - def walk(self, path_info, **kwargs): - if not self.isdir(path_info): + def walk(self, top, topdown=True, onerror=None, **kwargs): + if not self.isdir(top): return - with self.hdfs(path_info) as hdfs: + with self.hdfs(top) as hdfs: for root, dnames, fnames in self._walk( - hdfs, path_info.path, **kwargs + hdfs, top.path, topdown=topdown ): - yield path_info.replace(path=root), dnames, fnames + yield top.replace(path=root), dnames, fnames def walk_files(self, path_info, **kwargs): for root, _, fnames in self.walk(path_info): diff --git a/dvc/fs/http.py b/dvc/fs/http.py index 461b8ea124..a7a5b4ffe2 100644 --- a/dvc/fs/http.py +++ b/dvc/fs/http.py @@ -138,7 +138,7 @@ def _head(self, url): return response - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: res = self._head(path_info.url) if res.status_code == 404: return False diff --git a/dvc/fs/local.py b/dvc/fs/local.py index 7136c8999d..61eaafdae5 100644 --- a/dvc/fs/local.py +++ b/dvc/fs/local.py @@ -2,8 +2,6 @@ import os import stat -from funcy import cached_property - from dvc.path_info import PathInfo from dvc.scheme import Schemes from dvc.system import System @@ -22,70 +20,39 @@ class LocalFileSystem(BaseFileSystem): PARAM_PATH = "path" TRAVERSE_PREFIX_LEN = 2 - def __init__(self, repo, config, use_dvcignore=False, dvcignore_root=None): + def __init__(self, repo, config): super().__init__(repo, config) url = config.get("url") self.path_info = self.PATH_CLS(url) if url else None - self.use_dvcignore = use_dvcignore - self.dvcignore_root = dvcignore_root @property def fs_root(self): return self.config.get("url") - @cached_property - def dvcignore(self): - from dvc.ignore import DvcIgnoreFilter, DvcIgnoreFilterNoop - - root = self.dvcignore_root or self.fs_root - cls = DvcIgnoreFilter if self.use_dvcignore else DvcIgnoreFilterNoop - return cls(self, root) - @staticmethod def open(path_info, mode="r", encoding=None, **kwargs): return open(path_info, mode=mode, encoding=encoding) - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: assert isinstance(path_info, str) or path_info.scheme == "local" if self.repo: ret = os.path.lexists(path_info) else: ret = os.path.exists(path_info) - if not ret: - return False - if not use_dvcignore: - return True - - return not self.dvcignore.is_ignored_file( - path_info - ) and not self.dvcignore.is_ignored_dir(path_info) + return ret - def isfile(self, path_info): - if not os.path.isfile(path_info): - return False + def isfile(self, path_info) -> bool: + return os.path.isfile(path_info) - return not self.dvcignore.is_ignored_file(path_info) - - def isdir( - self, path_info, use_dvcignore=True - ): # pylint: disable=arguments-differ - if not os.path.isdir(path_info): - return False - return not (use_dvcignore and self.dvcignore.is_ignored_dir(path_info)) + def isdir(self, path_info) -> bool: + return os.path.isdir(path_info) def iscopy(self, path_info): return not ( System.is_symlink(path_info) or System.is_hardlink(path_info) ) - def walk( - self, - top, - topdown=True, - onerror=None, - use_dvcignore=True, - ignore_subrepos=True, - ): + def walk(self, top, topdown=True, onerror=None, **kwargs): """Directory fs generator. See `os.walk` for the docs. Differences: @@ -94,14 +61,6 @@ def walk( for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror ): - if use_dvcignore: - dirs[:], files[:] = self.dvcignore( - os.path.abspath(root), - dirs, - files, - ignore_subrepos=ignore_subrepos, - ) - yield os.path.normpath(root), dirs, files def walk_files(self, path_info, **kwargs): @@ -133,9 +92,6 @@ def isexec(self, path_info): return is_exec(mode) def stat(self, path): - if self.dvcignore.is_ignored(path): - raise FileNotFoundError - return os.stat(path) def move(self, from_info, to_info): @@ -227,6 +183,3 @@ def _download( copyfile( from_info, to_file, no_progress_bar=no_progress_bar, name=name ) - - def _reset(self): - return self.__dict__.pop("dvcignore", None) diff --git a/dvc/fs/memory.py b/dvc/fs/memory.py index c9b2ef628a..6bf5fa4d34 100644 --- a/dvc/fs/memory.py +++ b/dvc/fs/memory.py @@ -12,7 +12,7 @@ def __init__(self, repo, config): self.fs = MemFS() - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: return self.fs.exists(path_info.fspath) def open(self, path_info, mode="r", encoding=None, **kwargs): diff --git a/dvc/fs/oss.py b/dvc/fs/oss.py index e16e8272c3..6cd217720d 100644 --- a/dvc/fs/oss.py +++ b/dvc/fs/oss.py @@ -88,7 +88,7 @@ def _generate_download_url(self, path_info, expires=3600): return self.oss_service.sign_url("GET", path_info.path, expires) - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: paths = self._list_paths(path_info) return any(path_info.path == path for path in paths) diff --git a/dvc/fs/repo.py b/dvc/fs/repo.py index 2f4dae7626..75821a2316 100644 --- a/dvc/fs/repo.py +++ b/dvc/fs/repo.py @@ -63,7 +63,7 @@ def __init__( if hasattr(repo, "dvc_dir"): self._dvcfss[repo.root_dir] = DvcFileSystem(repo) - def _get_repo(self, path) -> Optional["Repo"]: + def _get_repo(self, path: str) -> Optional["Repo"]: """Returns repo that the path falls in, using prefix. If the path is already tracked/collected, it just returns the repo. @@ -108,8 +108,7 @@ def _is_dvc_repo(self, dir_path): from dvc.repo import Repo repo_path = os.path.join(dir_path, Repo.DVC_DIR) - # dvcignore will ignore subrepos, therefore using `use_dvcignore=False` - return self._main_repo.fs.isdir(repo_path, use_dvcignore=False) + return self._main_repo.fs.isdir(repo_path) def _get_fs_pair( self, path @@ -142,19 +141,20 @@ def open( return dvc_fs.open(path_info, mode=mode, encoding=encoding, **kwargs) - def exists( - self, path, use_dvcignore=True - ): # pylint: disable=arguments-differ - fs, dvc_fs = self._get_fs_pair(path) + def exists(self, path_info) -> bool: + fs, dvc_fs = self._get_fs_pair(path_info) if not dvc_fs: - return fs.exists(path) + return fs.exists(path_info) + + if dvc_fs.repo.dvcignore.is_ignored(fs, path_info): + return False - if fs.exists(path): + if fs.exists(path_info): return True try: - meta = dvc_fs.metadata(path) + meta = dvc_fs.metadata(path_info) except FileNotFoundError: return False @@ -167,6 +167,9 @@ def exists( def isdir(self, path): # pylint: disable=arguments-differ fs, dvc_fs = self._get_fs_pair(path) + if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_dir(path): + return False + try: st = fs.stat(path) return stat.S_ISDIR(st.st_mode) @@ -195,6 +198,9 @@ def isdvc(self, path, **kwargs): def isfile(self, path): # pylint: disable=arguments-differ fs, dvc_fs = self._get_fs_pair(path) + if dvc_fs and dvc_fs.repo.dvcignore.is_ignored_file(path): + return False + try: st = fs.stat(path) return stat.S_ISREG(st.st_mode) @@ -242,9 +248,7 @@ def _subrepo_walk(self, dir_path, **kwargs): ignore_subrepos is set to False. """ fs, dvc_fs = self._get_fs_pair(dir_path) - fs_walk = fs.walk( - dir_path, topdown=True, ignore_subrepos=not self._traverse_subrepos - ) + fs_walk = fs.walk(dir_path, topdown=True) if dvc_fs: dvc_walk = dvc_fs.walk(dir_path, topdown=True, **kwargs) else: @@ -306,15 +310,7 @@ def is_dvc_repo(d): elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) - def walk( - self, - top, - topdown=True, - onerror=None, - dvcfiles=False, - follow_subrepos=None, - **kwargs - ): # pylint: disable=arguments-differ + def walk(self, top, topdown=True, onerror=None, **kwargs): """Walk and merge both DVC and repo fss. Args: @@ -340,17 +336,14 @@ def walk( onerror(NotADirectoryError(top)) return - ignore_subrepos = not self._traverse_subrepos - if follow_subrepos is not None: - ignore_subrepos = not follow_subrepos + repo = self._get_repo(os.path.abspath(top)) + dvcfiles = kwargs.pop("dvcfiles", False) fs, dvc_fs = self._get_fs_pair(top) repo_exists = fs.exists(top) - repo_walk = fs.walk( - top, - topdown=topdown, - onerror=onerror, - ignore_subrepos=ignore_subrepos, + + repo_walk = repo.dvcignore.walk( + fs, top, topdown=topdown, onerror=onerror, **kwargs ) if not dvc_fs or (repo_exists and dvc_fs.isdvc(top)): @@ -358,16 +351,20 @@ def walk( return if not repo_exists: - yield from dvc_fs.walk(top, topdown=topdown, **kwargs) + yield from dvc_fs.walk( + top, topdown=topdown, onerror=onerror, **kwargs + ) dvc_walk = None if dvc_fs.exists(top): - dvc_walk = dvc_fs.walk(top, topdown=topdown, **kwargs) + dvc_walk = dvc_fs.walk( + top, topdown=topdown, onerror=onerror, **kwargs + ) yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) - def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ - for root, _, files in self.walk(top, **kwargs): + def walk_files(self, path_info, **kwargs): + for root, _, files in self.walk(path_info, **kwargs): for fname in files: yield PathInfo(root) / fname diff --git a/dvc/fs/s3.py b/dvc/fs/s3.py index b24e67d319..b8ce9a323e 100644 --- a/dvc/fs/s3.py +++ b/dvc/fs/s3.py @@ -14,7 +14,7 @@ _AWS_CONFIG_PATH = os.path.join(os.path.expanduser("~"), ".aws", "config") -class BaseS3FileSystem(FSSpecWrapper): +class BaseS3FileSystem(FSSpecWrapper): # pylint:disable=abstract-method scheme = Schemes.S3 PATH_CLS = CloudURLInfo REQUIRES = {"s3fs": "s3fs", "boto3": "boto3"} @@ -167,7 +167,7 @@ def wrapper(*args, **kwargs): return wrapper -class S3FileSystem(BaseS3FileSystem): +class S3FileSystem(BaseS3FileSystem): # pylint:disable=abstract-method @wrap_prop(threading.Lock()) @cached_property def s3(self): diff --git a/dvc/fs/ssh/__init__.py b/dvc/fs/ssh/__init__.py index 4ad1fca363..6753d038eb 100644 --- a/dvc/fs/ssh/__init__.py +++ b/dvc/fs/ssh/__init__.py @@ -30,7 +30,7 @@ def ask_password(host, user, port): ) -class SSHFileSystem(BaseFileSystem): +class SSHFileSystem(BaseFileSystem): # pylint:disable=abstract-method scheme = Schemes.SSH REQUIRES = {"paramiko": "paramiko"} _JOBS = 4 @@ -159,7 +159,7 @@ def open(self, path_info, mode="r", encoding=None, **kwargs): else: yield io.TextIOWrapper(fd, encoding=encoding) - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: with self.ssh(path_info) as ssh: return ssh.exists(path_info.path) diff --git a/dvc/fs/webdav.py b/dvc/fs/webdav.py index cb0954040e..c5b82cca52 100644 --- a/dvc/fs/webdav.py +++ b/dvc/fs/webdav.py @@ -142,7 +142,7 @@ def open(self, path_info, mode="r", encoding=None, **kwargs): return io.TextIOWrapper(fobj, encoding=encoding) # Checks whether file/directory exists at remote - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: # Use webdav check to test for file existence return self._client.check(path_info.path) diff --git a/dvc/fs/webhdfs.py b/dvc/fs/webhdfs.py index cf6873ae58..74249c017b 100644 --- a/dvc/fs/webhdfs.py +++ b/dvc/fs/webhdfs.py @@ -30,7 +30,7 @@ def update(_, bytes_transfered): return update -class WebHDFSFileSystem(BaseFileSystem): +class WebHDFSFileSystem(BaseFileSystem): # pylint:disable=abstract-method scheme = Schemes.WEBHDFS PATH_CLS = CloudURLInfo REQUIRES = {"hdfs": "hdfs"} @@ -114,7 +114,7 @@ def remove(self, path_info): self.hdfs_client.delete(path_info.path) - def exists(self, path_info, use_dvcignore=True): + def exists(self, path_info) -> bool: assert not isinstance(path_info, list) assert path_info.scheme == "webhdfs" diff --git a/dvc/ignore.py b/dvc/ignore.py index cdb8722b8c..3348e24a70 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -7,9 +7,12 @@ from pathspec.patterns import GitWildMatchPattern from pathspec.util import normalize_file +from dvc.fs.base import BaseFileSystem from dvc.path_info import PathInfo from dvc.pathspec_math import PatternInfo, merge_patterns +from dvc.scheme import Schemes from dvc.system import System +from dvc.types import AnyPath, List, Optional from dvc.utils import relpath from dvc.utils.collections import PathStringTrie @@ -67,7 +70,7 @@ def from_files(cls, ignore_file_path, fs): return cls(path_spec_lines, dirname) - def __call__(self, root, dirs, files): + def __call__(self, root: List[str], dirs: List[str], files: List[str]): files = [f for f in files if not self.matches(root, f)] dirs = [d for d in dirs if not self.matches(root, d, True)] @@ -159,26 +162,6 @@ def _no_match(path): return CheckIgnoreResult(path, False, ["::"]) -class DvcIgnoreFilterNoop: - def __init__(self, fs, root_dir): - pass - - def __call__(self, root, dirs, files, **kwargs): - return dirs, files - - def is_ignored_dir(self, _): - return False - - def is_ignored_file(self, _): - return False - - def check_ignore(self, path): - return _no_match(path) - - def is_ignored(self, _): - return False - - class DvcIgnoreFilter: def __init__(self, fs, root_dir): from dvc.repo import Repo @@ -193,27 +176,35 @@ def __init__(self, fs, root_dir): self.fs = fs self.root_dir = root_dir self.ignores_trie_fs = PathStringTrie() + self._ignores_trie_subrepos = PathStringTrie() self.ignores_trie_fs[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) - self._ignored_subrepos = PathStringTrie() - self._update(self.root_dir) - - def _update(self, dirname): - self._update_sub_repo(dirname) + self._ignores_trie_subrepos[root_dir] = self.ignores_trie_fs[root_dir] + self._update( + self.root_dir, + self._ignores_trie_subrepos, + dnames=None, + ignore_subrepos=False, + ) + self._update( + self.root_dir, + self.ignores_trie_fs, + dnames=None, + ignore_subrepos=True, + ) - old_pattern = self.ignores_trie_fs.longest_prefix(dirname).value + def _update_trie(self, dirname: str, trie: PathStringTrie) -> None: + old_pattern = trie.longest_prefix(dirname).value matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False) ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) - if not matches and self.fs.exists( - ignore_file_path, use_dvcignore=False - ): + if not matches and self.fs.exists(ignore_file_path): new_pattern = DvcIgnorePatterns.from_files( ignore_file_path, self.fs ) if old_pattern: - self.ignores_trie_fs[dirname] = DvcIgnorePatterns( + trie[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, @@ -222,11 +213,32 @@ def _update(self, dirname): ) ) else: - self.ignores_trie_fs[dirname] = new_pattern + trie[dirname] = new_pattern elif old_pattern: - self.ignores_trie_fs[dirname] = old_pattern + trie[dirname] = old_pattern + + def _update( + self, + dirname: str, + ignore_trie: PathStringTrie, + dnames: Optional["List"], + ignore_subrepos: bool, + ) -> None: + self._update_trie(dirname, ignore_trie) + + if ignore_subrepos: + if dnames is None: + try: + _, dnames, _ = next(self.fs.walk(dirname)) + except StopIteration: + dnames = [] + + for dname in dnames: + self._update_sub_repo( + os.path.join(dirname, dname), ignore_trie + ) - def _update_sub_repo(self, path): + def _update_sub_repo(self, path, ignore_trie: PathStringTrie): from dvc.repo import Repo if path == self.root_dir: @@ -237,14 +249,11 @@ def _update_sub_repo(self, path): return root, dname = os.path.split(path) - self._ignored_subrepos[root] = self._ignored_subrepos.get( - root, set() - ) | {dname} pattern_info = PatternInfo(f"/{dname}/", f"in sub_repo:{dname}") new_pattern = DvcIgnorePatterns([pattern_info], root) - old_pattern = self.ignores_trie_fs.longest_prefix(root).value + old_pattern = ignore_trie.longest_prefix(root).value if old_pattern: - self.ignores_trie_fs[root] = DvcIgnorePatterns( + ignore_trie[root] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, @@ -253,25 +262,50 @@ def _update_sub_repo(self, path): ) ) else: - self.ignores_trie_fs[root] = new_pattern + ignore_trie[root] = new_pattern def __call__(self, root, dirs, files, ignore_subrepos=True): - for dname in dirs: - self._update_sub_repo(os.path.join(root, dname)) - - ignore_pattern = self._get_trie_pattern(root) + abs_root = os.path.abspath(root) + ignore_pattern = self._get_trie_pattern( + abs_root, dnames=dirs, ignore_subrepos=ignore_subrepos + ) if ignore_pattern: - dirs, files = ignore_pattern(root, dirs, files) - if not ignore_subrepos: - dirs.extend(self._ignored_subrepos.get(root, [])) + dirs, files = ignore_pattern(abs_root, dirs, files) return dirs, files - def _get_trie_pattern(self, dirname): - ignore_pattern = self.ignores_trie_fs.get(dirname) + def walk(self, fs: BaseFileSystem, path_info: AnyPath, **kwargs): + ignore_subrepos = kwargs.pop("ignore_subrepos", True) + if fs.scheme == Schemes.LOCAL: + for root, dirs, files in fs.walk(path_info, **kwargs): + dirs[:], files[:] = self( + root, dirs, files, ignore_subrepos=ignore_subrepos + ) + yield root, dirs, files + else: + yield from fs.walk(path_info, **kwargs) + + def walk_files(self, fs: BaseFileSystem, path_info: AnyPath, **kwargs): + if fs.scheme == Schemes.LOCAL: + for root, _, files in self.walk(fs, path_info, **kwargs): + for file in files: + # NOTE: os.path.join is ~5.5 times slower + yield PathInfo(f"{root}{os.sep}{file}") + else: + yield from fs.walk_files(path_info) + + def _get_trie_pattern( + self, dirname, dnames: Optional["List"] = None, ignore_subrepos=True + ) -> Optional["DvcIgnorePatterns"]: + if ignore_subrepos: + ignores_trie = self.ignores_trie_fs + else: + ignores_trie = self._ignores_trie_subrepos + + ignore_pattern = ignores_trie.get(dirname) if ignore_pattern: return ignore_pattern - prefix = self.ignores_trie_fs.longest_prefix(dirname).key + prefix = ignores_trie.longest_prefix(dirname).key if not prefix: # outside of the repo return None @@ -286,33 +320,31 @@ def _get_trie_pattern(self, dirname): dirs.append(dirname) for parent in dirs: - self._update(parent) + self._update(parent, ignores_trie, dnames, ignore_subrepos) - return self.ignores_trie_fs.get(dirname) + return ignores_trie.get(dirname) - def _is_ignored(self, path, is_dir=False): + def _is_ignored( + self, path: str, is_dir: bool = False, ignore_subrepos: bool = True + ): if self._outside_repo(path): return False dirname, basename = os.path.split(os.path.normpath(path)) - ignore_pattern = self._get_trie_pattern(dirname) + ignore_pattern = self._get_trie_pattern(dirname, None, ignore_subrepos) if ignore_pattern: return ignore_pattern.matches(dirname, basename, is_dir) return False - def _is_subrepo(self, path): - dirname, basename = os.path.split(os.path.normpath(path)) - return basename in self._ignored_subrepos.get(dirname, set()) - - def is_ignored_dir(self, path, ignore_subrepos=True): + def is_ignored_dir(self, path: str, ignore_subrepos: bool = True) -> bool: + "Only used in LocalFileSystem" path = os.path.abspath(path) - if not ignore_subrepos: - return not self._is_subrepo(path) if path == self.root_dir: return False - return self._is_ignored(path, True) + return self._is_ignored(path, True, ignore_subrepos=ignore_subrepos) - def is_ignored_file(self, path): + def is_ignored_file(self, path: str) -> bool: + "Only used in LocalFileSystem" path = os.path.abspath(path) return self._is_ignored(path, False) @@ -346,14 +378,20 @@ def check_ignore(self, target): return CheckIgnoreResult(target, True, matches) return _no_match(target) - def is_ignored(self, path): + def is_ignored( + self, fs: BaseFileSystem, path: str, ignore_subrepos: bool = True + ) -> bool: # NOTE: can't use self.check_ignore(path).match for now, see # https://github.com/iterative/dvc/issues/4555 - if os.path.isfile(path): + if fs.scheme != Schemes.LOCAL: + return False + if fs.isfile(path): return self.is_ignored_file(path) - if os.path.isdir(path): - return self.is_ignored_dir(path) - return self.is_ignored_file(path) or self.is_ignored_dir(path) + if fs.isdir(path): + return self.is_ignored_dir(path, ignore_subrepos) + return self.is_ignored_file(path) or self.is_ignored_dir( + path, ignore_subrepos + ) def init(path): diff --git a/dvc/objects/stage.py b/dvc/objects/stage.py index dd64bf9c62..f6e228bdf5 100644 --- a/dvc/objects/stage.py +++ b/dvc/objects/stage.py @@ -77,7 +77,13 @@ def _get_file_obj(path_info, fs, name, odb=None, state=None, upload=False): return path_info, obj -def _build_objects(path_info, fs, name, odb, state, upload, **kwargs): +def _build_objects( + path_info, fs, name, odb, state, upload, dvcignore=None, **kwargs +): + if dvcignore: + walk_iterator = dvcignore.walk_files(fs, path_info) + else: + walk_iterator = fs.walk_files(path_info) with Tqdm( unit="md5", desc="Computing file/dir hashes (only done once)", @@ -96,7 +102,7 @@ def _build_objects(path_info, fs, name, odb, state, upload, **kwargs): with ThreadPoolExecutor( max_workers=kwargs.pop("jobs", fs.hash_jobs) ) as executor: - yield from executor.map(worker, fs.walk_files(path_info, **kwargs)) + yield from executor.map(worker, walk_iterator) def _iter_objects(path_info, fs, name, odb, state, upload, **kwargs): diff --git a/dvc/output/base.py b/dvc/output/base.py index faed0f5b2c..dfec17eca9 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -21,6 +21,7 @@ from dvc.objects.db import NamedCache from dvc.objects.errors import ObjectFormatError from dvc.objects.stage import stage as ostage +from dvc.scheme import Schemes from ..fs.base import BaseFileSystem @@ -117,6 +118,11 @@ def __init__( desc=None, isexec=False, ): + self.repo = stage.repo if stage else None + if fs: + self.fs = fs + else: + self.fs = self.FS_CLS(self.repo, {}) self._validate_output_path(path, stage) # This output (and dependency) objects have too many paths/urls # here is a list and comments: @@ -129,13 +135,8 @@ def __init__( # By resolved path, which contains actual location, # should be absolute and don't contain remote:// refs. self.stage = stage - self.repo = stage.repo if stage else None self.def_path = path self.hash_info = HashInfo.from_dict(info) - if fs: - self.fs = fs - else: - self.fs = self.FS_CLS(self.repo, {}) self.use_cache = False if self.IS_DEPENDENCY else cache self.metric = False if self.IS_DEPENDENCY else metric self.plot = False if self.IS_DEPENDENCY else plot @@ -195,17 +196,31 @@ def get_hash(self): self.path_info, self.fs, self.fs.PARAM_CHECKSUM, + dvcignore=self.dvcignore, ).hash_info return ostage( - self.odb, self.path_info, self.fs, self.odb.fs.PARAM_CHECKSUM + self.odb, + self.path_info, + self.fs, + self.odb.fs.PARAM_CHECKSUM, + dvcignore=self.dvcignore, ).hash_info @property def is_dir_checksum(self): return self.hash_info.isdir + def _is_path_dvcignore(self, path) -> bool: + if not self.IS_DEPENDENCY and self.dvcignore: + if self.dvcignore.is_ignored(self.fs, path, ignore_subrepos=False): + return True + return False + @property def exists(self): + if self._is_path_dvcignore(self.path_info): + return False + return self.fs.exists(self.path_info) def changed_checksum(self): @@ -248,14 +263,22 @@ def changed(self): logger.debug(str(status)) return bool(status) + @property + def dvcignore(self): + return None + @property def is_empty(self): return self.fs.is_empty(self.path_info) def isdir(self): + if self._is_path_dvcignore(self.path_info): + return False return self.fs.isdir(self.path_info) def isfile(self): + if self._is_path_dvcignore(self.path_info): + return False return self.fs.isfile(self.path_info) # pylint: disable=no-member @@ -307,7 +330,11 @@ def save(self): return self.obj = ostage( - self.odb, self.path_info, self.fs, self.odb.fs.PARAM_CHECKSUM + self.odb, + self.path_info, + self.fs, + self.odb.fs.PARAM_CHECKSUM, + dvcignore=self.dvcignore, ) self.hash_info = self.obj.hash_info self.isexec = self.isfile() and self.fs.isexec(self.path_info) @@ -328,6 +355,7 @@ def commit(self, filter_info=None): filter_info or self.path_info, self.fs, self.odb.fs.PARAM_CHECKSUM, + dvcignore=self.dvcignore, ) objects.save(self.odb, obj) checkout( @@ -336,6 +364,7 @@ def commit(self, filter_info=None): obj, self.odb, relink=True, + dvcignore=self.dvcignore, ) self.set_exec() @@ -451,7 +480,7 @@ def checkout( def remove(self, ignore_remove=False): self.fs.remove(self.path_info) - if self.scheme != "local": + if self.scheme != Schemes.LOCAL: return if ignore_remove: @@ -650,18 +679,17 @@ def get_used_cache(self, **kwargs): return ret - @classmethod - def _validate_output_path(cls, path, stage=None): + def _validate_output_path(self, path, stage=None): from dvc.dvcfile import is_valid_filename if is_valid_filename(path): - raise cls.IsStageFileError(path) + raise self.IsStageFileError(path) if stage: abs_path = os.path.join(stage.wdir, path) - if stage.repo.fs.dvcignore.is_ignored(abs_path): - check = stage.repo.fs.dvcignore.check_ignore(abs_path) - raise cls.IsIgnoredError(check) + if self._is_path_dvcignore(abs_path): + check = stage.repo.dvcignore.check_ignore(abs_path) + raise self.IsIgnoredError(check) def _check_can_merge(self, out): if self.scheme != out.scheme: diff --git a/dvc/output/local.py b/dvc/output/local.py index 76fa01ab7d..e06d0990ad 100644 --- a/dvc/output/local.py +++ b/dvc/output/local.py @@ -105,3 +105,7 @@ def verify_metric(self): if not istextfile(path, self.fs): msg = "binary file '{}' cannot be used as {}." raise DvcException(msg.format(self.path_info, name)) + + @property + def dvcignore(self): + return self.repo.dvcignore diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 961a447b23..832b640ba7 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -9,6 +9,7 @@ from dvc.exceptions import FileMissingError from dvc.exceptions import IsADirectoryError as DvcIsADirectoryError from dvc.exceptions import NotDvcRepoError, OutputNotFoundError +from dvc.ignore import DvcIgnoreFilter from dvc.path_info import PathInfo from dvc.utils.fs import path_isin @@ -157,13 +158,10 @@ def __init__( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) - fs_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: - self._fs = scm.get_fs(rev, **fs_kwargs) + self._fs = scm.get_fs(rev) else: - self._fs = LocalFileSystem( - self, {"url": self.root_dir}, **fs_kwargs - ) + self._fs = LocalFileSystem(self, {"url": self.root_dir}) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized @@ -190,7 +188,7 @@ def __init__( # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. - self.state = State(self.root_dir, self.tmp_dir) + self.state = State(self.root_dir, self.tmp_dir, self.dvcignore) self.stage_cache = StageCache(self) self._ignore() @@ -241,6 +239,11 @@ def scm(self): return SCM(self.root_dir, no_scm=True) raise + @cached_property + def dvcignore(self) -> DvcIgnoreFilter: + + return DvcIgnoreFilter(self.fs, self.root_dir) + def get_rev(self): from dvc.fs.local import LocalFileSystem @@ -506,6 +509,7 @@ def _reset(self): self.__dict__.pop("graph", None) self.__dict__.pop("stages", None) self.__dict__.pop("pipelines", None) + self.__dict__.pop("dvcignore", None) def __enter__(self): return self diff --git a/dvc/repo/add.py b/dvc/repo/add.py index f5e6f658d3..73fc48f9eb 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -209,7 +209,7 @@ def _find_all_targets(repo, target, recursive): return [ os.fspath(path) for path in Tqdm( - repo.fs.walk_files(target), + repo.dvcignore.walk_files(repo.fs, target), desc="Searching " + target, bar_format=Tqdm.BAR_FMT_NOTOTAL, unit="file", diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index 63bce9d186..8c2420f7be 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -36,7 +36,7 @@ def brancher( # noqa: E302 scm = self.scm - self.fs = LocalFileSystem(self, {"url": self.root_dir}, use_dvcignore=True) + self.fs = LocalFileSystem(self, {"url": self.root_dir}) yield "workspace" if revs and "workspace" in revs: @@ -59,9 +59,7 @@ def brancher( # noqa: E302 try: if revs: for sha, names in group_by(scm.resolve_rev, revs).items(): - self.fs = scm.get_fs( - sha, use_dvcignore=True, dvcignore_root=self.root_dir - ) + self.fs = scm.get_fs(sha) # ignore revs that don't contain repo root # (i.e. revs from before a subdir=True repo was init'ed) if self.fs.exists(self.root_dir): diff --git a/dvc/repo/collect.py b/dvc/repo/collect.py index a099d391d7..1f8a03a881 100644 --- a/dvc/repo/collect.py +++ b/dvc/repo/collect.py @@ -43,7 +43,7 @@ def _collect_paths( for path_info in path_infos: if recursive and fs.isdir(path_info): - target_infos.extend(fs.walk_files(path_info)) + target_infos.extend(repo.dvcignore.walk_files(fs, path_info)) if not fs.exists(path_info): if not recursive: diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index b25627f7b1..a623221c99 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -111,14 +111,7 @@ def cb(result): cb(repo.cloud.pull(used, jobs)) except (NoOutputOrStageError, NoRemoteError): pass - obj = stage( - odb, - path_info, - repo.repo_fs, - "md5", - jobs=jobs, - follow_subrepos=False, - ) + obj = stage(odb, path_info, repo.repo_fs, "md5", jobs=jobs,) save( odb, obj, jobs=jobs, download_callback=cb, ) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index cdd908a820..c81383e7b5 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -52,8 +52,6 @@ def get(url, path, out=None, rev=None, jobs=None): ) as repo: from_info = PathInfo(repo.root_dir) / path to_info = PathInfo(out) - repo.repo_fs.download( - from_info, to_info, jobs=jobs, follow_subrepos=False - ) + repo.repo_fs.download(from_info, to_info, jobs=jobs) finally: remove(tmp_dir) diff --git a/dvc/repo/stage.py b/dvc/repo/stage.py index a45fb682b9..14e00eb290 100644 --- a/dvc/repo/stage.py +++ b/dvc/repo/stage.py @@ -465,9 +465,10 @@ def is_out_or_ignored(root, directory): return dir_path in outs or is_ignored(dir_path) stages = [] - for root, dirs, files in self.fs.walk(self.repo.root_dir): + for root, dirs, files in self.repo.dvcignore.walk( + self.fs, self.repo.root_dir + ): dvcfile_filter = partial(is_dvcfile_and_not_ignored, root) - for file in filter(dvcfile_filter, files): file_path = os.path.join(root, file) try: diff --git a/dvc/scm/git/__init__.py b/dvc/scm/git/__init__.py index e65d3cf1e0..6136a4e868 100644 --- a/dvc/scm/git/__init__.py +++ b/dvc/scm/git/__init__.py @@ -345,7 +345,7 @@ def _backend_func(self, name, *args, **kwargs): pass raise NoGitBackendError(name) - def get_fs(self, rev: str, **kwargs): + def get_fs(self, rev: str): from dvc.fs.git import GitFileSystem from .objects import GitTrie @@ -353,7 +353,7 @@ def get_fs(self, rev: str, **kwargs): resolved = self.resolve_rev(rev) tree_obj = self.pygit2.get_tree_obj(rev=resolved) trie = GitTrie(tree_obj, resolved) - return GitFileSystem(self.root_dir, trie, **kwargs) + return GitFileSystem(self.root_dir, trie) is_ignored = partialmethod(_backend_func, "is_ignored") add = partialmethod(_backend_func, "add") diff --git a/dvc/state.py b/dvc/state.py index 219c4ca7da..040e84c58b 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -64,13 +64,14 @@ def save_link(self, path_info, fs): class State(StateBase): # pylint: disable=too-many-instance-attributes - def __init__(self, root_dir=None, tmp_dir=None): + def __init__(self, root_dir=None, tmp_dir=None, dvcignore=None): from diskcache import Cache super().__init__() self.tmp_dir = tmp_dir self.root_dir = root_dir + self.dvcignore = dvcignore self.fs = LocalFileSystem(None, {"url": self.root_dir}) if not tmp_dir: @@ -103,7 +104,7 @@ def save(self, path_info, fs, hash_info): assert isinstance(hash_info, HashInfo) assert os.path.exists(path_info) - mtime, size = get_mtime_and_size(path_info, self.fs) + mtime, size = get_mtime_and_size(path_info, self.fs, self.dvcignore) inode = get_inode(path_info) logger.debug( @@ -135,7 +136,7 @@ def get(self, path_info, fs): if not os.path.exists(path): return None - mtime, size = get_mtime_and_size(path, self.fs) + mtime, size = get_mtime_and_size(path, self.fs, self.dvcignore) inode = get_inode(path) value = self.md5s.get(inode) @@ -160,7 +161,7 @@ def save_link(self, path_info, fs): if not self.fs.exists(path_info): return - mtime, _ = get_mtime_and_size(path_info, self.fs) + mtime, _ = get_mtime_and_size(path_info, self.fs, self.dvcignore) inode = get_inode(path_info) relative_path = relpath(path_info, self.root_dir) @@ -186,7 +187,7 @@ def get_unused_links(self, used, fs): continue inode = get_inode(path) - mtime, _ = get_mtime_and_size(path, self.fs) + mtime, _ = get_mtime_and_size(path, self.fs, self.dvcignore) if ref[relative_path] == (inode, mtime): logger.debug("Removing '%s' as unused link.", path) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index d46c736772..78fa36722a 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -30,13 +30,17 @@ def get_inode(path): return inode -def get_mtime_and_size(path, fs): +def get_mtime_and_size(path, fs, dvcignore=None): import nanotime if fs.isdir(path): size = 0 files_mtimes = {} - for file_path in fs.walk_files(path): + if dvcignore: + walk_iterator = dvcignore.walk_files(fs, path) + else: + walk_iterator = fs.walk_files(path) + for file_path in walk_iterator: try: stats = fs.stat(file_path) except OSError as exc: diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 398363d6bc..41f53b782a 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -206,7 +206,7 @@ def test_subrepos_are_ignored(tmp_dir, erepo_dir): PathInfo(repo.root_dir) / "dir", repo.repo_fs, "md5", - follow_subrepos=False, + dvcignore=repo.dvcignore, ) save(repo.odb.local, obj) assert set(cache_dir.glob("*/*")) == { diff --git a/tests/func/test_fs.py b/tests/func/test_fs.py index 3d74a66061..a96ab147f5 100644 --- a/tests/func/test_fs.py +++ b/tests/func/test_fs.py @@ -144,9 +144,13 @@ def test_subdir(self): class TestWalkInGit(AssertWalkEqualMixin, TestGit): def test_nobranch(self): - fs = LocalFileSystem(None, {"url": self._root_dir}, use_dvcignore=True) + fs = LocalFileSystem(None, {"url": self._root_dir}) + walk_result = [] + for root, dirs, files in fs.walk("."): + dirs[:] = [i for i in dirs if i != ".git"] + walk_result.append((root, dirs, files)) self.assertWalkEqual( - fs.walk("."), + walk_result, [ (".", ["data_dir"], ["bar", "тест", "code.py", "foo"]), (join("data_dir"), ["data_sub_dir"], ["data"]), @@ -196,13 +200,11 @@ def test_cleanfs_subrepo(tmp_dir, dvc, scm, monkeypatch): path = PathInfo(subrepo_dir) - assert dvc.fs.use_dvcignore - assert not dvc.fs.exists(path / "foo") - assert not dvc.fs.isfile(path / "foo") - assert not dvc.fs.exists(path / "dir") - assert not dvc.fs.isdir(path / "dir") + assert dvc.fs.exists(path / "foo") + assert dvc.fs.isfile(path / "foo") + assert dvc.fs.exists(path / "dir") + assert dvc.fs.isdir(path / "dir") - assert subrepo.fs.use_dvcignore assert subrepo.fs.exists(path / "foo") assert subrepo.fs.isfile(path / "foo") assert subrepo.fs.exists(path / "dir") @@ -219,17 +221,13 @@ def test_walk_dont_ignore_subrepos(tmp_dir, scm, dvc): scm.commit("Add subrepo") dvc_fs = dvc.fs - dvc_fs._reset() - scm_fs = scm.get_fs("HEAD", use_dvcignore=True) + dvc._reset() + scm_fs = scm.get_fs("HEAD") path = os.fspath(tmp_dir) get_dirs = itemgetter(1) - assert get_dirs(next(dvc_fs.walk(path))) == [] - assert get_dirs(next(scm_fs.walk(path))) == [] - - kw = {"ignore_subrepos": False} - assert get_dirs(next(dvc_fs.walk(path, **kw))) == ["subdir"] - assert get_dirs(next(scm_fs.walk(path, **kw))) == ["subdir"] + assert set(get_dirs(next(dvc_fs.walk(path)))) == {".dvc", "subdir", ".git"} + assert set(get_dirs(next(scm_fs.walk(path)))) == {".dvc", "subdir"} @pytest.mark.parametrize( diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 4da3bf7778..30e9446c7a 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -7,57 +7,51 @@ from dvc.exceptions import DvcIgnoreInCollectedDirError from dvc.ignore import DvcIgnore, DvcIgnorePatterns from dvc.output.base import OutputIsIgnoredError -from dvc.path_info import PathInfo from dvc.pathspec_math import PatternInfo, merge_patterns from dvc.repo import Repo -from dvc.utils import relpath +from dvc.types import List from dvc.utils.fs import get_mtime_and_size from tests.dir_helpers import TmpDir -def _to_pattern_info_list(str_list): +def _to_pattern_info_list(str_list: List): return [PatternInfo(a, "") for a in str_list] -def test_ignore(tmp_dir, dvc, monkeypatch): - tmp_dir.gen({"dir": {"ignored": "text", "other": "text2"}}) - tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/ignored") - - dvc.fs._reset() - - path = PathInfo(tmp_dir) +@pytest.mark.parametrize("filename", ["ignored", "тест"]) +def test_ignore(tmp_dir, dvc, filename): + tmp_dir.gen({"dir": {filename: filename, "other": "text2"}}) + tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/{}".format(filename)) - assert set(dvc.fs.walk_files(path / "dir")) == {path / "dir" / "other"} + dvc._reset() - -def test_ignore_unicode(tmp_dir, dvc): - tmp_dir.gen({"dir": {"other": "text", "тест": "проверка"}}) - tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/тест") - dvc.fs._reset() - path = PathInfo(tmp_dir) - assert set(dvc.fs.walk_files(path / "dir")) == {path / "dir" / "other"} + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir) + assert set(result) == { + tmp_dir / DvcIgnore.DVCIGNORE_FILE, + tmp_dir / "dir" / "other", + } def test_rename_ignored_file(tmp_dir, dvc): tmp_dir.gen({"dir": {"ignored": "...", "other": "text"}}) tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "ignored*") - dvc.fs._reset() + dvc._reset() - mtime, size = get_mtime_and_size("dir", dvc.fs) + mtime, size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) shutil.move("dir/ignored", "dir/ignored_new") - new_mtime, new_size = get_mtime_and_size("dir", dvc.fs) + new_mtime, new_size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) assert new_mtime == mtime and new_size == size def test_rename_file(tmp_dir, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) - mtime, size = get_mtime_and_size("dir", dvc.fs) + mtime, size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) shutil.move("dir/foo", "dir/foo_new") - new_mtime, new_size = get_mtime_and_size("dir", dvc.fs) + new_mtime, new_size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) assert new_mtime != mtime and new_size == size @@ -65,22 +59,22 @@ def test_rename_file(tmp_dir, dvc): def test_remove_ignored_file(tmp_dir, dvc): tmp_dir.gen({"dir": {"ignored": "...", "other": "text"}}) tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/ignored") - dvc.fs._reset() + dvc._reset() - mtime, size = get_mtime_and_size("dir", dvc.fs) + mtime, size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) os.remove("dir/ignored") - new_mtime, new_size = get_mtime_and_size("dir", dvc.fs) + new_mtime, new_size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) assert new_mtime == mtime and new_size == size def test_remove_file(tmp_dir, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) - mtime, size = get_mtime_and_size("dir", dvc.fs) + mtime, size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) os.remove("dir/foo") - new_mtime, new_size = get_mtime_and_size("dir", dvc.fs) + new_mtime, new_size = get_mtime_and_size("dir", dvc.fs, dvc.dvcignore) assert new_mtime != mtime and new_size != size @@ -98,12 +92,12 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): top_ignore_file = (tmp_dir / dname).with_name(DvcIgnore.DVCIGNORE_FILE) top_ignore_file.write_text(os.path.basename(dname)) - dvc.fs._reset() + dvc._reset() ignore_file = tmp_dir / dname / DvcIgnore.DVCIGNORE_FILE ignore_file.write_text("foo") - dvcignore = dvc.fs.dvcignore + dvcignore = dvc.dvcignore top_ignore_path = os.path.dirname(os.fspath(top_ignore_file)) @@ -129,20 +123,18 @@ def test_ignore_on_branch(tmp_dir, scm, dvc): with tmp_dir.branch("branch", new=True): tmp_dir.scm_gen(DvcIgnore.DVCIGNORE_FILE, "foo", commit="add ignore") - dvc.fs._reset() - path = PathInfo(tmp_dir) - assert set(dvc.fs.walk_files(path)) == { - path / "foo", - path / "bar", - path / ".dvcignore", - } + dvc._reset() - dvc.fs = scm.get_fs("branch", use_dvcignore=True) - assert set(dvc.fs.walk_files(path)) == { - os.fspath(path / DvcIgnore.DVCIGNORE_FILE), - os.fspath(path / "bar"), + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir) + assert set(result) == { + tmp_dir / "foo", + tmp_dir / "bar", + tmp_dir / DvcIgnore.DVCIGNORE_FILE, } + dvc.fs = scm.get_fs("branch") + assert dvc.dvcignore.is_ignored_file(tmp_dir / "foo") + def test_match_nested(tmp_dir, dvc): tmp_dir.gen( @@ -153,9 +145,9 @@ def test_match_nested(tmp_dir, dvc): "dir": {"x.backup": "x backup", "tmp": "content"}, } ) - dvc.fs._reset() - result = {os.fspath(os.path.normpath(f)) for f in dvc.fs.walk_files(".")} - assert result == {".dvcignore", "foo"} + dvc._reset() + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir) + assert set(result) == {tmp_dir / DvcIgnore.DVCIGNORE_FILE, tmp_dir / "foo"} def test_ignore_external(tmp_dir, scm, dvc, tmp_path_factory): @@ -163,12 +155,11 @@ def test_ignore_external(tmp_dir, scm, dvc, tmp_path_factory): ext_dir = TmpDir(os.fspath(tmp_path_factory.mktemp("external_dir"))) ext_dir.gen({"y.backup": "y", "tmp": {"file": "ext tmp"}}) - result = {relpath(f, ext_dir) for f in dvc.fs.walk_files(ext_dir)} - assert result == {"y.backup", os.path.join("tmp", "file")} - assert dvc.fs.dvcignore.is_ignored_dir(os.fspath(ext_dir / "tmp")) is False + result = dvc.dvcignore.walk_files(dvc.fs, ext_dir) + assert set(result) == {ext_dir / "y.backup", ext_dir / "tmp" / "file"} + assert dvc.dvcignore.is_ignored_dir(os.fspath(ext_dir / "tmp")) is False assert ( - dvc.fs.dvcignore.is_ignored_file(os.fspath(ext_dir / "y.backup")) - is False + dvc.dvcignore.is_ignored_file(os.fspath(ext_dir / "y.backup")) is False ) @@ -176,10 +167,12 @@ def test_ignore_subrepo(tmp_dir, scm, dvc): tmp_dir.gen({".dvcignore": "foo", "subdir": {"foo": "foo"}}) scm.add([".dvcignore"]) scm.commit("init parent dvcignore") - dvc.fs._reset() + dvc._reset() subrepo_dir = tmp_dir / "subdir" - assert not dvc.fs.exists(PathInfo(subrepo_dir / "foo")) + + result = dvc.dvcignore.walk_files(dvc.fs, subrepo_dir) + assert set(result) == set() with subrepo_dir.chdir(): subrepo = Repo.init(subdir=True) @@ -187,7 +180,7 @@ def test_ignore_subrepo(tmp_dir, scm, dvc): scm.commit("subrepo init") for _ in subrepo.brancher(all_commits=True): - assert subrepo.fs.exists(PathInfo(subrepo_dir / "foo")) + assert subrepo.fs.exists(subrepo_dir / "foo") def test_ignore_resurface_subrepo(tmp_dir, scm, dvc): @@ -196,28 +189,31 @@ def test_ignore_resurface_subrepo(tmp_dir, scm, dvc): subrepo_dir.mkdir() with subrepo_dir.chdir(): Repo.init(subdir=True) + subrepo_dir.gen({"bar": {"bar": "bar"}}) - dvc.fs._reset() + dvc._reset() - dirs = ["subdir"] files = ["foo"] - assert dvc.fs.dvcignore(os.fspath(tmp_dir), dirs, files) == ([], files) - assert dvc.fs.dvcignore( - os.fspath(tmp_dir), dirs, files, ignore_subrepos=False - ) == (dirs, files) - - assert dvc.fs.dvcignore.is_ignored_dir(os.fspath(subrepo_dir)) - assert not dvc.fs.dvcignore.is_ignored_dir( - os.fspath(subrepo_dir), ignore_subrepos=False + dirs = ["bar"] + root = os.fspath(subrepo_dir) + assert dvc.dvcignore(root, dirs, files, ignore_subrepos=False) == ( + dirs, + files, + ) + assert dvc.dvcignore(root, dirs, files) == ([], []) + + assert dvc.dvcignore.is_ignored_dir(os.fspath(subrepo_dir / "bar")) + assert not dvc.dvcignore.is_ignored_dir( + os.fspath(subrepo_dir / "bar"), ignore_subrepos=False ) def test_ignore_blank_line(tmp_dir, dvc): tmp_dir.gen({"dir": {"ignored": "text", "other": "text2"}}) tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "foo\n\ndir/ignored") - dvc.fs._reset() - path = PathInfo(tmp_dir) - assert set(dvc.fs.walk_files(path / "dir")) == {path / "dir" / "other"} + dvc._reset() + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir / "dir") + assert set(result) == {tmp_dir / "dir" / "other"} # It is not possible to re-include a file if a parent directory of @@ -250,11 +246,9 @@ def test_ignore_file_in_parent_path( ): tmp_dir.gen(data_struct) tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "\n".join(pattern_list)) - dvc.fs._reset() - path = PathInfo(tmp_dir) - assert set(dvc.fs.walk_files(path / "dir")) == { - path / relpath for relpath in result_set - } + dvc._reset() + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir / "dir") + assert set(result) == {tmp_dir / relpath for relpath in result_set} # If there is a separator at the end of the pattern then the pattern @@ -273,11 +267,11 @@ def test_ignore_sub_directory(tmp_dir, dvc): ) tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "doc/fortz"}}) - dvc.fs._reset() - path = PathInfo(tmp_dir) - assert set(dvc.fs.walk_files(path / "dir")) == { - path / "dir" / "a" / "doc" / "fortz" / "a", - path / "dir" / DvcIgnore.DVCIGNORE_FILE, + dvc._reset() + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir / "dir") + assert set(result) == { + tmp_dir / "dir" / "a" / "doc" / "fortz" / "a", + tmp_dir / "dir" / DvcIgnore.DVCIGNORE_FILE, } @@ -285,10 +279,10 @@ def test_ignore_sub_directory(tmp_dir, dvc): def test_ignore_directory(tmp_dir, dvc): tmp_dir.gen({"dir": {"fortz": {}, "a": {"fortz": {}}}}) tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "fortz"}}) - dvc.fs._reset() - path = PathInfo(tmp_dir) - assert set(dvc.fs.walk_files(path / "dir")) == { - path / "dir" / DvcIgnore.DVCIGNORE_FILE, + dvc._reset() + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir / "dir") + assert set(result) == { + tmp_dir / "dir" / DvcIgnore.DVCIGNORE_FILE, } @@ -296,11 +290,11 @@ def test_multi_ignore_file(tmp_dir, dvc, monkeypatch): tmp_dir.gen({"dir": {"subdir": {"should_ignore": "1", "not_ignore": "1"}}}) tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/subdir/*_ignore") tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "!subdir/not_ignore"}}) - dvc.fs._reset() - path = PathInfo(tmp_dir) - assert set(dvc.fs.walk_files(path / "dir")) == { - path / "dir" / "subdir" / "not_ignore", - path / "dir" / DvcIgnore.DVCIGNORE_FILE, + dvc._reset() + result = dvc.dvcignore.walk_files(dvc.fs, tmp_dir / "dir") + assert set(result) == { + tmp_dir / "dir" / "subdir" / "not_ignore", + tmp_dir / "dir" / DvcIgnore.DVCIGNORE_FILE, } @@ -321,8 +315,8 @@ def test_pattern_trie_fs(tmp_dir, dvc): "other": {DvcIgnore.DVCIGNORE_FILE: "1\n2\n3"}, } ) - dvc.fs._reset() - dvcignore = dvc.fs.dvcignore + dvc._reset() + dvcignore = dvc.dvcignore ignore_pattern_top = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top") @@ -389,10 +383,11 @@ def test_ignore_in_added_dir(tmp_dir, dvc): ".dvcignore": "**/ignored", } ) - dvc.fs._reset() + dvc._reset() ignored_path = tmp_dir / "dir" / "sub" / "ignored" - assert not dvc.fs.exists(PathInfo(ignored_path)) + result = dvc.dvcignore.walk_files(dvc.fs, ignored_path) + assert set(result) == set() assert ignored_path.exists() dvc.add("dir") @@ -417,3 +412,9 @@ def test_ignored_output_nested(tmp_dir, scm, dvc, run_copy): run_copy("foo", "foo.log", name="copy", wdir="copy") assert Path("copy/foo.log").exists() + + +def test_run_dvcignored_dep(tmp_dir, dvc, run_copy): + tmp_dir.gen({".dvcignore": "dir\n", "dir": {"foo": "foo"}}) + run_copy(os.path.join("dir", "foo"), "bar", name="copy-foo-to-bar") + assert (tmp_dir / "bar").read_text() == "foo" diff --git a/tests/func/test_ls.py b/tests/func/test_ls.py index ff189b7c24..4ccaf62e9d 100644 --- a/tests/func/test_ls.py +++ b/tests/func/test_ls.py @@ -536,8 +536,8 @@ def test_subrepo(dvc_top_level, erepo): if hasattr(repo, "dvc"): repo.dvc_gen(dvc_files, commit=f"dvc track for {repo}") - def _list_files(path=None): - return set(map(itemgetter("path"), Repo.ls(os.fspath(erepo), path))) + def _list_files(repo, path=None): + return set(map(itemgetter("path"), Repo.ls(os.fspath(repo), path))) extras = {".dvcignore", ".gitignore"} git_tracked_outputs = {"bar.txt", "scm_dir"} @@ -547,12 +547,11 @@ def _list_files(path=None): top_level_outputs = ( common_outputs if dvc_top_level else git_tracked_outputs ) - assert _list_files() == top_level_outputs | {"subrepo"} - assert _list_files("subrepo") == common_outputs - - assert _list_files("scm_dir") == {"ipsum"} - assert _list_files("subrepo/scm_dir") == {"ipsum"} - + assert _list_files(erepo) == top_level_outputs + assert _list_files(erepo, "scm_dir") == {"ipsum"} if dvc_top_level: - assert _list_files("dvc_dir") == {"lorem"} - assert _list_files("subrepo/dvc_dir") == {"lorem"} + assert _list_files(erepo, "dvc_dir") == {"lorem"} + + assert _list_files(subrepo, ".") == common_outputs + assert _list_files(subrepo, "scm_dir") == {"ipsum"} + assert _list_files(subrepo, "dvc_dir") == {"lorem"} diff --git a/tests/func/test_stage.py b/tests/func/test_stage.py index 47ba4d93f3..783a1dad9c 100644 --- a/tests/func/test_stage.py +++ b/tests/func/test_stage.py @@ -216,6 +216,8 @@ def test_parent_repo_collect_stages(tmp_dir, scm, dvc): deep_subrepo_dir.gen("subrepo_file", "subrepo file content") deep_subrepo.add("subrepo_file") + dvc._reset() + stages = dvc.stage.collect(None) subrepo_stages = subrepo.stage.collect(None) deep_subrepo_stages = deep_subrepo.stage.collect(None) diff --git a/tests/func/test_state.py b/tests/func/test_state.py index 2a24a0ed5b..166f94d85d 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -12,7 +12,7 @@ def test_state(tmp_dir, dvc): path_info = PathInfo(path) hash_info = HashInfo("md5", file_md5(path, dvc.fs)) - state = State(dvc.root_dir, dvc.tmp_dir) + state = State(dvc.root_dir, dvc.tmp_dir, dvc.dvcignore) state.save(path_info, dvc.fs, hash_info) assert state.get(path_info, dvc.fs) == hash_info diff --git a/tests/unit/fs/test_repo.py b/tests/unit/fs/test_repo.py index 8f0b33ee26..aaf6acc8e3 100644 --- a/tests/unit/fs/test_repo.py +++ b/tests/unit/fs/test_repo.py @@ -324,7 +324,7 @@ def test_subrepos(tmp_dir, scm, dvc): {"lorem": "lorem", "dir2": {"ipsum": "ipsum"}}, commit="BAR" ) - dvc.fs._reset() + dvc._reset() fs = RepoFileSystem(dvc, subrepos=True) def assert_fs_belongs_to_repo(ret_val): @@ -402,8 +402,8 @@ def test_subrepo_walk(tmp_dir, scm, dvc, dvcfiles, extra_expected): ) # using fs that does not have dvcignore - dvc.fs._reset() - fs = RepoFileSystem(dvc, subrepos=True) + dvc._reset() + fs = RepoFileSystem(dvc) expected = [ PathInfo("dir") / "repo", PathInfo("dir") / "repo.txt", @@ -420,7 +420,9 @@ def test_subrepo_walk(tmp_dir, scm, dvc, dvcfiles, extra_expected): actual = [] for root, dirs, files in fs.walk( - os.path.join(fs.root_dir, "dir"), dvcfiles=dvcfiles + os.path.join(fs.root_dir, "dir"), + dvcfiles=dvcfiles, + ignore_subrepos=False, ): for entry in dirs + files: actual.append(os.path.join(root, entry)) @@ -445,8 +447,8 @@ def test_repo_fs_no_subrepos(tmp_dir, dvc, scm): subrepo.scm_gen({"ipsum": "ipsum"}, commit="BAR") # using fs that does not have dvcignore - dvc.fs._reset() - fs = RepoFileSystem(dvc, subrepos=False) + dvc._reset() + fs = RepoFileSystem(dvc) expected = [ tmp_dir / ".dvcignore", tmp_dir / ".gitignore", @@ -629,7 +631,9 @@ def dvc_structure(suffix): expected[str(tmp_dir / "subrepo1")].add("subrepo3") actual = {} - fs = RepoFileSystem(dvc, subrepos=traverse_subrepos) - for root, dirs, files in fs.walk(str(tmp_dir)): + fs = RepoFileSystem(dvc) + for root, dirs, files in fs.walk( + str(tmp_dir), ignore_subrepos=not traverse_subrepos + ): actual[root] = set(dirs + files) assert expected == actual diff --git a/tests/unit/test_dvcfile.py b/tests/unit/test_dvcfile.py index b9b5b5cbc6..dc578a687c 100644 --- a/tests/unit/test_dvcfile.py +++ b/tests/unit/test_dvcfile.py @@ -102,6 +102,7 @@ def test_stage_load_file_exists_but_dvcignored(tmp_dir, dvc, scm, file): (tmp_dir / file).write_text("") (tmp_dir / ".dvcignore").write_text(file) + dvc._reset() dvcfile = Dvcfile(dvc, file) with pytest.raises(StageFileDoesNotExistError) as exc_info: assert dvcfile.stages.values() diff --git a/tests/unit/test_external_repo.py b/tests/unit/test_external_repo.py index 8fc66621b3..7b34acb550 100644 --- a/tests/unit/test_external_repo.py +++ b/tests/unit/test_external_repo.py @@ -28,7 +28,7 @@ def test_hook_is_called(tmp_dir, erepo_dir, mocker): with external_repo(str(erepo_dir)) as repo: spy = mocker.spy(repo.repo_fs, "repo_factory") - list(repo.repo_fs.walk(repo.root_dir)) # drain + list(repo.repo_fs.walk(repo.root_dir, ignore_subrepos=False)) # drain assert spy.call_count == len(subrepos) paths = [os.path.join(repo.root_dir, path) for path in subrepo_paths] @@ -66,7 +66,7 @@ def test_subrepo_is_constructed_properly( ) as repo: spy = mocker.spy(repo.repo_fs, "repo_factory") - list(repo.repo_fs.walk(repo.root_dir)) # drain + list(repo.repo_fs.walk(repo.root_dir, ignore_subrepos=False)) # drain assert spy.call_count == 1 subrepo = spy.spy_return diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index ba0a53bc1c..9653d0e347 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -28,7 +28,7 @@ class TestMtimeAndSize(TestDir): def test(self): - fs = LocalFileSystem(None, {"url": self.root_dir}, use_dvcignore=True) + fs = LocalFileSystem(None, {"url": self.root_dir}) file_time, file_size = get_mtime_and_size(self.DATA, fs) dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, fs) @@ -129,7 +129,7 @@ def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_dir): tmp_dir.gen( {"dir": {"dir_file": "dir file content"}, "file": "file_content"} ) - fs = LocalFileSystem(None, {"url": os.fspath(tmp_dir)}, use_dvcignore=True) + fs = LocalFileSystem(None, {"url": os.fspath(tmp_dir)}) time, size = get_mtime_and_size("dir", fs) object_time, object_size = get_mtime_and_size(PathInfo("dir"), fs)