From 47e3498541f069995fca0cbe5b3b801dfdaa071a Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 10 Jun 2021 15:48:01 +0300 Subject: [PATCH] s3, gs, azure: re-enable prefix-based search optimization --- dvc/fs/base.py | 2 +- dvc/fs/fsspec_wrapper.py | 20 +++++++++++++++----- dvc/fs/gdrive.py | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/dvc/fs/base.py b/dvc/fs/base.py index e4c4513f5c..e80c1e4366 100644 --- a/dvc/fs/base.py +++ b/dvc/fs/base.py @@ -170,7 +170,7 @@ def walk_files(self, path_info, **kwargs): def ls(self, path_info, detail=False): raise RemoteActionNotImplemented("ls", self.scheme) - def find(self, path_info, detail=False): + def find(self, path_info, detail=False, prefix=None): raise RemoteActionNotImplemented("find", self.scheme) def is_empty(self, path_info): diff --git a/dvc/fs/fsspec_wrapper.py b/dvc/fs/fsspec_wrapper.py index fc91e95293..8b303f9f5e 100644 --- a/dvc/fs/fsspec_wrapper.py +++ b/dvc/fs/fsspec_wrapper.py @@ -96,7 +96,8 @@ def ls(self, path_info, detail=False): files = self.fs.ls(path, detail=detail) yield from self._strip_buckets(files, detail=detail) - def find(self, path_info, detail=False): + # pylint: disable=unused-argument + def find(self, path_info, detail=False, prefix=None): path = self._with_bucket(path_info) files = self.fs.find(path, detail=detail) if detail: @@ -105,7 +106,7 @@ def find(self, path_info, detail=False): yield from self._strip_buckets(files, detail=detail) def walk_files(self, path_info, **kwargs): - for file in self.find(path_info): + for file in self.find(path_info, **kwargs): yield path_info.replace(path=file) def remove(self, path_info): @@ -155,6 +156,8 @@ def _download( # pylint: disable=abstract-method class ObjectFSWrapper(FSSpecWrapper): + TRAVERSE_PREFIX_LEN = 3 + def _isdir(self, path_info): # Directory in object storages are interpreted differently # among different fsspec providers, so this logic is a temporary @@ -169,9 +172,16 @@ def _isdir(self, path_info): and entry["name"].endswith("/") ) - def find(self, path_info, detail=False): - path = self._with_bucket(path_info) - files = self.fs.find(path, detail=detail) + def find(self, path_info, detail=False, prefix=None): + if prefix is not None: + path = self._with_bucket(path_info.parent) + files = self.fs.find( + path, detail=detail, prefix=path_info.parts[-1] + ) + else: + path = self._with_bucket(path_info) + files = self.fs.find(path, detail=detail) + if detail: files = files.values() diff --git a/dvc/fs/gdrive.py b/dvc/fs/gdrive.py index 3cceee8961..379d7ef6f8 100644 --- a/dvc/fs/gdrive.py +++ b/dvc/fs/gdrive.py @@ -538,7 +538,7 @@ def _gdrive_list_ids(self, query_ids): query = f"({query}) and trashed=false" return self._gdrive_list(query) - def find(self, path_info, detail=False): + def find(self, path_info, detail=False, prefix=None): root_path = path_info.path seen_paths = set()