diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 47515ef8a61..be4fa10f3c8 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -298,7 +298,9 @@ class GcsFileSystem::Impl { Result GetFileInfo(const FileSelector& select) { ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir)); - auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(p.object); + const auto canonical = internal::EnsureTrailingSlash(p.object); + const auto max_depth = internal::Depth(canonical) + select.max_recursion; + auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); bool found_directory = false; FileInfoVector result; @@ -311,8 +313,9 @@ class GcsFileSystem::Impl { return internal::ToArrowStatus(o.status()); } found_directory = true; - // Skip the directory itself from the results - if (o->name() == p.object) { + // Skip the directory itself from the results, and any result that is "too deep" + // into the recursion. + if (o->name() == p.object || internal::Depth(o->name()) > max_depth) { continue; } auto path = internal::ConcatAbstractPath(o->bucket(), o->name()); diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.cc b/cpp/src/arrow/filesystem/gcsfs_internal.cc index e4cb38ade92..0cede754856 100644 --- a/cpp/src/arrow/filesystem/gcsfs_internal.cc +++ b/cpp/src/arrow/filesystem/gcsfs_internal.cc @@ -24,6 +24,7 @@ #include #include +#include "arrow/filesystem/path_util.h" #include "arrow/util/key_value_metadata.h" namespace arrow { @@ -258,6 +259,10 @@ Result> FromObjectMetadata( return result; } +std::int64_t Depth(arrow::util::string_view path) { + return std::count(path.begin(), path.end(), fs::internal::kSep); +} + } // namespace internal } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.h b/cpp/src/arrow/filesystem/gcsfs_internal.h index fc7789ff0f5..729a1b13b4d 100644 --- a/cpp/src/arrow/filesystem/gcsfs_internal.h +++ b/cpp/src/arrow/filesystem/gcsfs_internal.h @@ -49,6 +49,8 @@ Result ToObjectMetadata( Result> FromObjectMetadata( google::cloud::storage::ObjectMetadata const& m); +std::int64_t Depth(arrow::util::string_view path); + } // namespace internal } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index a2985ca1d17..b0575134553 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -199,16 +199,26 @@ class GcsIntegrationTest : public ::testing::Test { Result CreateHierarchy(std::shared_ptr fs) { const char* const kTestFolders[] = { - "a/", "a/0/", "a/0/0/", "a/1/", "a/2/", + "b/", + "b/0/", + "b/0/0/", + "b/1/", + "b/2/", + // Create some additional folders that should not appear in any listing of b/ + "aa/", + "ba/", + "c/", }; - constexpr auto kFilesPerFolder = 4; - auto result = Hierarchy{PreexistingBucketPath() + "a/", {}}; + constexpr auto kFilesPerFolder = 2; + auto base_dir = internal::ConcatAbstractPath(PreexistingBucketPath(), "b/"); + auto result = Hierarchy{base_dir, {}}; for (auto const* f : kTestFolders) { - const auto folder = PreexistingBucketPath() + f; + const auto folder = internal::ConcatAbstractPath(PreexistingBucketPath(), f); RETURN_NOT_OK(fs->CreateDir(folder, true)); result.contents.push_back(arrow::fs::Dir(folder)); for (int i = 0; i != kFilesPerFolder; ++i) { - const auto filename = folder + "test-file-" + std::to_string(i); + const auto filename = + internal::ConcatAbstractPath(folder, "test-file-" + std::to_string(i)); CreateFile(fs.get(), filename, filename); result.contents.push_back(arrow::fs::File(filename)); } @@ -483,14 +493,19 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); std::vector expected; - std::copy_if( - hierarchy.contents.begin(), hierarchy.contents.end(), std::back_inserter(expected), - [&](const arrow::fs::FileInfo& info) { return hierarchy.base_dir != info.path(); }); + std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(), + std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) { + if (!fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())) { + return false; + } + return hierarchy.base_dir != info.path(); + }); auto selector = FileSelector(); selector.base_dir = hierarchy.base_dir; selector.allow_not_found = false; selector.recursive = true; + selector.max_recursion = 16; ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector)); EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end())); } @@ -515,6 +530,34 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorNonRecursive) { EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end())); } +TEST_F(GcsIntegrationTest, GetFileInfoSelectorLimitedRecursion) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); + + for (const auto max_recursion : {0, 1, 2, 3}) { + SCOPED_TRACE("Testing with max_recursion=" + std::to_string(max_recursion)); + const auto max_depth = + internal::Depth(internal::EnsureTrailingSlash(hierarchy.base_dir)) + + max_recursion; + std::vector expected; + std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(), + std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) { + if (info.path() == hierarchy.base_dir) return false; + if (!fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())) { + return false; + } + return internal::Depth(info.path()) <= max_depth; + }); + auto selector = FileSelector(); + selector.base_dir = hierarchy.base_dir; + selector.allow_not_found = true; + selector.recursive = true; + selector.max_recursion = max_recursion; + ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector)); + EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end())); + } +} + TEST_F(GcsIntegrationTest, GetFileInfoSelectorNotFoundTrue) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); @@ -591,7 +634,10 @@ TEST_F(GcsIntegrationTest, DeleteDirSuccess) { arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); for (auto const& info : hierarchy.contents) { - arrow::fs::AssertFileInfo(fs.get(), info.path(), FileType::NotFound); + const auto expected_type = fs::internal::IsAncestorOf(hierarchy.base_dir, info.path()) + ? FileType::NotFound + : info.type(); + arrow::fs::AssertFileInfo(fs.get(), info.path(), expected_type); } } @@ -604,8 +650,12 @@ TEST_F(GcsIntegrationTest, DeleteDirContentsSuccess) { arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); for (auto const& info : hierarchy.contents) { - if (info.path() == hierarchy.base_dir) continue; - arrow::fs::AssertFileInfo(fs.get(), info.path(), FileType::NotFound); + auto expected_type = FileType::NotFound; + if (info.path() == hierarchy.base_dir || + !fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())) { + expected_type = info.type(); + } + arrow::fs::AssertFileInfo(fs.get(), info.path(), expected_type); } }