From 0676127153117afde9ef39900919a34810f6fdbe Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Thu, 9 Dec 2021 17:25:28 +0000 Subject: [PATCH 1/3] ARROW-14918: [C++] GcsFileSystem and listing files --- cpp/src/arrow/filesystem/gcsfs.cc | 37 ++++++++++- cpp/src/arrow/filesystem/gcsfs_test.cc | 88 ++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index c45d58b5fdb..0c755fd3bb9 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -285,6 +285,41 @@ class GcsFileSystem::Impl { path.object.back() == '/' ? FileType::Directory : FileType::File); } + Result GetFileInfo(const FileSelector& select) { + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir)); + auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(p.object); + auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); + bool found_directory = false; + FileInfoVector result; + for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter)) { + if (!o.ok()) { + if (select.allow_not_found && + o.status().code() == google::cloud::StatusCode::kNotFound) { + continue; + } + return internal::ToArrowStatus(o.status()); + } + found_directory = true; + // Skip the directory itself from the results + if (o->name() == p.object) { + continue; + } + auto path = internal::ConcatAbstractPath(o->bucket(), o->name()); + if (o->name().back() == '/') { + result.push_back( + FileInfo(internal::EnsureTrailingSlash(path), FileType::Directory)); + continue; + } + auto info = FileInfo(path, FileType::File); + info.set_size(static_cast(o->size())); + result.push_back(std::move(info)); + } + if (!found_directory && !select.allow_not_found) { + return Status::IOError("no such file or directory " + select.base_dir); + } + return result; + } + // GCS does not have directories or folders. But folders can be emulated (with some // limitations) using marker objects. That and listing with prefixes creates the // illusion of folders. @@ -478,7 +513,7 @@ Result GcsFileSystem::GetFileInfo(const std::string& path) { } Result GcsFileSystem::GetFileInfo(const FileSelector& select) { - return Status::NotImplemented("The GCS FileSystem is not fully implemented"); + return impl_->GetFileInfo(select); } Status GcsFileSystem::CreateDir(const std::string& path, bool recursive) { diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 3e4431cde81..5ee31204014 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -39,6 +39,11 @@ namespace arrow { namespace fs { +/// Custom comparison for FileInfo, we need this to use complex googletest matchers. +inline bool operator==(const FileInfo& a, const FileInfo& b) { + return a.path() == b.path() && a.type() == b.type(); +} + namespace { namespace bp = boost::process; @@ -51,6 +56,7 @@ using ::testing::Not; using ::testing::NotNull; using ::testing::Pair; using ::testing::UnorderedElementsAre; +using ::testing::UnorderedElementsAreArray; auto const* kLoremIpsum = R"""( Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor @@ -187,6 +193,31 @@ class GcsIntegrationTest : public ::testing::Test { std::string RandomFolderName() { return RandomChars(32) + "/"; } + struct Hierarchy { + std::string base_dir; + std::vector contents; + }; + + Result CreateHierarchy(std::shared_ptr fs) { + const char* const kTestFolders[] = { + "a/", "a/0/", "a/0/0/", "a/1/", "a/2/", + }; + auto result = Hierarchy{PreexistingBucketPath() + "a/", {}}; + for (auto const* f : kTestFolders) { + const auto folder = PreexistingBucketPath() + f; + RETURN_NOT_OK(fs->CreateDir(folder, true)); + result.contents.push_back(arrow::fs::Dir(folder)); + for (int i = 0; i != 64; ++i) { + const auto filename = folder + "test-file-" + std::to_string(i); + ARROW_ASSIGN_OR_RAISE(auto w, fs->OpenOutputStream(filename, {})); + RETURN_NOT_OK(w->Write(filename.data(), filename.size())); + RETURN_NOT_OK(w->Close()); + result.contents.push_back(arrow::fs::File(filename)); + } + } + return result; + } + private: std::string RandomChars(std::size_t count) { auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); @@ -399,6 +430,63 @@ TEST_F(GcsIntegrationTest, GetFileInfoObject) { arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); } +TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); + std::vector expected; + std::copy_if( + hierarchy.contents.begin(), hierarchy.contents.end(), std::back_inserter(expected), + [&](const arrow::fs::FileInfo& info) { return hierarchy.base_dir != info.path(); }); + + auto selector = FileSelector(); + selector.base_dir = hierarchy.base_dir; + selector.allow_not_found = false; + selector.recursive = true; + ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector)); + EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end())); +} + +TEST_F(GcsIntegrationTest, GetFileInfoSelectorNonRecursive) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); + std::vector expected; + std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(), + std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) { + if (info.path() == hierarchy.base_dir) return false; + return internal::EnsureTrailingSlash( + internal::GetAbstractPathParent(info.path()).first) == + hierarchy.base_dir; + }); + + auto selector = FileSelector(); + selector.base_dir = hierarchy.base_dir; + selector.allow_not_found = false; + selector.recursive = false; + ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector)); + EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end())); +} + +TEST_F(GcsIntegrationTest, GetFileInfoSelectorNotFoundTrue) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + auto selector = FileSelector(); + selector.base_dir = NotFoundObjectPath() + "/"; + selector.allow_not_found = true; + selector.recursive = true; + ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector)); + EXPECT_THAT(results, IsEmpty()); +} + +TEST_F(GcsIntegrationTest, GetFileInfoSelectorNotFoundFalse) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + auto selector = FileSelector(); + selector.base_dir = NotFoundObjectPath() + "/"; + selector.allow_not_found = false; + selector.recursive = false; + ASSERT_RAISES(IOError, fs->GetFileInfo(selector)); +} + TEST_F(GcsIntegrationTest, CreateDirSuccessBucketOnly) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); auto bucket_name = RandomBucketName(); From 29de5060cbcdd0696d37d8ac5a3ec0ea14ce3388 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Tue, 14 Dec 2021 14:15:05 +0000 Subject: [PATCH 2/3] Address review comments --- cpp/src/arrow/filesystem/gcsfs.cc | 3 ++- cpp/src/arrow/filesystem/gcsfs_test.cc | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 0c755fd3bb9..0f86701740a 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -312,10 +312,11 @@ class GcsFileSystem::Impl { } auto info = FileInfo(path, FileType::File); info.set_size(static_cast(o->size())); + info.set_mtime(o->updated()); result.push_back(std::move(info)); } if (!found_directory && !select.allow_not_found) { - return Status::IOError("no such file or directory " + select.base_dir); + return Status::IOError("No such file or directory '", select.base_dir, "'"); } return result; } diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 5ee31204014..0ffdb22e592 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -99,7 +99,7 @@ class GcsTestbench : public ::testing::Environment { error_ = std::move(error); } - ~GcsTestbench() { + ~GcsTestbench() override { // Brutal shutdown, kill the full process group because the GCS testbench may launch // additional children. group_.terminate(); @@ -209,9 +209,7 @@ class GcsIntegrationTest : public ::testing::Test { result.contents.push_back(arrow::fs::Dir(folder)); for (int i = 0; i != 64; ++i) { const auto filename = folder + "test-file-" + std::to_string(i); - ARROW_ASSIGN_OR_RAISE(auto w, fs->OpenOutputStream(filename, {})); - RETURN_NOT_OK(w->Write(filename.data(), filename.size())); - RETURN_NOT_OK(w->Close()); + CreateFile(fs.get(), filename, filename); result.contents.push_back(arrow::fs::File(filename)); } } From 5ab652bbb72fce585398ea82119d8c5e6776483b Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Tue, 14 Dec 2021 15:14:18 +0000 Subject: [PATCH 3/3] Address review comments --- cpp/src/arrow/filesystem/gcsfs.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 0f86701740a..02bed8b9f54 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -312,7 +312,10 @@ class GcsFileSystem::Impl { } auto info = FileInfo(path, FileType::File); info.set_size(static_cast(o->size())); - info.set_mtime(o->updated()); + // An object has multiple "time" attributes, including the time when its data was + // created, and the time when its metadata was last updated. We use the object + // creation time because the data for an object cannot be changed once created. + info.set_mtime(o->time_created()); result.push_back(std::move(info)); } if (!found_directory && !select.allow_not_found) {