From 3f85930158757690a0acec5ec0eeb0e627203382 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Thu, 16 Dec 2021 15:58:09 +0000 Subject: [PATCH 1/5] ARROW-15114: [C++] GcsFileSystem uses metadata for directory markers --- cpp/src/arrow/filesystem/gcsfs.cc | 193 ++++++++++++++++--------- cpp/src/arrow/filesystem/gcsfs.h | 9 +- cpp/src/arrow/filesystem/gcsfs_test.cc | 61 ++++---- 3 files changed, 164 insertions(+), 99 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index abc24c0a1ad..81b2898f6aa 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -42,6 +42,7 @@ struct GcsCredentials { namespace { namespace gcs = google::cloud::storage; +using GcsCode = google::cloud::StatusCode; // Change the default upload buffer size. In general, sending larger buffers is more // efficient with GCS, as each buffer requires a roundtrip to the service. With formatted @@ -75,6 +76,13 @@ struct GcsPath { return path; } + GcsPath parent() const { + auto object_parent = internal::GetAbstractPathParent(object).first; + if (object_parent.empty()) return GcsPath{bucket, bucket, ""}; + return GcsPath{internal::ConcatAbstractPath(bucket, object_parent), bucket, + object_parent}; + } + bool empty() const { return bucket.empty() && object.empty(); } bool operator==(const GcsPath& other) const { @@ -310,93 +318,107 @@ class GcsFileSystem::Impl { Result GetFileInfo(const GcsPath& path) { if (path.object.empty()) { auto meta = client_.GetBucketMetadata(path.bucket); - return GetFileInfoDirectory(path, std::move(meta).status()); + return GetFileInfoBucket(path, std::move(meta).status()); } auto meta = client_.GetObjectMetadata(path.bucket, path.object); - if (path.object.back() == '/') { - return GetFileInfoDirectory(path, std::move(meta).status()); - } - return GetFileInfoFile(path, meta); + return GetFileInfoObject(path, meta); } Result GetFileInfo(const FileSelector& select) { ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir)); + // Adding the trailing '/' avoids problems with files named 'a', 'ab', 'ac' which + // where GCS would return all of them if the prefix is 'a'. const auto canonical = internal::EnsureTrailingSlash(p.object); const auto max_depth = internal::Depth(canonical) + select.max_recursion; auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); - bool found_directory = false; FileInfoVector result; for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter)) { if (!o.ok()) { if (select.allow_not_found && o.status().code() == google::cloud::StatusCode::kNotFound) { - continue; + return result; } return internal::ToArrowStatus(o.status()); } - found_directory = true; // Skip the directory itself from the results, and any result that is "too deep" // into the recursion. if (o->name() == p.object || internal::Depth(o->name()) > max_depth) { continue; } auto path = internal::ConcatAbstractPath(o->bucket(), o->name()); - if (o->name().back() == '/') { - result.push_back( - FileInfo(internal::EnsureTrailingSlash(path), FileType::Directory)); - continue; - } result.push_back(ToFileInfo(path, *o)); } - if (!found_directory && !select.allow_not_found) { - return Status::IOError("No such file or directory '", select.base_dir, "'"); + // Finding any elements indicates the directory was found. + if (!result.empty() || select.allow_not_found) { + return result; } - return result; + // To find out if the directory exists we need to perform an additional query. + ARROW_ASSIGN_OR_RAISE(auto directory, GetFileInfo(p)); + if (directory.IsDirectory()) return result; + return Status::IOError("No such file or directory '", select.base_dir, "'"); } // GCS does not have directories or folders. But folders can be emulated (with some // limitations) using marker objects. That and listing with prefixes creates the // illusion of folders. - google::cloud::Status CreateDirMarker(const std::string& bucket, - util::string_view name) { + google::cloud::StatusOr CreateDirMarker(const std::string& bucket, + util::string_view name) { // Make the name canonical. - const auto canonical = internal::EnsureTrailingSlash(name); - return client_ - .InsertObject(bucket, canonical, std::string(), - gcs::WithObjectMetadata(gcs::ObjectMetadata().upsert_metadata( - "arrow/gcsfs", "directory"))) - .status(); + const auto canonical = internal::RemoveTrailingSlash(name).to_string(); + auto object = client_.InsertObject( + bucket, canonical, std::string(), + gcs::WithObjectMetadata( + gcs::ObjectMetadata().upsert_metadata("arrow/gcsfs", "directory")), + gcs::IfGenerationMatch(0)); + if (object) return object; + if (object.status().code() == GcsCode::kFailedPrecondition) { + // The marker already exists, find out if it is a directory or a file. + return client_.GetObjectMetadata(bucket, canonical); + } + return object; } - google::cloud::Status CreateDirMarkerRecursive(const std::string& bucket, - const std::string& object) { - using GcsCode = google::cloud::StatusCode; + static Status NotDirectoryError(const gcs::ObjectMetadata& o) { + return Status::IOError( + "Cannot create directory, it conflicts with an existing file '", + internal::ConcatAbstractPath(o.bucket(), o.name()), "'"); + } + + Status CreateDirMarkerRecursive(const std::string& bucket, const std::string& name) { auto get_parent = [](std::string const& path) { return std::move(internal::GetAbstractPathParent(path).first); }; - // Maybe counterintuitively we create the markers from the most nested and up. Because - // GCS does not have directories creating `a/b/c` will succeed, even if `a/` or `a/b/` - // does not exist. In the common case, where `a/b/` may already exist, it is more - // efficient to just create `a/b/c/` and then find out that `a/b/` was already there. - // In the case where none exists, it does not matter which order we follow. - auto status = CreateDirMarker(bucket, object); - if (status.code() == GcsCode::kAlreadyExists) return {}; - if (status.code() == GcsCode::kNotFound) { - // Missing bucket, create it first ... - status = client_.CreateBucket(bucket, gcs::BucketMetadata()).status(); - if (status.code() != GcsCode::kOk && status.code() != GcsCode::kAlreadyExists) { - return status; + // Find the list of missing parents. In the process we discover if any elements in + // the path are files, this is unavoidable as GCS does not really have directories. + std::vector missing_parents; + auto dir = name; + for (; !dir.empty(); dir = get_parent(dir)) { + auto o = client_.GetObjectMetadata(bucket, dir); + if (o) { + if (IsDirectory(*o)) break; + return NotDirectoryError(*o); } + missing_parents.push_back(dir); } - - for (auto parent = get_parent(object); !parent.empty(); parent = get_parent(parent)) { - status = CreateDirMarker(bucket, parent); - if (status.code() == GcsCode::kAlreadyExists) { - break; + if (dir.empty()) { + // We could not find any of the parent directories in the bucket, the last step is + // to find out if the bucket exists, and if necessary, create it + auto b = client_.GetBucketMetadata(bucket); + if (!b) { + if (b.status().code() == GcsCode::kNotFound) { + b = client_.CreateBucket(bucket, gcs::BucketMetadata()); + } + if (!b) return internal::ToArrowStatus(b.status()); } - if (!status.ok()) { - return status; + } + for (auto d = missing_parents.rbegin(); d != missing_parents.rend(); ++d) { + auto o = CreateDirMarker(bucket, *d); + if (o) { + if (IsDirectory(*o)) continue; + // This is probably a race condition, something created a file before we managed + // to create the directories. + return NotDirectoryError(*o); } } return {}; @@ -407,11 +429,16 @@ class GcsFileSystem::Impl { return internal::ToArrowStatus( client_.CreateBucket(p.bucket, gcs::BucketMetadata()).status()); } - return internal::ToArrowStatus(CreateDirMarker(p.bucket, p.object)); + auto parent = p.parent(); + if (!parent.object.empty()) { + auto o = client_.GetObjectMetadata(p.bucket, parent.object); + if (!IsDirectory(*o)) return NotDirectoryError(*o); + } + return internal::ToArrowStatus(CreateDirMarker(p.bucket, p.object).status()); } Status CreateDirRecursive(const GcsPath& p) { - return internal::ToArrowStatus(CreateDirMarkerRecursive(p.bucket, p.object)); + return CreateDirMarkerRecursive(p.bucket, p.object); } Status DeleteDir(const GcsPath& p, const io::IOContext& io_context) { @@ -423,20 +450,26 @@ class GcsFileSystem::Impl { } Status DeleteDirContents(const GcsPath& p, const io::IOContext& io_context) { + // If the directory marker exists, it better be a directory. + auto dir = client_.GetObjectMetadata(p.bucket, p.object); + if (dir && !IsDirectory(*dir)) return NotDirectoryError(*dir); + // Deleting large directories can be fairly slow, we need to parallelize the // operation. + const auto& canonical = + p.object.empty() ? p.object : internal::EnsureTrailingSlash(p.object); auto async_delete = - [&p, this](const google::cloud::StatusOr& o) -> Status { + [&, this](const google::cloud::StatusOr& o) -> Status { if (!o) return internal::ToArrowStatus(o.status()); // The list includes the directory, skip it. DeleteDir() takes care of it. - if (o->bucket() == p.bucket && o->name() == p.object) return {}; + if (o->bucket() == p.bucket && o->name() == canonical) return {}; return internal::ToArrowStatus( client_.DeleteObject(o->bucket(), o->name(), gcs::Generation(o->generation()))); }; std::vector> submitted; // This iterates over all the objects, and schedules parallel deletes. - auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(p.object); + auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); for (const auto& o : client_.ListObjects(p.bucket, prefix)) { submitted.push_back(DeferNotOk(io_context.executor()->Submit(async_delete, o))); } @@ -445,29 +478,46 @@ class GcsFileSystem::Impl { } Status DeleteFile(const GcsPath& p) { - if (!p.object.empty() && p.object.back() == '/') { - return Status::IOError("The given path (" + p.full_path + - ") is a directory, use DeleteDir"); + if (!p.object.empty()) { + auto stat = client_.GetObjectMetadata(p.bucket, p.object); + if (!stat) return internal::ToArrowStatus(stat.status()); + if (IsDirectory(*stat)) { + return Status::IOError("The given path '", p.full_path, + "' is a directory, use DeleteDir"); + } } return internal::ToArrowStatus(client_.DeleteObject(p.bucket, p.object)); } Status Move(const GcsPath& src, const GcsPath& dest) { - if (src.full_path.empty() || src.object.empty() || - src.object.back() == internal::kSep) { + if (src == dest) return {}; + if (src.object.empty()) { return Status::IOError( - "Moving directories or buckets cannot be implemented in GCS. You provided (" + - src.full_path + ") as a source for Move()"); + "Moving directories or buckets cannot be implemented in GCS. You provided (", + src.full_path, ") as a source for Move()"); } ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(dest)); if (info.IsDirectory()) { return Status::IOError("Attempting to Move() to an existing directory"); } + ARROW_ASSIGN_OR_RAISE(auto src_info, GetFileInfo(src)); + if (!src_info.IsFile()) { + return Status::IOError("Cannot move source '", src.full_path, + "' the object does not exist or does not represent a file"); + } RETURN_NOT_OK(CopyFile(src, dest)); return DeleteFile(src); } Status CopyFile(const GcsPath& src, const GcsPath& dest) { + auto parent = dest.parent(); + if (!parent.object.empty()) { + ARROW_ASSIGN_OR_RAISE(auto parent_info, GetFileInfo(parent)); + if (parent_info.IsFile()) { + return Status::IOError("Cannot use file '", parent.full_path, + "' as a destination directory"); + } + } auto metadata = client_.RewriteObjectBlocking(src.bucket, src.object, dest.bucket, dest.object); return internal::ToArrowStatus(metadata.status()); @@ -505,20 +555,23 @@ class GcsFileSystem::Impl { } private: - static Result GetFileInfoDirectory(const GcsPath& path, - const google::cloud::Status& status) { - using ::google::cloud::StatusCode; - auto canonical = internal::EnsureTrailingSlash(path.full_path); + static bool IsDirectory(gcs::ObjectMetadata const& o) { + return o.has_metadata("arrow/gcsfs") && o.metadata("arrow/gcsfs") == "directory"; + } + + static Result GetFileInfoBucket(const GcsPath& path, + const google::cloud::Status& status) { if (status.ok()) { - return FileInfo(canonical, FileType::Directory); + return FileInfo(path.bucket, FileType::Directory); } + using ::google::cloud::StatusCode; if (status.code() == StatusCode::kNotFound) { - return FileInfo(canonical, FileType::NotFound); + return FileInfo(path.bucket, FileType::NotFound); } return internal::ToArrowStatus(status); } - static Result GetFileInfoFile( + static Result GetFileInfoObject( const GcsPath& path, const google::cloud::StatusOr& meta) { if (meta.ok()) { return ToFileInfo(path.full_path, *meta); @@ -532,6 +585,9 @@ class GcsFileSystem::Impl { static FileInfo ToFileInfo(const std::string& full_path, const gcs::ObjectMetadata& meta) { + if (IsDirectory(meta)) { + return FileInfo(full_path, FileType::Directory); + } auto info = FileInfo(full_path, FileType::File); info.set_size(static_cast(meta.size())); // An object has multiple "time" attributes, including the time when its data was @@ -659,8 +715,8 @@ Result> GcsFileSystem::OpenInputStream( Result> GcsFileSystem::OpenInputStream( const FileInfo& info) { - if (!info.IsFile()) { - return Status::IOError("Only files can be opened as input streams"); + if (info.IsDirectory()) { + return Status::IOError("Cannot open a directory as an input stream"); } ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); return impl_->OpenInputStream(p.bucket, p.object, gcs::Generation(), @@ -688,6 +744,9 @@ Result> GcsFileSystem::OpenInputFile( Result> GcsFileSystem::OpenInputFile( const FileInfo& info) { + if (info.IsDirectory()) { + return Status::IOError("Cannot open a directory as an input stream"); + } ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); auto metadata = impl_->GetObjectMetadata(p); ARROW_GCS_RETURN_NOT_OK(metadata.status()); diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 029660e512e..df20466fcde 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -96,7 +96,6 @@ struct ARROW_EXPORT GcsOptions { static GcsOptions FromServiceAccountCredentials(const std::string& json_object); }; -// - TODO(ARROW-1231) - review this documentation before closing the bug. /// \brief GCS-backed FileSystem implementation. /// /// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object @@ -120,12 +119,8 @@ struct ARROW_EXPORT GcsOptions { /// - All buckets are treated as directories at the "root" /// - Creating a root directory results in a new bucket being created, this may be slower /// than most GCS operations. -/// - Any object with a name ending with a slash (`/`) character is treated as a -/// directory. -/// - The class creates marker objects for a directory, using a trailing slash in the -/// marker names. For debugging purposes, the metadata of these marker objects indicate -/// that they are markers created by this class. The class does not rely on this -/// annotation. +/// - The class creates marker objects for a directory, using a metadata attribute to +/// annotate the file. /// - GCS can list all the objects with a given prefix, this is used to emulate listing /// of directories. /// - In object lists GCS can summarize all the objects with a common prefix as a single diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 1190969bba8..be7efb2f26e 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -190,7 +190,7 @@ class GcsIntegrationTest : public ::testing::Test { std::string RandomBucketName() { return RandomChars(32); } - std::string RandomFolderName() { return RandomChars(32) + "/"; } + std::string RandomFolderName() { return RandomChars(32); } struct Hierarchy { std::string base_dir; @@ -199,18 +199,18 @@ class GcsIntegrationTest : public ::testing::Test { Result CreateHierarchy(std::shared_ptr fs) { const char* const kTestFolders[] = { - "b/", - "b/0/", - "b/0/0/", - "b/1/", - "b/2/", + "b", + "b/0", + "b/0/0", + "b/1", + "b/2", // Create some additional folders that should not appear in any listing of b/ - "aa/", - "ba/", - "c/", + "aa", + "ba", + "c", }; constexpr auto kFilesPerFolder = 2; - auto base_dir = internal::ConcatAbstractPath(PreexistingBucketPath(), "b/"); + auto base_dir = internal::ConcatAbstractPath(PreexistingBucketPath(), "b"); auto result = Hierarchy{base_dir, {}}; for (auto const* f : kTestFolders) { const auto folder = internal::ConcatAbstractPath(PreexistingBucketPath(), f); @@ -477,10 +477,10 @@ TEST(GcsFileSystem, ObjectMetadataRoundtrip) { TEST_F(GcsIntegrationTest, GetFileInfoBucket) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); - arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory); + arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketName(), FileType::Directory); // URI - ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingBucketPath())); + ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingBucketName())); } TEST_F(GcsIntegrationTest, GetFileInfoObject) { @@ -506,6 +506,12 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) { } return hierarchy.base_dir != info.path(); }); + // Directories must appear without a trailing slash in the results. + std::transform(expected.begin(), expected.end(), expected.begin(), + [](FileInfo const& info) { + if (!info.IsDirectory()) return info; + return Dir(internal::RemoveTrailingSlash(info.path()).to_string()); + }); auto selector = FileSelector(); selector.base_dir = hierarchy.base_dir; @@ -527,8 +533,7 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorNonRecursive) { std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(), std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) { if (info.path() == hierarchy.base_dir) return false; - return internal::EnsureTrailingSlash( - internal::GetAbstractPathParent(info.path()).first) == + return internal::GetAbstractPathParent(info.path()).first == hierarchy.base_dir; }); @@ -558,6 +563,12 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorLimitedRecursion) { } return internal::Depth(info.path()) <= max_depth; }); + // Directories must appear without a trailing slash in the results. + std::transform(expected.begin(), expected.end(), expected.begin(), + [](FileInfo const& info) { + if (!info.IsDirectory()) return info; + return Dir(internal::RemoveTrailingSlash(info.path()).to_string()); + }); auto selector = FileSelector(); selector.base_dir = hierarchy.base_dir; selector.allow_not_found = true; @@ -593,7 +604,7 @@ TEST_F(GcsIntegrationTest, CreateDirSuccessBucketOnly) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); auto bucket_name = RandomBucketName(); ASSERT_OK(fs->CreateDir(bucket_name, false)); - arrow::fs::AssertFileInfo(fs.get(), bucket_name + "/", FileType::Directory); + arrow::fs::AssertFileInfo(fs.get(), bucket_name, FileType::Directory); } TEST_F(GcsIntegrationTest, CreateDirSuccessBucketAndFolder) { @@ -605,7 +616,7 @@ TEST_F(GcsIntegrationTest, CreateDirSuccessBucketAndFolder) { TEST_F(GcsIntegrationTest, CreateDirFailureFolderWithMissingBucket) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); - const auto path = std::string("not-a-bucket/new-folder/"); + const auto path = std::string("not-a-bucket/new-folder"); ASSERT_RAISES(IOError, fs->CreateDir(path, false)); } @@ -613,13 +624,13 @@ TEST_F(GcsIntegrationTest, CreateDirRecursiveBucketOnly) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); auto bucket_name = RandomBucketName(); ASSERT_OK(fs->CreateDir(bucket_name, true)); - arrow::fs::AssertFileInfo(fs.get(), bucket_name + "/", FileType::Directory); + arrow::fs::AssertFileInfo(fs.get(), bucket_name, FileType::Directory); } TEST_F(GcsIntegrationTest, CreateDirRecursiveFolderOnly) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); const auto parent = PreexistingBucketPath() + RandomFolderName(); - const auto path = parent + "new-sub/"; + const auto path = internal::ConcatAbstractPath(parent, "new-sub"); ASSERT_OK(fs->CreateDir(path, true)); arrow::fs::AssertFileInfo(fs.get(), path, FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), parent, FileType::Directory); @@ -628,12 +639,12 @@ TEST_F(GcsIntegrationTest, CreateDirRecursiveFolderOnly) { TEST_F(GcsIntegrationTest, CreateDirRecursiveBucketAndFolder) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); auto bucket_name = RandomBucketName(); - const auto parent = bucket_name + "/" + RandomFolderName(); - const auto path = parent + "new-sub/"; + const auto parent = internal::ConcatAbstractPath(bucket_name, RandomFolderName()); + const auto path = internal::ConcatAbstractPath(parent, "new-sub"); ASSERT_OK(fs->CreateDir(path, true)); arrow::fs::AssertFileInfo(fs.get(), path, FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), parent, FileType::Directory); - arrow::fs::AssertFileInfo(fs.get(), bucket_name + "/", FileType::Directory); + arrow::fs::AssertFileInfo(fs.get(), bucket_name, FileType::Directory); } TEST_F(GcsIntegrationTest, CreateDirUri) { @@ -646,7 +657,7 @@ TEST_F(GcsIntegrationTest, DeleteDirSuccess) { ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); ASSERT_OK(fs->DeleteDir(hierarchy.base_dir)); - arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory); + arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketName(), FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); for (auto const& info : hierarchy.contents) { const auto expected_type = fs::internal::IsAncestorOf(hierarchy.base_dir, info.path()) @@ -667,7 +678,7 @@ TEST_F(GcsIntegrationTest, DeleteDirContentsSuccess) { ASSERT_OK(fs->DeleteDirContents(hierarchy.base_dir)); arrow::fs::AssertFileInfo(fs.get(), hierarchy.base_dir, FileType::Directory); - arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory); + arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketName(), FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); for (auto const& info : hierarchy.contents) { auto expected_type = FileType::NotFound; @@ -728,9 +739,9 @@ TEST_F(GcsIntegrationTest, MoveFileCannotRenameDirectories) { TEST_F(GcsIntegrationTest, MoveFileCannotRenameToDirectory) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); - ASSERT_OK(fs->CreateDir(PreexistingBucketPath() + "destination/", false)); + ASSERT_OK(fs->CreateDir(PreexistingBucketPath() + "destination", false)); ASSERT_RAISES(IOError, fs->Move(PreexistingObjectPath(), - PreexistingBucketPath() + "destination/")); + PreexistingBucketPath() + "destination")); } TEST_F(GcsIntegrationTest, MoveFileUri) { From 43c14e5b52adcb7cdb152848e4d0f2daca225792 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Fri, 7 Jan 2022 18:04:57 +0000 Subject: [PATCH 2/5] Address review comments --- cpp/src/arrow/filesystem/gcsfs.cc | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 81b2898f6aa..cbd5376384a 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -366,7 +366,7 @@ class GcsFileSystem::Impl { util::string_view name) { // Make the name canonical. const auto canonical = internal::RemoveTrailingSlash(name).to_string(); - auto object = client_.InsertObject( + google::cloud::StatusOr object = client_.InsertObject( bucket, canonical, std::string(), gcs::WithObjectMetadata( gcs::ObjectMetadata().upsert_metadata("arrow/gcsfs", "directory")), @@ -404,7 +404,7 @@ class GcsFileSystem::Impl { if (dir.empty()) { // We could not find any of the parent directories in the bucket, the last step is // to find out if the bucket exists, and if necessary, create it - auto b = client_.GetBucketMetadata(bucket); + google::cloud::StatusOr b = client_.GetBucketMetadata(bucket); if (!b) { if (b.status().code() == GcsCode::kNotFound) { b = client_.CreateBucket(bucket, gcs::BucketMetadata()); @@ -421,7 +421,7 @@ class GcsFileSystem::Impl { return NotDirectoryError(*o); } } - return {}; + return Status::OK(); } Status CreateDir(const GcsPath& p) { @@ -462,7 +462,7 @@ class GcsFileSystem::Impl { [&, this](const google::cloud::StatusOr& o) -> Status { if (!o) return internal::ToArrowStatus(o.status()); // The list includes the directory, skip it. DeleteDir() takes care of it. - if (o->bucket() == p.bucket && o->name() == canonical) return {}; + if (o->bucket() == p.bucket && o->name() == canonical) return Status::OK(); return internal::ToArrowStatus( client_.DeleteObject(o->bucket(), o->name(), gcs::Generation(o->generation()))); }; @@ -490,7 +490,7 @@ class GcsFileSystem::Impl { } Status Move(const GcsPath& src, const GcsPath& dest) { - if (src == dest) return {}; + if (src == dest) return Status::OK(); if (src.object.empty()) { return Status::IOError( "Moving directories or buckets cannot be implemented in GCS. You provided (", @@ -498,7 +498,8 @@ class GcsFileSystem::Impl { } ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(dest)); if (info.IsDirectory()) { - return Status::IOError("Attempting to Move() to an existing directory"); + return Status::IOError("Attempting to Move() '", info.path(), + "' to an existing directory"); } ARROW_ASSIGN_OR_RAISE(auto src_info, GetFileInfo(src)); if (!src_info.IsFile()) { @@ -555,7 +556,7 @@ class GcsFileSystem::Impl { } private: - static bool IsDirectory(gcs::ObjectMetadata const& o) { + static bool IsDirectory(const gcs::ObjectMetadata& o) { return o.has_metadata("arrow/gcsfs") && o.metadata("arrow/gcsfs") == "directory"; } @@ -716,7 +717,8 @@ Result> GcsFileSystem::OpenInputStream( Result> GcsFileSystem::OpenInputStream( const FileInfo& info) { if (info.IsDirectory()) { - return Status::IOError("Cannot open a directory as an input stream"); + return Status::IOError("Cannot open directory '", info.path(), + "' as an input stream"); } ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); return impl_->OpenInputStream(p.bucket, p.object, gcs::Generation(), @@ -745,7 +747,8 @@ Result> GcsFileSystem::OpenInputFile( Result> GcsFileSystem::OpenInputFile( const FileInfo& info) { if (info.IsDirectory()) { - return Status::IOError("Cannot open a directory as an input stream"); + return Status::IOError("Cannot open directory '", info.path(), + "' as an input stream"); } ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); auto metadata = impl_->GetObjectMetadata(p); From b88b90e4f712ba7ab326e9be848bf1993de88516 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Mon, 10 Jan 2022 23:35:48 +0000 Subject: [PATCH 3/5] Address review comments --- cpp/src/arrow/filesystem/gcsfs.cc | 9 +++++++-- cpp/src/arrow/filesystem/gcsfs_test.cc | 26 ++++++++++++++------------ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index cbd5376384a..8cfc2531579 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -326,8 +326,8 @@ class GcsFileSystem::Impl { Result GetFileInfo(const FileSelector& select) { ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir)); - // Adding the trailing '/' avoids problems with files named 'a', 'ab', 'ac' which - // where GCS would return all of them if the prefix is 'a'. + // Adding the trailing '/' avoids problems with files named 'a', 'ab', 'ac' where GCS + // would return all of them if the prefix is 'a'. const auto canonical = internal::EnsureTrailingSlash(p.object); const auto max_depth = internal::Depth(canonical) + select.max_recursion; auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); @@ -356,6 +356,9 @@ class GcsFileSystem::Impl { // To find out if the directory exists we need to perform an additional query. ARROW_ASSIGN_OR_RAISE(auto directory, GetFileInfo(p)); if (directory.IsDirectory()) return result; + if (directory.IsFile()) { + return Status::IOError("Cannot use file '", select.base_dir, "' as a directory"); + } return Status::IOError("No such file or directory '", select.base_dir, "'"); } @@ -412,6 +415,8 @@ class GcsFileSystem::Impl { if (!b) return internal::ToArrowStatus(b.status()); } } + // Iterate in reverse order, if `a/b/c` is found, there is no need to create `a/b/` + // and `a/` for (auto d = missing_parents.rbegin(); d != missing_parents.rend(); ++d) { auto o = CreateDirMarker(bucket, *d); if (o) { diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index be7efb2f26e..4105ad42d84 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -226,6 +226,18 @@ class GcsIntegrationTest : public ::testing::Test { return result; } + // Directories must appear without a trailing slash in the results. + std::vector static CleanupDirectoryNames( + std::vector expected) { + std::transform(expected.begin(), expected.end(), expected.begin(), + [](FileInfo const& info) { + if (!info.IsDirectory()) return info; + return Dir(internal::RemoveTrailingSlash(info.path()).to_string()); + }); + return expected; + } + + private: std::string RandomChars(std::size_t count) { auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); @@ -506,12 +518,7 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) { } return hierarchy.base_dir != info.path(); }); - // Directories must appear without a trailing slash in the results. - std::transform(expected.begin(), expected.end(), expected.begin(), - [](FileInfo const& info) { - if (!info.IsDirectory()) return info; - return Dir(internal::RemoveTrailingSlash(info.path()).to_string()); - }); + expected = CleanupDirectoryNames(std::move(expected)); auto selector = FileSelector(); selector.base_dir = hierarchy.base_dir; @@ -563,12 +570,7 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorLimitedRecursion) { } return internal::Depth(info.path()) <= max_depth; }); - // Directories must appear without a trailing slash in the results. - std::transform(expected.begin(), expected.end(), expected.begin(), - [](FileInfo const& info) { - if (!info.IsDirectory()) return info; - return Dir(internal::RemoveTrailingSlash(info.path()).to_string()); - }); + expected = CleanupDirectoryNames(std::move(expected)); auto selector = FileSelector(); selector.base_dir = hierarchy.base_dir; selector.allow_not_found = true; From 1ec19e071ce5aaa52b468e9b87289183d32f1002 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Tue, 11 Jan 2022 13:39:27 +0000 Subject: [PATCH 4/5] Doh! wrong comment, thanks to @pitrou for catching that --- cpp/src/arrow/filesystem/gcsfs.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 8cfc2531579..1767b50d414 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -415,10 +415,11 @@ class GcsFileSystem::Impl { if (!b) return internal::ToArrowStatus(b.status()); } } - // Iterate in reverse order, if `a/b/c` is found, there is no need to create `a/b/` - // and `a/` - for (auto d = missing_parents.rbegin(); d != missing_parents.rend(); ++d) { - auto o = CreateDirMarker(bucket, *d); + + // Note that the list of parents are sorted from deepest to most shallow, this is + // convenient because as soon as we find a directory we can stop the iteration. + for (auto const& d : missing_parents) { + auto o = CreateDirMarker(bucket, d); if (o) { if (IsDirectory(*o)) continue; // This is probably a race condition, something created a file before we managed From aba03e0760a5e332801002b9c51f54bfd4316dc2 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Tue, 11 Jan 2022 13:41:42 +0000 Subject: [PATCH 5/5] Fix lints --- cpp/src/arrow/filesystem/gcsfs_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 4105ad42d84..85f7ca2030b 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -237,7 +237,6 @@ class GcsIntegrationTest : public ::testing::Test { return expected; } - private: std::string RandomChars(std::size_t count) { auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789");