From ced6a7a0eaf8b57e62f76173ccc4625abebc93ca Mon Sep 17 00:00:00 2001 From: Haocheng Liu Date: Fri, 24 May 2024 18:14:40 -0400 Subject: [PATCH 1/3] GH-41493: [C++] Add a new S3option to check existence before CreateDir --- cpp/src/arrow/filesystem/s3fs.cc | 43 +++++++++++++++++++++++---- cpp/src/arrow/filesystem/s3fs.h | 11 +++++++ cpp/src/arrow/filesystem/s3fs_test.cc | 31 ++++++++++++++++++- 3 files changed, 78 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 640888e1c4f..c98ae845674 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2859,6 +2859,7 @@ Status S3FileSystem::CreateDir(const std::string& s, bool recursive) { return impl_->CreateBucket(path.bucket); } + FileInfo file_info; // Create object if (recursive) { // Ensure bucket exists @@ -2866,10 +2867,33 @@ Status S3FileSystem::CreateDir(const std::string& s, bool recursive) { if (!bucket_exists) { RETURN_NOT_OK(impl_->CreateBucket(path.bucket)); } + + auto key_i = path.key_parts.begin(); + std::string parent_key{}; + if (options().check_directory_existence_before_creation) { + // Walk up the directory first to find the first existing parent + for (const auto& part : path.key_parts) { + parent_key += part; + parent_key += kSep; + } + for (key_i = path.key_parts.end(); key_i-- != path.key_parts.begin();) { + ARROW_ASSIGN_OR_RAISE(file_info, + this->GetFileInfo(path.bucket + kSep + parent_key)); + if (file_info.type() != FileType::NotFound) { + /// Found! + break; + } else { + // remove the kSep and the part + parent_key.pop_back(); + parent_key.erase(parent_key.end() - key_i->size(), parent_key.end()); + } + } + key_i++; // Above for loop moves one extra iterator at the end + } // Ensure that all parents exist, then the directory itself - std::string parent_key; - for (const auto& part : path.key_parts) { - parent_key += part; + // Create all missing directories + for (; key_i < path.key_parts.end(); ++key_i) { + parent_key += *key_i; parent_key += kSep; RETURN_NOT_OK(impl_->CreateEmptyDir(path.bucket, parent_key)); } @@ -2887,11 +2911,18 @@ Status S3FileSystem::CreateDir(const std::string& s, bool recursive) { "': parent directory does not exist"); } } + } - // XXX Should we check that no non-directory entry exists? - // Minio does it for us, not sure about other S3 implementations. - return impl_->CreateEmptyDir(path.bucket, path.key); + // Check if the directory exists already + if (options().check_directory_existence_before_creation) { + ARROW_ASSIGN_OR_RAISE(file_info, this->GetFileInfo(path.full_path)); + if (file_info.type() != FileType::NotFound) { + return Status::OK(); + } } + // XXX Should we check that no non-directory entry exists? + // Minio does it for us, not sure about other S3 implementations. + return impl_->CreateEmptyDir(path.bucket, path.key); } Status S3FileSystem::DeleteDir(const std::string& s) { diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 82d08bc5ea8..fbbe9d0b3f4 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -166,6 +166,17 @@ struct ARROW_EXPORT S3Options { /// Whether to allow deletion of buckets bool allow_bucket_deletion = false; + /// Whether to allow pessimistic directory creation in CreateDir function + /// + /// By default, CreateDir function will try to create the directory without checking its + /// existence. It's an optimization to try directory creation and catch the error, + /// rather than issue two dependent I/O calls. + /// Though for key/value storage like Google Cloud Storage, too many creation calls will + /// breach the rate limit for object mutation operations and cause serious consequences. + /// It's also possible you don't have creation access for the parent directory. Set it + /// to be true to address these scenarios. + bool check_directory_existence_before_creation = false; + /// \brief Default metadata for OpenOutputStream. /// /// This will be ignored if non-empty metadata is passed to OpenOutputStream. diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 88cc96956e3..371f73c1f7e 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -922,9 +922,13 @@ TEST_F(TestS3FS, CreateDir) { // New "directory" AssertFileInfo(fs_.get(), "bucket/newdir", FileType::NotFound); - ASSERT_OK(fs_->CreateDir("bucket/newdir")); + ASSERT_OK(fs_->CreateDir("bucket/newdir", false)); AssertFileInfo(fs_.get(), "bucket/newdir", FileType::Directory); + // By default CreateDir uses recursvie mode, make it explictly to be false + ASSERT_RAISES(IOError, + fs_->CreateDir("bucket/newdir/newsub/newsubsub", /*recursive=*/false)); + // New "directory", recursive ASSERT_OK(fs_->CreateDir("bucket/newdir/newsub/newsubsub", /*recursive=*/true)); AssertFileInfo(fs_.get(), "bucket/newdir/newsub", FileType::Directory); @@ -939,6 +943,31 @@ TEST_F(TestS3FS, CreateDir) { // Extraneous slashes ASSERT_RAISES(Invalid, fs_->CreateDir("bucket//somedir")); ASSERT_RAISES(Invalid, fs_->CreateDir("bucket/somedir//newdir")); + + // check existing before creation + options_.check_directory_existence_before_creation = true; + MakeFileSystem(); + /// New "directory" again + AssertFileInfo(fs_.get(), "bucket/checknewdir", FileType::NotFound); + ASSERT_OK(fs_->CreateDir("bucket/checknewdir")); + AssertFileInfo(fs_.get(), "bucket/checknewdir", FileType::Directory); + + ASSERT_RAISES(IOError, fs_->CreateDir("bucket/checknewdir/newsub/newsubsub/newsubsub/", + /*recursive=*/false)); + + /// New "directory" again, recursive + ASSERT_OK(fs_->CreateDir("bucket/checknewdir/newsub/newsubsub", /*recursive=*/true)); + AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub", FileType::Directory); + AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub", FileType::Directory); + AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub/newsubsub", + FileType::NotFound); + //// Try creation with the same name + ASSERT_OK(fs_->CreateDir("bucket/checknewdir/newsub/newsubsub/newsubsub/", + /*recursive=*/true)); + AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub", FileType::Directory); + AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub", FileType::Directory); + AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub/newsubsub", + FileType::Directory); } TEST_F(TestS3FS, DeleteFile) { From 73b9b39ab19de53890052d912f9fab519ec1fdc8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 4 Jun 2024 14:50:56 +0200 Subject: [PATCH 2/3] Various nits --- cpp/src/arrow/filesystem/s3fs.cc | 2 +- cpp/src/arrow/filesystem/s3fs_test.cc | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index c98ae845674..3eb8a96d61f 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2880,7 +2880,7 @@ Status S3FileSystem::CreateDir(const std::string& s, bool recursive) { ARROW_ASSIGN_OR_RAISE(file_info, this->GetFileInfo(path.bucket + kSep + parent_key)); if (file_info.type() != FileType::NotFound) { - /// Found! + // Found! break; } else { // remove the kSep and the part diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 371f73c1f7e..7904ef6be7d 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -922,7 +922,7 @@ TEST_F(TestS3FS, CreateDir) { // New "directory" AssertFileInfo(fs_.get(), "bucket/newdir", FileType::NotFound); - ASSERT_OK(fs_->CreateDir("bucket/newdir", false)); + ASSERT_OK(fs_->CreateDir("bucket/newdir", /*recursive=*/ false)); AssertFileInfo(fs_.get(), "bucket/newdir", FileType::Directory); // By default CreateDir uses recursvie mode, make it explictly to be false @@ -947,7 +947,7 @@ TEST_F(TestS3FS, CreateDir) { // check existing before creation options_.check_directory_existence_before_creation = true; MakeFileSystem(); - /// New "directory" again + // New "directory" again AssertFileInfo(fs_.get(), "bucket/checknewdir", FileType::NotFound); ASSERT_OK(fs_->CreateDir("bucket/checknewdir")); AssertFileInfo(fs_.get(), "bucket/checknewdir", FileType::Directory); @@ -955,13 +955,13 @@ TEST_F(TestS3FS, CreateDir) { ASSERT_RAISES(IOError, fs_->CreateDir("bucket/checknewdir/newsub/newsubsub/newsubsub/", /*recursive=*/false)); - /// New "directory" again, recursive + // New "directory" again, recursive ASSERT_OK(fs_->CreateDir("bucket/checknewdir/newsub/newsubsub", /*recursive=*/true)); AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub", FileType::Directory); AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub", FileType::Directory); AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub/newsubsub", FileType::NotFound); - //// Try creation with the same name + // Try creation with the same name ASSERT_OK(fs_->CreateDir("bucket/checknewdir/newsub/newsubsub/newsubsub/", /*recursive=*/true)); AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub", FileType::Directory); From dddf68ffd88be404e6c872419d81a9cd435e4d73 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 4 Jun 2024 16:42:05 +0200 Subject: [PATCH 3/3] Fix lint (sorry!) --- cpp/src/arrow/filesystem/s3fs_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 7904ef6be7d..7bfa120eda6 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -922,7 +922,7 @@ TEST_F(TestS3FS, CreateDir) { // New "directory" AssertFileInfo(fs_.get(), "bucket/newdir", FileType::NotFound); - ASSERT_OK(fs_->CreateDir("bucket/newdir", /*recursive=*/ false)); + ASSERT_OK(fs_->CreateDir("bucket/newdir", /*recursive=*/false)); AssertFileInfo(fs_.get(), "bucket/newdir", FileType::Directory); // By default CreateDir uses recursvie mode, make it explictly to be false