From abf641241cda79ad177f97dff4197e16362e8e3e Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 28 Oct 2023 19:04:27 +0100 Subject: [PATCH 01/45] Paste in s3 GetFileInfo tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 63 ++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index f57fc4d8140..84b04cd49f1 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -226,6 +226,69 @@ class TestAzureFileSystem : public ::testing::Test { } }; +TEST_F(GcsIntegrationTest, GetFileInfoBucket) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketName(), FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingBucketName())); +} + +TEST_F(GcsIntegrationTest, GetFileInfoObjectWithNestedStructure) { + // Adds detailed tests to handle cases of different edge cases + // with directory naming conventions (e.g. with and without slashes). + auto fs = GcsFileSystem::Make(TestGcsOptions()); + constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; + ASSERT_OK_AND_ASSIGN( + auto output, + fs->OpenOutputStream(PreexistingBucketPath() + kObjectName, /*metadata=*/{})); + const auto data = std::string(kLoremIpsum); + ASSERT_OK(output->Write(data.data(), data.size())); + ASSERT_OK(output->Close()); + + // 0 is immediately after "/" lexicographically, ensure that this doesn't + // cause unexpected issues. + ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(PreexistingBucketPath() + + "test-object-dir/some_other_dir0", + /*metadata=*/{})); + ASSERT_OK(output->Write(data.data(), data.size())); + ASSERT_OK(output->Close()); + ASSERT_OK_AND_ASSIGN( + output, + fs->OpenOutputStream(PreexistingBucketPath() + kObjectName + "0", /*metadata=*/{})); + ASSERT_OK(output->Write(data.data(), data.size())); + ASSERT_OK(output->Close()); + + AssertFileInfo(fs.get(), PreexistingBucketPath() + kObjectName, FileType::File); + AssertFileInfo(fs.get(), PreexistingBucketPath() + kObjectName + "/", + FileType::NotFound); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir", + FileType::Directory); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/", + FileType::Directory); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_dir", + FileType::Directory); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_dir/", + FileType::Directory); + + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-di", + FileType::NotFound); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_di", + FileType::NotFound); +} + +TEST_F(GcsIntegrationTest, GetFileInfoObjectNoExplicitObject) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + auto object = + GcsClient().GetObjectMetadata(PreexistingBucketName(), PreexistingObjectName()); + ASSERT_TRUE(object.ok()) << "status=" << object.status(); + arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File, + object->time_created(), static_cast(object->size())); + + // URI + ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingObjectName())); +} + TEST_F(TestAzureFileSystem, OpenInputStreamString) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); From 80b12ee21441b34198407a850677c7347163f8a0 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 28 Oct 2023 19:40:20 +0100 Subject: [PATCH 02/45] Tests build correctly --- cpp/src/arrow/filesystem/azurefs_test.cc | 63 +++++++++++++----------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 84b04cd49f1..dea04e2b650 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -47,6 +47,7 @@ #include #include +#include "arrow/filesystem/test_util.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/io_util.h" @@ -226,67 +227,71 @@ class TestAzureFileSystem : public ::testing::Test { } }; -TEST_F(GcsIntegrationTest, GetFileInfoBucket) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketName(), FileType::Directory); +TEST_F(TestAzureFileSystem, GetFileInfoContainer) { + arrow::fs::AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); // URI - ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingBucketName())); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); } -TEST_F(GcsIntegrationTest, GetFileInfoObjectWithNestedStructure) { +TEST_F(TestAzureFileSystem, GetFileInfoObjectWithNestedStructure) { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). - auto fs = GcsFileSystem::Make(TestGcsOptions()); constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; ASSERT_OK_AND_ASSIGN( auto output, - fs->OpenOutputStream(PreexistingBucketPath() + kObjectName, /*metadata=*/{})); + fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName, /*metadata=*/{})); const auto data = std::string(kLoremIpsum); ASSERT_OK(output->Write(data.data(), data.size())); ASSERT_OK(output->Close()); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(PreexistingBucketPath() + - "test-object-dir/some_other_dir0", - /*metadata=*/{})); + ASSERT_OK_AND_ASSIGN(output, + fs_->OpenOutputStream( + PreexistingContainerPath() + "test-object-dir/some_other_dir0", + /*metadata=*/{})); ASSERT_OK(output->Write(data.data(), data.size())); ASSERT_OK(output->Close()); ASSERT_OK_AND_ASSIGN( - output, - fs->OpenOutputStream(PreexistingBucketPath() + kObjectName + "0", /*metadata=*/{})); + output, fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName + "0", + /*metadata=*/{})); ASSERT_OK(output->Write(data.data(), data.size())); ASSERT_OK(output->Close()); - AssertFileInfo(fs.get(), PreexistingBucketPath() + kObjectName, FileType::File); - AssertFileInfo(fs.get(), PreexistingBucketPath() + kObjectName + "/", + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", FileType::NotFound); - AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir", + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir", FileType::Directory); - AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/", + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/", FileType::Directory); - AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_dir", + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_dir", FileType::Directory); - AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_dir/", + AssertFileInfo(fs_.get(), + PreexistingContainerPath() + "test-object-dir/some_other_dir/", FileType::Directory); - AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-di", + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di", FileType::NotFound); - AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_di", + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_di", FileType::NotFound); } -TEST_F(GcsIntegrationTest, GetFileInfoObjectNoExplicitObject) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - auto object = - GcsClient().GetObjectMetadata(PreexistingBucketName(), PreexistingObjectName()); - ASSERT_TRUE(object.ok()) << "status=" << object.status(); - arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File, - object->time_created(), static_cast(object->size())); +TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { + auto object_properties = + service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlobClient(PreexistingObjectName()) + .GetProperties() + .Value; + + arrow::fs::AssertFileInfo( + fs_.get(), PreexistingObjectPath(), FileType::File, + std::chrono::system_clock::time_point(object_properties.LastModified), + static_cast(object_properties.BlobSize)); // URI - ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingObjectName())); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); } TEST_F(TestAzureFileSystem, OpenInputStreamString) { @@ -343,7 +348,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamNotFound) { TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingBucketPath())); + // fs->GetFileInfo(PreexistingContainerPath())); arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::Directory); ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); From 16b6b9ab23f93ae5b8e9bbda35690eacf901c080 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 28 Oct 2023 19:41:16 +0100 Subject: [PATCH 03/45] Add test for root of storage account --- cpp/src/arrow/filesystem/azurefs_test.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index dea04e2b650..252831693aa 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -227,6 +227,13 @@ class TestAzureFileSystem : public ::testing::Test { } }; +TEST_F(TestAzureFileSystem, GetFileInfoAccount) { + arrow::fs::AssertFileInfo(fs_.get(), "", FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); +} + TEST_F(TestAzureFileSystem, GetFileInfoContainer) { arrow::fs::AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); From fa869ea09634984bebc448ed87886ac20920eb17 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 28 Oct 2023 19:43:29 +0100 Subject: [PATCH 04/45] Partially complete GetFileInfo --- cpp/src/arrow/filesystem/azurefs.cc | 37 ++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a04338d999d..c2d7528cc5e 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -467,6 +467,40 @@ class AzureFileSystem::Impl { const AzureOptions& options() const { return options_; } + Result GetFileInfo(const AzurePath& path) { + FileInfo info; + info.set_path(path.full_path); + + if (path.container.empty()) { + DCHECK(path.path_to_file.empty()); // The path is invalid if the container is empty + // but not path_to_file. + // path must refer to the root of the Azure storage account. This is a directory, + // and there isn't any extra metadata to fetch. + // TODO: Confirm if we need to check that the storage account exists. I think there + // would have been an earlier failure. + return FileInfo(path.full_path, FileType::Directory); + } + if (path.path_to_file.empty()) { + // path refers to a container. This is a directory if it exists. + auto container_client = service_client_->GetBlobContainerClient(path.container); + try { + auto properties = container_client.GetProperties(); + auto info = FileInfo(path.full_path, FileType::Directory); + info.set_mtime( + std::chrono::system_clock::time_point(properties.Value.LastModified)); + return info; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + return FileInfo(path.full_path, FileType::NotFound); + } + return ErrorToStatus( + "When fetching properties for '" + container_client.GetUrl() + "': ", + exception); + } + } + return info; + } + Result> OpenInputFile(const std::string& s, AzureFileSystem* fs) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s)); @@ -518,7 +552,8 @@ bool AzureFileSystem::Equals(const FileSystem& other) const { } Result AzureFileSystem::GetFileInfo(const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path)); + return impl_->GetFileInfo(p); } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { From b3f715799b06aa2467b2d39b3984c49384a67aa4 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 29 Oct 2023 16:11:56 +0000 Subject: [PATCH 05/45] Avoid relying on filesystem writes in tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 33 ++++++++++++------------ 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 252831693aa..580f2f88363 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -237,6 +237,8 @@ TEST_F(TestAzureFileSystem, GetFileInfoAccount) { TEST_F(TestAzureFileSystem, GetFileInfoContainer) { arrow::fs::AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); + arrow::fs::AssertFileInfo(fs_.get(), "non-existent-container", FileType::NotFound); + // URI ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); } @@ -245,26 +247,21 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectWithNestedStructure) { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - ASSERT_OK_AND_ASSIGN( - auto output, - fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName, /*metadata=*/{})); - const auto data = std::string(kLoremIpsum); - ASSERT_OK(output->Write(data.data(), data.size())); - ASSERT_OK(output->Close()); + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(kObjectName) + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - ASSERT_OK_AND_ASSIGN(output, - fs_->OpenOutputStream( - PreexistingContainerPath() + "test-object-dir/some_other_dir0", - /*metadata=*/{})); - ASSERT_OK(output->Write(data.data(), data.size())); - ASSERT_OK(output->Close()); - ASSERT_OK_AND_ASSIGN( - output, fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName + "0", - /*metadata=*/{})); - ASSERT_OK(output->Write(data.data(), data.size())); - ASSERT_OK(output->Close()); + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient("test-object-dir/some_other_dir0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(std::string(kObjectName) + "0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", @@ -285,6 +282,8 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectWithNestedStructure) { FileType::NotFound); } +// TODO: Add ADLS Gen2 directory tests. + TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { auto object_properties = service_client_->GetBlobContainerClient(PreexistingContainerName()) From 7ccafbf6aea1e4a56056cc2b5c874cac8ae17c5d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 29 Oct 2023 18:13:44 +0000 Subject: [PATCH 06/45] Implement get info for objects, avoiding HNS specific APIs --- cpp/src/arrow/filesystem/azurefs.cc | 32 ++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index c2d7528cc5e..49923f5ad80 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -151,6 +151,13 @@ Status ErrorToStatus(const std::string& prefix, return Status::IOError(prefix, " Azure Error: ", exception.what()); } +// Copy of private function from Azure SDK +// https://github.com/Azure/azure-sdk-for-cpp/blob/dd236311193c6a3debf3b12c47f14e49a20c72c7/sdk/storage/azure-storage-files-datalake/src/datalake_utilities.cpp#L86-L90 +bool MetadataIncidatesIsDirectory(const Azure::Storage::Metadata& metadata) { + auto ite = metadata.find("hdi_isFolder"); + return ite != metadata.end() && ite->second == "true"; +} + template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -498,7 +505,30 @@ class AzureFileSystem::Impl { exception); } } - return info; + auto blob_client = service_client_->GetBlobContainerClient(path.container) + .GetBlobClient(path.path_to_file); + try { + auto properties = blob_client.GetProperties(); + auto info = FileInfo(path.full_path, FileType::File); + if (MetadataIncidatesIsDirectory(properties.Value.Metadata)) { + info.set_type(FileType::Directory); + } else { + info.set_type(FileType::File); + info.set_size(properties.Value.BlobSize); + } + info.set_mtime( + std::chrono::system_clock::time_point(properties.Value.LastModified)); + return info; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + blob_service_client_->GetBlobContainerClient(path.container) + // TODO: List the `path_to_file/` prefix and consider it a directory if there + // is at least one result. + return FileInfo(path.full_path, FileType::NotFound); + } + return ErrorToStatus( + "When fetching properties for '" + blob_client.GetUrl() + "': ", exception); + } } Result> OpenInputFile(const std::string& s, From fe87c1785185dfb86a6e3024c62e56f407c45938 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 29 Oct 2023 21:19:45 +0000 Subject: [PATCH 07/45] Create a datalake service client and start logic for detecting HNS --- cpp/src/arrow/filesystem/azurefs.cc | 45 ++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 49923f5ad80..20f487c60f7 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -18,6 +18,7 @@ #include "arrow/filesystem/azurefs.h" #include +#include #include "arrow/buffer.h" #include "arrow/filesystem/path_util.h" @@ -460,20 +461,42 @@ class ObjectInputFile final : public io::RandomAccessFile { class AzureFileSystem::Impl { public: io::IOContext io_context_; - std::shared_ptr service_client_; + std::shared_ptr + datalake_service_client_; + std::shared_ptr blob_service_client_; AzureOptions options_; + std::optional is_hierarchical_namespace_enabled_; explicit Impl(AzureOptions options, io::IOContext io_context) - : io_context_(io_context), options_(std::move(options)) {} + : io_context_(io_context), + options_(std::move(options)), + is_hierarchical_namespace_enabled_(std::nullopt) {} Status Init() { - service_client_ = std::make_shared( + blob_service_client_ = std::make_shared( options_.account_blob_url, options_.storage_credentials_provider); + datalake_service_client_ = + std::make_shared( + options_.account_blob_url, options_.storage_credentials_provider); return Status::OK(); } const AzureOptions& options() const { return options_; } + public: + const bool get_is_hierarchical_namespace_enabled(const std::string& container) { + if (!is_hierarchical_namespace_enabled_.has_value()) { + auto path_client = datalake_service_client_->GetFileSystemClient(container); + // try { + path_client.GetAccessPolicy(); + is_hierarchical_namespace_enabled_ = true; + // } catch () { + // is_hierarchical_namespace_enabled_ = false; + // } + } + return is_hierarchical_namespace_enabled_.value(); + } + Result GetFileInfo(const AzurePath& path) { FileInfo info; info.set_path(path.full_path); @@ -489,7 +512,8 @@ class AzureFileSystem::Impl { } if (path.path_to_file.empty()) { // path refers to a container. This is a directory if it exists. - auto container_client = service_client_->GetBlobContainerClient(path.container); + auto container_client = + blob_service_client_->GetBlobContainerClient(path.container); try { auto properties = container_client.GetProperties(); auto info = FileInfo(path.full_path, FileType::Directory); @@ -505,7 +529,7 @@ class AzureFileSystem::Impl { exception); } } - auto blob_client = service_client_->GetBlobContainerClient(path.container) + auto blob_client = blob_service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file); try { auto properties = blob_client.GetProperties(); @@ -521,10 +545,9 @@ class AzureFileSystem::Impl { return info; } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - blob_service_client_->GetBlobContainerClient(path.container) - // TODO: List the `path_to_file/` prefix and consider it a directory if there - // is at least one result. - return FileInfo(path.full_path, FileType::NotFound); + // TODO: List the `path_to_file/` prefix and consider it a directory if there + // is at least one result. + return FileInfo(path.full_path, FileType::NotFound); } return ErrorToStatus( "When fetching properties for '" + blob_client.GetUrl() + "': ", exception); @@ -537,7 +560,7 @@ class AzureFileSystem::Impl { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - service_client_->GetBlobContainerClient(path.container) + blob_service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); auto ptr = @@ -558,7 +581,7 @@ class AzureFileSystem::Impl { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - service_client_->GetBlobContainerClient(path.container) + blob_service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); auto ptr = std::make_shared(blob_client, fs->io_context(), From 5067a6e663028174cb5df76c24a2c81a5ff94840 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 29 Oct 2023 22:06:10 +0000 Subject: [PATCH 08/45] Mostly correct directory detection --- cpp/src/arrow/filesystem/azurefs.cc | 56 +++++++++++++++++++---------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 20f487c60f7..f3aa962b1d2 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -152,13 +152,6 @@ Status ErrorToStatus(const std::string& prefix, return Status::IOError(prefix, " Azure Error: ", exception.what()); } -// Copy of private function from Azure SDK -// https://github.com/Azure/azure-sdk-for-cpp/blob/dd236311193c6a3debf3b12c47f14e49a20c72c7/sdk/storage/azure-storage-files-datalake/src/datalake_utilities.cpp#L86-L90 -bool MetadataIncidatesIsDirectory(const Azure::Storage::Metadata& metadata) { - auto ite = metadata.find("hdi_isFolder"); - return ite != metadata.end() && ite->second == "true"; -} - template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -483,13 +476,13 @@ class AzureFileSystem::Impl { const AzureOptions& options() const { return options_; } - public: + public: const bool get_is_hierarchical_namespace_enabled(const std::string& container) { if (!is_hierarchical_namespace_enabled_.has_value()) { auto path_client = datalake_service_client_->GetFileSystemClient(container); // try { - path_client.GetAccessPolicy(); - is_hierarchical_namespace_enabled_ = true; + path_client.GetAccessPolicy(); + is_hierarchical_namespace_enabled_ = true; // } catch () { // is_hierarchical_namespace_enabled_ = false; // } @@ -529,28 +522,53 @@ class AzureFileSystem::Impl { exception); } } - auto blob_client = blob_service_client_->GetBlobContainerClient(path.container) - .GetBlobClient(path.path_to_file); + auto file_client = datalake_service_client_->GetFileSystemClient(path.container) + .GetFileClient(path.path_to_file); try { - auto properties = blob_client.GetProperties(); + auto properties = file_client.GetProperties(); auto info = FileInfo(path.full_path, FileType::File); - if (MetadataIncidatesIsDirectory(properties.Value.Metadata)) { + if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); } else { info.set_type(FileType::File); - info.set_size(properties.Value.BlobSize); + info.set_size(properties.Value.FileSize); } info.set_mtime( std::chrono::system_clock::time_point(properties.Value.LastModified)); return info; } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - // TODO: List the `path_to_file/` prefix and consider it a directory if there - // is at least one result. - return FileInfo(path.full_path, FileType::NotFound); + if (get_is_hierarchical_namespace_enabled(path.container)) { + // If the hierarchical namespace is enabled, then the storage account will have + // explicit directories. Neither a file nor a directory was found. + return FileInfo(path.full_path, FileType::NotFound); + } + // If the hierarchical namespace is not enabled, then there are no real + // directories. Directories are only implied by using `/` in the blob name. + // We need to detect implied directories by listing. + Azure::Storage::Blobs::ListBlobsOptions list_blob_options; + + // AzurePath::FromString() ensures that path_to_file does not end with a slash. + // If listing the prefix `path.path_to_file + "/"` returns at least one result + // then path refers to an implied directory. + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path.path_to_file)); + list_blob_options.Prefix = path.path_to_file + "/"; + // We only need to know if there is at least one result, so minimise page size + // for efficiency. + list_blob_options.PageSizeHint = 1; + + // TODO: Confirm this will only fetch a single page, for efficiency + // TODO: Return appropriate status if there is an exception. + if (blob_service_client_->GetBlobContainerClient(path.container) + .ListBlobs(list_blob_options) + .HasPage()) { + return FileInfo(path.full_path, FileType::Directory); + } else { + return FileInfo(path.full_path, FileType::NotFound); + } } return ErrorToStatus( - "When fetching properties for '" + blob_client.GetUrl() + "': ", exception); + "When fetching properties for '" + file_client.GetUrl() + "': ", exception); } } From 1416e5719bada1f081240e3fbd512f3dfbca6e75 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 29 Oct 2023 22:25:10 +0000 Subject: [PATCH 09/45] Working directory detection for flat namespace accounts --- cpp/src/arrow/filesystem/azurefs.cc | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index f3aa962b1d2..7f11fc4c306 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -80,18 +80,17 @@ struct AzurePath { "Expected an Azure object path of the form 'container/path...', got a URI: '", s, "'"); } - const auto src = internal::RemoveTrailingSlash(s); - auto first_sep = src.find_first_of(internal::kSep); + auto first_sep = s.find_first_of(internal::kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", s, "')"); } if (first_sep == std::string::npos) { - return AzurePath{std::string(src), std::string(src), "", {}}; + return AzurePath{std::string(s), std::string(s), "", {}}; } AzurePath path; - path.full_path = std::string(src); - path.container = std::string(src.substr(0, first_sep)); - path.path_to_file = std::string(src.substr(first_sep + 1)); + path.full_path = std::string(s); + path.container = std::string(s.substr(0, first_sep)); + path.path_to_file = std::string(s.substr(first_sep + 1)); path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file); RETURN_NOT_OK(Validate(path)); return path; @@ -482,7 +481,8 @@ class AzureFileSystem::Impl { auto path_client = datalake_service_client_->GetFileSystemClient(container); // try { path_client.GetAccessPolicy(); - is_hierarchical_namespace_enabled_ = true; + is_hierarchical_namespace_enabled_ = false; + // is_hierarchical_namespace_enabled_ = true; // } catch () { // is_hierarchical_namespace_enabled_ = false; // } @@ -548,20 +548,19 @@ class AzureFileSystem::Impl { // We need to detect implied directories by listing. Azure::Storage::Blobs::ListBlobsOptions list_blob_options; - // AzurePath::FromString() ensures that path_to_file does not end with a slash. - // If listing the prefix `path.path_to_file + "/"` returns at least one result - // then path refers to an implied directory. - ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path.path_to_file)); - list_blob_options.Prefix = path.path_to_file + "/"; + // If listing the prefix `path.path_to_file` with trailing slash returns at least + // one result then path refers to an implied directory. + list_blob_options.Prefix = internal::EnsureTrailingSlash(path.path_to_file); // We only need to know if there is at least one result, so minimise page size // for efficiency. list_blob_options.PageSizeHint = 1; // TODO: Confirm this will only fetch a single page, for efficiency // TODO: Return appropriate status if there is an exception. - if (blob_service_client_->GetBlobContainerClient(path.container) - .ListBlobs(list_blob_options) - .HasPage()) { + auto paged_list_result = + blob_service_client_->GetBlobContainerClient(path.container) + .ListBlobs(list_blob_options); + if (paged_list_result.Blobs.size() > 0) { return FileInfo(path.full_path, FileType::Directory); } else { return FileInfo(path.full_path, FileType::NotFound); From 34598361c7ad3ea22ceb3668040852f9901b8f8f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 31 Oct 2023 19:26:14 +0000 Subject: [PATCH 10/45] Tidy up --- cpp/src/arrow/filesystem/azurefs.cc | 30 +++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 7f11fc4c306..96fbbd382e7 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -543,27 +543,29 @@ class AzureFileSystem::Impl { // explicit directories. Neither a file nor a directory was found. return FileInfo(path.full_path, FileType::NotFound); } - // If the hierarchical namespace is not enabled, then there are no real - // directories. Directories are only implied by using `/` in the blob name. - // We need to detect implied directories by listing. + // On flat namespace accounts there are no real directories. Directories are only + // implied by using `/` in the blob name. Azure::Storage::Blobs::ListBlobsOptions list_blob_options; // If listing the prefix `path.path_to_file` with trailing slash returns at least - // one result then path refers to an implied directory. - list_blob_options.Prefix = internal::EnsureTrailingSlash(path.path_to_file); + // one result then `path` refers to an implied directory. + auto prefix = internal::EnsureTrailingSlash(path.path_to_file); + list_blob_options.Prefix = prefix; // We only need to know if there is at least one result, so minimise page size // for efficiency. list_blob_options.PageSizeHint = 1; - // TODO: Confirm this will only fetch a single page, for efficiency - // TODO: Return appropriate status if there is an exception. - auto paged_list_result = - blob_service_client_->GetBlobContainerClient(path.container) - .ListBlobs(list_blob_options); - if (paged_list_result.Blobs.size() > 0) { - return FileInfo(path.full_path, FileType::Directory); - } else { - return FileInfo(path.full_path, FileType::NotFound); + try { + auto paged_list_result = + blob_service_client_->GetBlobContainerClient(path.container) + .ListBlobs(list_blob_options); + if (paged_list_result.Blobs.size() > 0) { + return FileInfo(path.full_path, FileType::Directory); + } else { + return FileInfo(path.full_path, FileType::NotFound); + } + } catch (const Azure::Storage::StorageException& exception) { + return ErrorToStatus("When listing blobs for '" + prefix + "': ", exception); } } return ErrorToStatus( From d51b4e6d9d952156ace2b8f2990adee0cf624dfa Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 31 Oct 2023 19:27:04 +0000 Subject: [PATCH 11/45] Add first test against real blob storage for HNS tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 43 +++++++++++++++++++----- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 580f2f88363..d7b53164ad2 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -149,19 +149,21 @@ class TestAzureFileSystem : public ::testing::Test { TestAzureFileSystem() : generator_(std::random_device()()) {} - AzureOptions MakeOptions() { - const std::string& account_name = GetAzuriteEnv()->account_name(); - const std::string& account_key = GetAzuriteEnv()->account_key(); + // virtual std::string GetAccountName() const { return GetAzuriteEnv()->account_name(); + // } virtual std::string GetAccountKey() const { return GetAzuriteEnv()->account_key(); + // } + + virtual AzureOptions MakeOptions() { + EXPECT_THAT(GetAzuriteEnv(), NotNull()); + ARROW_EXPECT_OK(GetAzuriteEnv()->status()); AzureOptions options; options.backend = AzureBackend::Azurite; - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( + GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key())); return options; } void SetUp() override { - ASSERT_THAT(GetAzuriteEnv(), NotNull()); - ASSERT_OK(GetAzuriteEnv()->status()); - container_name_ = RandomChars(32); auto options = MakeOptions(); service_client_ = std::make_shared( @@ -227,6 +229,15 @@ class TestAzureFileSystem : public ::testing::Test { } }; +class TestAzureHNSFileSystem : public TestAzureFileSystem { + virtual AzureOptions MakeOptions() override { + AzureOptions options; + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( + std::getenv("AZURE_HNS_ACCOUNT_NAME"), std::getenv("AZURE_HNS_ACCOUNT_KEY"))); + return options; + } +}; + TEST_F(TestAzureFileSystem, GetFileInfoAccount) { arrow::fs::AssertFileInfo(fs_.get(), "", FileType::Directory); @@ -282,7 +293,7 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectWithNestedStructure) { FileType::NotFound); } -// TODO: Add ADLS Gen2 directory tests. +// TODO: Add ADLS Gen2 directory tests. TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { auto object_properties = @@ -300,6 +311,22 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); } +TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectNoExplicitObject) { + auto object_properties = + service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlobClient(PreexistingObjectName()) + .GetProperties() + .Value; + + arrow::fs::AssertFileInfo( + fs_.get(), PreexistingObjectPath(), FileType::File, + std::chrono::system_clock::time_point(object_properties.LastModified), + static_cast(object_properties.BlobSize)); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); +} + TEST_F(TestAzureFileSystem, OpenInputStreamString) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); From 4adc5e756bacd6d2a13b6bb7f578f4c235457a7f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 1 Nov 2023 09:47:24 +0000 Subject: [PATCH 12/45] Main directory detection test for HNS accounts --- cpp/src/arrow/filesystem/azurefs_test.cc | 98 ++++++++++++++++++------ 1 file changed, 75 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index d7b53164ad2..aa15cef9084 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -46,6 +46,7 @@ #include #include #include +#include #include "arrow/filesystem/test_util.h" #include "arrow/testing/gtest_util.h" @@ -143,16 +144,13 @@ TEST(AzureFileSystem, OptionsCompare) { class TestAzureFileSystem : public ::testing::Test { public: std::shared_ptr fs_; - std::shared_ptr service_client_; + std::shared_ptr blob_service_client_; + AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; TestAzureFileSystem() : generator_(std::random_device()()) {} - // virtual std::string GetAccountName() const { return GetAzuriteEnv()->account_name(); - // } virtual std::string GetAccountKey() const { return GetAzuriteEnv()->account_key(); - // } - virtual AzureOptions MakeOptions() { EXPECT_THAT(GetAzuriteEnv(), NotNull()); ARROW_EXPECT_OK(GetAzuriteEnv()->status()); @@ -165,11 +163,11 @@ class TestAzureFileSystem : public ::testing::Test { void SetUp() override { container_name_ = RandomChars(32); - auto options = MakeOptions(); - service_client_ = std::make_shared( - options.account_blob_url, options.storage_credentials_provider); - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options)); - auto container_client = service_client_->GetBlobContainerClient(container_name_); + options_ = MakeOptions(); + blob_service_client_ = std::make_shared( + options_.account_blob_url, options_.storage_credentials_provider); + ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); container_client.CreateIfNotExists(); auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); @@ -178,9 +176,10 @@ class TestAzureFileSystem : public ::testing::Test { } void TearDown() override { - auto containers = service_client_->ListBlobContainers(); + auto containers = blob_service_client_->ListBlobContainers(); for (auto container : containers.BlobContainers) { - auto container_client = service_client_->GetBlobContainerClient(container.Name); + auto container_client = + blob_service_client_->GetBlobContainerClient(container.Name); container_client.DeleteIfExists(); } } @@ -221,8 +220,9 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const char* path_to_file, int total_size) { // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); + auto blob_client = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); blob_client.UploadFrom(reinterpret_cast(all_lines.data()), total_size); @@ -230,12 +230,22 @@ class TestAzureFileSystem : public ::testing::Test { }; class TestAzureHNSFileSystem : public TestAzureFileSystem { - virtual AzureOptions MakeOptions() override { + public: + std::shared_ptr + datalake_service_client_; + AzureOptions MakeOptions() override { AzureOptions options; ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( std::getenv("AZURE_HNS_ACCOUNT_NAME"), std::getenv("AZURE_HNS_ACCOUNT_KEY"))); return options; } + + void SetUp() override { + TestAzureFileSystem::SetUp(); + datalake_service_client_ = + std::make_shared( + options_.account_dfs_url, options_.storage_credentials_provider); + } }; TEST_F(TestAzureFileSystem, GetFileInfoAccount) { @@ -259,18 +269,57 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectWithNestedStructure) { // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(kObjectName) + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + // 0 is immediately after "/" lexicographically, ensure that this doesn't + // cause unexpected issues. + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient("test-object-dir/some_other_dir0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(std::string(kObjectName) + "0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", + FileType::NotFound); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir", + FileType::Directory); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/", + FileType::Directory); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_dir", + FileType::Directory); + AssertFileInfo(fs_.get(), + PreexistingContainerPath() + "test-object-dir/some_other_dir/", + FileType::Directory); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di", + FileType::NotFound); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_di", + FileType::NotFound); +} + +TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectWithNestedStructure) { + // Adds detailed tests to handle cases of different edge cases + // with directory naming conventions (e.g. with and without slashes). + constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(kObjectName) .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient("test-object-dir/some_other_dir0") .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(std::string(kObjectName) + "0") .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); @@ -295,9 +344,11 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectWithNestedStructure) { // TODO: Add ADLS Gen2 directory tests. +// TODO: Check I haven't accidentally changed the purpose of this test. The name seems a +// bit strange. TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { auto object_properties = - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) .GetProperties() .Value; @@ -313,7 +364,7 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectNoExplicitObject) { auto object_properties = - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) .GetProperties() .Value; @@ -364,7 +415,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { const auto path_to_file = "empty-object.txt"; const auto path = PreexistingContainerPath() + path_to_file; - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); @@ -552,8 +603,9 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { const auto path = PreexistingContainerPath() + path_to_file; const std::string contents = "The quick brown fox jumps over the lazy dog"; - auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); + auto blob_client = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); blob_client.UploadFrom(reinterpret_cast(contents.data()), contents.length()); From 8de8ddf1c4539a0106e100c26a003f786735aca9 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 1 Nov 2023 09:47:49 +0000 Subject: [PATCH 13/45] Fix handling of file with trailing slash on HNS account --- cpp/src/arrow/filesystem/azurefs.cc | 3 +++ cpp/src/arrow/filesystem/path_util.cc | 9 ++++++++- cpp/src/arrow/filesystem/path_util.h | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 96fbbd382e7..ed5bc580de1 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -529,6 +529,9 @@ class AzureFileSystem::Impl { auto info = FileInfo(path.full_path, FileType::File); if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); + } else if (internal::HasTrailingSlash(path.path_to_file)) { + info.set_type(FileType::NotFound); + return info; } else { info.set_type(FileType::File); info.set_size(properties.Value.FileSize); diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 90af3c66ff8..e3fb655437f 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -191,12 +191,19 @@ std::string_view RemoveLeadingSlash(std::string_view key) { } Status AssertNoTrailingSlash(std::string_view key) { - if (key.back() == '/') { + if (HasTrailingSlash(key)) { return NotAFile(key); } return Status::OK(); } +bool HasTrailingSlash(std::string_view key) { + if (key.back() != '/') { + return false; + } + return true; +} + bool HasLeadingSlash(std::string_view key) { if (key.front() != '/') { return false; diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 13a74b7fa12..2c8c123e779 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -94,6 +94,9 @@ std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = fa ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); +ARROW_EXPORT +bool HasTrailingSlash(std::string_view s); + ARROW_EXPORT bool HasLeadingSlash(std::string_view s); From 655ac3b920a3fda6189669744a5e7eef176a920d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 1 Nov 2023 20:24:14 +0000 Subject: [PATCH 14/45] Test empty directory on HNS account --- cpp/src/arrow/filesystem/azurefs_test.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index aa15cef9084..1847f5a6f81 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -323,6 +323,10 @@ TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectWithNestedStructure) { .GetBlockBlobClient(std::string(kObjectName) + "0") .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + datalake_service_client_->GetFileSystemClient(PreexistingContainerName()) + .GetDirectoryClient("test-empty-object-dir") + .Create(); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", FileType::NotFound); @@ -340,6 +344,11 @@ TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectWithNestedStructure) { FileType::NotFound); AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_di", FileType::NotFound); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-empty-object-dir", + FileType::Directory); + + // TODO: Add an assert that it did not use ListBlobs to detect directories. } // TODO: Add ADLS Gen2 directory tests. From 49a7c698666fe679e05703b17e1767cc0e2f8169 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 2 Nov 2023 08:55:41 +0000 Subject: [PATCH 15/45] Working handling of HNS --- cpp/src/arrow/filesystem/azurefs.cc | 35 +++++++++++++++--------- cpp/src/arrow/filesystem/azurefs_test.cc | 11 +++++++- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ed5bc580de1..ad4a0e21794 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -151,6 +151,10 @@ Status ErrorToStatus(const std::string& prefix, return Status::IOError(prefix, " Azure Error: ", exception.what()); } +bool ContainerOrBlobNotFound(const Azure::Storage::StorageException& exception) { + return exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound; +} + template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -315,8 +319,7 @@ class ObjectInputFile final : public io::RandomAccessFile { metadata_ = PropertiesToMetadata(properties.Value); return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - // Could be either container or blob not found. + if (ContainerOrBlobNotFound(exception)) { return PathNotFound(path_); } return ErrorToStatus( @@ -469,7 +472,7 @@ class AzureFileSystem::Impl { options_.account_blob_url, options_.storage_credentials_provider); datalake_service_client_ = std::make_shared( - options_.account_blob_url, options_.storage_credentials_provider); + options_.account_dfs_url, options_.storage_credentials_provider); return Status::OK(); } @@ -478,14 +481,20 @@ class AzureFileSystem::Impl { public: const bool get_is_hierarchical_namespace_enabled(const std::string& container) { if (!is_hierarchical_namespace_enabled_.has_value()) { - auto path_client = datalake_service_client_->GetFileSystemClient(container); - // try { - path_client.GetAccessPolicy(); - is_hierarchical_namespace_enabled_ = false; - // is_hierarchical_namespace_enabled_ = true; - // } catch () { - // is_hierarchical_namespace_enabled_ = false; - // } + try { + datalake_service_client_->GetFileSystemClient(container) + .GetDirectoryClient("/") + .GetAccessControlList(); + is_hierarchical_namespace_enabled_ = true; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest) { + // The container exists, but the access control list (ACL) endpoint responded + // bad request because this storage account does not support hierarchical namespace. + is_hierarchical_namespace_enabled_ = false; + } else { + throw exception; + } + } } return is_hierarchical_namespace_enabled_.value(); } @@ -514,7 +523,7 @@ class AzureFileSystem::Impl { std::chrono::system_clock::time_point(properties.Value.LastModified)); return info; } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + if (ContainerOrBlobNotFound(exception)) { return FileInfo(path.full_path, FileType::NotFound); } return ErrorToStatus( @@ -540,7 +549,7 @@ class AzureFileSystem::Impl { std::chrono::system_clock::time_point(properties.Value.LastModified)); return info; } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + if (ContainerOrBlobNotFound(exception)) { if (get_is_hierarchical_namespace_enabled(path.container)) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 1847f5a6f81..3c8b49d437c 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -229,6 +229,15 @@ class TestAzureFileSystem : public ::testing::Test { } }; +class TestAzureFlatFileSystem : public TestAzureFileSystem { + AzureOptions MakeOptions() override { + AzureOptions options; + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( + std::getenv("AZURE_FLAT_ACCOUNT_NAME"), std::getenv("AZURE_FLAT_ACCOUNT_KEY"))); + return options; + } +}; + class TestAzureHNSFileSystem : public TestAzureFileSystem { public: std::shared_ptr @@ -264,7 +273,7 @@ TEST_F(TestAzureFileSystem, GetFileInfoContainer) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); } -TEST_F(TestAzureFileSystem, GetFileInfoObjectWithNestedStructure) { +TEST_F(TestAzureFlatFileSystem, GetFileInfoObjectWithNestedStructure) { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; From cc6f156fe7c77f5bc7dc39f14dbc46164d2dc12f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 2 Nov 2023 10:17:01 +0000 Subject: [PATCH 16/45] Naive move hierarchical namespace detection --- cpp/src/arrow/filesystem/azurefs.cc | 65 ++++++++++++++++++----------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ad4a0e21794..2046c4b03cd 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -448,6 +448,40 @@ class ObjectInputFile final : public io::RandomAccessFile { std::shared_ptr metadata_; }; +class HierachicalNamespaceDetecter { + public: + HierachicalNamespaceDetecter( + Azure::Storage::Files::DataLake::DataLakeServiceClient datalake_service_client) + : datalake_service_client_(datalake_service_client) {} + bool Enabled(const std::string& container) { + if (is_hierachical_namespace_enabled_.has_value()) { + return is_hierachical_namespace_enabled_.value(); + } + try { + datalake_service_client_.GetFileSystemClient(container) + .GetDirectoryClient("/") + .GetAccessControlList(); + is_hierachical_namespace_enabled_ = true; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest) { + // The container exists, but the access control list (ACL) endpoint responded + // bad request because this storage account does not support hierarchical + // namespace. + is_hierachical_namespace_enabled_ = false; + } else { + // Something else failed so we can't tell if the storage account has hierachical + // namespace. + throw exception; + } + } + return is_hierachical_namespace_enabled_.value(); + } + + private: + Azure::Storage::Files::DataLake::DataLakeServiceClient datalake_service_client_; + std::optional is_hierachical_namespace_enabled_; +}; + } // namespace // ----------------------------------------------------------------------- @@ -460,12 +494,10 @@ class AzureFileSystem::Impl { datalake_service_client_; std::shared_ptr blob_service_client_; AzureOptions options_; - std::optional is_hierarchical_namespace_enabled_; + std::shared_ptr hierarchical_namespace = nullptr; explicit Impl(AzureOptions options, io::IOContext io_context) - : io_context_(io_context), - options_(std::move(options)), - is_hierarchical_namespace_enabled_(std::nullopt) {} + : io_context_(io_context), options_(std::move(options)) {} Status Init() { blob_service_client_ = std::make_shared( @@ -473,32 +505,15 @@ class AzureFileSystem::Impl { datalake_service_client_ = std::make_shared( options_.account_dfs_url, options_.storage_credentials_provider); + hierarchical_namespace = std::make_shared( + Azure::Storage::Files::DataLake::DataLakeServiceClient( + options_.account_dfs_url, options_.storage_credentials_provider)); return Status::OK(); } const AzureOptions& options() const { return options_; } public: - const bool get_is_hierarchical_namespace_enabled(const std::string& container) { - if (!is_hierarchical_namespace_enabled_.has_value()) { - try { - datalake_service_client_->GetFileSystemClient(container) - .GetDirectoryClient("/") - .GetAccessControlList(); - is_hierarchical_namespace_enabled_ = true; - } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest) { - // The container exists, but the access control list (ACL) endpoint responded - // bad request because this storage account does not support hierarchical namespace. - is_hierarchical_namespace_enabled_ = false; - } else { - throw exception; - } - } - } - return is_hierarchical_namespace_enabled_.value(); - } - Result GetFileInfo(const AzurePath& path) { FileInfo info; info.set_path(path.full_path); @@ -550,7 +565,7 @@ class AzureFileSystem::Impl { return info; } catch (const Azure::Storage::StorageException& exception) { if (ContainerOrBlobNotFound(exception)) { - if (get_is_hierarchical_namespace_enabled(path.container)) { + if (hierarchical_namespace->Enabled(path.container)) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. return FileInfo(path.full_path, FileType::NotFound); From b3ea9154cf4db4bf7d5e2fbf49adb851fd58ab02 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 2 Nov 2023 10:20:18 +0000 Subject: [PATCH 17/45] Avoid copying the datalake client --- cpp/src/arrow/filesystem/azurefs.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 2046c4b03cd..1f6e6329327 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -451,14 +451,15 @@ class ObjectInputFile final : public io::RandomAccessFile { class HierachicalNamespaceDetecter { public: HierachicalNamespaceDetecter( - Azure::Storage::Files::DataLake::DataLakeServiceClient datalake_service_client) + std::shared_ptr + datalake_service_client) : datalake_service_client_(datalake_service_client) {} bool Enabled(const std::string& container) { if (is_hierachical_namespace_enabled_.has_value()) { return is_hierachical_namespace_enabled_.value(); } try { - datalake_service_client_.GetFileSystemClient(container) + datalake_service_client_->GetFileSystemClient(container) .GetDirectoryClient("/") .GetAccessControlList(); is_hierachical_namespace_enabled_ = true; @@ -478,7 +479,8 @@ class HierachicalNamespaceDetecter { } private: - Azure::Storage::Files::DataLake::DataLakeServiceClient datalake_service_client_; + std::shared_ptr + datalake_service_client_; std::optional is_hierachical_namespace_enabled_; }; @@ -505,9 +507,8 @@ class AzureFileSystem::Impl { datalake_service_client_ = std::make_shared( options_.account_dfs_url, options_.storage_credentials_provider); - hierarchical_namespace = std::make_shared( - Azure::Storage::Files::DataLake::DataLakeServiceClient( - options_.account_dfs_url, options_.storage_credentials_provider)); + hierarchical_namespace = + std::make_shared(datalake_service_client_); return Status::OK(); } From 379090a6c46b28fcfeab286b3b6854de87c0b57d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 2 Nov 2023 20:32:31 +0000 Subject: [PATCH 18/45] Tests for HierachicalNamespaceDetecter --- cpp/src/arrow/filesystem/azurefs.cc | 74 ++++++++++++------------ cpp/src/arrow/filesystem/azurefs_test.cc | 25 ++++---- 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 1f6e6329327..12a2a00b8b7 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -60,6 +60,43 @@ Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_n credentials_kind = AzureCredentialsKind::StorageCredentials; return Status::OK(); } + +class HierachicalNamespaceDetecter { + public: + HierachicalNamespaceDetecter( + std::shared_ptr + datalake_service_client) + : datalake_service_client_(datalake_service_client) {} + bool Enabled(const std::string& container) { + if (is_hierachical_namespace_enabled_.has_value()) { + return is_hierachical_namespace_enabled_.value(); + } + try { + datalake_service_client_->GetFileSystemClient(container) + .GetDirectoryClient("/") + .GetAccessControlList(); + is_hierachical_namespace_enabled_ = true; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest) { + // The container exists, but the access control list (ACL) endpoint responded + // bad request because this storage account does not support hierarchical + // namespace. + is_hierachical_namespace_enabled_ = false; + } else { + // Something else failed so we can't tell if the storage account has hierachical + // namespace. + throw exception; + } + } + return is_hierachical_namespace_enabled_.value(); + } + + private: + std::shared_ptr + datalake_service_client_; + std::optional is_hierachical_namespace_enabled_; +}; + namespace { // An AzureFileSystem represents a single Azure storage account. AzurePath describes a @@ -447,43 +484,6 @@ class ObjectInputFile final : public io::RandomAccessFile { int64_t content_length_ = kNoSize; std::shared_ptr metadata_; }; - -class HierachicalNamespaceDetecter { - public: - HierachicalNamespaceDetecter( - std::shared_ptr - datalake_service_client) - : datalake_service_client_(datalake_service_client) {} - bool Enabled(const std::string& container) { - if (is_hierachical_namespace_enabled_.has_value()) { - return is_hierachical_namespace_enabled_.value(); - } - try { - datalake_service_client_->GetFileSystemClient(container) - .GetDirectoryClient("/") - .GetAccessControlList(); - is_hierachical_namespace_enabled_ = true; - } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest) { - // The container exists, but the access control list (ACL) endpoint responded - // bad request because this storage account does not support hierarchical - // namespace. - is_hierachical_namespace_enabled_ = false; - } else { - // Something else failed so we can't tell if the storage account has hierachical - // namespace. - throw exception; - } - } - return is_hierachical_namespace_enabled_.value(); - } - - private: - std::shared_ptr - datalake_service_client_; - std::optional is_hierachical_namespace_enabled_; -}; - } // namespace // ----------------------------------------------------------------------- diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 3c8b49d437c..2c12bb116bf 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -34,6 +34,7 @@ #include #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs.cc" #include #include @@ -145,6 +146,7 @@ class TestAzureFileSystem : public ::testing::Test { public: std::shared_ptr fs_; std::shared_ptr blob_service_client_; + std::shared_ptr datalake_service_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -166,6 +168,9 @@ class TestAzureFileSystem : public ::testing::Test { options_ = MakeOptions(); blob_service_client_ = std::make_shared( options_.account_blob_url, options_.storage_credentials_provider); + datalake_service_client_ = + std::make_shared( + options_.account_dfs_url, options_.storage_credentials_provider); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); container_client.CreateIfNotExists(); @@ -239,24 +244,24 @@ class TestAzureFlatFileSystem : public TestAzureFileSystem { }; class TestAzureHNSFileSystem : public TestAzureFileSystem { - public: - std::shared_ptr - datalake_service_client_; AzureOptions MakeOptions() override { AzureOptions options; ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( std::getenv("AZURE_HNS_ACCOUNT_NAME"), std::getenv("AZURE_HNS_ACCOUNT_KEY"))); return options; } - - void SetUp() override { - TestAzureFileSystem::SetUp(); - datalake_service_client_ = - std::make_shared( - options_.account_dfs_url, options_.storage_credentials_provider); - } }; +TEST_F(TestAzureFlatFileSystem, DetectHierarchicalNamespace) { + auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); + EXPECT_FALSE(hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(TestAzureHNSFileSystem, DetectHierarchicalNamespace) { + auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); + EXPECT_TRUE(hierarchical_namespace.Enabled(PreexistingContainerName())); +} + TEST_F(TestAzureFileSystem, GetFileInfoAccount) { arrow::fs::AssertFileInfo(fs_.get(), "", FileType::Directory); From 2f5918364413cd2bab3157004336d95ad386d93d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 2 Nov 2023 20:41:53 +0000 Subject: [PATCH 19/45] Working HNS detection vs flat with soft delete --- cpp/src/arrow/filesystem/azurefs.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 12a2a00b8b7..439db00f093 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -77,10 +77,12 @@ class HierachicalNamespaceDetecter { .GetAccessControlList(); is_hierachical_namespace_enabled_ = true; } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest) { - // The container exists, but the access control list (ACL) endpoint responded - // bad request because this storage account does not support hierarchical - // namespace. + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest || + exception.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { + // GetAccessControlList will fail on storage accounts without hierarchical + // namespace enabled. If soft delete blobs is enabled it returns Conflict, + // otherwise it returns BadRequest. + // On Azureite it returns NotFound. is_hierachical_namespace_enabled_ = false; } else { // Something else failed so we can't tell if the storage account has hierachical From 5906a6a43392f8820a680ec1ea651715b6f58b48 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 2 Nov 2023 23:06:52 +0000 Subject: [PATCH 20/45] HierachicalNamespaceDetecter.Enable returns Status, supports azurite and more tests --- cpp/src/arrow/filesystem/azurefs.cc | 58 +++++++++++++++++------- cpp/src/arrow/filesystem/azurefs_test.cc | 19 ++++++-- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 439db00f093..7e38f8fc03e 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -61,33 +61,62 @@ Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_n return Status::OK(); } +Status ErrorToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception) { + return Status::IOError(prefix, " Azure Error: ", exception.what()); +} + class HierachicalNamespaceDetecter { public: HierachicalNamespaceDetecter( std::shared_ptr datalake_service_client) : datalake_service_client_(datalake_service_client) {} - bool Enabled(const std::string& container) { + Result Enabled(const std::string& container) { if (is_hierachical_namespace_enabled_.has_value()) { return is_hierachical_namespace_enabled_.value(); } + + // This approach is inspired by hadoop-azure + // https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. + // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly + // elevated permissions. + // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization + auto filesystem_client = datalake_service_client_->GetFileSystemClient(container); + auto directory_client = filesystem_client.GetDirectoryClient("/"); try { - datalake_service_client_->GetFileSystemClient(container) - .GetDirectoryClient("/") - .GetAccessControlList(); + directory_client.GetAccessControlList(); is_hierachical_namespace_enabled_ = true; } catch (const Azure::Storage::StorageException& exception) { + // GetAccessControlList will fail on storage accounts without hierarchical + // namespace enabled. + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest || exception.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { - // GetAccessControlList will fail on storage accounts without hierarchical - // namespace enabled. If soft delete blobs is enabled it returns Conflict, - // otherwise it returns BadRequest. - // On Azureite it returns NotFound. + // Flat namespace storage accounts with soft delete enabled return + // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete + // otherwise it returns: BadRequest - This operation is only supported on a + // hierarchical namespace account. is_hierachical_namespace_enabled_ = false; + } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + // Azurite returns NotFound. + try { + // Ensure sure that the directory exists by checking its properties. If it + // doesn't then the GetAccessControlList check was invalid and we can't tell + // if the storage account has hierachical namespace. + filesystem_client.GetProperties(); + is_hierachical_namespace_enabled_ = false; + } catch (const Azure::Storage::StorageException& exception) { + return ErrorToStatus( + "When getting properties '" + filesystem_client.GetUrl() + "': ", + exception); + } } else { - // Something else failed so we can't tell if the storage account has hierachical + // Unexpected error so we can't tell if the storage account has hierachical // namespace. - throw exception; + return ErrorToStatus( + "When getting access control list '" + directory_client.GetUrl() + "': ", + exception); } } return is_hierachical_namespace_enabled_.value(); @@ -185,11 +214,6 @@ Status ValidateFilePath(const AzurePath& path) { return Status::OK(); } -Status ErrorToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { - return Status::IOError(prefix, " Azure Error: ", exception.what()); -} - bool ContainerOrBlobNotFound(const Azure::Storage::StorageException& exception) { return exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound; } @@ -568,7 +592,9 @@ class AzureFileSystem::Impl { return info; } catch (const Azure::Storage::StorageException& exception) { if (ContainerOrBlobNotFound(exception)) { - if (hierarchical_namespace->Enabled(path.container)) { + ARROW_ASSIGN_OR_RAISE(bool hierarchical_namespace_enabled, + hierarchical_namespace->Enabled(path.container)); + if (hierarchical_namespace_enabled) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. return FileInfo(path.full_path, FileType::NotFound); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 2c12bb116bf..09d444d6ea7 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -33,8 +33,8 @@ // cpp/cmake_modules/ThirdpartyToolchain.cmake for details. #include -#include "arrow/filesystem/azurefs.h" #include "arrow/filesystem/azurefs.cc" +#include "arrow/filesystem/azurefs.h" #include #include @@ -146,7 +146,8 @@ class TestAzureFileSystem : public ::testing::Test { public: std::shared_ptr fs_; std::shared_ptr blob_service_client_; - std::shared_ptr datalake_service_client_; + std::shared_ptr + datalake_service_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -254,12 +255,22 @@ class TestAzureHNSFileSystem : public TestAzureFileSystem { TEST_F(TestAzureFlatFileSystem, DetectHierarchicalNamespace) { auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); - EXPECT_FALSE(hierarchical_namespace.Enabled(PreexistingContainerName())); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(TestAzureHNSFileSystem, DetectHierarchicalNamespace) { auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); - EXPECT_TRUE(hierarchical_namespace.Enabled(PreexistingContainerName())); + ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(TestAzureFileSystem, DetectHierarchicalNamespace) { + auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(TestAzureFileSystem, DetectHierarchicalNamespaceFailsWithMissingContainer) { + auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); + ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); } TEST_F(TestAzureFileSystem, GetFileInfoAccount) { From a030a998ed7f31b7ecfc5399a02e76a06d1c36d7 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 2 Nov 2023 23:06:06 +0000 Subject: [PATCH 21/45] Addresss TODO(GH-38335)s for using GetFileInfo in tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 09d444d6ea7..5d4ab67fb46 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -435,9 +435,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { } TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingObjectPath())); - arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); @@ -465,14 +463,10 @@ TEST_F(TestAzureFileSystem, OpenInputStreamNotFound) { } TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingContainerPath())); - arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::Directory); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(NotFoundObjectPath())); - arrow::fs::FileInfo info2(PreexistingContainerPath(), FileType::NotFound); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); } @@ -649,9 +643,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { } TEST_F(TestAzureFileSystem, OpenInputFileInfo) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingObjectPath())); - arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); @@ -670,14 +662,10 @@ TEST_F(TestAzureFileSystem, OpenInputFileNotFound) { } TEST_F(TestAzureFileSystem, OpenInputFileInfoInvalid) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingContainerPath())); - arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::File); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(NotFoundObjectPath())); - arrow::fs::FileInfo info2(NotFoundObjectPath(), FileType::NotFound); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); } From 147b9a00c949637ed609ce776a9b25c8c5b2e234 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 3 Nov 2023 09:28:58 +0000 Subject: [PATCH 22/45] Avoid holding reference to service client in HierachicalNamespaceDetecter --- cpp/src/arrow/filesystem/azurefs.cc | 20 +++++++------------- cpp/src/arrow/filesystem/azurefs_test.cc | 23 +++++++++++++++-------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 7e38f8fc03e..8a2b9dc07f7 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -68,11 +68,8 @@ Status ErrorToStatus(const std::string& prefix, class HierachicalNamespaceDetecter { public: - HierachicalNamespaceDetecter( - std::shared_ptr - datalake_service_client) - : datalake_service_client_(datalake_service_client) {} - Result Enabled(const std::string& container) { + Result Enabled( + Azure::Storage::Files::DataLake::DataLakeFileSystemClient filesystem_client) { if (is_hierachical_namespace_enabled_.has_value()) { return is_hierachical_namespace_enabled_.value(); } @@ -82,7 +79,6 @@ class HierachicalNamespaceDetecter { // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly // elevated permissions. // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization - auto filesystem_client = datalake_service_client_->GetFileSystemClient(container); auto directory_client = filesystem_client.GetDirectoryClient("/"); try { directory_client.GetAccessControlList(); @@ -123,8 +119,6 @@ class HierachicalNamespaceDetecter { } private: - std::shared_ptr - datalake_service_client_; std::optional is_hierachical_namespace_enabled_; }; @@ -522,7 +516,7 @@ class AzureFileSystem::Impl { datalake_service_client_; std::shared_ptr blob_service_client_; AzureOptions options_; - std::shared_ptr hierarchical_namespace = nullptr; + HierachicalNamespaceDetecter hierarchical_namespace_; explicit Impl(AzureOptions options, io::IOContext io_context) : io_context_(io_context), options_(std::move(options)) {} @@ -533,8 +527,6 @@ class AzureFileSystem::Impl { datalake_service_client_ = std::make_shared( options_.account_dfs_url, options_.storage_credentials_provider); - hierarchical_namespace = - std::make_shared(datalake_service_client_); return Status::OK(); } @@ -592,8 +584,10 @@ class AzureFileSystem::Impl { return info; } catch (const Azure::Storage::StorageException& exception) { if (ContainerOrBlobNotFound(exception)) { - ARROW_ASSIGN_OR_RAISE(bool hierarchical_namespace_enabled, - hierarchical_namespace->Enabled(path.container)); + ARROW_ASSIGN_OR_RAISE( + bool hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled( + datalake_service_client_->GetFileSystemClient(path.container))); if (hierarchical_namespace_enabled) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 5d4ab67fb46..0c99ec8f21c 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -254,23 +254,30 @@ class TestAzureHNSFileSystem : public TestAzureFileSystem { }; TEST_F(TestAzureFlatFileSystem, DetectHierarchicalNamespace) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); - ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); + auto hierarchical_namespace = HierachicalNamespaceDetecter(); + ASSERT_OK_AND_EQ( + false, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( + PreexistingContainerName()))); } TEST_F(TestAzureHNSFileSystem, DetectHierarchicalNamespace) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); - ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); + auto hierarchical_namespace = HierachicalNamespaceDetecter(); + ASSERT_OK_AND_EQ( + true, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( + PreexistingContainerName()))); } TEST_F(TestAzureFileSystem, DetectHierarchicalNamespace) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); - ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); + auto hierarchical_namespace = HierachicalNamespaceDetecter(); + ASSERT_OK_AND_EQ( + false, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( + PreexistingContainerName()))); } TEST_F(TestAzureFileSystem, DetectHierarchicalNamespaceFailsWithMissingContainer) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(datalake_service_client_); - ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); + auto hierarchical_namespace = HierachicalNamespaceDetecter(); + ASSERT_NOT_OK(hierarchical_namespace.Enabled( + datalake_service_client_->GetFileSystemClient("non-existent-container"))); } TEST_F(TestAzureFileSystem, GetFileInfoAccount) { From d94f961842d75748d227063cc00a8b5d04d3ce31 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 12:58:04 +0000 Subject: [PATCH 23/45] Tidy --- cpp/src/arrow/filesystem/azurefs.cc | 2 -- cpp/src/arrow/filesystem/azurefs_test.cc | 10 ++-------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 8a2b9dc07f7..f40d7290d74 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -542,8 +542,6 @@ class AzureFileSystem::Impl { // but not path_to_file. // path must refer to the root of the Azure storage account. This is a directory, // and there isn't any extra metadata to fetch. - // TODO: Confirm if we need to check that the storage account exists. I think there - // would have been an earlier failure. return FileInfo(path.full_path, FileType::Directory); } if (path.path_to_file.empty()) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 0c99ec8f21c..6b1320bb908 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -379,15 +379,9 @@ TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectWithNestedStructure) { AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-empty-object-dir", FileType::Directory); - - // TODO: Add an assert that it did not use ListBlobs to detect directories. } -// TODO: Add ADLS Gen2 directory tests. - -// TODO: Check I haven't accidentally changed the purpose of this test. The name seems a -// bit strange. -TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { +TEST_F(TestAzureFileSystem, GetFileInfoObject) { auto object_properties = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) @@ -403,7 +397,7 @@ TEST_F(TestAzureFileSystem, GetFileInfoObjectNoExplicitObject) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); } -TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectNoExplicitObject) { +TEST_F(TestAzureHNSFileSystem, GetFileInfoObject) { auto object_properties = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) From b9e8991f7c61151ce240fad20d5617f3bd0469e8 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 13:51:19 +0000 Subject: [PATCH 24/45] Move some logic to azurefs_internal --- cpp/src/arrow/filesystem/azurefs.cc | 81 +++----------------- cpp/src/arrow/filesystem/azurefs_internal.cc | 67 ++++++++++++++++ cpp/src/arrow/filesystem/azurefs_internal.h | 26 +++++++ cpp/src/arrow/filesystem/azurefs_test.cc | 10 +-- 4 files changed, 109 insertions(+), 75 deletions(-) create mode 100644 cpp/src/arrow/filesystem/azurefs_internal.cc create mode 100644 cpp/src/arrow/filesystem/azurefs_internal.h diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index f40d7290d74..5b41ff8bb88 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -61,67 +62,6 @@ Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_n return Status::OK(); } -Status ErrorToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { - return Status::IOError(prefix, " Azure Error: ", exception.what()); -} - -class HierachicalNamespaceDetecter { - public: - Result Enabled( - Azure::Storage::Files::DataLake::DataLakeFileSystemClient filesystem_client) { - if (is_hierachical_namespace_enabled_.has_value()) { - return is_hierachical_namespace_enabled_.value(); - } - - // This approach is inspired by hadoop-azure - // https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. - // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly - // elevated permissions. - // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization - auto directory_client = filesystem_client.GetDirectoryClient("/"); - try { - directory_client.GetAccessControlList(); - is_hierachical_namespace_enabled_ = true; - } catch (const Azure::Storage::StorageException& exception) { - // GetAccessControlList will fail on storage accounts without hierarchical - // namespace enabled. - - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest || - exception.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { - // Flat namespace storage accounts with soft delete enabled return - // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete - // otherwise it returns: BadRequest - This operation is only supported on a - // hierarchical namespace account. - is_hierachical_namespace_enabled_ = false; - } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - // Azurite returns NotFound. - try { - // Ensure sure that the directory exists by checking its properties. If it - // doesn't then the GetAccessControlList check was invalid and we can't tell - // if the storage account has hierachical namespace. - filesystem_client.GetProperties(); - is_hierachical_namespace_enabled_ = false; - } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus( - "When getting properties '" + filesystem_client.GetUrl() + "': ", - exception); - } - } else { - // Unexpected error so we can't tell if the storage account has hierachical - // namespace. - return ErrorToStatus( - "When getting access control list '" + directory_client.GetUrl() + "': ", - exception); - } - } - return is_hierachical_namespace_enabled_.value(); - } - - private: - std::optional is_hierachical_namespace_enabled_; -}; - namespace { // An AzureFileSystem represents a single Azure storage account. AzurePath describes a @@ -379,7 +319,7 @@ class ObjectInputFile final : public io::RandomAccessFile { if (ContainerOrBlobNotFound(exception)) { return PathNotFound(path_); } - return ErrorToStatus( + return internal::ErrorToStatus( "When fetching properties for '" + blob_client_->GetUrl() + "': ", exception); } } @@ -457,10 +397,10 @@ class ObjectInputFile final : public io::RandomAccessFile { ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus("When reading from '" + blob_client_->GetUrl() + - "' at position " + std::to_string(position) + " for " + - std::to_string(nbytes) + " bytes: ", - exception); + return internal::ErrorToStatus("When reading from '" + blob_client_->GetUrl() + + "' at position " + std::to_string(position) + + " for " + std::to_string(nbytes) + " bytes: ", + exception); } } @@ -516,7 +456,7 @@ class AzureFileSystem::Impl { datalake_service_client_; std::shared_ptr blob_service_client_; AzureOptions options_; - HierachicalNamespaceDetecter hierarchical_namespace_; + internal::HierachicalNamespaceDetecter hierarchical_namespace_; explicit Impl(AzureOptions options, io::IOContext io_context) : io_context_(io_context), options_(std::move(options)) {} @@ -558,7 +498,7 @@ class AzureFileSystem::Impl { if (ContainerOrBlobNotFound(exception)) { return FileInfo(path.full_path, FileType::NotFound); } - return ErrorToStatus( + return internal::ErrorToStatus( "When fetching properties for '" + container_client.GetUrl() + "': ", exception); } @@ -613,10 +553,11 @@ class AzureFileSystem::Impl { return FileInfo(path.full_path, FileType::NotFound); } } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus("When listing blobs for '" + prefix + "': ", exception); + return internal::ErrorToStatus("When listing blobs for '" + prefix + "': ", + exception); } } - return ErrorToStatus( + return internal::ErrorToStatus( "When fetching properties for '" + file_client.GetUrl() + "': ", exception); } } diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc new file mode 100644 index 00000000000..8792c1b549d --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -0,0 +1,67 @@ +#include "arrow/filesystem/azurefs_internal.h" + +#include + +#include "arrow/result.h" + +namespace arrow { +namespace fs { +namespace internal { + +Status ErrorToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception) { + return Status::IOError(prefix, " Azure Error: ", exception.what()); +} + +Result HierachicalNamespaceDetecter::Enabled( + Azure::Storage::Files::DataLake::DataLakeFileSystemClient filesystem_client) { + if (is_hierachical_namespace_enabled_.has_value()) { + return is_hierachical_namespace_enabled_.value(); + } + + // This approach is inspired by hadoop-azure + // https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. + // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly + // elevated permissions. + // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization + auto directory_client = filesystem_client.GetDirectoryClient("/"); + try { + directory_client.GetAccessControlList(); + is_hierachical_namespace_enabled_ = true; + } catch (const Azure::Storage::StorageException& exception) { + // GetAccessControlList will fail on storage accounts without hierarchical + // namespace enabled. + + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest || + exception.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { + // Flat namespace storage accounts with soft delete enabled return + // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete + // otherwise it returns: BadRequest - This operation is only supported on a + // hierarchical namespace account. + is_hierachical_namespace_enabled_ = false; + } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + // Azurite returns NotFound. + try { + // Ensure sure that the directory exists by checking its properties. If it + // doesn't then the GetAccessControlList check was invalid and we can't tell + // if the storage account has hierachical namespace. + filesystem_client.GetProperties(); + is_hierachical_namespace_enabled_ = false; + } catch (const Azure::Storage::StorageException& exception) { + return ErrorToStatus( + "When getting properties '" + filesystem_client.GetUrl() + "': ", exception); + } + } else { + // Unexpected error so we can't tell if the storage account has hierachical + // namespace. + return ErrorToStatus( + "When getting access control list '" + directory_client.GetUrl() + "': ", + exception); + } + } + return is_hierachical_namespace_enabled_.value(); +} + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h new file mode 100644 index 00000000000..7f081872211 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -0,0 +1,26 @@ + +#include + +#include "arrow/result.h" + +namespace arrow { +namespace fs { +namespace internal { + +Status ErrorToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception); + +class HierachicalNamespaceDetecter { + public: + // TODO: Switch back to holding a reference to the datalake service client. That way we + // can avoid instantiating a filesystem client if the result is cached. + Result Enabled( + Azure::Storage::Files::DataLake::DataLakeFileSystemClient filesystem_client); + + private: + std::optional is_hierachical_namespace_enabled_; +}; + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 6b1320bb908..d5ddc94ded2 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -33,8 +33,8 @@ // cpp/cmake_modules/ThirdpartyToolchain.cmake for details. #include -#include "arrow/filesystem/azurefs.cc" #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -254,28 +254,28 @@ class TestAzureHNSFileSystem : public TestAzureFileSystem { }; TEST_F(TestAzureFlatFileSystem, DetectHierarchicalNamespace) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_OK_AND_EQ( false, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( PreexistingContainerName()))); } TEST_F(TestAzureHNSFileSystem, DetectHierarchicalNamespace) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_OK_AND_EQ( true, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( PreexistingContainerName()))); } TEST_F(TestAzureFileSystem, DetectHierarchicalNamespace) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_OK_AND_EQ( false, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( PreexistingContainerName()))); } TEST_F(TestAzureFileSystem, DetectHierarchicalNamespaceFailsWithMissingContainer) { - auto hierarchical_namespace = HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_NOT_OK(hierarchical_namespace.Enabled( datalake_service_client_->GetFileSystemClient("non-existent-container"))); } From 4c897296541e35e47aabb817b686e0e9c08ed22f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 14:25:39 +0000 Subject: [PATCH 25/45] Avoid unnecessary creation of filesystem clients for hierachical namespace check in the case that the result is cached. --- cpp/src/arrow/filesystem/azurefs.cc | 7 +++---- cpp/src/arrow/filesystem/azurefs_internal.cc | 14 ++++++++++++-- cpp/src/arrow/filesystem/azurefs_internal.h | 11 ++++++----- cpp/src/arrow/filesystem/azurefs_test.cc | 19 ++++++++----------- 4 files changed, 29 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 5b41ff8bb88..e59784ece22 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -467,6 +467,7 @@ class AzureFileSystem::Impl { datalake_service_client_ = std::make_shared( options_.account_dfs_url, options_.storage_credentials_provider); + RETURN_NOT_OK(hierarchical_namespace_.Init(datalake_service_client_)); return Status::OK(); } @@ -522,10 +523,8 @@ class AzureFileSystem::Impl { return info; } catch (const Azure::Storage::StorageException& exception) { if (ContainerOrBlobNotFound(exception)) { - ARROW_ASSIGN_OR_RAISE( - bool hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled( - datalake_service_client_->GetFileSystemClient(path.container))); + ARROW_ASSIGN_OR_RAISE(bool hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled(path.container)); if (hierarchical_namespace_enabled) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index 8792c1b549d..5190151b213 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -13,8 +13,17 @@ Status ErrorToStatus(const std::string& prefix, return Status::IOError(prefix, " Azure Error: ", exception.what()); } -Result HierachicalNamespaceDetecter::Enabled( - Azure::Storage::Files::DataLake::DataLakeFileSystemClient filesystem_client) { +Status HierachicalNamespaceDetecter::Init( + std::shared_ptr + datalake_service_client) { + datalake_service_client_ = datalake_service_client; + return Status::OK(); +}; + +Result HierachicalNamespaceDetecter::Enabled(const std::string& container_name) { + // Hierarchical namespace can't easily be changed after the storage account is created + // and its common across all containers in the storage account. Do nothing until we've + // checked for a cached result. if (is_hierachical_namespace_enabled_.has_value()) { return is_hierachical_namespace_enabled_.value(); } @@ -24,6 +33,7 @@ Result HierachicalNamespaceDetecter::Enabled( // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly // elevated permissions. // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization + auto filesystem_client = datalake_service_client_->GetFileSystemClient(container_name); auto directory_client = filesystem_client.GetDirectoryClient("/"); try { directory_client.GetAccessControlList(); diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index 7f081872211..5e6c6464595 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -8,16 +8,17 @@ namespace fs { namespace internal { Status ErrorToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception); + const Azure::Storage::StorageException& exception); class HierachicalNamespaceDetecter { public: - // TODO: Switch back to holding a reference to the datalake service client. That way we - // can avoid instantiating a filesystem client if the result is cached. - Result Enabled( - Azure::Storage::Files::DataLake::DataLakeFileSystemClient filesystem_client); + Status Init(std::shared_ptr + datalake_service_client); + Result Enabled(const std::string& container_name); private: + std::shared_ptr + datalake_service_client_; std::optional is_hierachical_namespace_enabled_; }; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index d5ddc94ded2..4123ecc48e1 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -255,29 +255,26 @@ class TestAzureHNSFileSystem : public TestAzureFileSystem { TEST_F(TestAzureFlatFileSystem, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); - ASSERT_OK_AND_EQ( - false, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( - PreexistingContainerName()))); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(TestAzureHNSFileSystem, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); - ASSERT_OK_AND_EQ( - true, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( - PreexistingContainerName()))); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(TestAzureFileSystem, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); - ASSERT_OK_AND_EQ( - false, hierarchical_namespace.Enabled(datalake_service_client_->GetFileSystemClient( - PreexistingContainerName()))); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(TestAzureFileSystem, DetectHierarchicalNamespaceFailsWithMissingContainer) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); - ASSERT_NOT_OK(hierarchical_namespace.Enabled( - datalake_service_client_->GetFileSystemClient("non-existent-container"))); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); } TEST_F(TestAzureFileSystem, GetFileInfoAccount) { From 218312b1ca7106fe7908864ab0ef7b8019e5689d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 15:51:55 +0000 Subject: [PATCH 26/45] Rename test fixtures --- cpp/src/arrow/filesystem/azurefs_test.cc | 86 +++++++++++++----------- 1 file changed, 46 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 4123ecc48e1..34d917bfc20 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -50,6 +50,7 @@ #include #include "arrow/filesystem/test_util.h" +#include "arrow/result.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/io_util.h" @@ -142,7 +143,7 @@ TEST(AzureFileSystem, OptionsCompare) { EXPECT_TRUE(options.Equals(options)); } -class TestAzureFileSystem : public ::testing::Test { +class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; std::shared_ptr blob_service_client_; @@ -152,17 +153,9 @@ class TestAzureFileSystem : public ::testing::Test { std::mt19937_64 generator_; std::string container_name_; - TestAzureFileSystem() : generator_(std::random_device()()) {} + AzureFileSystemTest() : generator_(std::random_device()()) {} - virtual AzureOptions MakeOptions() { - EXPECT_THAT(GetAzuriteEnv(), NotNull()); - ARROW_EXPECT_OK(GetAzuriteEnv()->status()); - AzureOptions options; - options.backend = AzureBackend::Azurite; - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( - GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key())); - return options; - } + virtual AzureOptions MakeOptions() = 0; void SetUp() override { container_name_ = RandomChars(32); @@ -172,6 +165,7 @@ class TestAzureFileSystem : public ::testing::Test { datalake_service_client_ = std::make_shared( options_.account_dfs_url, options_.storage_credentials_provider); + GTEST_SKIP(); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); container_client.CreateIfNotExists(); @@ -235,7 +229,19 @@ class TestAzureFileSystem : public ::testing::Test { } }; -class TestAzureFlatFileSystem : public TestAzureFileSystem { +class AzuriteFileSystemTest : public AzureFileSystemTest { + AzureOptions MakeOptions() { + EXPECT_THAT(GetAzuriteEnv(), NotNull()); + ARROW_EXPECT_OK(GetAzuriteEnv()->status()); + AzureOptions options; + options.backend = AzureBackend::Azurite; + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( + GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key())); + return options; + } +}; + +class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { AzureOptions MakeOptions() override { AzureOptions options; ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( @@ -244,7 +250,7 @@ class TestAzureFlatFileSystem : public TestAzureFileSystem { } }; -class TestAzureHNSFileSystem : public TestAzureFileSystem { +class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { AzureOptions MakeOptions() override { AzureOptions options; ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( @@ -253,38 +259,38 @@ class TestAzureHNSFileSystem : public TestAzureFileSystem { } }; -TEST_F(TestAzureFlatFileSystem, DetectHierarchicalNamespace) { +TEST_F(AzureFlatNamespaceFileSystemTest, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } -TEST_F(TestAzureHNSFileSystem, DetectHierarchicalNamespace) { +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); } -TEST_F(TestAzureFileSystem, DetectHierarchicalNamespace) { +TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } -TEST_F(TestAzureFileSystem, DetectHierarchicalNamespaceFailsWithMissingContainer) { +TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContainer) { auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); } -TEST_F(TestAzureFileSystem, GetFileInfoAccount) { +TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) { arrow::fs::AssertFileInfo(fs_.get(), "", FileType::Directory); // URI ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); } -TEST_F(TestAzureFileSystem, GetFileInfoContainer) { +TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) { arrow::fs::AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); arrow::fs::AssertFileInfo(fs_.get(), "non-existent-container", FileType::NotFound); @@ -293,7 +299,7 @@ TEST_F(TestAzureFileSystem, GetFileInfoContainer) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); } -TEST_F(TestAzureFlatFileSystem, GetFileInfoObjectWithNestedStructure) { +TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; @@ -332,7 +338,7 @@ TEST_F(TestAzureFlatFileSystem, GetFileInfoObjectWithNestedStructure) { FileType::NotFound); } -TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectWithNestedStructure) { +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObjectWithNestedStructure) { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; @@ -378,7 +384,7 @@ TEST_F(TestAzureHNSFileSystem, GetFileInfoObjectWithNestedStructure) { FileType::Directory); } -TEST_F(TestAzureFileSystem, GetFileInfoObject) { +TEST_F(AzuriteFileSystemTest, GetFileInfoObject) { auto object_properties = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) @@ -394,7 +400,7 @@ TEST_F(TestAzureFileSystem, GetFileInfoObject) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); } -TEST_F(TestAzureHNSFileSystem, GetFileInfoObject) { +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { auto object_properties = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) @@ -410,7 +416,7 @@ TEST_F(TestAzureHNSFileSystem, GetFileInfoObject) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); } -TEST_F(TestAzureFileSystem, OpenInputStreamString) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamString) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -418,7 +424,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamString) { EXPECT_EQ(buffer->ToString(), kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamStringBuffers) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -432,7 +438,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { EXPECT_EQ(contents, kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamInfo) { ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr stream; @@ -442,7 +448,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { EXPECT_EQ(buffer->ToString(), kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamEmpty) { const auto path_to_file = "empty-object.txt"; const auto path = PreexistingContainerPath() + path_to_file; blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) @@ -456,11 +462,11 @@ TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { EXPECT_EQ(size, 0); } -TEST_F(TestAzureFileSystem, OpenInputStreamNotFound) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamNotFound) { ASSERT_RAISES(IOError, fs_->OpenInputStream(NotFoundObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamInfoInvalid) { ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); @@ -468,11 +474,11 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); } -TEST_F(TestAzureFileSystem, OpenInputStreamUri) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamUri) { ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + PreexistingObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputStreamTrailingSlash) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamTrailingSlash) { ASSERT_RAISES(IOError, fs_->OpenInputStream(PreexistingObjectPath() + '/')); } @@ -510,7 +516,7 @@ std::shared_ptr NormalizerKeyValueMetadata( } }; // namespace -TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamReadMetadata) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -540,7 +546,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { NormalizerKeyValueMetadata(actual)->ToString()); } -TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(PreexistingObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; @@ -549,7 +555,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { ASSERT_RAISES(Invalid, stream->Tell()); } -TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { +TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; @@ -595,7 +601,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { } } -TEST_F(TestAzureFileSystem, OpenInputFileRandomSeek) { +TEST_F(AzuriteFileSystemTest, OpenInputFileRandomSeek) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; @@ -623,7 +629,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileRandomSeek) { } } -TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { +TEST_F(AzuriteFileSystemTest, OpenInputFileIoContext) { // Create a test file. const auto path_to_file = "OpenInputFileIoContext/object-name"; const auto path = PreexistingContainerPath() + path_to_file; @@ -640,7 +646,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); } -TEST_F(TestAzureFileSystem, OpenInputFileInfo) { +TEST_F(AzuriteFileSystemTest, OpenInputFileInfo) { ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr file; @@ -655,11 +661,11 @@ TEST_F(TestAzureFileSystem, OpenInputFileInfo) { EXPECT_EQ(std::string(buffer.data(), size), expected); } -TEST_F(TestAzureFileSystem, OpenInputFileNotFound) { +TEST_F(AzuriteFileSystemTest, OpenInputFileNotFound) { ASSERT_RAISES(IOError, fs_->OpenInputFile(NotFoundObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputFileInfoInvalid) { +TEST_F(AzuriteFileSystemTest, OpenInputFileInfoInvalid) { ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); @@ -667,7 +673,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileInfoInvalid) { ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); } -TEST_F(TestAzureFileSystem, OpenInputFileClosed) { +TEST_F(AzuriteFileSystemTest, OpenInputFileClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(PreexistingObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; From 22d64d3c7070e3f1a1cbbfa16c9d136aa4569c99 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 16:23:19 +0000 Subject: [PATCH 27/45] Skip tests against real Azure blob storage if credentials are not provided --- cpp/src/arrow/filesystem/azurefs_test.cc | 41 ++++++++++++++++++------ 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 34d917bfc20..568413211ce 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -152,20 +152,27 @@ class AzureFileSystemTest : public ::testing::Test { AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; + bool suite_skipped_ = false; AzureFileSystemTest() : generator_(std::random_device()()) {} - virtual AzureOptions MakeOptions() = 0; + virtual Result MakeOptions() = 0; + virtual void Skip(){}; void SetUp() override { + auto options = MakeOptions(); + if (options.ok()) { + options_ = options.ValueOrDie(); + } else { + suite_skipped_ = true; + GTEST_SKIP() << options.status().message(); + } container_name_ = RandomChars(32); - options_ = MakeOptions(); blob_service_client_ = std::make_shared( options_.account_blob_url, options_.storage_credentials_provider); datalake_service_client_ = std::make_shared( options_.account_dfs_url, options_.storage_credentials_provider); - GTEST_SKIP(); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); container_client.CreateIfNotExists(); @@ -176,11 +183,13 @@ class AzureFileSystemTest : public ::testing::Test { } void TearDown() override { + if (!suite_skipped_) { auto containers = blob_service_client_->ListBlobContainers(); for (auto container : containers.BlobContainers) { auto container_client = blob_service_client_->GetBlobContainerClient(container.Name); container_client.DeleteIfExists(); + } } } @@ -230,7 +239,7 @@ class AzureFileSystemTest : public ::testing::Test { }; class AzuriteFileSystemTest : public AzureFileSystemTest { - AzureOptions MakeOptions() { + Result MakeOptions() { EXPECT_THAT(GetAzuriteEnv(), NotNull()); ARROW_EXPECT_OK(GetAzuriteEnv()->status()); AzureOptions options; @@ -242,20 +251,32 @@ class AzuriteFileSystemTest : public AzureFileSystemTest { }; class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { - AzureOptions MakeOptions() override { + Result MakeOptions() override { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( - std::getenv("AZURE_FLAT_ACCOUNT_NAME"), std::getenv("AZURE_FLAT_ACCOUNT_KEY"))); + if (char* account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME")) { + char* account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); + EXPECT_THAT(account_key, NotNull()); + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); return options; + } + return Status::Cancelled( + "Connection details not provided for a real flat namespace " + "account."); } }; class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { - AzureOptions MakeOptions() override { + Result MakeOptions() override { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( - std::getenv("AZURE_HNS_ACCOUNT_NAME"), std::getenv("AZURE_HNS_ACCOUNT_KEY"))); + if (char* account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME")) { + char* account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); + EXPECT_THAT(account_key, NotNull()); + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); return options; + } + return Status::Cancelled( + "Connection details not provided for a real hierachical namespace " + "account."); } }; From 80d0a55686f8445b51b7ea99ee7012197ad6bcb6 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 16:23:24 +0000 Subject: [PATCH 28/45] Spelling --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- cpp/src/arrow/filesystem/azurefs_internal.cc | 16 +++++++------- cpp/src/arrow/filesystem/azurefs_internal.h | 4 ++-- cpp/src/arrow/filesystem/azurefs_test.cc | 22 ++++++++++---------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index e59784ece22..2550fa33118 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -456,7 +456,7 @@ class AzureFileSystem::Impl { datalake_service_client_; std::shared_ptr blob_service_client_; AzureOptions options_; - internal::HierachicalNamespaceDetecter hierarchical_namespace_; + internal::HierarchicalNamespaceDetector hierarchical_namespace_; explicit Impl(AzureOptions options, io::IOContext io_context) : io_context_(io_context), options_(std::move(options)) {} diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index 5190151b213..c14415f64c5 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -13,19 +13,19 @@ Status ErrorToStatus(const std::string& prefix, return Status::IOError(prefix, " Azure Error: ", exception.what()); } -Status HierachicalNamespaceDetecter::Init( +Status HierarchicalNamespaceDetector::Init( std::shared_ptr datalake_service_client) { datalake_service_client_ = datalake_service_client; return Status::OK(); }; -Result HierachicalNamespaceDetecter::Enabled(const std::string& container_name) { +Result HierarchicalNamespaceDetector::Enabled(const std::string& container_name) { // Hierarchical namespace can't easily be changed after the storage account is created // and its common across all containers in the storage account. Do nothing until we've // checked for a cached result. - if (is_hierachical_namespace_enabled_.has_value()) { - return is_hierachical_namespace_enabled_.value(); + if (is_hierarchical_namespace_enabled_.has_value()) { + return is_hierarchical_namespace_enabled_.value(); } // This approach is inspired by hadoop-azure @@ -37,7 +37,7 @@ Result HierachicalNamespaceDetecter::Enabled(const std::string& container_ auto directory_client = filesystem_client.GetDirectoryClient("/"); try { directory_client.GetAccessControlList(); - is_hierachical_namespace_enabled_ = true; + is_hierarchical_namespace_enabled_ = true; } catch (const Azure::Storage::StorageException& exception) { // GetAccessControlList will fail on storage accounts without hierarchical // namespace enabled. @@ -48,7 +48,7 @@ Result HierachicalNamespaceDetecter::Enabled(const std::string& container_ // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete // otherwise it returns: BadRequest - This operation is only supported on a // hierarchical namespace account. - is_hierachical_namespace_enabled_ = false; + is_hierarchical_namespace_enabled_ = false; } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { // Azurite returns NotFound. try { @@ -56,7 +56,7 @@ Result HierachicalNamespaceDetecter::Enabled(const std::string& container_ // doesn't then the GetAccessControlList check was invalid and we can't tell // if the storage account has hierachical namespace. filesystem_client.GetProperties(); - is_hierachical_namespace_enabled_ = false; + is_hierarchical_namespace_enabled_ = false; } catch (const Azure::Storage::StorageException& exception) { return ErrorToStatus( "When getting properties '" + filesystem_client.GetUrl() + "': ", exception); @@ -69,7 +69,7 @@ Result HierachicalNamespaceDetecter::Enabled(const std::string& container_ exception); } } - return is_hierachical_namespace_enabled_.value(); + return is_hierarchical_namespace_enabled_.value(); } } // namespace internal diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index 5e6c6464595..53863f61df0 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -10,7 +10,7 @@ namespace internal { Status ErrorToStatus(const std::string& prefix, const Azure::Storage::StorageException& exception); -class HierachicalNamespaceDetecter { +class HierarchicalNamespaceDetector { public: Status Init(std::shared_ptr datalake_service_client); @@ -19,7 +19,7 @@ class HierachicalNamespaceDetecter { private: std::shared_ptr datalake_service_client_; - std::optional is_hierachical_namespace_enabled_; + std::optional is_hierarchical_namespace_enabled_; }; } // namespace internal diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 568413211ce..7f2473f2497 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -184,11 +184,11 @@ class AzureFileSystemTest : public ::testing::Test { void TearDown() override { if (!suite_skipped_) { - auto containers = blob_service_client_->ListBlobContainers(); - for (auto container : containers.BlobContainers) { - auto container_client = - blob_service_client_->GetBlobContainerClient(container.Name); - container_client.DeleteIfExists(); + auto containers = blob_service_client_->ListBlobContainers(); + for (auto container : containers.BlobContainers) { + auto container_client = + blob_service_client_->GetBlobContainerClient(container.Name); + container_client.DeleteIfExists(); } } } @@ -257,7 +257,7 @@ class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { char* account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); EXPECT_THAT(account_key, NotNull()); ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); - return options; + return options; } return Status::Cancelled( "Connection details not provided for a real flat namespace " @@ -272,7 +272,7 @@ class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { char* account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); EXPECT_THAT(account_key, NotNull()); ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); - return options; + return options; } return Status::Cancelled( "Connection details not provided for a real hierachical namespace " @@ -281,25 +281,25 @@ class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { }; TEST_F(AzureFlatNamespaceFileSystemTest, DetectHierarchicalNamespace) { - auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(AzureHierarchicalNamespaceFileSystemTest, DetectHierarchicalNamespace) { - auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) { - auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContainer) { - auto hierarchical_namespace = internal::HierachicalNamespaceDetecter(); + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); } From d0f33119629cfbb27430dcb9fe0797f8b3b3bd28 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 16:34:05 +0000 Subject: [PATCH 29/45] Tidy --- cpp/src/arrow/filesystem/azurefs.cc | 15 +++++++++------ cpp/src/arrow/filesystem/azurefs_internal.cc | 17 +++++++++++++++++ cpp/src/arrow/filesystem/azurefs_internal.h | 18 ++++++++++++++++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 2550fa33118..3760ce200ca 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -491,13 +491,14 @@ class AzureFileSystem::Impl { blob_service_client_->GetBlobContainerClient(path.container); try { auto properties = container_client.GetProperties(); - auto info = FileInfo(path.full_path, FileType::Directory); + info.set_type(FileType::Directory); info.set_mtime( std::chrono::system_clock::time_point(properties.Value.LastModified)); return info; } catch (const Azure::Storage::StorageException& exception) { if (ContainerOrBlobNotFound(exception)) { - return FileInfo(path.full_path, FileType::NotFound); + info.set_type(FileType::NotFound); + return info; } return internal::ErrorToStatus( "When fetching properties for '" + container_client.GetUrl() + "': ", @@ -508,7 +509,6 @@ class AzureFileSystem::Impl { .GetFileClient(path.path_to_file); try { auto properties = file_client.GetProperties(); - auto info = FileInfo(path.full_path, FileType::File); if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); } else if (internal::HasTrailingSlash(path.path_to_file)) { @@ -528,7 +528,8 @@ class AzureFileSystem::Impl { if (hierarchical_namespace_enabled) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. - return FileInfo(path.full_path, FileType::NotFound); + info.set_type(FileType::NotFound); + return info; } // On flat namespace accounts there are no real directories. Directories are only // implied by using `/` in the blob name. @@ -547,9 +548,11 @@ class AzureFileSystem::Impl { blob_service_client_->GetBlobContainerClient(path.container) .ListBlobs(list_blob_options); if (paged_list_result.Blobs.size() > 0) { - return FileInfo(path.full_path, FileType::Directory); + info.set_type(FileType::Directory); + return info; } else { - return FileInfo(path.full_path, FileType::NotFound); + info.set_type(FileType::NotFound); + return info; } } catch (const Azure::Storage::StorageException& exception) { return internal::ErrorToStatus("When listing blobs for '" + prefix + "': ", diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index c14415f64c5..b55ebfdcf60 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "arrow/filesystem/azurefs_internal.h" #include diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index 53863f61df0..c53ce14bad2 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -1,3 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once #include From 8912dbfa81688b1d95d3d0b1932ce885039aa108 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 17:24:33 +0000 Subject: [PATCH 30/45] Update error messages --- cpp/src/arrow/filesystem/azurefs.cc | 21 +++++++++++++++----- cpp/src/arrow/filesystem/azurefs_internal.cc | 15 +++++++------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 3760ce200ca..a96d590c879 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -501,7 +501,9 @@ class AzureFileSystem::Impl { return info; } return internal::ErrorToStatus( - "When fetching properties for '" + container_client.GetUrl() + "': ", + "GetProperties for '" + container_client.GetUrl() + + "' failed with an unexpected Azure error. GetFileInfo is unable to " + "determine whether the container exists.", exception); } } @@ -512,6 +514,9 @@ class AzureFileSystem::Impl { if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); } else if (internal::HasTrailingSlash(path.path_to_file)) { + // For a path with a trailing slash a hierarchical namespace may return a blob + // with that trailing slash removed. For consistency with flat namespace and + // other filesystems we chose to return NotFound. info.set_type(FileType::NotFound); return info; } else { @@ -548,19 +553,25 @@ class AzureFileSystem::Impl { blob_service_client_->GetBlobContainerClient(path.container) .ListBlobs(list_blob_options); if (paged_list_result.Blobs.size() > 0) { - info.set_type(FileType::Directory); + info.set_type(FileType::Directory); return info; } else { info.set_type(FileType::NotFound); return info; } } catch (const Azure::Storage::StorageException& exception) { - return internal::ErrorToStatus("When listing blobs for '" + prefix + "': ", - exception); + return internal::ErrorToStatus( + "ListBlobs for '" + prefix + + "' failed with an unexpected Azure error. GetFileInfo is unable to " + "determine whether the path should be considered an implied directory.", + exception); } } return internal::ErrorToStatus( - "When fetching properties for '" + file_client.GetUrl() + "': ", exception); + "GetProperties for '" + file_client.GetUrl() + + "' failed with an unexpected " + "Azure error. GetFileInfo is unable to determine whether the path exists.", + exception); } } diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index b55ebfdcf60..d299a606ee9 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -69,20 +69,19 @@ Result HierarchicalNamespaceDetector::Enabled(const std::string& container } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { // Azurite returns NotFound. try { - // Ensure sure that the directory exists by checking its properties. If it - // doesn't then the GetAccessControlList check was invalid and we can't tell - // if the storage account has hierachical namespace. filesystem_client.GetProperties(); is_hierarchical_namespace_enabled_ = false; } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus( - "When getting properties '" + filesystem_client.GetUrl() + "': ", exception); + return ErrorToStatus("Failed to confirm '" + filesystem_client.GetUrl() + + "' is an accessible container. Therefore the " + "hierarchical namespace check was invalid.", + exception); } } else { - // Unexpected error so we can't tell if the storage account has hierachical - // namespace. return ErrorToStatus( - "When getting access control list '" + directory_client.GetUrl() + "': ", + "GetAccessControlList for '" + directory_client.GetUrl() + + "' failed with an unexpected Azure error, while checking " + "whether the storage account has hierarchical namespace enabled.", exception); } } From 3b920f125ff758b01c6383e923d25766aa3db7b7 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 17:24:39 +0000 Subject: [PATCH 31/45] Add missing include --- cpp/src/arrow/filesystem/azurefs_internal.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index c53ce14bad2..f762f8c1add 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -17,6 +17,8 @@ #pragma once +#include + #include #include "arrow/result.h" From 6ab232aefed01903cf50f5d9561b396c7b4076ba Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 17:39:01 +0000 Subject: [PATCH 32/45] Lint --- cpp/src/arrow/filesystem/azurefs_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index d299a606ee9..4e8173233b7 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -35,7 +35,7 @@ Status HierarchicalNamespaceDetector::Init( datalake_service_client) { datalake_service_client_ = datalake_service_client; return Status::OK(); -}; +} Result HierarchicalNamespaceDetector::Enabled(const std::string& container_name) { // Hierarchical namespace can't easily be changed after the storage account is created From 89320c04ee63a32645212eee1ad3a65a5b0d5e98 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 17:55:52 +0000 Subject: [PATCH 33/45] Adjust some more error messges --- cpp/src/arrow/filesystem/azurefs.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a96d590c879..934e0479216 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -320,7 +320,10 @@ class ObjectInputFile final : public io::RandomAccessFile { return PathNotFound(path_); } return internal::ErrorToStatus( - "When fetching properties for '" + blob_client_->GetUrl() + "': ", exception); + "GetProperties failed for '" + blob_client_->GetUrl() + + "' with an unexpected Azure error. Can not initialise an ObjectInputFile " + "without knowing the file size. ", + exception); } } @@ -397,9 +400,11 @@ class ObjectInputFile final : public io::RandomAccessFile { ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - return internal::ErrorToStatus("When reading from '" + blob_client_->GetUrl() + + return internal::ErrorToStatus("DownloadTo from '" + blob_client_->GetUrl() + "' at position " + std::to_string(position) + - " for " + std::to_string(nbytes) + " bytes: ", + " for " + std::to_string(nbytes) + + " bytes failed with an Azure error. ReadAt " + "failed to read the required byte range.", exception); } } From e86812564e1269941b1211ac2dd6b3358439d1e8 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 18:26:11 +0000 Subject: [PATCH 34/45] Reduce duplication in tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 65 +++++------------------- 1 file changed, 13 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 7f2473f2497..12b6006aea6 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -236,6 +236,9 @@ class AzureFileSystemTest : public ::testing::Test { blob_client.UploadFrom(reinterpret_cast(all_lines.data()), total_size); } + + void RunGetFileInfoObjectWithNestedStructureTest(); + void RunGetFileInfoObjectTest(); }; class AzuriteFileSystemTest : public AzureFileSystemTest { @@ -320,7 +323,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); } -TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) { +void AzureFileSystemTest::RunGetFileInfoObjectWithNestedStructureTest() { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; @@ -359,53 +362,21 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) { FileType::NotFound); } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObjectWithNestedStructure) { - // Adds detailed tests to handle cases of different edge cases - // with directory naming conventions (e.g. with and without slashes). - constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(kObjectName) - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); - - // 0 is immediately after "/" lexicographically, ensure that this doesn't - // cause unexpected issues. - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient("test-object-dir/some_other_dir0") - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); - - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(std::string(kObjectName) + "0") - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); +TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) { + RunGetFileInfoObjectWithNestedStructureTest(); +} +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObjectWithNestedStructure) { + RunGetFileInfoObjectWithNestedStructureTest(); datalake_service_client_->GetFileSystemClient(PreexistingContainerName()) .GetDirectoryClient("test-empty-object-dir") .Create(); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", - FileType::NotFound); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir", - FileType::Directory); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/", - FileType::Directory); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_dir", - FileType::Directory); - AssertFileInfo(fs_.get(), - PreexistingContainerPath() + "test-object-dir/some_other_dir/", - FileType::Directory); - - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di", - FileType::NotFound); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_di", - FileType::NotFound); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-empty-object-dir", FileType::Directory); } -TEST_F(AzuriteFileSystemTest, GetFileInfoObject) { +void AzureFileSystemTest::RunGetFileInfoObjectTest() { auto object_properties = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) @@ -421,20 +392,10 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoObject) { ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { - auto object_properties = - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlobClient(PreexistingObjectName()) - .GetProperties() - .Value; - - arrow::fs::AssertFileInfo( - fs_.get(), PreexistingObjectPath(), FileType::File, - std::chrono::system_clock::time_point(object_properties.LastModified), - static_cast(object_properties.BlobSize)); +TEST_F(AzuriteFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } - // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { + RunGetFileInfoObjectTest(); } TEST_F(AzuriteFileSystemTest, OpenInputStreamString) { From 2cd51c1c31e89cae26b44025f6ab0ead5c642d7a Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 18:28:21 +0000 Subject: [PATCH 35/45] Lint --- cpp/src/arrow/filesystem/azurefs_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 12b6006aea6..8e8af019762 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -157,7 +157,6 @@ class AzureFileSystemTest : public ::testing::Test { AzureFileSystemTest() : generator_(std::random_device()()) {} virtual Result MakeOptions() = 0; - virtual void Skip(){}; void SetUp() override { auto options = MakeOptions(); From 6ece0c9edccf2ac948e4ba7ca0675fc5a7685e0f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 5 Nov 2023 20:32:36 +0000 Subject: [PATCH 36/45] Fix clang linker error --- cpp/src/arrow/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9a611701153..101b089ba83 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -502,8 +502,8 @@ if(ARROW_FILESYSTEM) filesystem/util_internal.cc) if(ARROW_AZURE) - list(APPEND ARROW_SRCS filesystem/azurefs.cc) - set_source_files_properties(filesystem/azurefs.cc + list(APPEND ARROW_SRCS filesystem/azurefs.cc filesystem/azurefs_internal.cc) + set_source_files_properties(filesystem/azurefs.cc filesystem/azurefs_internal.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) endif() From c562c7ebc1e8cb2195df09d3ed215cbc67eca211 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 6 Nov 2023 09:51:14 +0000 Subject: [PATCH 37/45] PR comments 1 --- cpp/src/arrow/filesystem/azurefs.cc | 25 +++++++------- cpp/src/arrow/filesystem/azurefs_internal.cc | 34 +++++++++----------- cpp/src/arrow/filesystem/azurefs_internal.h | 14 +++----- cpp/src/arrow/filesystem/path_util.cc | 14 ++------ 4 files changed, 35 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 934e0479216..3c8aa62b733 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -319,10 +319,10 @@ class ObjectInputFile final : public io::RandomAccessFile { if (ContainerOrBlobNotFound(exception)) { return PathNotFound(path_); } - return internal::ErrorToStatus( + return internal::ExceptionToStatus( "GetProperties failed for '" + blob_client_->GetUrl() + "' with an unexpected Azure error. Can not initialise an ObjectInputFile " - "without knowing the file size. ", + "without knowing the file size.", exception); } } @@ -400,12 +400,12 @@ class ObjectInputFile final : public io::RandomAccessFile { ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - return internal::ErrorToStatus("DownloadTo from '" + blob_client_->GetUrl() + - "' at position " + std::to_string(position) + - " for " + std::to_string(nbytes) + - " bytes failed with an Azure error. ReadAt " - "failed to read the required byte range.", - exception); + return internal::ExceptionToStatus("DownloadTo from '" + blob_client_->GetUrl() + + "' at position " + std::to_string(position) + + " for " + std::to_string(nbytes) + + " bytes failed with an Azure error. ReadAt " + "failed to read the required byte range.", + exception); } } @@ -488,7 +488,8 @@ class AzureFileSystem::Impl { // but not path_to_file. // path must refer to the root of the Azure storage account. This is a directory, // and there isn't any extra metadata to fetch. - return FileInfo(path.full_path, FileType::Directory); + info.set_type(FileType::Directory); + return info; } if (path.path_to_file.empty()) { // path refers to a container. This is a directory if it exists. @@ -505,7 +506,7 @@ class AzureFileSystem::Impl { info.set_type(FileType::NotFound); return info; } - return internal::ErrorToStatus( + return internal::ExceptionToStatus( "GetProperties for '" + container_client.GetUrl() + "' failed with an unexpected Azure error. GetFileInfo is unable to " "determine whether the container exists.", @@ -565,14 +566,14 @@ class AzureFileSystem::Impl { return info; } } catch (const Azure::Storage::StorageException& exception) { - return internal::ErrorToStatus( + return internal::ExceptionToStatus( "ListBlobs for '" + prefix + "' failed with an unexpected Azure error. GetFileInfo is unable to " "determine whether the path should be considered an implied directory.", exception); } } - return internal::ErrorToStatus( + return internal::ExceptionToStatus( "GetProperties for '" + file_client.GetUrl() + "' failed with an unexpected " "Azure error. GetFileInfo is unable to determine whether the path exists.", diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index 4e8173233b7..cc3632afc78 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -21,12 +21,10 @@ #include "arrow/result.h" -namespace arrow { -namespace fs { -namespace internal { +namespace arrow::fs::internal { -Status ErrorToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { +Status ExceptionToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception) { return Status::IOError(prefix, " Azure Error: ", exception.what()); } @@ -41,8 +39,8 @@ Result HierarchicalNamespaceDetector::Enabled(const std::string& container // Hierarchical namespace can't easily be changed after the storage account is created // and its common across all containers in the storage account. Do nothing until we've // checked for a cached result. - if (is_hierarchical_namespace_enabled_.has_value()) { - return is_hierarchical_namespace_enabled_.value(); + if (enabled_.has_value()) { + return enabled_.value(); } // This approach is inspired by hadoop-azure @@ -54,7 +52,7 @@ Result HierarchicalNamespaceDetector::Enabled(const std::string& container auto directory_client = filesystem_client.GetDirectoryClient("/"); try { directory_client.GetAccessControlList(); - is_hierarchical_namespace_enabled_ = true; + enabled_ = true; } catch (const Azure::Storage::StorageException& exception) { // GetAccessControlList will fail on storage accounts without hierarchical // namespace enabled. @@ -65,29 +63,27 @@ Result HierarchicalNamespaceDetector::Enabled(const std::string& container // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete // otherwise it returns: BadRequest - This operation is only supported on a // hierarchical namespace account. - is_hierarchical_namespace_enabled_ = false; + enabled_ = false; } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { // Azurite returns NotFound. try { filesystem_client.GetProperties(); - is_hierarchical_namespace_enabled_ = false; + enabled_ = false; } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus("Failed to confirm '" + filesystem_client.GetUrl() + - "' is an accessible container. Therefore the " - "hierarchical namespace check was invalid.", - exception); + return ExceptionToStatus("Failed to confirm '" + filesystem_client.GetUrl() + + "' is an accessible container. Therefore the " + "hierarchical namespace check was invalid.", + exception); } } else { - return ErrorToStatus( + return ExceptionToStatus( "GetAccessControlList for '" + directory_client.GetUrl() + "' failed with an unexpected Azure error, while checking " "whether the storage account has hierarchical namespace enabled.", exception); } } - return is_hierarchical_namespace_enabled_.value(); + return enabled_.value(); } -} // namespace internal -} // namespace fs -} // namespace arrow +} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index f762f8c1add..e238c198e51 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -23,12 +23,10 @@ #include "arrow/result.h" -namespace arrow { -namespace fs { -namespace internal { +namespace arrow::fs::internal { -Status ErrorToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception); +Status ExceptionToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception); class HierarchicalNamespaceDetector { public: @@ -39,9 +37,7 @@ class HierarchicalNamespaceDetector { private: std::shared_ptr datalake_service_client_; - std::optional is_hierarchical_namespace_enabled_; + std::optional enabled_; }; -} // namespace internal -} // namespace fs -} // namespace arrow +} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index e3fb655437f..46ea436a9f3 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -197,19 +197,9 @@ Status AssertNoTrailingSlash(std::string_view key) { return Status::OK(); } -bool HasTrailingSlash(std::string_view key) { - if (key.back() != '/') { - return false; - } - return true; -} +bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; } -bool HasLeadingSlash(std::string_view key) { - if (key.front() != '/') { - return false; - } - return true; -} +bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; } Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { From 2486c86353de6cf10a6089e3950eb461b0b51906 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 6 Nov 2023 10:25:08 +0000 Subject: [PATCH 38/45] PR comments 2 --- cpp/src/arrow/filesystem/azurefs_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 8e8af019762..cb205915e2f 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -255,8 +255,8 @@ class AzuriteFileSystemTest : public AzureFileSystemTest { class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { Result MakeOptions() override { AzureOptions options; - if (char* account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME")) { - char* account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); + if (const auto account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME")) { + const auto account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); EXPECT_THAT(account_key, NotNull()); ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); return options; @@ -270,8 +270,8 @@ class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { Result MakeOptions() override { AzureOptions options; - if (char* account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME")) { - char* account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); + if (const auto account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME")) { + const auto account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); EXPECT_THAT(account_key, NotNull()); ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); return options; @@ -307,16 +307,16 @@ TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContain } TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) { - arrow::fs::AssertFileInfo(fs_.get(), "", FileType::Directory); + AssertFileInfo(fs_.get(), "", FileType::Directory); // URI ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); } TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) { - arrow::fs::AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); + AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), "non-existent-container", FileType::NotFound); + AssertFileInfo(fs_.get(), "non-existent-container", FileType::NotFound); // URI ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); @@ -382,7 +382,7 @@ void AzureFileSystemTest::RunGetFileInfoObjectTest() { .GetProperties() .Value; - arrow::fs::AssertFileInfo( + AssertFileInfo( fs_.get(), PreexistingObjectPath(), FileType::File, std::chrono::system_clock::time_point(object_properties.LastModified), static_cast(object_properties.BlobSize)); From fb03dd86005421930e4787158a2d6663feaf6f73 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 6 Nov 2023 21:18:13 +0000 Subject: [PATCH 39/45] PR comments: shared_ptr -> unique_ptr --- cpp/src/arrow/filesystem/azurefs.cc | 4 ++-- cpp/src/arrow/filesystem/azurefs_test.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 3c8aa62b733..0df054c8388 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -459,7 +459,7 @@ class AzureFileSystem::Impl { io::IOContext io_context_; std::shared_ptr datalake_service_client_; - std::shared_ptr blob_service_client_; + std::unique_ptr blob_service_client_; AzureOptions options_; internal::HierarchicalNamespaceDetector hierarchical_namespace_; @@ -467,7 +467,7 @@ class AzureFileSystem::Impl { : io_context_(io_context), options_(std::move(options)) {} Status Init() { - blob_service_client_ = std::make_shared( + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); datalake_service_client_ = std::make_shared( diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index cb205915e2f..47ff83101ec 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -146,7 +146,7 @@ TEST(AzureFileSystem, OptionsCompare) { class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; - std::shared_ptr blob_service_client_; + std::unique_ptr blob_service_client_; std::shared_ptr datalake_service_client_; AzureOptions options_; @@ -167,7 +167,7 @@ class AzureFileSystemTest : public ::testing::Test { GTEST_SKIP() << options.status().message(); } container_name_ = RandomChars(32); - blob_service_client_ = std::make_shared( + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); datalake_service_client_ = std::make_shared( From c8370b36a582b314b9161626e2413ec687f8ab1e Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 6 Nov 2023 22:36:11 +0000 Subject: [PATCH 40/45] Adjust handling of missing credentials --- cpp/src/arrow/filesystem/azurefs_test.cc | 27 ++++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 47ff83101ec..4f32cd9d7f8 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -161,7 +161,7 @@ class AzureFileSystemTest : public ::testing::Test { void SetUp() override { auto options = MakeOptions(); if (options.ok()) { - options_ = options.ValueOrDie(); + options_ = *options; } else { suite_skipped_ = true; GTEST_SKIP() << options.status().message(); @@ -255,10 +255,10 @@ class AzuriteFileSystemTest : public AzureFileSystemTest { class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { Result MakeOptions() override { AzureOptions options; - if (const auto account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME")) { - const auto account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); - EXPECT_THAT(account_key, NotNull()); - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + const auto account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); + const auto account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME"); + if (account_key && account_name) { + RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); return options; } return Status::Cancelled( @@ -270,14 +270,14 @@ class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { Result MakeOptions() override { AzureOptions options; - if (const auto account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME")) { - const auto account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); - EXPECT_THAT(account_key, NotNull()); - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + const auto account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); + const auto account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME"); + if (account_key && account_name) { + RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); return options; } return Status::Cancelled( - "Connection details not provided for a real hierachical namespace " + "Connection details not provided for a real hierarchical namespace " "account."); } }; @@ -382,10 +382,9 @@ void AzureFileSystemTest::RunGetFileInfoObjectTest() { .GetProperties() .Value; - AssertFileInfo( - fs_.get(), PreexistingObjectPath(), FileType::File, - std::chrono::system_clock::time_point(object_properties.LastModified), - static_cast(object_properties.BlobSize)); + AssertFileInfo(fs_.get(), PreexistingObjectPath(), FileType::File, + std::chrono::system_clock::time_point(object_properties.LastModified), + static_cast(object_properties.BlobSize)); // URI ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); From 25a0c7682f8fb71ddb206898b9f0134d90c49e1e Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 7 Nov 2023 09:52:38 +0000 Subject: [PATCH 41/45] Fix namespacing issue after rebase --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 4f32cd9d7f8..cbf05ec524d 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -485,7 +485,7 @@ std::shared_ptr NormalizerKeyValueMetadata( value = "2023-10-31T08:15:20Z"; } } else if (key == "ETag") { - if (internal::StartsWith(value, "\"") && internal::EndsWith(value, "\"")) { + if (arrow::internal::StartsWith(value, "\"") && arrow::internal::EndsWith(value, "\"")) { // Valid value value = "\"ETagValue\""; } From 5540c70235d58e80887f2861dd3e285905b12aa7 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 7 Nov 2023 12:47:42 +0000 Subject: [PATCH 42/45] Auto-formatting --- cpp/src/arrow/filesystem/azurefs_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index cbf05ec524d..8b656184155 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -485,7 +485,8 @@ std::shared_ptr NormalizerKeyValueMetadata( value = "2023-10-31T08:15:20Z"; } } else if (key == "ETag") { - if (arrow::internal::StartsWith(value, "\"") && arrow::internal::EndsWith(value, "\"")) { + if (arrow::internal::StartsWith(value, "\"") && + arrow::internal::EndsWith(value, "\"")) { // Valid value value = "\"ETagValue\""; } From 3fd35c6c3790c5307b0cfa2bf1e668a7820878be Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 8 Nov 2023 09:04:30 +0000 Subject: [PATCH 43/45] PR comments --- cpp/src/arrow/filesystem/azurefs.cc | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0df054c8388..ace358c7817 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -148,10 +148,6 @@ Status ValidateFilePath(const AzurePath& path) { return Status::OK(); } -bool ContainerOrBlobNotFound(const Azure::Storage::StorageException& exception) { - return exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound; -} - template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -316,7 +312,7 @@ class ObjectInputFile final : public io::RandomAccessFile { metadata_ = PropertiesToMetadata(properties.Value); return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { - if (ContainerOrBlobNotFound(exception)) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { return PathNotFound(path_); } return internal::ExceptionToStatus( @@ -502,7 +498,7 @@ class AzureFileSystem::Impl { std::chrono::system_clock::time_point(properties.Value.LastModified)); return info; } catch (const Azure::Storage::StorageException& exception) { - if (ContainerOrBlobNotFound(exception)) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { info.set_type(FileType::NotFound); return info; } @@ -533,7 +529,7 @@ class AzureFileSystem::Impl { std::chrono::system_clock::time_point(properties.Value.LastModified)); return info; } catch (const Azure::Storage::StorageException& exception) { - if (ContainerOrBlobNotFound(exception)) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { ARROW_ASSIGN_OR_RAISE(bool hierarchical_namespace_enabled, hierarchical_namespace_.Enabled(path.container)); if (hierarchical_namespace_enabled) { @@ -560,11 +556,10 @@ class AzureFileSystem::Impl { .ListBlobs(list_blob_options); if (paged_list_result.Blobs.size() > 0) { info.set_type(FileType::Directory); - return info; } else { info.set_type(FileType::NotFound); - return info; } + return info; } catch (const Azure::Storage::StorageException& exception) { return internal::ExceptionToStatus( "ListBlobs for '" + prefix + From 44f6aa570e2cf2dc82dec9314774d676fd859c2d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 8 Nov 2023 09:42:51 +0000 Subject: [PATCH 44/45] Use unique_ptr for datalake_service_client --- cpp/src/arrow/filesystem/azurefs.cc | 8 ++++---- cpp/src/arrow/filesystem/azurefs_internal.cc | 3 +-- cpp/src/arrow/filesystem/azurefs_internal.h | 7 +++---- cpp/src/arrow/filesystem/azurefs_test.cc | 12 ++++++------ 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ace358c7817..6cbc3d22ddb 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -453,7 +453,7 @@ class ObjectInputFile final : public io::RandomAccessFile { class AzureFileSystem::Impl { public: io::IOContext io_context_; - std::shared_ptr + std::unique_ptr datalake_service_client_; std::unique_ptr blob_service_client_; AzureOptions options_; @@ -466,9 +466,9 @@ class AzureFileSystem::Impl { blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); datalake_service_client_ = - std::make_shared( + std::make_unique( options_.account_dfs_url, options_.storage_credentials_provider); - RETURN_NOT_OK(hierarchical_namespace_.Init(datalake_service_client_)); + RETURN_NOT_OK(hierarchical_namespace_.Init(datalake_service_client_.get())); return Status::OK(); } @@ -559,7 +559,7 @@ class AzureFileSystem::Impl { } else { info.set_type(FileType::NotFound); } - return info; + return info; } catch (const Azure::Storage::StorageException& exception) { return internal::ExceptionToStatus( "ListBlobs for '" + prefix + diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index cc3632afc78..3e545d670cb 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -29,8 +29,7 @@ Status ExceptionToStatus(const std::string& prefix, } Status HierarchicalNamespaceDetector::Init( - std::shared_ptr - datalake_service_client) { + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client) { datalake_service_client_ = datalake_service_client; return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index e238c198e51..c3da96239a1 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -30,13 +30,12 @@ Status ExceptionToStatus(const std::string& prefix, class HierarchicalNamespaceDetector { public: - Status Init(std::shared_ptr - datalake_service_client); + Status Init( + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client); Result Enabled(const std::string& container_name); private: - std::shared_ptr - datalake_service_client_; + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client_; std::optional enabled_; }; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 8b656184155..c08a4b50b77 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -147,7 +147,7 @@ class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; std::unique_ptr blob_service_client_; - std::shared_ptr + std::unique_ptr datalake_service_client_; AzureOptions options_; std::mt19937_64 generator_; @@ -170,7 +170,7 @@ class AzureFileSystemTest : public ::testing::Test { blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); datalake_service_client_ = - std::make_shared( + std::make_unique( options_.account_dfs_url, options_.storage_credentials_provider); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); @@ -284,25 +284,25 @@ class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { TEST_F(AzureFlatNamespaceFileSystemTest, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(AzureHierarchicalNamespaceFileSystemTest, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) { auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); } TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContainer) { auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_)); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); } From 0659a392ef09925030d4c64721450aee3e273d1c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 9 Nov 2023 11:14:10 +0900 Subject: [PATCH 45/45] Use auto --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6cbc3d22ddb..6359183d90b 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -530,7 +530,7 @@ class AzureFileSystem::Impl { return info; } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - ARROW_ASSIGN_OR_RAISE(bool hierarchical_namespace_enabled, + ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, hierarchical_namespace_.Enabled(path.container)); if (hierarchical_namespace_enabled) { // If the hierarchical namespace is enabled, then the storage account will have