From 1f57b8ce869fab961c865549c82e7cc1afcd4e10 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 19 Dec 2023 22:43:33 -0300 Subject: [PATCH 1/9] Replace shared_ptr parameter with & parameter --- cpp/src/arrow/filesystem/azurefs.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 032cd034e7a..7315455cf86 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -571,12 +571,12 @@ class ObjectInputFile final : public io::RandomAccessFile { std::shared_ptr metadata_; }; -Status CreateEmptyBlockBlob(std::shared_ptr block_blob_client) { +Status CreateEmptyBlockBlob(Blobs::BlockBlobClient& block_blob_client) { try { - block_blob_client->UploadFrom(nullptr, 0); + block_blob_client.UploadFrom(nullptr, 0); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "UploadFrom failed for '" + block_blob_client->GetUrl() + + "UploadFrom failed for '" + block_blob_client.GetUrl() + "'. There is no existing blob at this location or the existing blob must be " "replaced so ObjectAppendStream must create a new empty block blob.", exception); @@ -659,7 +659,7 @@ class ObjectAppendStream final : public io::OutputStream { pos_ = content_length_; } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { - RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client_)); + RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client_)); } else { return ExceptionToStatus( "GetProperties failed for '" + block_blob_client_->GetUrl() + @@ -1349,7 +1349,7 @@ class AzureFileSystem::Impl { std::shared_ptr stream; if (truncate) { - RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client)); + RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client)); stream = std::make_shared(block_blob_client, fs->io_context(), location, metadata, options_, 0); } else { From 02c3146ae8aacbe8a47bb0201c8c05349484450d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 18 Dec 2023 22:31:43 -0300 Subject: [PATCH 2/9] Move function closer to the metadata handling code --- cpp/src/arrow/filesystem/azurefs.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 7315455cf86..614dd94e1bb 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -418,6 +418,15 @@ std::shared_ptr PropertiesToMetadata( return metadata; } +Storage::Metadata ArrowMetadataToAzureMetadata( + const std::shared_ptr& arrow_metadata) { + Storage::Metadata azure_metadata; + for (auto key_value : arrow_metadata->sorted_pairs()) { + azure_metadata[key_value.first] = key_value.second; + } + return azure_metadata; +} + class ObjectInputFile final : public io::RandomAccessFile { public: ObjectInputFile(std::shared_ptr blob_client, @@ -596,15 +605,6 @@ Result GetBlockList( } } -Storage::Metadata ArrowMetadataToAzureMetadata( - const std::shared_ptr& arrow_metadata) { - Storage::Metadata azure_metadata; - for (auto key_value : arrow_metadata->sorted_pairs()) { - azure_metadata[key_value.first] = key_value.second; - } - return azure_metadata; -} - Status CommitBlockList(std::shared_ptr block_blob_client, const std::vector& block_ids, const Storage::Metadata& metadata) { From 67602dc328d575c1c442795966573188b9eff75e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 16 Dec 2023 01:29:01 -0300 Subject: [PATCH 3/9] CreateDir,DeleteDir: Change the error message for empty container name in AzureLocation --- cpp/src/arrow/filesystem/azurefs.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 614dd94e1bb..e22b36aa019 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1241,7 +1241,7 @@ class AzureFileSystem::Impl { Status CreateDir(const AzureLocation& location) { if (location.container.empty()) { - return Status::Invalid("Cannot create an empty container"); + return Status::Invalid("CreateDir requires a non-empty path."); } auto container_client = @@ -1249,13 +1249,9 @@ class AzureFileSystem::Impl { if (location.path.empty()) { try { auto response = container_client.Create(); - if (response.Value.Created) { - return Status::OK(); - } else { - return StatusFromErrorResponse( - container_client.GetUrl(), *response.RawResponse, - "Failed to create a container: " + location.container); - } + return response.Value.Created + ? Status::OK() + : Status::AlreadyExists("Directory already exists: " + location.all); } catch (const Storage::StorageException& exception) { return ExceptionToStatus("Failed to create a container: " + location.container + ": " + container_client.GetUrl(), @@ -1299,7 +1295,7 @@ class AzureFileSystem::Impl { Status CreateDirRecursive(const AzureLocation& location) { if (location.container.empty()) { - return Status::Invalid("Cannot create an empty container"); + return Status::Invalid("CreateDir requires a non-empty path."); } auto container_client = @@ -1434,7 +1430,7 @@ class AzureFileSystem::Impl { public: Status DeleteDir(const AzureLocation& location) { if (location.container.empty()) { - return Status::Invalid("Cannot delete an empty container"); + return Status::Invalid("DeleteDir requires a non-empty path."); } auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); From 5de71a3768b23d2064395bbc55bfea5549088545 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 00:42:55 -0300 Subject: [PATCH 4/9] Forwards arguments to ExceptionToStatus->..->Status::FromArgs This simplifies the creation of long error messages and leads to the use of a string builder to construct the error message. --- cpp/src/arrow/filesystem/azurefs.cc | 150 +++++++++++++--------------- 1 file changed, 70 insertions(+), 80 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index e22b36aa019..fe175916db5 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -217,9 +217,11 @@ struct AzureLocation { } }; -Status ExceptionToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { - return Status::IOError(prefix, " Azure Error: ", exception.what()); +template +Status ExceptionToStatus(const Storage::StorageException& exception, + PrefixArgs&&... prefix_args) { + return Status::IOError(std::forward(prefix_args)..., + " Azure Error: ", exception.what()); } Status PathNotFound(const AzureLocation& location) { @@ -452,9 +454,8 @@ class ObjectInputFile final : public io::RandomAccessFile { return PathNotFound(location_); } return ExceptionToStatus( - "GetProperties failed for '" + blob_client_->GetUrl() + - "'. Cannot initialise an ObjectInputFile without knowing the file size.", - exception); + exception, "GetProperties failed for '", blob_client_->GetUrl(), + "'. Cannot initialise an ObjectInputFile without knowing the file size."); } } @@ -532,10 +533,9 @@ class ObjectInputFile final : public io::RandomAccessFile { .Value.ContentRange.Length.Value(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "DownloadTo from '" + blob_client_->GetUrl() + "' at position " + - std::to_string(position) + " for " + std::to_string(nbytes) + - " bytes failed. ReadAt failed to read the required byte range.", - exception); + exception, "DownloadTo from '", blob_client_->GetUrl(), "' at position ", + position, " for ", nbytes, + " bytes failed. ReadAt failed to read the required byte range."); } } @@ -585,10 +585,9 @@ Status CreateEmptyBlockBlob(Blobs::BlockBlobClient& block_blob_client) { block_blob_client.UploadFrom(nullptr, 0); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "UploadFrom failed for '" + block_blob_client.GetUrl() + - "'. There is no existing blob at this location or the existing blob must be " - "replaced so ObjectAppendStream must create a new empty block blob.", - exception); + exception, "UploadFrom failed for '", block_blob_client.GetUrl(), + "'. There is no existing blob at this location or the existing blob must be " + "replaced so ObjectAppendStream must create a new empty block blob."); } return Status::OK(); } @@ -599,9 +598,8 @@ Result GetBlockList( return block_blob_client->GetBlockList().Value; } catch (Storage::StorageException& exception) { return ExceptionToStatus( - "GetBlockList failed for '" + block_blob_client->GetUrl() + - "'. Cannot write to a file without first fetching the existing block list.", - exception); + exception, "GetBlockList failed for '", block_blob_client->GetUrl(), + "'. Cannot write to a file without first fetching the existing block list."); } } @@ -618,9 +616,8 @@ Status CommitBlockList(std::shared_ptr block_bl block_blob_client->CommitBlockList(block_ids, options); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "CommitBlockList failed for '" + block_blob_client->GetUrl() + - "'. Committing is required to flush an output/append stream.", - exception); + exception, "CommitBlockList failed for '", block_blob_client->GetUrl(), + "'. Committing is required to flush an output/append stream."); } return Status::OK(); } @@ -662,10 +659,9 @@ class ObjectAppendStream final : public io::OutputStream { RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client_)); } else { return ExceptionToStatus( - "GetProperties failed for '" + block_blob_client_->GetUrl() + - "'. Cannot initialise an ObjectAppendStream without knowing whether a " - "file already exists at this path, and if it exists, its size.", - exception); + exception, "GetProperties failed for '", block_blob_client_->GetUrl(), + "'. Cannot initialise an ObjectAppendStream without knowing whether a " + "file already exists at this path, and if it exists, its size."); } content_length_ = 0; } @@ -760,10 +756,9 @@ class ObjectAppendStream final : public io::OutputStream { block_blob_client_->StageBlock(new_block_id, block_content); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "StageBlock failed for '" + block_blob_client_->GetUrl() + "' new_block_id: '" + - new_block_id + - "'. Staging new blocks is fundamental to streaming writes to blob storage.", - exception); + exception, "StageBlock failed for '", block_blob_client_->GetUrl(), + "' new_block_id: '", new_block_id, + "'. Staging new blocks is fundamental to streaming writes to blob storage."); } block_ids_.push_back(new_block_id); pos_ += nbytes; @@ -835,9 +830,9 @@ Result CheckIfHierarchicalNamespaceIsEnabled( if (exception.ErrorCode == "HierarchicalNamespaceNotEnabled") { return HNSSupport::kDisabled; } - return ExceptionToStatus("Check for Hierarchical Namespace support on '" + - adlfs_client.GetUrl() + "' failed.", - exception); + return ExceptionToStatus(exception, + "Check for Hierarchical Namespace support on '", + adlfs_client.GetUrl(), "' failed."); } } } @@ -855,6 +850,8 @@ namespace { // "filesystem". Creating a container using the Blob Storage API will make it // accessible using the Data Lake Storage Gen 2 API and vice versa. +const std::string kDelimiter = std::string{internal::kSep}; + template Result GetContainerPropsAsFileInfo(const std::string& container_name, ContainerClient& container_client) { @@ -869,8 +866,8 @@ Result GetContainerPropsAsFileInfo(const std::string& container_name, info.set_type(FileType::NotFound); return info; } - return ExceptionToStatus( - "GetProperties for '" + container_client.GetUrl() + "' failed.", exception); + return ExceptionToStatus(exception, "GetProperties for '", container_client.GetUrl(), + "' failed."); } } @@ -1011,16 +1008,14 @@ class AzureFileSystem::Impl { return info; } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "ListBlobs failed for prefix='" + *list_blob_options.Prefix + - "' failed. GetFileInfo is unable to determine whether the path should " - "be considered an implied directory.", - exception); + exception, "ListBlobs failed for prefix='", *list_blob_options.Prefix, + "' failed. GetFileInfo is unable to determine whether the path should " + "be considered an implied directory."); } } return ExceptionToStatus( - "GetProperties failed for '" + file_client.GetUrl() + - "' GetFileInfo is unable to determine whether the path exists.", - exception); + exception, "GetProperties failed for '", file_client.GetUrl(), + "' GetFileInfo is unable to determine whether the path exists."); } } @@ -1038,7 +1033,7 @@ class AzureFileSystem::Impl { } } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to list account containers.", exception); + return ExceptionToStatus(exception, "Failed to list account containers."); } return Status::OK(); } @@ -1153,9 +1148,9 @@ class AzureFileSystem::Impl { if (IsContainerNotFound(exception)) { found = false; } else { - return ExceptionToStatus("Failed to list blobs in a directory: " + - select.base_dir + ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to list blobs in a directory: ", select.base_dir, + ": ", container_client.GetUrl()); } } @@ -1253,9 +1248,9 @@ class AzureFileSystem::Impl { ? Status::OK() : Status::AlreadyExists("Directory already exists: " + location.all); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a container: " + location.container + - ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to create a container: ", location.container, + ": ", container_client.GetUrl()); } } @@ -1287,9 +1282,8 @@ class AzureFileSystem::Impl { "Failed to create a directory: " + location.path); } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a directory: " + location.path + ": " + - directory_client.GetUrl(), - exception); + return ExceptionToStatus(exception, "Failed to create a directory: ", location.path, + ": ", directory_client.GetUrl()); } } @@ -1303,9 +1297,9 @@ class AzureFileSystem::Impl { try { container_client.CreateIfNotExists(); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a container: " + location.container + - " (" + container_client.GetUrl() + ")", - exception); + return ExceptionToStatus(exception, + "Failed to create a container: ", location.container, " (", + container_client.GetUrl(), ")"); } auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); @@ -1324,9 +1318,9 @@ class AzureFileSystem::Impl { try { directory_client.CreateIfNotExists(); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a directory: " + location.path + " (" + - directory_client.GetUrl() + ")", - exception); + return ExceptionToStatus(exception, + "Failed to create a directory: ", location.path, " (", + directory_client.GetUrl(), ")"); } } @@ -1389,9 +1383,8 @@ class AzureFileSystem::Impl { try { container_client.SubmitBatch(batch); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to delete blobs in a directory: " + - location.path + ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", + location.path, ": ", container_client.GetUrl()); } std::vector failed_blob_names; for (size_t i = 0; i < deferred_responses.size(); ++i) { @@ -1420,9 +1413,9 @@ class AzureFileSystem::Impl { } } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to list blobs in a directory: " + location.path + - ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to list blobs in a directory: ", location.path, + ": ", container_client.GetUrl()); } return Status::OK(); } @@ -1452,9 +1445,9 @@ class AzureFileSystem::Impl { "Failed to delete a container: " + location.container); } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to delete a container: " + location.container + - ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to delete a container: ", location.container, + ": ", container_client.GetUrl()); } } @@ -1470,9 +1463,9 @@ class AzureFileSystem::Impl { "Failed to delete a directory: " + location.path); } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to delete a directory: " + location.path + ": " + - directory_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to delete a directory: ", location.path, ": ", + directory_client.GetUrl()); } } else { return DeleteDirContentsWithoutHierarchicalNamespace(location, @@ -1503,9 +1496,8 @@ class AzureFileSystem::Impl { sub_directory_client.DeleteRecursive(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "Failed to delete a sub directory: " + location.container + - internal::kSep + path.Name + ": " + sub_directory_client.GetUrl(), - exception); + exception, "Failed to delete a sub directory: ", location.container, + kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); } } else { auto sub_file_client = adlfs_client.GetFileClient(path.Name); @@ -1513,9 +1505,8 @@ class AzureFileSystem::Impl { sub_file_client.Delete(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "Failed to delete a sub file: " + location.container + - internal::kSep + path.Name + ": " + sub_file_client.GetUrl(), - exception); + exception, "Failed to delete a sub file: ", location.container, + kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); } } } @@ -1524,9 +1515,9 @@ class AzureFileSystem::Impl { if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { return Status::OK(); } else { - return ExceptionToStatus("Failed to delete directory contents: " + - location.path + ": " + directory_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to delete directory contents: ", location.path, + ": ", directory_client.GetUrl()); } } return Status::OK(); @@ -1549,9 +1540,8 @@ class AzureFileSystem::Impl { try { dest_blob_client.CopyFromUri(src_url); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - "Failed to copy a blob. (" + src_url + " -> " + dest_blob_client.GetUrl() + ")", - exception); + return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ", + dest_blob_client.GetUrl(), ")"); } return Status::OK(); } From 57ec6caa38f587380a4d424715bc361f5c572cb6 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 12:15:38 -0300 Subject: [PATCH 5/9] Move CheckIfHierarchicalNamespaceIsEnabled() signature to azurefs_internal.h But keep implementation in azurefs.cc to avoid having to expose currently azurefs.cc-private symbols in headers. --- cpp/src/arrow/filesystem/azurefs.cc | 3 +- cpp/src/arrow/filesystem/azurefs.h | 49 +------------ cpp/src/arrow/filesystem/azurefs_internal.h | 78 +++++++++++++++++++++ cpp/src/arrow/filesystem/azurefs_test.cc | 12 ++-- 4 files changed, 88 insertions(+), 54 deletions(-) create mode 100644 cpp/src/arrow/filesystem/azurefs_internal.h diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index fe175916db5..dbb22cfd287 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -41,7 +42,7 @@ namespace DataLake = Azure::Storage::Files::DataLake; namespace Http = Azure::Core::Http; namespace Storage = Azure::Storage; -using internal::HNSSupport; +using HNSSupport = internal::HierarchicalNamespaceSupport; // ----------------------------------------------------------------------- // AzureOptions Implementation diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index b7ef2bb3130..f97cfb423f2 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -21,6 +21,7 @@ #include #include +#include "arrow/filesystem/azurefs_internal.h" #include "arrow/filesystem/filesystem.h" #include "arrow/util/macros.h" #include "arrow/util/uri.h" @@ -118,54 +119,6 @@ struct ARROW_EXPORT AzureOptions { MakeDataLakeServiceClient() const; }; -namespace internal { - -enum class HNSSupport { - kUnknown = 0, - kContainerNotFound = 1, - kDisabled = 2, - kEnabled = 3, -}; - -/// \brief Performs a request to check if the storage account has Hierarchical -/// Namespace support enabled. -/// -/// This check requires a DataLakeFileSystemClient for any container of the -/// storage account. If the container doesn't exist yet, we just forward that -/// error to the caller (kContainerNotFound) since that's a proper error to the operation -/// on that container anyways -- no need to try again with or without the knowledge of -/// Hierarchical Namespace support. -/// -/// Hierarchical Namespace support can't easily be changed after the storage account is -/// created and the feature is shared by all containers in the storage account. -/// This means the result of this check can (and should!) be cached as soon as -/// it returns a successful result on any container of the storage account (see -/// AzureFileSystem::Impl). -/// -/// The check consists of a call to DataLakeFileSystemClient::GetAccessControlList() -/// on the root directory of the container. An approach taken by the Hadoop Azure -/// project [1]. A more obvious approach would be to call -/// BlobServiceClient::GetAccountInfo(), but that endpoint requires elevated -/// permissions [2] that we can't generally rely on. -/// -/// [1]: -/// https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. -/// [2]: -/// https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization -/// -/// IMPORTANT: If the result is kEnabled or kDisabled, it doesn't necessarily mean that -/// the container exists. -/// -/// \param adlfs_client A DataLakeFileSystemClient for a container of the storage -/// account. -/// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never -/// returned). -Result CheckIfHierarchicalNamespaceIsEnabled( - Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, - const AzureOptions& options); - -} // namespace internal - /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and /// Azure Data Lake Storage Gen2 (ADLS Gen2) [2]. /// diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h new file mode 100644 index 00000000000..13d84c9b542 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/result.h" + +namespace Azure::Storage::Files::DataLake { +class DataLakeFileSystemClient; +class DataLakeServiceClient; +} // namespace Azure::Storage::Files::DataLake + +namespace arrow::fs { + +struct AzureOptions; + +namespace internal { + +enum class HierarchicalNamespaceSupport { + kUnknown = 0, + kContainerNotFound = 1, + kDisabled = 2, + kEnabled = 3, +}; + +/// \brief Performs a request to check if the storage account has Hierarchical +/// Namespace support enabled. +/// +/// This check requires a DataLakeFileSystemClient for any container of the +/// storage account. If the container doesn't exist yet, we just forward that +/// error to the caller (kContainerNotFound) since that's a proper error to the operation +/// on that container anyways -- no need to try again with or without the knowledge of +/// Hierarchical Namespace support. +/// +/// Hierarchical Namespace support can't easily be changed after the storage account is +/// created and the feature is shared by all containers in the storage account. +/// This means the result of this check can (and should!) be cached as soon as +/// it returns a successful result on any container of the storage account (see +/// AzureFileSystem::Impl). +/// +/// The check consists of a call to DataLakeFileSystemClient::GetAccessControlList() +/// on the root directory of the container. An approach taken by the Hadoop Azure +/// project [1]. A more obvious approach would be to call +/// BlobServiceClient::GetAccountInfo(), but that endpoint requires elevated +/// permissions [2] that we can't generally rely on. +/// +/// [1]: +/// https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. +/// [2]: +/// https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization +/// +/// IMPORTANT: If the result is kEnabled or kDisabled, it doesn't necessarily mean that +/// the container exists. +/// +/// \param adlfs_client A DataLakeFileSystemClient for a container of the storage +/// account. +/// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never +/// returned). +Result CheckIfHierarchicalNamespaceIsEnabled( + Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, + const arrow::fs::AzureOptions& options); + +} // namespace internal +} // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index db0e133e0d4..a3dd898755e 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -72,6 +72,8 @@ namespace Blobs = Azure::Storage::Blobs; namespace Core = Azure::Core; namespace DataLake = Azure::Storage::Files::DataLake; +using HNSSupport = internal::HierarchicalNamespaceSupport; + enum class AzureBackend { /// \brief Official Azure Remote Backend kAzure, @@ -629,9 +631,9 @@ void TestAzureFileSystem::TestDetectHierarchicalNamespace(bool trip_up_azurite) ASSERT_OK_AND_ASSIGN(auto hns_support, internal::CheckIfHierarchicalNamespaceIsEnabled( adlfs_client, options_)); if (env->WithHierarchicalNamespace()) { - ASSERT_EQ(hns_support, internal::HNSSupport::kEnabled); + ASSERT_EQ(hns_support, HNSSupport::kEnabled); } else { - ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + ASSERT_EQ(hns_support, HNSSupport::kDisabled); } } @@ -643,13 +645,13 @@ void TestAzureFileSystem::TestDetectHierarchicalNamespaceOnMissingContainer() { EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); switch (env->backend()) { case AzureBackend::kAzurite: - ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + ASSERT_EQ(hns_support, HNSSupport::kDisabled); break; case AzureBackend::kAzure: if (env->WithHierarchicalNamespace()) { - ASSERT_EQ(hns_support, internal::HNSSupport::kContainerNotFound); + ASSERT_EQ(hns_support, HNSSupport::kContainerNotFound); } else { - ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + ASSERT_EQ(hns_support, HNSSupport::kDisabled); } break; } From 6de47621afabfa5af069944160ab650efb65d510 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 13:42:40 -0300 Subject: [PATCH 6/9] Remove include of azurefs_internal.h --- cpp/src/arrow/filesystem/azurefs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index f97cfb423f2..0c41c429281 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -21,7 +21,6 @@ #include #include -#include "arrow/filesystem/azurefs_internal.h" #include "arrow/filesystem/filesystem.h" #include "arrow/util/macros.h" #include "arrow/util/uri.h" From cf253c1bf52a733ba26c710f4079ac20cbba4e79 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 13:43:21 -0300 Subject: [PATCH 7/9] Make & const --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index dbb22cfd287..6e99189b288 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -581,7 +581,7 @@ class ObjectInputFile final : public io::RandomAccessFile { std::shared_ptr metadata_; }; -Status CreateEmptyBlockBlob(Blobs::BlockBlobClient& block_blob_client) { +Status CreateEmptyBlockBlob(const Blobs::BlockBlobClient& block_blob_client) { try { block_blob_client.UploadFrom(nullptr, 0); } catch (const Storage::StorageException& exception) { From 9d0d9690917b55a7425fa6b84b159bc259765c46 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 13:44:50 -0300 Subject: [PATCH 8/9] Include internal header in tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index a3dd898755e..53e71f3658d 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -34,6 +34,7 @@ #include #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include From fa65f6decbda9e5a5eba9af24dac307cf1b6d90d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 14:37:26 -0300 Subject: [PATCH 9/9] Fix lint warning --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6e99189b288..a9795e40a6c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -851,7 +851,7 @@ namespace { // "filesystem". Creating a container using the Blob Storage API will make it // accessible using the Data Lake Storage Gen 2 API and vice versa. -const std::string kDelimiter = std::string{internal::kSep}; +const char kDelimiter[] = {internal::kSep, '\0'}; template Result GetContainerPropsAsFileInfo(const std::string& container_name,