-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-38597: [C++] Implement GetFileInfo(selector) for Azure filesystem #39009
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fa8303f
a4f0703
6d9e49e
3aabdf4
32f2e77
7cbe442
73ece5a
a55cc5f
a33821e
2ab1d10
7c9ff2f
378e868
e248400
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -39,7 +39,7 @@ namespace fs { | |||||
| // ----------------------------------------------------------------------- | ||||||
| // AzureOptions Implementation | ||||||
|
|
||||||
| AzureOptions::AzureOptions() {} | ||||||
| AzureOptions::AzureOptions() = default; | ||||||
|
|
||||||
| bool AzureOptions::Equals(const AzureOptions& other) const { | ||||||
| return (account_dfs_url == other.account_dfs_url && | ||||||
|
|
@@ -820,6 +820,209 @@ class AzureFileSystem::Impl { | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| private: | ||||||
| template <typename OnContainer> | ||||||
| Status VisitContainers(const Azure::Core::Context& context, | ||||||
| OnContainer&& on_container) const { | ||||||
| Azure::Storage::Blobs::ListBlobContainersOptions options; | ||||||
| try { | ||||||
| auto container_list_response = | ||||||
| blob_service_client_->ListBlobContainers(options, context); | ||||||
| for (; container_list_response.HasPage(); | ||||||
| container_list_response.MoveToNextPage(context)) { | ||||||
| for (const auto& container : container_list_response.BlobContainers) { | ||||||
| RETURN_NOT_OK(on_container(container)); | ||||||
| } | ||||||
| } | ||||||
| } catch (const Azure::Storage::StorageException& exception) { | ||||||
| return internal::ExceptionToStatus("Failed to list account containers.", exception); | ||||||
| } | ||||||
| return Status::OK(); | ||||||
| } | ||||||
|
|
||||||
| static FileInfo FileInfoFromBlob(const std::string& container, | ||||||
| const Azure::Storage::Blobs::Models::BlobItem& blob) { | ||||||
| auto path = internal::ConcatAbstractPath(container, blob.Name); | ||||||
| if (internal::HasTrailingSlash(blob.Name)) { | ||||||
| return DirectoryFileInfoFromPath(path); | ||||||
| } | ||||||
| FileInfo info{std::move(path), FileType::File}; | ||||||
| info.set_size(blob.BlobSize); | ||||||
| info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); | ||||||
| return info; | ||||||
| } | ||||||
|
|
||||||
| static FileInfo DirectoryFileInfoFromPath(const std::string& path) { | ||||||
| return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, | ||||||
| FileType::Directory}; | ||||||
| } | ||||||
|
|
||||||
| static std::string_view BasenameView(std::string_view s) { | ||||||
| DCHECK(!internal::HasTrailingSlash(s)); | ||||||
| auto offset = s.find_last_of(internal::kSep); | ||||||
| auto result = (offset == std::string_view::npos) ? s : s.substr(offset); | ||||||
| DCHECK(!result.empty() && result.back() != internal::kSep); | ||||||
| return result; | ||||||
| } | ||||||
|
|
||||||
| /// \brief List the blobs at the root of a container or some dir in a container. | ||||||
| /// | ||||||
| /// \pre container_client is the client for the container named like the first | ||||||
| /// segment of select.base_dir. | ||||||
| Status GetFileInfoWithSelectorFromContainer( | ||||||
| const Azure::Storage::Blobs::BlobContainerClient& container_client, | ||||||
| const Azure::Core::Context& context, Azure::Nullable<int32_t> page_size_hint, | ||||||
| const FileSelector& select, FileInfoVector* acc_results) { | ||||||
| ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); | ||||||
|
|
||||||
| bool found = false; | ||||||
| Azure::Storage::Blobs::ListBlobsOptions options; | ||||||
| if (internal::IsEmptyPath(base_location.path)) { | ||||||
| // If the base_dir is the root of the container, then we want to list all blobs in | ||||||
| // the container and the Prefix should be empty and not even include the trailing | ||||||
| // slash because the container itself represents the `<container>/` directory. | ||||||
| options.Prefix = {}; | ||||||
felipecrv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| found = true; // Unless the container itself is not found later! | ||||||
| } else { | ||||||
| options.Prefix = internal::EnsureTrailingSlash(base_location.path); | ||||||
| } | ||||||
| options.PageSizeHint = page_size_hint; | ||||||
| options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; | ||||||
|
||||||
|
|
||||||
| auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { | ||||||
| if (select.recursive && select.max_recursion > 0) { | ||||||
| FileSelector sub_select; | ||||||
| sub_select.base_dir = internal::ConcatAbstractPath( | ||||||
| base_location.container, internal::RemoveTrailingSlash(blob_prefix)); | ||||||
| sub_select.allow_not_found = true; | ||||||
| sub_select.recursive = true; | ||||||
| sub_select.max_recursion = select.max_recursion - 1; | ||||||
| return GetFileInfoWithSelectorFromContainer( | ||||||
| container_client, context, page_size_hint, sub_select, acc_results); | ||||||
| } | ||||||
| return Status::OK(); | ||||||
| }; | ||||||
|
|
||||||
| auto process_blob = | ||||||
| [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { | ||||||
| // blob.Name has trailing slash only when Prefix is an empty | ||||||
| // directory marker blob for the directory we're listing | ||||||
| // from, and we should skip it. | ||||||
| if (!internal::HasTrailingSlash(blob.Name)) { | ||||||
| acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); | ||||||
| } | ||||||
| }; | ||||||
| auto process_prefix = [&](const std::string& prefix) noexcept -> Status { | ||||||
| const auto path = internal::ConcatAbstractPath(base_location.container, prefix); | ||||||
| acc_results->push_back(DirectoryFileInfoFromPath(path)); | ||||||
| return recurse(prefix); | ||||||
| }; | ||||||
|
|
||||||
| try { | ||||||
| auto list_response = | ||||||
| container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context); | ||||||
felipecrv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { | ||||||
| if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) { | ||||||
| continue; | ||||||
| } | ||||||
| found = true; | ||||||
| // Blob and BlobPrefixes are sorted by name, so we can merge-iterate | ||||||
| // them to ensure returned results are all sorted. | ||||||
| size_t blob_index = 0; | ||||||
| size_t blob_prefix_index = 0; | ||||||
| while (blob_index < list_response.Blobs.size() && | ||||||
| blob_prefix_index < list_response.BlobPrefixes.size()) { | ||||||
| const auto& blob = list_response.Blobs[blob_index]; | ||||||
| const auto& prefix = list_response.BlobPrefixes[blob_prefix_index]; | ||||||
| const int cmp = blob.Name.compare(prefix); | ||||||
felipecrv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| if (cmp < 0) { | ||||||
| process_blob(blob); | ||||||
| blob_index += 1; | ||||||
|
||||||
| blob_index += 1; | |
| ++blob_index; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both styles are found in the codebase and the += 1 is the only way to do it in modern languages like Rust and Swift.
i++ is ok in for loops, but ++blob_index by itself doesn't stand out enough alone when in a line by itself.
felipecrv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
felipecrv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't object them but why do you want to specify them explicitly?
Our other methods don't specify them explicitly. (They use the default value.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want all the wiring to be in place if we need to specify the page_size_hint which I'm pretty sure we will need to tweak when this is used in practice.
Uh oh!
There was an error while loading. Please reload this page.