From fa8303fe6f0af08791762c5a900ffd0f6f377404 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 25 Nov 2023 01:02:58 -0300 Subject: [PATCH 01/13] path_util.cc: Protect against int overflow `length` could come from `FileSelector::max_recursion` which is default initialized to `INT32_MAX` and cause an overflow on the `offset+length` sum here. --- cpp/src/arrow/filesystem/path_util.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 46ea436a9f3..fc4739b1cf4 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -72,15 +72,12 @@ std::string SliceAbstractPath(const std::string& s, int offset, int length, char return ""; } std::vector components = SplitAbstractPath(s, sep); - std::stringstream combined; if (offset >= static_cast(components.size())) { return ""; } - int end = offset + length; - if (end > static_cast(components.size())) { - end = static_cast(components.size()); - } - for (int i = offset; i < end; i++) { + const size_t end = std::min(static_cast(offset) + length, components.size()); + std::stringstream combined; + for (auto i = static_cast(offset); i < end; i++) { combined << components[i]; if (i < end - 1) { combined << sep; From a4f070386a64a77c4edaff37e22ebd0ca95e3466 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 29 Nov 2023 20:51:57 -0300 Subject: [PATCH 02/13] clang-tidy fixes in path_util.cc --- cpp/src/arrow/filesystem/path_util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index fc4739b1cf4..8931ff7f884 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -52,7 +52,7 @@ std::vector SplitAbstractPath(const std::string& path, char sep) { } auto append_part = [&parts, &v](size_t start, size_t end) { - parts.push_back(std::string(v.substr(start, end - start))); + parts.emplace_back(v.substr(start, end - start)); }; size_t start = 0; @@ -380,7 +380,7 @@ struct Globber::Impl { Globber::Globber(std::string pattern) : impl_(new Impl(pattern)) {} -Globber::~Globber() {} +Globber::~Globber() = default; bool Globber::Matches(const std::string& path) { return regex_match(path, impl_->pattern_); From 6d9e49e9fb3b0c226eb034a02be3819ae5ae6c8e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 25 Nov 2023 01:05:26 -0300 Subject: [PATCH 03/13] Implement AzureFileSystem::GetFileInfoWithSelector() --- cpp/src/arrow/filesystem/azurefs.cc | 205 ++++++++++++++++++++++- cpp/src/arrow/filesystem/azurefs_test.cc | 172 ++++++++++++++++++- 2 files changed, 375 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 9bd2b0ae9d8..56e7e053783 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -820,6 +820,204 @@ class AzureFileSystem::Impl { } } + private: + template + Status ListContainers(const Azure::Core::Context& context, + OnContainer&& on_container) const { + Azure::Storage::Blobs::ListBlobContainersOptions options; + // Deleted containers are not returned. + options.Include = Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None; + try { + auto container_list_response = + blob_service_client_->ListBlobContainers(options, context); + for (; container_list_response.HasPage(); + container_list_response.MoveToNextPage(context)) { + for (const auto& container : container_list_response.BlobContainers) { + RETURN_NOT_OK(on_container(container)); + } + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus("Failed to list account containers.", exception); + } + return Status::OK(); + } + + /// \brief The last result name from the same container as the base_location. + /// + /// The "/" prefix is stripped from the path, so it can be + /// easily compared with blob names in a listing loop. + std::optional LastResultBlobName(const AzureLocation& base_location, + const std::vector& results) { + if (results.empty()) { + return {}; + } + const auto& path = results.back().path(); + // if path starts with "/" + if (path.size() > base_location.container.size() && + arrow::internal::StartsWith(path, base_location.container) && + path[base_location.container.size()] == internal::kSep) { + auto blob_name = path.substr(base_location.container.size() + 1); + if (!blob_name.empty()) { + return std::move(blob_name); + } + } + return {}; + } + + Status GetFileInfoWithSelectorFromContainer( + const Azure::Storage::Blobs::BlobContainerClient& container_client, + const Azure::Core::Context& context, Azure::Nullable page_size_hint, + const FileSelector& select, FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + bool found = false; + Azure::Storage::Blobs::ListBlobsOptions options; + if (internal::GetAbstractPathDepth(base_location.path) == 0) { + // If the base_dir is the root of the container, then we want to list all blobs in + // the container and the Prefix should be empty and not even include the trailing + // slash because the container itself represents the `/` directory. + options.Prefix = {}; + found = true; // Unless the container itself is not found later! + } else { + options.Prefix = internal::EnsureTrailingSlash(base_location.path); + } + options.PageSizeHint = page_size_hint; + options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; + + std::set dir_markers_listed; + try { + auto list_response = + container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context); + for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { + if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) { + continue; + } + found = true; + // Emit the blobs first, so that potential BlobPrefix recursions are at the + // function tail. + for (const auto& blob : list_response.Blobs) { + if (ARROW_PREDICT_FALSE(options.Prefix.HasValue() && + blob.Name == options.Prefix.Value())) { + // Prefix.Value() contains a trailing slash, so if we found a blob that + // matches it completely, then it is an empty directory marker blob for the + // directory we're listing from, so we should skip it. + continue; + } + if (blob.Name.back() == internal::kSep) { + // Defense against empty '/'-separated segments in the blob name. + if (ARROW_PREDICT_FALSE(blob.Name == "/")) { + continue; + } + // We include these now as they might be empty directories and not appear in + // the BlobPrefixes listing. + std::string out_path; + out_path.reserve(base_location.container.size() + blob.Name.size()); + out_path += base_location.container; + out_path += internal::kSep; + out_path += internal::RemoveTrailingSlash(blob.Name, false); + FileInfo info{std::move(out_path), FileType::Directory}; + acc_results->push_back(std::move(info)); + // Remember the listed directory markers, so we don't duplicate + // them in the BlobPrefixes processing loop. + dir_markers_listed.emplace_hint(dir_markers_listed.end(), blob.Name); + continue; + } + // Add file path to results. + std::string out_path; + out_path.reserve(base_location.container.size() + 1 + blob.Name.size()); + out_path += base_location.container; + out_path += internal::kSep; + out_path += blob.Name; + FileInfo info{std::move(out_path), FileType::File}; + info.set_size(blob.BlobSize); + info.set_mtime( + std::chrono::system_clock::time_point{blob.Details.LastModified}); + acc_results->push_back(std::move(info)); + } + // Process the BlobPrefixes. + for (const auto& blob_prefix : list_response.BlobPrefixes) { + // Defense against empty '/'-separated segments in the blob name. + if (ARROW_PREDICT_FALSE(blob_prefix == "/")) { + continue; + } + auto path = base_location.container + internal::kSep + blob_prefix; + if (dir_markers_listed.find(path) == dir_markers_listed.end()) { + // Directory not added to the results yet, add it now. + FileInfo info{std::string{internal::RemoveTrailingSlash(path)}, + FileType::Directory}; + acc_results->push_back(std::move(info)); + } + // Recurse into the directory. + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = internal::RemoveTrailingSlash(path); + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + RETURN_NOT_OK(GetFileInfoWithSelectorFromContainer( + container_client, context, page_size_hint, sub_select, acc_results)); + } + } + } + } catch (const Azure::Storage::StorageException& exception) { + if (exception.ErrorCode == "ContainerNotFound") { + found = false; + } else { + return internal::ExceptionToStatus( + "Failed to list blobs in a directory: " + select.base_dir + ": " + + container_client.GetUrl(), + exception); + } + } + + return found || select.allow_not_found + ? Status::OK() + : ::arrow::fs::internal::PathNotFound(select.base_dir); + } + + public: + Status GetFileInfoWithSelector(const Azure::Core::Context& context, + Azure::Nullable page_size_hint, + const FileSelector& select, + FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + if (base_location.container.empty()) { + // Without a container, the base_location is equivalent to the filesystem + // root -- `/`. FileSelector::allow_not_found doesn't matter in this case + // because the root always exists. + auto on_container = + [&](const Azure::Storage::Blobs::Models::BlobContainerItem& container) { + // Deleted containers are not listed by ListContainers. + DCHECK(!container.IsDeleted); + + // Every container is considered a directory. + FileInfo info{container.Name, FileType::Directory}; + info.set_mtime( + std::chrono::system_clock::time_point{container.Details.LastModified}); + acc_results->push_back(std::move(info)); + + // Recurse into containers (subdirectories) if requested. + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = container.Name; + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + ARROW_RETURN_NOT_OK(GetFileInfoWithSelector(context, page_size_hint, + sub_select, acc_results)); + } + return Status::OK(); + }; + return ListContainers(context, std::move(on_container)); + } + + auto container_client = + blob_service_client_->GetBlobContainerClient(base_location.container); + return GetFileInfoWithSelectorFromContainer(container_client, context, page_size_hint, + select, acc_results); + } + Result> OpenInputFile(const AzureLocation& location, AzureFileSystem* fs) { RETURN_NOT_OK(ValidateFileLocation(location)); @@ -1196,7 +1394,12 @@ Result AzureFileSystem::GetFileInfo(const std::string& path) { } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + Azure::Core::Context context; + Azure::Nullable page_size_hint; // unspecified + FileInfoVector results; + RETURN_NOT_OK( + impl_->GetFileInfoWithSelector(context, page_size_hint, select, &results)); + return std::move(results); } Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 41f1663114f..fe4a56bad1b 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -213,7 +213,7 @@ class AzureFileSystemTest : public ::testing::Test { suite_skipped_ = true; GTEST_SKIP() << options.status().message(); } - container_name_ = RandomChars(32); + container_name_ = "z" + RandomChars(31); blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); datalake_service_client_ = @@ -326,6 +326,49 @@ class AzureFileSystemTest : public ::testing::Test { top_blob_path, }; } + + void SetUpSmallFileSystemTree() { + // Set up test containers + blob_service_client_->GetBlobContainerClient("empty-container").CreateIfNotExists(); + + auto container_client = blob_service_client_->GetBlobContainerClient("container"); + container_client.CreateIfNotExists(); + + auto blob_client = container_client.GetBlockBlobClient("emptydir/"); + blob_client.UploadFrom(reinterpret_cast(""), 0); + + blob_client = container_client.GetBlockBlobClient("somedir/subdir/subfile"); + const char* sub_data = "sub data"; + blob_client.UploadFrom(reinterpret_cast(sub_data), strlen(sub_data)); + + blob_client = container_client.GetBlockBlobClient("somefile"); + const char* some_data = "some data"; + blob_client.UploadFrom(reinterpret_cast(some_data), + strlen(some_data)); + + blob_client = container_client.GetBlockBlobClient("otherdir/1/2/3/otherfile"); + const char* other_data = "other data"; + blob_client.UploadFrom(reinterpret_cast(other_data), + strlen(other_data)); + } + + void AssertInfoAllContainersRecursive(const std::vector& infos) { + ASSERT_EQ(infos.size(), 14); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile", FileType::File, 10); + AssertFileInfo(infos[7], "container/somedir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[9], "container/somedir/subdir/subfile", FileType::File, 8); + AssertFileInfo(infos[10], "container/somefile", FileType::File, 9); + AssertFileInfo(infos[11], "empty-container", FileType::Directory); + AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory); + AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File); + } }; class AzuriteFileSystemTest : public AzureFileSystemTest { @@ -518,6 +561,133 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } +TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { + SetUpSmallFileSystemTree(); + + FileSelector select; + std::vector infos; + + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 3); + SortInfos(&infos); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "empty-container", FileType::Directory); + AssertFileInfo(infos[2], container_name_, FileType::Directory); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Nonexistent container + select.base_dir = "nonexistent-container"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + SortInfos(&infos); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/somedir", FileType::Directory); + AssertFileInfo(infos[3], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + select.base_dir = "container/somedir/subdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 8); + // Nonexistent + select.base_dir = "container/nonexistent"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + + // Trailing slashes + select.base_dir = "empty-container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.base_dir = "nonexistent-container/"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.base_dir = "container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + SortInfos(&infos); + ASSERT_EQ(infos.size(), 4); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { + SetUpSmallFileSystemTree(); + + FileSelector select; + select.recursive = true; + + std::vector infos; + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 14); + SortInfos(&infos); + AssertInfoAllContainersRecursive(infos); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + SortInfos(&infos); + ASSERT_EQ(infos.size(), 10); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3/otherfile", FileType::File, 10); + AssertFileInfo(infos[6], "container/somedir", FileType::Directory); + AssertFileInfo(infos[7], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir/subfile", FileType::File, 8); + AssertFileInfo(infos[9], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + SortInfos(&infos); + ASSERT_EQ(infos.size(), 2); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); + + select.base_dir = "container/otherdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + SortInfos(&infos); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); +} + TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) { ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); } From 3aabdf42348ad40735a7381202d0fe0cf13dee3c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 30 Nov 2023 14:50:22 -0300 Subject: [PATCH 04/13] AzureFS: Guarantee listed FileInfos already come in sorted --- cpp/src/arrow/filesystem/azurefs.cc | 193 +++++++++++++---------- cpp/src/arrow/filesystem/azurefs_test.cc | 14 +- cpp/src/arrow/filesystem/test_util.cc | 6 + cpp/src/arrow/filesystem/test_util.h | 4 + 4 files changed, 128 insertions(+), 89 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 56e7e053783..549c664c13d 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -842,28 +842,37 @@ class AzureFileSystem::Impl { return Status::OK(); } - /// \brief The last result name from the same container as the base_location. - /// - /// The "/" prefix is stripped from the path, so it can be - /// easily compared with blob names in a listing loop. - std::optional LastResultBlobName(const AzureLocation& base_location, - const std::vector& results) { - if (results.empty()) { - return {}; - } - const auto& path = results.back().path(); - // if path starts with "/" - if (path.size() > base_location.container.size() && - arrow::internal::StartsWith(path, base_location.container) && - path[base_location.container.size()] == internal::kSep) { - auto blob_name = path.substr(base_location.container.size() + 1); - if (!blob_name.empty()) { - return std::move(blob_name); - } + static FileInfo FileInfoFromBlob(const std::string& container, + const Azure::Storage::Blobs::Models::BlobItem& blob) { + if (blob.Name.back() == internal::kSep) { + return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name); } - return {}; + std::string path; + path.reserve(container.size() + 1 + blob.Name.size()); + path += container; + path += internal::kSep; + path += blob.Name; + FileInfo info{std::move(path), FileType::File}; + info.set_size(blob.BlobSize); + info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); + return info; + } + + static FileInfo DirectoryFileInfoFromPath(const std::string& path) { + return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, + FileType::Directory}; + } + + static std::string_view BasenameView(std::string_view s) { + auto offset = s.find_last_of(internal::kSep); + auto tail = (offset == std::string_view::npos) ? s : s.substr(offset); + return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false); } + /// \brief List the blobs at the root of a container or some dir in a container. + /// + /// \pre container_client is the client for the container named like the first + /// segment of select.base_dir. Status GetFileInfoWithSelectorFromContainer( const Azure::Storage::Blobs::BlobContainerClient& container_client, const Azure::Core::Context& context, Azure::Nullable page_size_hint, @@ -884,7 +893,63 @@ class AzureFileSystem::Impl { options.PageSizeHint = page_size_hint; options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; - std::set dir_markers_listed; + // When Prefix.Value() contains a trailing slash and we find a blob that + // matches it completely, it is an empty directory marker blob for the + // directory we're listing from, and we should skip it. + auto is_empty_dir_marker = + [&options](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept -> bool { + return options.Prefix.HasValue() && blob.Name == options.Prefix.Value(); + }; + + auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = base_location.container; + sub_select.base_dir += internal::kSep; + sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix); + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + return GetFileInfoWithSelectorFromContainer( + container_client, context, page_size_hint, sub_select, acc_results); + } + return Status::OK(); + }; + + // (*acc_results)[*last_dir_reported] is the last FileType::Directory in the results + // produced through this loop over the response pages. + std::optional last_dir_reported{}; + auto matches_last_dir_reported = [&last_dir_reported, + acc_results](const FileInfo& info) noexcept { + if (!last_dir_reported.has_value() || info.type() != FileType::Directory) { + return false; + } + const auto& last_dir = (*acc_results)[*last_dir_reported]; + return BasenameView(info.path()) == BasenameView(last_dir.path()); + }; + + auto process_blob = + [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { + if (!is_empty_dir_marker(blob)) { + const auto& info = acc_results->emplace_back( + FileInfoFromBlob(base_location.container, blob)); + if (info.type() == FileType::Directory) { + last_dir_reported = acc_results->size() - 1; + } + } + }; + auto process_prefix = [&](const std::string& prefix) noexcept -> Status { + const std::string path = base_location.container + internal::kSep + prefix; + const auto& info = acc_results->emplace_back(DirectoryFileInfoFromPath(path)); + if (ARROW_PREDICT_FALSE(matches_last_dir_reported(info))) { + acc_results->pop_back(); + } else { + last_dir_reported = acc_results->size() - 1; + return recurse(prefix); + } + return Status::OK(); + }; + try { auto list_response = container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context); @@ -893,70 +958,34 @@ class AzureFileSystem::Impl { continue; } found = true; - // Emit the blobs first, so that potential BlobPrefix recursions are at the - // function tail. - for (const auto& blob : list_response.Blobs) { - if (ARROW_PREDICT_FALSE(options.Prefix.HasValue() && - blob.Name == options.Prefix.Value())) { - // Prefix.Value() contains a trailing slash, so if we found a blob that - // matches it completely, then it is an empty directory marker blob for the - // directory we're listing from, so we should skip it. - continue; - } - if (blob.Name.back() == internal::kSep) { - // Defense against empty '/'-separated segments in the blob name. - if (ARROW_PREDICT_FALSE(blob.Name == "/")) { - continue; - } - // We include these now as they might be empty directories and not appear in - // the BlobPrefixes listing. - std::string out_path; - out_path.reserve(base_location.container.size() + blob.Name.size()); - out_path += base_location.container; - out_path += internal::kSep; - out_path += internal::RemoveTrailingSlash(blob.Name, false); - FileInfo info{std::move(out_path), FileType::Directory}; - acc_results->push_back(std::move(info)); - // Remember the listed directory markers, so we don't duplicate - // them in the BlobPrefixes processing loop. - dir_markers_listed.emplace_hint(dir_markers_listed.end(), blob.Name); - continue; + // Blob and BlobPrefixes are sorted by name, so we can merge-iterate + // them to ensure returned results are all sorted. + size_t blob_index = 0; + size_t blob_prefix_index = 0; + while (blob_index < list_response.Blobs.size() && + blob_prefix_index < list_response.BlobPrefixes.size()) { + const auto& blob = list_response.Blobs[blob_index]; + const auto& prefix = list_response.BlobPrefixes[blob_prefix_index]; + const int cmp = blob.Name.compare(prefix); + if (cmp < 0) { + process_blob(blob); + blob_index += 1; + } else if (cmp > 0) { + RETURN_NOT_OK(process_prefix(prefix)); + blob_prefix_index += 1; + } else { // there is a blob (empty dir marker) and a prefix with the same name + DCHECK_EQ(blob.Name, prefix); + RETURN_NOT_OK(process_prefix(prefix)); + blob_index += 1; + blob_prefix_index += 1; } - // Add file path to results. - std::string out_path; - out_path.reserve(base_location.container.size() + 1 + blob.Name.size()); - out_path += base_location.container; - out_path += internal::kSep; - out_path += blob.Name; - FileInfo info{std::move(out_path), FileType::File}; - info.set_size(blob.BlobSize); - info.set_mtime( - std::chrono::system_clock::time_point{blob.Details.LastModified}); - acc_results->push_back(std::move(info)); } - // Process the BlobPrefixes. - for (const auto& blob_prefix : list_response.BlobPrefixes) { - // Defense against empty '/'-separated segments in the blob name. - if (ARROW_PREDICT_FALSE(blob_prefix == "/")) { - continue; - } - auto path = base_location.container + internal::kSep + blob_prefix; - if (dir_markers_listed.find(path) == dir_markers_listed.end()) { - // Directory not added to the results yet, add it now. - FileInfo info{std::string{internal::RemoveTrailingSlash(path)}, - FileType::Directory}; - acc_results->push_back(std::move(info)); - } - // Recurse into the directory. - if (select.recursive && select.max_recursion > 0) { - FileSelector sub_select; - sub_select.base_dir = internal::RemoveTrailingSlash(path); - sub_select.allow_not_found = true; - sub_select.recursive = true; - sub_select.max_recursion = select.max_recursion - 1; - RETURN_NOT_OK(GetFileInfoWithSelectorFromContainer( - container_client, context, page_size_hint, sub_select, acc_results)); - } + for (; blob_index < list_response.Blobs.size(); blob_index++) { + process_blob(list_response.Blobs[blob_index]); + } + for (; blob_prefix_index < list_response.BlobPrefixes.size(); + blob_prefix_index++) { + RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index])); } } } catch (const Azure::Storage::StorageException& exception) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index fe4a56bad1b..b641707cf50 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -571,7 +571,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { select.base_dir = ""; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); ASSERT_EQ(infos.size(), 3); - SortInfos(&infos); + ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container", FileType::Directory); AssertFileInfo(infos[1], "empty-container", FileType::Directory); AssertFileInfo(infos[2], container_name_, FileType::Directory); @@ -590,7 +590,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { // Non-empty container select.base_dir = "container"; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); - SortInfos(&infos); + ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); @@ -626,7 +626,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); select.base_dir = "container/"; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); - SortInfos(&infos); + ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); } @@ -641,7 +641,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { select.base_dir = ""; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); ASSERT_EQ(infos.size(), 14); - SortInfos(&infos); + ASSERT_EQ(infos, SortedInfos(infos)); AssertInfoAllContainersRecursive(infos); // Empty container @@ -652,7 +652,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { // Non-empty container select.base_dir = "container"; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); - SortInfos(&infos); + ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 10); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); @@ -673,14 +673,14 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { // Non-empty "directories" select.base_dir = "container/somedir"; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); - SortInfos(&infos); + ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 2); AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); select.base_dir = "container/otherdir"; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); - SortInfos(&infos); + ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 6c5dda8e659..040917dcd21 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -126,6 +126,12 @@ void SortInfos(std::vector* infos) { std::sort(infos->begin(), infos->end(), FileInfo::ByPath{}); } +std::vector SortedInfos(const std::vector& infos) { + auto sorted = infos; + SortInfos(&sorted); + return sorted; +} + void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos) { auto fut = CollectAsyncGenerator(gen); ASSERT_FINISHES_OK_AND_ASSIGN(auto nested_infos, fut); diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index c4d846fd31b..90240228568 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -74,6 +74,10 @@ void CreateFile(FileSystem* fs, const std::string& path, const std::string& data ARROW_TESTING_EXPORT void SortInfos(FileInfoVector* infos); +// Create a copy of a FileInfo vector sorted by lexicographic path order +ARROW_TESTING_EXPORT +std::vector SortedInfos(const std::vector& infos); + ARROW_TESTING_EXPORT void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos); From 32f2e77badf70bd5dbeef515afeb10f9f5525693 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 4 Dec 2023 22:38:44 -0300 Subject: [PATCH 05/13] Fix unsafe access and make Has(Trailing|Leading)Slash inline These functions were added recently so no risk of ABI changes. --- cpp/src/arrow/filesystem/path_util.cc | 8 ++------ cpp/src/arrow/filesystem/path_util.h | 10 ++++++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 8931ff7f884..180e5a61169 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -146,7 +146,7 @@ std::string ConcatAbstractPath(const std::string& base, const std::string& stem) } std::string EnsureTrailingSlash(std::string_view v) { - if (v.length() > 0 && v.back() != kSep) { + if (!v.empty() && !HasTrailingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "C:/"... // Unless the local filesystem always uses absolute paths return std::string(v) + kSep; @@ -156,7 +156,7 @@ std::string EnsureTrailingSlash(std::string_view v) { } std::string EnsureLeadingSlash(std::string_view v) { - if (v.length() == 0 || v.front() != kSep) { + if (!HasLeadingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "/C:"... return kSep + std::string(v); } else { @@ -194,10 +194,6 @@ Status AssertNoTrailingSlash(std::string_view key) { return Status::OK(); } -bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; } - -bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; } - Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { if (base.empty() || base.front() != kSep) { diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 2c8c123e779..da2a9318f39 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -94,11 +94,13 @@ std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = fa ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); -ARROW_EXPORT -bool HasTrailingSlash(std::string_view s); +inline bool HasTrailingSlash(std::string_view s) { + return !s.empty() && s.back() == kSep; +} -ARROW_EXPORT -bool HasLeadingSlash(std::string_view s); +inline bool HasLeadingSlash(std::string_view s) { + return !s.empty() && s.front() == kSep; +} ARROW_EXPORT bool IsAncestorOf(std::string_view ancestor, std::string_view descendant); From 7cbe4420094c26163098e194365ddd1de95f4f97 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 4 Dec 2023 22:39:19 -0300 Subject: [PATCH 06/13] Feedback changes --- cpp/src/arrow/filesystem/azurefs.cc | 26 +++++++++++--------------- cpp/src/arrow/filesystem/path_util.cc | 8 ++++++-- cpp/src/arrow/filesystem/test_util.h | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 549c664c13d..3ceef133fe1 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -822,11 +822,9 @@ class AzureFileSystem::Impl { private: template - Status ListContainers(const Azure::Core::Context& context, - OnContainer&& on_container) const { + Status VisitContainers(const Azure::Core::Context& context, + OnContainer&& on_container) const { Azure::Storage::Blobs::ListBlobContainersOptions options; - // Deleted containers are not returned. - options.Include = Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None; try { auto container_list_response = blob_service_client_->ListBlobContainers(options, context); @@ -844,14 +842,10 @@ class AzureFileSystem::Impl { static FileInfo FileInfoFromBlob(const std::string& container, const Azure::Storage::Blobs::Models::BlobItem& blob) { + auto path = internal::ConcatAbstractPath(container, blob.Name); if (blob.Name.back() == internal::kSep) { - return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name); + return DirectoryFileInfoFromPath(path); } - std::string path; - path.reserve(container.size() + 1 + blob.Name.size()); - path += container; - path += internal::kSep; - path += blob.Name; FileInfo info{std::move(path), FileType::File}; info.set_size(blob.BlobSize); info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); @@ -864,9 +858,11 @@ class AzureFileSystem::Impl { } static std::string_view BasenameView(std::string_view s) { + DCHECK(!internal::HasTrailingSlash(s)); auto offset = s.find_last_of(internal::kSep); - auto tail = (offset == std::string_view::npos) ? s : s.substr(offset); - return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false); + auto result = (offset == std::string_view::npos) ? s : s.substr(offset); + DCHECK(!result.empty() && result.back() != internal::kSep); + return result; } /// \brief List the blobs at the root of a container or some dir in a container. @@ -939,7 +935,7 @@ class AzureFileSystem::Impl { } }; auto process_prefix = [&](const std::string& prefix) noexcept -> Status { - const std::string path = base_location.container + internal::kSep + prefix; + const auto path = internal::ConcatAbstractPath(base_location.container, prefix); const auto& info = acc_results->emplace_back(DirectoryFileInfoFromPath(path)); if (ARROW_PREDICT_FALSE(matches_last_dir_reported(info))) { acc_results->pop_back(); @@ -1038,7 +1034,7 @@ class AzureFileSystem::Impl { } return Status::OK(); }; - return ListContainers(context, std::move(on_container)); + return VisitContainers(context, std::move(on_container)); } auto container_client = @@ -1428,7 +1424,7 @@ Result AzureFileSystem::GetFileInfo(const FileSelector& select) FileInfoVector results; RETURN_NOT_OK( impl_->GetFileInfoWithSelector(context, page_size_hint, select, &results)); - return std::move(results); + return {std::move(results)}; } Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 180e5a61169..bf0ccbb066d 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -75,7 +75,7 @@ std::string SliceAbstractPath(const std::string& s, int offset, int length, char if (offset >= static_cast(components.size())) { return ""; } - const size_t end = std::min(static_cast(offset) + length, components.size()); + const auto end = std::min(static_cast(offset) + length, components.size()); std::stringstream combined; for (auto i = static_cast(offset); i < end; i++) { combined << components[i]; @@ -142,7 +142,11 @@ std::string ConcatAbstractPath(const std::string& base, const std::string& stem) if (base.empty()) { return stem; } - return EnsureTrailingSlash(base) + std::string(RemoveLeadingSlash(stem)); + std::string result; + result.reserve(base.length() + stem.length() + 1); // extra 1 is for potential kSep + result += EnsureTrailingSlash(base); + result += RemoveLeadingSlash(stem); + return result; } std::string EnsureTrailingSlash(std::string_view v) { diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index 90240228568..62b488e159a 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -76,7 +76,7 @@ void SortInfos(FileInfoVector* infos); // Create a copy of a FileInfo vector sorted by lexicographic path order ARROW_TESTING_EXPORT -std::vector SortedInfos(const std::vector& infos); +FileInfoVector SortedInfos(const FileInfoVector& infos); ARROW_TESTING_EXPORT void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos); From 73ece5af1c896854e7d646cdf55f9c2f4a93c6c0 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 6 Dec 2023 16:20:20 -0300 Subject: [PATCH 07/13] Mention issue and replace magic constants --- cpp/src/arrow/filesystem/azurefs_test.cc | 26 ++++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index b641707cf50..fe3c7c9eb17 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -213,6 +213,7 @@ class AzureFileSystemTest : public ::testing::Test { suite_skipped_ = true; GTEST_SKIP() << options.status().message(); } + // Stop-gap solution before GH-39119 is fixed. container_name_ = "z" + RandomChars(31); blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); @@ -327,6 +328,10 @@ class AzureFileSystemTest : public ::testing::Test { }; } + char const* kSubData = "sub data"; + char const* kSomeData = "some data"; + char const* kOtherData = "other data"; + void SetUpSmallFileSystemTree() { // Set up test containers blob_service_client_->GetBlobContainerClient("empty-container").CreateIfNotExists(); @@ -338,18 +343,15 @@ class AzureFileSystemTest : public ::testing::Test { blob_client.UploadFrom(reinterpret_cast(""), 0); blob_client = container_client.GetBlockBlobClient("somedir/subdir/subfile"); - const char* sub_data = "sub data"; - blob_client.UploadFrom(reinterpret_cast(sub_data), strlen(sub_data)); + blob_client.UploadFrom(reinterpret_cast(kSubData), strlen(kSubData)); blob_client = container_client.GetBlockBlobClient("somefile"); - const char* some_data = "some data"; - blob_client.UploadFrom(reinterpret_cast(some_data), - strlen(some_data)); + blob_client.UploadFrom(reinterpret_cast(kSomeData), + strlen(kSomeData)); blob_client = container_client.GetBlockBlobClient("otherdir/1/2/3/otherfile"); - const char* other_data = "other data"; - blob_client.UploadFrom(reinterpret_cast(other_data), - strlen(other_data)); + blob_client.UploadFrom(reinterpret_cast(kOtherData), + strlen(kOtherData)); } void AssertInfoAllContainersRecursive(const std::vector& infos) { @@ -360,11 +362,13 @@ class AzureFileSystemTest : public ::testing::Test { AssertFileInfo(infos[3], "container/otherdir/1", FileType::Directory); AssertFileInfo(infos[4], "container/otherdir/1/2", FileType::Directory); AssertFileInfo(infos[5], "container/otherdir/1/2/3", FileType::Directory); - AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile", FileType::File, 10); + AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile", FileType::File, + strlen(kOtherData)); AssertFileInfo(infos[7], "container/somedir", FileType::Directory); AssertFileInfo(infos[8], "container/somedir/subdir", FileType::Directory); - AssertFileInfo(infos[9], "container/somedir/subdir/subfile", FileType::File, 8); - AssertFileInfo(infos[10], "container/somefile", FileType::File, 9); + AssertFileInfo(infos[9], "container/somedir/subdir/subfile", FileType::File, + strlen(kSubData)); + AssertFileInfo(infos[10], "container/somefile", FileType::File, strlen(kSomeData)); AssertFileInfo(infos[11], "empty-container", FileType::Directory); AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory); AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File); From a55cc5f4ba60281df35c5b625a92e7c01ed627b4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 6 Dec 2023 17:21:49 -0300 Subject: [PATCH 08/13] Remove unnecessary explicit --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- cpp/src/arrow/filesystem/azurefs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 3ceef133fe1..accb1e4fcb7 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -39,7 +39,7 @@ namespace fs { // ----------------------------------------------------------------------- // AzureOptions Implementation -AzureOptions::AzureOptions() {} +AzureOptions::AzureOptions() = default; bool AzureOptions::Equals(const AzureOptions& other) const { return (account_dfs_url == other.account_dfs_url && diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 9f980ee8baa..b2865b059ef 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -157,7 +157,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { const AzureOptions& options, const io::IOContext& = io::default_io_context()); private: - explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); + AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); class Impl; std::unique_ptr impl_; From a33821e46e4ccca877d8212e3896c21ff65dc5db Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 6 Dec 2023 21:19:06 -0300 Subject: [PATCH 09/13] A small cleanup in the test setup code --- cpp/src/arrow/filesystem/azurefs_test.cc | 55 +++++++++++++----------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index fe3c7c9eb17..70f7f4ecfc5 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -70,6 +70,9 @@ using ::testing::IsEmpty; using ::testing::Not; using ::testing::NotNull; +namespace Blobs = Azure::Storage::Blobs; +namespace Files = Azure::Storage::Files; + auto const* kLoremIpsum = R"""( Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis @@ -193,9 +196,8 @@ TEST(AzureFileSystem, OptionsCompare) { class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; - std::unique_ptr blob_service_client_; - std::unique_ptr - datalake_service_client_; + std::unique_ptr blob_service_client_; + std::unique_ptr datalake_service_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -215,14 +217,12 @@ class AzureFileSystemTest : public ::testing::Test { } // Stop-gap solution before GH-39119 is fixed. container_name_ = "z" + RandomChars(31); - blob_service_client_ = std::make_unique( + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); - datalake_service_client_ = - std::make_unique( - options_.account_dfs_url, options_.storage_credentials_provider); + datalake_service_client_ = std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); - auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); - container_client.CreateIfNotExists(); + auto container_client = CreateContainer(container_name_); auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); blob_client.UploadFrom(reinterpret_cast(kLoremIpsum), @@ -240,6 +240,20 @@ class AzureFileSystemTest : public ::testing::Test { } } + Blobs::BlobContainerClient CreateContainer(const std::string& name) { + auto container_client = blob_service_client_->GetBlobContainerClient(name); + (void)container_client.CreateIfNotExists(); + return container_client; + } + + Blobs::BlobClient CreateBlob(Blobs::BlobContainerClient& container_client, + const std::string& name, const std::string& data = "") { + auto blob_client = container_client.GetBlockBlobClient(name); + (void)blob_client.UploadFrom(reinterpret_cast(data.data()), + data.size()); + return blob_client; + } + std::string PreexistingContainerName() const { return container_name_; } std::string PreexistingContainerPath() const { @@ -334,24 +348,13 @@ class AzureFileSystemTest : public ::testing::Test { void SetUpSmallFileSystemTree() { // Set up test containers - blob_service_client_->GetBlobContainerClient("empty-container").CreateIfNotExists(); - - auto container_client = blob_service_client_->GetBlobContainerClient("container"); - container_client.CreateIfNotExists(); - - auto blob_client = container_client.GetBlockBlobClient("emptydir/"); - blob_client.UploadFrom(reinterpret_cast(""), 0); - - blob_client = container_client.GetBlockBlobClient("somedir/subdir/subfile"); - blob_client.UploadFrom(reinterpret_cast(kSubData), strlen(kSubData)); - - blob_client = container_client.GetBlockBlobClient("somefile"); - blob_client.UploadFrom(reinterpret_cast(kSomeData), - strlen(kSomeData)); + CreateContainer("empty-container"); + auto container = CreateContainer("container"); - blob_client = container_client.GetBlockBlobClient("otherdir/1/2/3/otherfile"); - blob_client.UploadFrom(reinterpret_cast(kOtherData), - strlen(kOtherData)); + CreateBlob(container, "emptydir/"); + CreateBlob(container, "somedir/subdir/subfile", kSubData); + CreateBlob(container, "somefile", kSomeData); + CreateBlob(container, "otherdir/1/2/3/otherfile", kOtherData); } void AssertInfoAllContainersRecursive(const std::vector& infos) { From 2ab1d1083a560986288a85eb479db6110089c101 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 6 Dec 2023 22:33:24 -0300 Subject: [PATCH 10/13] Remove de-duplication code and improve the test cases --- cpp/src/arrow/filesystem/azurefs.cc | 50 +++++++++-------------- cpp/src/arrow/filesystem/azurefs_test.cc | 51 ++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index accb1e4fcb7..faf0542a606 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -843,7 +843,7 @@ class AzureFileSystem::Impl { static FileInfo FileInfoFromBlob(const std::string& container, const Azure::Storage::Blobs::Models::BlobItem& blob) { auto path = internal::ConcatAbstractPath(container, blob.Name); - if (blob.Name.back() == internal::kSep) { + if (internal::HasTrailingSlash(blob.Name)) { return DirectoryFileInfoFromPath(path); } FileInfo info{std::move(path), FileType::File}; @@ -889,10 +889,10 @@ class AzureFileSystem::Impl { options.PageSizeHint = page_size_hint; options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; - // When Prefix.Value() contains a trailing slash and we find a blob that - // matches it completely, it is an empty directory marker blob for the - // directory we're listing from, and we should skip it. - auto is_empty_dir_marker = + // When Prefix contains a value and we find a blob that matches it completely, it is + // an empty directory marker blob for the directory we're listing from, and we should + // skip it. + auto is_the_root_empty_dir_marker = [&options](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept -> bool { return options.Prefix.HasValue() && blob.Name == options.Prefix.Value(); }; @@ -912,38 +912,16 @@ class AzureFileSystem::Impl { return Status::OK(); }; - // (*acc_results)[*last_dir_reported] is the last FileType::Directory in the results - // produced through this loop over the response pages. - std::optional last_dir_reported{}; - auto matches_last_dir_reported = [&last_dir_reported, - acc_results](const FileInfo& info) noexcept { - if (!last_dir_reported.has_value() || info.type() != FileType::Directory) { - return false; - } - const auto& last_dir = (*acc_results)[*last_dir_reported]; - return BasenameView(info.path()) == BasenameView(last_dir.path()); - }; - auto process_blob = [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { - if (!is_empty_dir_marker(blob)) { - const auto& info = acc_results->emplace_back( - FileInfoFromBlob(base_location.container, blob)); - if (info.type() == FileType::Directory) { - last_dir_reported = acc_results->size() - 1; - } + if (!is_the_root_empty_dir_marker(blob)) { + acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); } }; auto process_prefix = [&](const std::string& prefix) noexcept -> Status { const auto path = internal::ConcatAbstractPath(base_location.container, prefix); - const auto& info = acc_results->emplace_back(DirectoryFileInfoFromPath(path)); - if (ARROW_PREDICT_FALSE(matches_last_dir_reported(info))) { - acc_results->pop_back(); - } else { - last_dir_reported = acc_results->size() - 1; - return recurse(prefix); - } - return Status::OK(); + acc_results->push_back(DirectoryFileInfoFromPath(path)); + return recurse(prefix); }; try { @@ -969,11 +947,19 @@ class AzureFileSystem::Impl { } else if (cmp > 0) { RETURN_NOT_OK(process_prefix(prefix)); blob_prefix_index += 1; - } else { // there is a blob (empty dir marker) and a prefix with the same name + } else { DCHECK_EQ(blob.Name, prefix); RETURN_NOT_OK(process_prefix(prefix)); blob_index += 1; blob_prefix_index += 1; + // If the container has an empty dir marker blob and another blob starting + // with this blob name as a prefix, the blob doesn't appear in the listing + // that also contains the prefix, so AFAICT this branch in unreachable. The + // code above is kept just in case, but if this DCHECK(false) is ever reached, + // we should refactor this loop to ensure no duplicate entries are ever + // reported. + DCHECK(false) + << "Unexpected blob/prefix name collision on the same listing request"; } } for (; blob_index < list_response.Blobs.size(); blob_index++) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 70f7f4ecfc5..9cc62e13d17 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -354,6 +354,10 @@ class AzureFileSystemTest : public ::testing::Test { CreateBlob(container, "emptydir/"); CreateBlob(container, "somedir/subdir/subfile", kSubData); CreateBlob(container, "somefile", kSomeData); + // Add an explicit marker for a non-empty directory. + CreateBlob(container, "otherdir/1/2/"); + // otherdir/{1/,2/,3/} are implicitly assumed to exist because of + // the otherdir/1/2/3/otherfile blob. CreateBlob(container, "otherdir/1/2/3/otherfile", kOtherData); } @@ -695,6 +699,53 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); } +TEST_F(AzuriteFileSystemTest, TestExplicitImplicitDirDeduplication) { + { + auto container = CreateContainer("container"); + CreateBlob(container, "mydir/emptydir1/"); + CreateBlob(container, "mydir/emptydir2/"); + CreateBlob(container, "mydir/nonemptydir1/"); // explicit dir marker + CreateBlob(container, "mydir/nonemptydir1/somefile", kSomeData); + CreateBlob(container, "mydir/nonemptydir2/somefile", kSomeData); + } + std::vector infos; + + FileSelector select; // non-recursive + select.base_dir = "container"; + + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir", FileType::Directory); + + select.base_dir = "container/mydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 4); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory); + AssertFileInfo(infos[1], "container/mydir/emptydir2", FileType::Directory); + AssertFileInfo(infos[2], "container/mydir/nonemptydir1", FileType::Directory); + AssertFileInfo(infos[3], "container/mydir/nonemptydir2", FileType::Directory); + + select.base_dir = "container/mydir/emptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/emptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/nonemptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", FileType::File); + + select.base_dir = "container/mydir/nonemptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); +} + TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) { ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); } From 7c9ff2fc556c4afed9be0623183953cb6a936f26 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 7 Dec 2023 18:24:55 -0300 Subject: [PATCH 11/13] Change ConcatAbstractPath to take string_views --- cpp/src/arrow/filesystem/azurefs.cc | 5 ++--- cpp/src/arrow/filesystem/filesystem.cc | 2 +- cpp/src/arrow/filesystem/path_util.cc | 4 ++-- cpp/src/arrow/filesystem/path_util.h | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index faf0542a606..6be1938a90c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -900,9 +900,8 @@ class AzureFileSystem::Impl { auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { if (select.recursive && select.max_recursion > 0) { FileSelector sub_select; - sub_select.base_dir = base_location.container; - sub_select.base_dir += internal::kSep; - sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix); + sub_select.base_dir = internal::ConcatAbstractPath( + base_location.container, internal::RemoveTrailingSlash(blob_prefix)); sub_select.allow_not_found = true; sub_select.recursive = true; sub_select.max_recursion = select.max_recursion - 1; diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 9ecc4610f38..2414e214183 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -655,7 +655,7 @@ Status CopyFiles(const std::shared_ptr& source_fs, } auto destination_path = - internal::ConcatAbstractPath(destination_base_dir, std::string(*relative)); + internal::ConcatAbstractPath(destination_base_dir, *relative); if (source_info.IsDirectory()) { dirs.push_back(destination_path); diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index bf0ccbb066d..9c895ae76c7 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -137,10 +137,10 @@ Status ValidateAbstractPathParts(const std::vector& parts) { return Status::OK(); } -std::string ConcatAbstractPath(const std::string& base, const std::string& stem) { +std::string ConcatAbstractPath(std::string_view base, std::string_view stem) { DCHECK(!stem.empty()); if (base.empty()) { - return stem; + return std::string{stem}; } std::string result; result.reserve(base.length() + stem.length() + 1); // extra 1 is for potential kSep diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index da2a9318f39..1da7afd3f93 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -69,7 +69,7 @@ Status ValidateAbstractPathParts(const std::vector& parts); // Append a non-empty stem to an abstract path. ARROW_EXPORT -std::string ConcatAbstractPath(const std::string& base, const std::string& stem); +std::string ConcatAbstractPath(std::string_view base, std::string_view stem); // Make path relative to base, if it starts with base. Otherwise error out. ARROW_EXPORT From 378e868e7ed377fcc014d04077dd41fd87930dff Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 7 Dec 2023 18:41:59 -0300 Subject: [PATCH 12/13] Latest feedback changes --- cpp/src/arrow/filesystem/azurefs.cc | 15 +++++---------- cpp/src/arrow/filesystem/azurefs_test.cc | 2 +- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6be1938a90c..daababb04c1 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -877,7 +877,7 @@ class AzureFileSystem::Impl { bool found = false; Azure::Storage::Blobs::ListBlobsOptions options; - if (internal::GetAbstractPathDepth(base_location.path) == 0) { + if (internal::IsEmptyPath(base_location.path)) { // If the base_dir is the root of the container, then we want to list all blobs in // the container and the Prefix should be empty and not even include the trailing // slash because the container itself represents the `/` directory. @@ -889,14 +889,6 @@ class AzureFileSystem::Impl { options.PageSizeHint = page_size_hint; options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; - // When Prefix contains a value and we find a blob that matches it completely, it is - // an empty directory marker blob for the directory we're listing from, and we should - // skip it. - auto is_the_root_empty_dir_marker = - [&options](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept -> bool { - return options.Prefix.HasValue() && blob.Name == options.Prefix.Value(); - }; - auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { if (select.recursive && select.max_recursion > 0) { FileSelector sub_select; @@ -913,7 +905,10 @@ class AzureFileSystem::Impl { auto process_blob = [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { - if (!is_the_root_empty_dir_marker(blob)) { + // blob.Name has trailing slash only when Prefix is an empty + // directory marker blob for the directory we're listing + // from, and we should skip it. + if (!internal::HasTrailingSlash(blob.Name)) { acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); } }; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 9cc62e13d17..fc9167e18fd 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -699,7 +699,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); } -TEST_F(AzuriteFileSystemTest, TestExplicitImplicitDirDeduplication) { +TEST_F(AzuriteFileSystemTest, TestGetFileInfoSelectorExplicitImplicitDirDedup) { { auto container = CreateContainer("container"); CreateBlob(container, "mydir/emptydir1/"); From e24840090b8550834a276785c8f2f912d7c5209e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 8 Dec 2023 08:37:54 -0300 Subject: [PATCH 13/13] Fix lint failure and test name --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 +- cpp/src/arrow/filesystem/filesystem.cc | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index fc9167e18fd..792c63b2094 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -699,7 +699,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); } -TEST_F(AzuriteFileSystemTest, TestGetFileInfoSelectorExplicitImplicitDirDedup) { +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) { { auto container = CreateContainer("container"); CreateBlob(container, "mydir/emptydir1/"); diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 2414e214183..810e9c179b1 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -654,8 +654,7 @@ Status CopyFiles(const std::shared_ptr& source_fs, "', which is outside base dir '", source_sel.base_dir, "'"); } - auto destination_path = - internal::ConcatAbstractPath(destination_base_dir, *relative); + auto destination_path = internal::ConcatAbstractPath(destination_base_dir, *relative); if (source_info.IsDirectory()) { dirs.push_back(destination_path);