From 3d543f08a07f35b313ab9e22eb63491799e22c8f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 12 Feb 2024 17:25:13 -0300 Subject: [PATCH 01/15] Remove default argument value from AcquireBlobLease --- cpp/src/arrow/filesystem/azurefs.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d4bb4457014..b8f5cc528f6 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1992,7 +1992,7 @@ class AzureFileSystem::Impl { /// optional (nullptr denotes blob not found) Result> AcquireBlobLease( const AzureLocation& location, std::chrono::seconds lease_duration, - bool allow_missing = false, bool retry_allowed = true) { + bool allow_missing, bool retry_allowed = true) { DCHECK(!location.container.empty() && !location.path.empty()); auto path = std::string{internal::RemoveTrailingSlash(location.path)}; auto blob_client = GetBlobClient(location.container, std::move(path)); @@ -2236,7 +2236,8 @@ class AzureFileSystem::Impl { const auto dest_path = std::string{internal::RemoveTrailingSlash(dest.path)}; // Ensure that src exists and, if path has a trailing slash, that it's a directory. - ARROW_ASSIGN_OR_RAISE(auto src_lease_client, AcquireBlobLease(src, kLeaseDuration)); + ARROW_ASSIGN_OR_RAISE(auto src_lease_client, + AcquireBlobLease(src, kLeaseDuration, /*allow_missing=*/false)); LeaseGuard src_lease_guard{std::move(src_lease_client), kLeaseDuration}; // It might be necessary to check src is a directory 0-3 times in this function, // so we use a lazy evaluation function to avoid redundant calls to GetFileInfo(). From f14d5fce4a93250aecf60d349d4244c64a59b3da Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 13 Feb 2024 22:18:58 -0300 Subject: [PATCH 02/15] Implement DeleteFile on HNS and flat-namespace storage accounts Co-authored-by: jerry.adair --- cpp/src/arrow/filesystem/azurefs.cc | 153 +++++++++++++++++--- cpp/src/arrow/filesystem/azurefs_test.cc | 177 ++++++++++++++++------- 2 files changed, 258 insertions(+), 72 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index b8f5cc528f6..05184845d83 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1924,26 +1924,6 @@ class AzureFileSystem::Impl { } } - Status DeleteFile(const AzureLocation& location) { - RETURN_NOT_OK(ValidateFileLocation(location)); - auto file_client = datalake_service_client_->GetFileSystemClient(location.container) - .GetFileClient(location.path); - try { - auto response = file_client.Delete(); - // Only the "*IfExists" functions ever set Deleted to false. - // All the others either succeed or throw an exception. - DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { - if (exception.ErrorCode == "FilesystemNotFound" || - exception.ErrorCode == "PathNotFound") { - return PathNotFound(location); - } - return ExceptionToStatus(exception, "Failed to delete a file: ", location.path, - ": ", file_client.GetUrl()); - } - return Status::OK(); - } - private: /// \brief Create a BlobLeaseClient and acquire a lease on the container. /// @@ -2055,6 +2035,116 @@ class AzureFileSystem::Impl { static constexpr auto kTimeNeededForFileOrDirectoryRename = std::chrono::seconds{3}; public: + /// \pre location.container is not empty. + /// \pre location.path is not empty. + /// \pre location.path does not end with a trailing slash. + Status DeleteFileOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location, bool require_file_to_exist, + Azure::Nullable lease_id = {}) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + DCHECK(!internal::HasTrailingSlash(location.path)); + auto file_client = adlfs_client.GetFileClient(location.path); + DataLake::GetPathPropertiesOptions get_options; + get_options.AccessConditions.LeaseId = lease_id; + DataLake::DeleteFileOptions delete_options; + delete_options.AccessConditions.LeaseId = std::move(lease_id); + try { + // This is necessary to avoid deletion of directories via DeleteFile. + auto properties = file_client.GetProperties(get_options); + if (properties.Value.IsDirectory) { + return internal::NotAFile(location.all); + } + auto response = file_client.Delete(delete_options); + // Only the "*IfExists" functions ever set Deleted to false. + // All the others either succeed or throw an exception. + DCHECK(response.Value.Deleted); + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { + // ErrorCode can be "FilesystemNotFound", "PathNotFound"... + if (require_file_to_exist) { + return PathNotFound(location); + } + return Status::OK(); + } + return ExceptionToStatus(exception, "Failed to delete a file: ", location.path, + ": ", file_client.GetUrl()); + } + return Status::OK(); + } + + /// \pre location.container is not empty. + /// \pre location.path is not empty. + /// \pre location.path does not end with a trailing slash. + Status DeleteFileOnContainer(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location, bool require_file_to_exist, + const char* operation) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + DCHECK(!internal::HasTrailingSlash(location.path)); + constexpr auto kFileBlobLeaseTime = std::chrono::seconds{15}; + + // If the parent directory of a file is not the container itself, there is a + // risk that deleting the file also deletes the *implied directory* -- the + // directory that is implied by the existence of the file path. + // + // In this case, we must ensure that the deletion is not semantically + // equivalent to also deleting the directory. This is done by ensuring the + // directory marker blob exists before the file is deleted. + std::optional file_blob_lease_guard; + const auto parent = location.parent(); + if (!parent.path.empty()) { + // We have to check the existence of the file before checking the + // existence of the parent directory marker, so we acquire a lease on the + // file first. + ARROW_ASSIGN_OR_RAISE(auto file_blob_lease_client, + AcquireBlobLease(location, kFileBlobLeaseTime, + /*allow_missing=*/!require_file_to_exist)); + if (file_blob_lease_client) { + file_blob_lease_guard.emplace(std::move(file_blob_lease_client), + kFileBlobLeaseTime); + // Ensure the empty directory marker blob exists before the file is deleted. + // + // There is not need to hold a lease on the directory marker because if + // a concurrent client deletes the directory marker right after we + // create it, the file deletion itself won't be the cause of the directory + // deletion. Additionally, the fact that a lease is held on the blob path + // semantically preserves the directory -- its existence is implied + // until the blob representing the file is deleted -- even if another + // client deletes the directory marker. + RETURN_NOT_OK(EnsureEmptyDirExists(container_client, parent, operation)); + } else { + // File blob doesn't exist and require_file_to_exists is false, so just return OK. + // Trying to delete a file that doesn't exist, costs only one request to the + // server. + DCHECK(!require_file_to_exist); + return Status::OK(); + } + } + + auto blob_client = container_client.GetBlobClient(location.path); + Blobs::DeleteBlobOptions options; + if (file_blob_lease_guard) { + options.AccessConditions.LeaseId = file_blob_lease_guard->LeaseId(); + } + try { + auto response = blob_client.Delete(options); + // Only the "*IfExists" functions ever set Deleted to false. + // All the others either succeed or throw an exception. + DCHECK(response.Value.Deleted); + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { + if (require_file_to_exist) { + return PathNotFound(location); + } + return Status::OK(); + } + return ExceptionToStatus(exception, "Failed to delete a file: ", location.all, ": ", + blob_client.GetUrl()); + } + return Status::OK(); + } + /// The conditions for a successful container rename are derived from the /// conditions for a successful `Move("/$src.container", "/$dest.container")`. /// The numbers here match the list in `Move`. @@ -2545,7 +2635,28 @@ Status AzureFileSystem::DeleteRootDirContents() { Status AzureFileSystem::DeleteFile(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->DeleteFile(location); + if (location.container.empty()) { + return Status::Invalid("DeleteFile requires a non-empty path."); + } + if (internal::HasTrailingSlash(location.path) || location.path.empty()) { + // Container paths (locations w/o path) or paths ending with a slash are not file + // paths. + return NotAFile(location); + } + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return PathNotFound(location); + } + if (hns_support == HNSSupport::kEnabled) { + return impl_->DeleteFileOnFileSystem(adlfs_client, location, + /*require_file_to_exist=*/true); + } + auto container_client = impl_->GetBlobContainerClient(location.container); + return impl_->DeleteFileOnContainer(container_client, location, + /*require_file_to_exist=*/true, + /*operation=*/"DeleteFile"); } Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index c39a5b7d22b..b58db39a8f3 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -889,6 +889,117 @@ class TestAzureFileSystem : public ::testing::Test { ASSERT_RAISES(IOError, fs()->DeleteDirContents(directory_path, false)); } + void TestDeleteFileAtRoot() { + ASSERT_RAISES(Invalid, fs()->DeleteFile("file0")); + ASSERT_RAISES(Invalid, fs()->DeleteFile("file1/")); + const auto container_name = PreexistingData::RandomContainerName(rng_); + if (WithHierarchicalNamespace()) { + ARROW_UNUSED(CreateFilesystem(container_name)); + } else { + ARROW_UNUSED(CreateContainer(container_name)); + } + arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); + ASSERT_RAISES(IOError, fs()->DeleteFile(container_name)); + ASSERT_RAISES(IOError, fs()->DeleteFile(container_name + "/")); + } + + void TestDeleteFileAtContainerRoot() { + auto data = SetUpPreexistingData(); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Path does not exist '" + data.Path("nonexistent-file") + + "'"), + fs()->DeleteFile(data.Path("nonexistent-file"))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Not a regular file: '" + data.Path("nonexistent-file/") + + "'"), + fs()->DeleteFile(data.Path("nonexistent-file/"))); + + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::File); + ASSERT_OK(fs()->DeleteFile(data.ObjectPath())); + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::NotFound); + + if (WithHierarchicalNamespace()) { + auto adlfs_client = + datalake_service_client_->GetFileSystemClient(data.container_name); + CreateFile(adlfs_client, data.kObjectName, PreexistingData::kLoremIpsum); + } else { + auto container_client = CreateContainer(data.container_name); + CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); + } + + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::File); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + data.ObjectPath() + "/'"), + fs()->DeleteFile(data.ObjectPath() + "/")); + ASSERT_RAISES(IOError, fs()->DeleteFile(data.ObjectPath() + "/")); + } + + void TestDeleteFileAtSubdirectory(bool create_empty_dir_marker_first) { + auto data = SetUpPreexistingData(); + + auto setup_dir_file0 = [this, create_empty_dir_marker_first, &data]() { + if (WithHierarchicalNamespace()) { + ASSERT_FALSE(create_empty_dir_marker_first); + auto adlfs_client = + datalake_service_client_->GetFileSystemClient(data.container_name); + CreateFile(adlfs_client, "dir/file0", PreexistingData::kLoremIpsum); + } else { + auto container_client = CreateContainer(data.container_name); + if (create_empty_dir_marker_first) { + CreateBlob(container_client, "dir/", ""); + } + CreateBlob(container_client, "dir/file0", PreexistingData::kLoremIpsum); + } + }; + setup_dir_file0(); + + // Trying to delete a non-existing file in an existing directory should fail + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Path does not exist '" + data.Path("dir/nonexistent-file") + + "'"), + fs()->DeleteFile(data.Path("dir/nonexistent-file"))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Not a regular file: '" + + data.Path("dir/nonexistent-file/") + "'"), + fs()->DeleteFile(data.Path("dir/nonexistent-file/"))); + + // Trying to delete the directory with DeleteFile should fail + if (WithHierarchicalNamespace()) { + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir") + "'"), + fs()->DeleteFile(data.Path("dir"))); + } else { + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + data.Path("dir") + "'"), + fs()->DeleteFile(data.Path("dir"))); + } + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir/") + "'"), + fs()->DeleteFile(data.Path("dir/"))); + + arrow::fs::AssertFileInfo(fs(), data.Path("dir"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File); + ASSERT_OK(fs()->DeleteFile(data.Path("dir/file0"))); + arrow::fs::AssertFileInfo(fs(), data.Path("dir"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/"), FileType::Directory); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::NotFound); + + // Recreating the file on the same path gurantees leases were properly released/broken + setup_dir_file0(); + + arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, + ::testing::HasSubstr("Not a regular file: '" + data.Path("dir/file0/") + "'"), + fs()->DeleteFile(data.Path("dir/file0/"))); + } + private: using StringMatcher = ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher>; @@ -1528,6 +1639,21 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsFailureNonexisten this->TestDeleteDirContentsFailureNonexistent(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtRoot) { + void TestDeleteFileAtRoot(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtContainerRoot) { + this->TestDeleteFileAtContainerRoot(); +} + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtSubdirectory) { + this->TestDeleteFileAtSubdirectory(/*create_empty_dir_marker_first=*/false); + if (!this->WithHierarchicalNamespace()) { + this->TestDeleteFileAtSubdirectory(/*create_empty_dir_marker_first=*/true); + } +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, RenameContainer) { this->TestRenameContainer(); } @@ -1816,57 +1942,6 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { this->TestDeleteDirContentsFailureNonexistent(); } -TEST_F(TestAzuriteFileSystem, DeleteFileSuccess) { - const auto container_name = PreexistingData::RandomContainerName(rng_); - const auto file_name = ConcatAbstractPath(container_name, "filename"); - if (WithHierarchicalNamespace()) { - auto adlfs_client = CreateFilesystem(container_name); - CreateFile(adlfs_client, "filename", "data"); - } else { - auto container = CreateContainer(container_name); - CreateBlob(container, "filename", "data"); - } - arrow::fs::AssertFileInfo(fs(), file_name, FileType::File); - ASSERT_OK(fs()->DeleteFile(file_name)); - arrow::fs::AssertFileInfo(fs(), file_name, FileType::NotFound); -} - -TEST_F(TestAzuriteFileSystem, DeleteFileFailureNonexistent) { - const auto container_name = PreexistingData::RandomContainerName(rng_); - const auto nonexistent_file_name = ConcatAbstractPath(container_name, "nonexistent"); - if (WithHierarchicalNamespace()) { - ARROW_UNUSED(CreateFilesystem(container_name)); - } else { - ARROW_UNUSED(CreateContainer(container_name)); - } - ASSERT_RAISES(IOError, fs()->DeleteFile(nonexistent_file_name)); -} - -TEST_F(TestAzuriteFileSystem, DeleteFileFailureContainer) { - const auto container_name = PreexistingData::RandomContainerName(rng_); - if (WithHierarchicalNamespace()) { - ARROW_UNUSED(CreateFilesystem(container_name)); - } else { - ARROW_UNUSED(CreateContainer(container_name)); - } - arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); - ASSERT_RAISES(IOError, fs()->DeleteFile(container_name)); -} - -TEST_F(TestAzuriteFileSystem, DeleteFileFailureDirectory) { - auto container_name = PreexistingData::RandomContainerName(rng_); - if (WithHierarchicalNamespace()) { - auto adlfs_client = CreateFilesystem(container_name); - CreateDirectory(adlfs_client, "directory"); - } else { - auto container = CreateContainer(container_name); - CreateBlob(container, "directory/"); - } - auto directory_path = ConcatAbstractPath(container_name, "directory"); - arrow::fs::AssertFileInfo(fs(), directory_path, FileType::Directory); - ASSERT_RAISES(IOError, fs()->DeleteFile(directory_path)); -} - TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); From edfc5ba89900bf6607d83ab39640c7d0c0a12a02 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 15 Feb 2024 14:58:07 -0300 Subject: [PATCH 03/15] Fixes from PR feedback --- cpp/src/arrow/filesystem/azurefs_test.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index b58db39a8f3..bddb752e8a3 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -908,14 +908,14 @@ class TestAzureFileSystem : public ::testing::Test { EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, - ::testing::HasSubstr("Path does not exist '" + data.Path("nonexistent-file") + + ::testing::HasSubstr("Path does not exist '" + data.Path("nonexistent-path") + "'"), - fs()->DeleteFile(data.Path("nonexistent-file"))); + fs()->DeleteFile(data.Path("nonexistent-path"))); EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, - ::testing::HasSubstr("Not a regular file: '" + data.Path("nonexistent-file/") + + ::testing::HasSubstr("Not a regular file: '" + data.Path("nonexistent-path/") + "'"), - fs()->DeleteFile(data.Path("nonexistent-file/"))); + fs()->DeleteFile(data.Path("nonexistent-path/"))); arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::File); ASSERT_OK(fs()->DeleteFile(data.ObjectPath())); @@ -959,14 +959,14 @@ class TestAzureFileSystem : public ::testing::Test { // Trying to delete a non-existing file in an existing directory should fail EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, - ::testing::HasSubstr("Path does not exist '" + data.Path("dir/nonexistent-file") + + ::testing::HasSubstr("Path does not exist '" + data.Path("dir/nonexistent-path") + "'"), - fs()->DeleteFile(data.Path("dir/nonexistent-file"))); + fs()->DeleteFile(data.Path("dir/nonexistent-path"))); EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, ::testing::HasSubstr("Not a regular file: '" + - data.Path("dir/nonexistent-file/") + "'"), - fs()->DeleteFile(data.Path("dir/nonexistent-file/"))); + data.Path("dir/nonexistent-path/") + "'"), + fs()->DeleteFile(data.Path("dir/nonexistent-path/"))); // Trying to delete the directory with DeleteFile should fail if (WithHierarchicalNamespace()) { @@ -993,11 +993,11 @@ class TestAzureFileSystem : public ::testing::Test { // Recreating the file on the same path gurantees leases were properly released/broken setup_dir_file0(); - arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File); EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir/file0/") + "'"), fs()->DeleteFile(data.Path("dir/file0/"))); + arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File); } private: From 7a822fe38e98224f1fa7a440e09e8cd54cf1d8fe Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 12 Feb 2024 18:02:21 -0300 Subject: [PATCH 04/15] AzureLocation: Add RemoveTrailingSlash() method --- cpp/src/arrow/filesystem/azurefs.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 05184845d83..dc4fe84bc43 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -266,6 +266,24 @@ struct AzureLocation { return parent; } + /// \brief Create a copy of this location with the trailing slash removed from the + /// path. + /// + /// \param[in] preserve_original If true, the original string that was parsed + /// into this location is preserved as is. + AzureLocation RemoveTrailingSlash(bool preserve_original) const { + AzureLocation result; + if (preserve_original) { + result.all = all; + } else { + result.all = internal::RemoveTrailingSlash(all); + } + result.container = container; + result.path = internal::RemoveTrailingSlash(path); + result.path_parts = path_parts; + return result; + } + Result join(const std::string& stem) const { return FromString(internal::ConcatAbstractPath(all, stem)); } From 0a69c6b62348e857189c08d6444cfd531ecfcdb6 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 15 Feb 2024 22:01:25 -0300 Subject: [PATCH 05/15] DeleteFile: Be more detailed in the type of errors returned --- cpp/src/arrow/filesystem/azurefs.cc | 89 ++++++++++++++++-------- cpp/src/arrow/filesystem/azurefs_test.cc | 69 +++++++++--------- 2 files changed, 93 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index dc4fe84bc43..0e87d9d27e8 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2055,25 +2055,24 @@ class AzureFileSystem::Impl { public: /// \pre location.container is not empty. /// \pre location.path is not empty. - /// \pre location.path does not end with a trailing slash. Status DeleteFileOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, - const AzureLocation& location, bool require_file_to_exist, - Azure::Nullable lease_id = {}) { + const AzureLocation& location, + bool require_file_to_exist) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); - DCHECK(!internal::HasTrailingSlash(location.path)); - auto file_client = adlfs_client.GetFileClient(location.path); - DataLake::GetPathPropertiesOptions get_options; - get_options.AccessConditions.LeaseId = lease_id; - DataLake::DeleteFileOptions delete_options; - delete_options.AccessConditions.LeaseId = std::move(lease_id); + auto path_no_trailing_slash = + std::string{internal::RemoveTrailingSlash(location.path)}; + auto file_client = adlfs_client.GetFileClient(path_no_trailing_slash); try { // This is necessary to avoid deletion of directories via DeleteFile. - auto properties = file_client.GetProperties(get_options); + auto properties = file_client.GetProperties(); if (properties.Value.IsDirectory) { return internal::NotAFile(location.all); } - auto response = file_client.Delete(delete_options); + if (internal::HasTrailingSlash(location.path)) { + return internal::NotADir(location.all); + } + auto response = file_client.Delete(); // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); @@ -2093,14 +2092,36 @@ class AzureFileSystem::Impl { /// \pre location.container is not empty. /// \pre location.path is not empty. - /// \pre location.path does not end with a trailing slash. Status DeleteFileOnContainer(const Blobs::BlobContainerClient& container_client, const AzureLocation& location, bool require_file_to_exist, const char* operation) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); - DCHECK(!internal::HasTrailingSlash(location.path)); constexpr auto kFileBlobLeaseTime = std::chrono::seconds{15}; + auto no_trailing_slash_location = location.RemoveTrailingSlash( + /*preserve_original=*/true); + + std::optional location_type_opt; + auto location_type_lazy = [&]() -> Result { + if (location_type_opt.has_value()) { + return location_type_opt.value(); + } + ARROW_ASSIGN_OR_RAISE(auto info, + GetFileInfo(container_client, no_trailing_slash_location)); + location_type_opt = info.type(); + return info.type(); + }; + + // Paths ending with trailing slashes are never leading to a deletion, + // but the correct error message requires a check of the path. + if (internal::HasTrailingSlash(location.path)) { + ARROW_ASSIGN_OR_RAISE(auto file_type, location_type_lazy()); + if (file_type == FileType::NotFound) { + return require_file_to_exist ? PathNotFound(location) : Status::OK(); + } + return file_type == FileType::Directory ? internal::NotAFile(location.all) + : internal::NotADir(location.all); + } // If the parent directory of a file is not the container itself, there is a // risk that deleting the file also deletes the *implied directory* -- the @@ -2117,11 +2138,16 @@ class AzureFileSystem::Impl { // file first. ARROW_ASSIGN_OR_RAISE(auto file_blob_lease_client, AcquireBlobLease(location, kFileBlobLeaseTime, - /*allow_missing=*/!require_file_to_exist)); + /*allow_missing=*/true)); if (file_blob_lease_client) { file_blob_lease_guard.emplace(std::move(file_blob_lease_client), kFileBlobLeaseTime); - // Ensure the empty directory marker blob exists before the file is deleted. + // If the path without a trailing slash exists, we can be sure it's a file. + // Cache it in location_type_opt so we don't have to check it again later + // in case location_type_lazy is called. + location_type_opt = FileType::File; + // Ensure the empty directory marker blob of the parent exists before the file is + // deleted. // // There is not need to hold a lease on the directory marker because if // a concurrent client deletes the directory marker right after we @@ -2132,14 +2158,17 @@ class AzureFileSystem::Impl { // client deletes the directory marker. RETURN_NOT_OK(EnsureEmptyDirExists(container_client, parent, operation)); } else { - // File blob doesn't exist and require_file_to_exists is false, so just return OK. - // Trying to delete a file that doesn't exist, costs only one request to the - // server. - DCHECK(!require_file_to_exist); - return Status::OK(); + // The blob representing a file doesn't exist, but the path may exist as + // a directory so we must check here. + ARROW_ASSIGN_OR_RAISE(auto file_type, location_type_lazy()); + if (file_type == FileType::Directory) { + return internal::NotAFile(location.all); + } + return require_file_to_exist ? PathNotFound(location) : Status::OK(); } } + DCHECK(!internal::HasTrailingSlash(location.path)); auto blob_client = container_client.GetBlobClient(location.path); Blobs::DeleteBlobOptions options; if (file_blob_lease_guard) { @@ -2152,10 +2181,13 @@ class AzureFileSystem::Impl { DCHECK(response.Value.Deleted); } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { - if (require_file_to_exist) { - return PathNotFound(location); + // The blob representing a file doesn't exist, but the path may exist as + // a directory so we must check here. + ARROW_ASSIGN_OR_RAISE(auto file_type, location_type_lazy()); + if (file_type == FileType::Directory) { + return internal::NotAFile(location.all); } - return Status::OK(); + return require_file_to_exist ? PathNotFound(location) : Status::OK(); } return ExceptionToStatus(exception, "Failed to delete a file: ", location.all, ": ", blob_client.GetUrl()); @@ -2656,10 +2688,12 @@ Status AzureFileSystem::DeleteFile(const std::string& path) { if (location.container.empty()) { return Status::Invalid("DeleteFile requires a non-empty path."); } - if (internal::HasTrailingSlash(location.path) || location.path.empty()) { - // Container paths (locations w/o path) or paths ending with a slash are not file - // paths. - return NotAFile(location); + auto container_client = impl_->GetBlobContainerClient(location.container); + if (location.path.empty()) { + // Container paths (locations w/o path) are either not found or represent directories. + ARROW_ASSIGN_OR_RAISE(auto container_info, + GetContainerPropsAsFileInfo(location, container_client)); + return container_info.IsDirectory() ? NotAFile(location) : PathNotFound(location); } auto adlfs_client = impl_->GetFileSystemClient(location.container); ARROW_ASSIGN_OR_RAISE(auto hns_support, @@ -2671,7 +2705,6 @@ Status AzureFileSystem::DeleteFile(const std::string& path) { return impl_->DeleteFileOnFileSystem(adlfs_client, location, /*require_file_to_exist=*/true); } - auto container_client = impl_->GetBlobContainerClient(location.container); return impl_->DeleteFileOnContainer(container_client, location, /*require_file_to_exist=*/true, /*operation=*/"DeleteFile"); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index bddb752e8a3..1664823ff34 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -641,6 +641,18 @@ class TestAzureFileSystem : public ::testing::Test { #endif } + static bool WithErrno(const Status& status, int expected_errno) { + auto* detail = status.detail().get(); + return detail && + arrow::internal::ErrnoFromStatusDetail(*detail).value_or(-1) == expected_errno; + } + +#define ASSERT_RAISES_ERRNO(expr, expected_errno) \ + for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); \ + !WithErrno(_st, (expected_errno));) \ + FAIL() << "'" ARROW_STRINGIFY(expr) "' did not fail with errno=" << #expected_errno \ + << ": " << _st.ToString() + // Tests that are called from more than one implementation of TestAzureFileSystem void TestDetectHierarchicalNamespace(bool trip_up_azurite); @@ -890,8 +902,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteFileAtRoot() { - ASSERT_RAISES(Invalid, fs()->DeleteFile("file0")); - ASSERT_RAISES(Invalid, fs()->DeleteFile("file1/")); + ASSERT_RAISES_ERRNO(fs()->DeleteFile("file0"), ENOENT); + ASSERT_RAISES_ERRNO(fs()->DeleteFile("file1/"), ENOENT); const auto container_name = PreexistingData::RandomContainerName(rng_); if (WithHierarchicalNamespace()) { ARROW_UNUSED(CreateFilesystem(container_name)); @@ -899,23 +911,19 @@ class TestAzureFileSystem : public ::testing::Test { ARROW_UNUSED(CreateContainer(container_name)); } arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); - ASSERT_RAISES(IOError, fs()->DeleteFile(container_name)); - ASSERT_RAISES(IOError, fs()->DeleteFile(container_name + "/")); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + container_name + "'"), + fs()->DeleteFile(container_name)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + container_name + "/'"), + fs()->DeleteFile(container_name + "/")); } void TestDeleteFileAtContainerRoot() { auto data = SetUpPreexistingData(); - EXPECT_RAISES_WITH_MESSAGE_THAT( - IOError, - ::testing::HasSubstr("Path does not exist '" + data.Path("nonexistent-path") + - "'"), - fs()->DeleteFile(data.Path("nonexistent-path"))); - EXPECT_RAISES_WITH_MESSAGE_THAT( - IOError, - ::testing::HasSubstr("Not a regular file: '" + data.Path("nonexistent-path/") + - "'"), - fs()->DeleteFile(data.Path("nonexistent-path/"))); + ASSERT_RAISES_ERRNO(fs()->DeleteFile(data.Path("nonexistent-path")), ENOENT); + ASSERT_RAISES_ERRNO(fs()->DeleteFile(data.Path("nonexistent-path/")), ENOENT); arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::File); ASSERT_OK(fs()->DeleteFile(data.ObjectPath())); @@ -929,12 +937,11 @@ class TestAzureFileSystem : public ::testing::Test { auto container_client = CreateContainer(data.container_name); CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); } - arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::File); - EXPECT_RAISES_WITH_MESSAGE_THAT( - IOError, ::testing::HasSubstr("Not a regular file: '" + data.ObjectPath() + "/'"), - fs()->DeleteFile(data.ObjectPath() + "/")); - ASSERT_RAISES(IOError, fs()->DeleteFile(data.ObjectPath() + "/")); + + ASSERT_RAISES_ERRNO(fs()->DeleteFile(data.ObjectPath() + "/"), ENOTDIR); + ASSERT_OK(fs()->DeleteFile(data.ObjectPath())); + arrow::fs::AssertFileInfo(fs(), data.ObjectPath(), FileType::NotFound); } void TestDeleteFileAtSubdirectory(bool create_empty_dir_marker_first) { @@ -964,20 +971,14 @@ class TestAzureFileSystem : public ::testing::Test { fs()->DeleteFile(data.Path("dir/nonexistent-path"))); EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, - ::testing::HasSubstr("Not a regular file: '" + + ::testing::HasSubstr("Path does not exist '" + data.Path("dir/nonexistent-path/") + "'"), fs()->DeleteFile(data.Path("dir/nonexistent-path/"))); // Trying to delete the directory with DeleteFile should fail - if (WithHierarchicalNamespace()) { - EXPECT_RAISES_WITH_MESSAGE_THAT( - IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir") + "'"), - fs()->DeleteFile(data.Path("dir"))); - } else { - EXPECT_RAISES_WITH_MESSAGE_THAT( - IOError, ::testing::HasSubstr("Path does not exist '" + data.Path("dir") + "'"), - fs()->DeleteFile(data.Path("dir"))); - } + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir") + "'"), + fs()->DeleteFile(data.Path("dir"))); EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, ::testing::HasSubstr("Not a regular file: '" + data.Path("dir/") + "'"), fs()->DeleteFile(data.Path("dir/"))); @@ -995,7 +996,7 @@ class TestAzureFileSystem : public ::testing::Test { EXPECT_RAISES_WITH_MESSAGE_THAT( IOError, - ::testing::HasSubstr("Not a regular file: '" + data.Path("dir/file0/") + "'"), + ::testing::HasSubstr("Not a directory: '" + data.Path("dir/file0/") + "'"), fs()->DeleteFile(data.Path("dir/file0/"))); arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File); } @@ -1157,12 +1158,6 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs(), dest, type); } - static bool WithErrno(const Status& status, int expected_errno) { - auto* detail = status.detail().get(); - return detail && - arrow::internal::ErrnoFromStatusDetail(*detail).value_or(-1) == expected_errno; - } - std::optional MoveErrorMessageMatcher(const FileInfo& src_info, const std::string& src, const std::string& dest, @@ -1640,7 +1635,7 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsFailureNonexisten } TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtRoot) { - void TestDeleteFileAtRoot(); + this->TestDeleteFileAtRoot(); } TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteFileAtContainerRoot) { From a36ce5241dd338e82ba70e9162467346a634699a Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 15 Feb 2024 23:03:14 -0300 Subject: [PATCH 06/15] Note about the need to break leases only when using the DataLake API --- cpp/src/arrow/filesystem/azurefs.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0e87d9d27e8..fe74ed02c3d 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1103,7 +1103,11 @@ class LeaseGuard { return Status::OK(); } - /// \brief Break the lease before deleting or renaming the resource. + /// \brief Break the lease before deleting or renaming the resource via the + /// DataLakeFileSystemClient API. + /// + /// NOTE: When using the Blobs API, this is not necessary -- you can release a + /// lease on a path after it's deleted with a lease on it. /// /// Calling this is recommended when the resource for which the lease was acquired is /// about to be deleted as there is no way of releasing the lease after that, we can From cf037242250ddd02af5050cf87e58bee7e6afc4f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 16 Feb 2024 11:42:49 -0300 Subject: [PATCH 07/15] Rename AzureLocation::RemoveTrailingSlash{->FromPath}() ...instead of taking a bool parameter. --- cpp/src/arrow/filesystem/azurefs.cc | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index fe74ed02c3d..0aeac8adaec 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -269,15 +269,11 @@ struct AzureLocation { /// \brief Create a copy of this location with the trailing slash removed from the /// path. /// - /// \param[in] preserve_original If true, the original string that was parsed - /// into this location is preserved as is. - AzureLocation RemoveTrailingSlash(bool preserve_original) const { + /// The original string that was parsed into this location -- all -- is preserved as is + /// to make error messages show what the user originally provided. + AzureLocation RemoveTrailingSlashFromPath() const { AzureLocation result; - if (preserve_original) { - result.all = all; - } else { - result.all = internal::RemoveTrailingSlash(all); - } + result.all = all; result.container = container; result.path = internal::RemoveTrailingSlash(path); result.path_parts = path_parts; @@ -2102,8 +2098,7 @@ class AzureFileSystem::Impl { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); constexpr auto kFileBlobLeaseTime = std::chrono::seconds{15}; - auto no_trailing_slash_location = location.RemoveTrailingSlash( - /*preserve_original=*/true); + auto no_trailing_slash_location = location.RemoveTrailingSlashFromPath(); std::optional location_type_opt; auto location_type_lazy = [&]() -> Result { From 965fcbadebb14eb85b1015bf9fb73884e7deaaf3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 16 Feb 2024 11:51:29 -0300 Subject: [PATCH 08/15] DeleteFile: Simplify how path is tested for being a directory when not found as a file --- cpp/src/arrow/filesystem/azurefs.cc | 49 ++++++++++------------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0aeac8adaec..0487cb8d427 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2098,28 +2098,27 @@ class AzureFileSystem::Impl { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); constexpr auto kFileBlobLeaseTime = std::chrono::seconds{15}; - auto no_trailing_slash_location = location.RemoveTrailingSlashFromPath(); - std::optional location_type_opt; - auto location_type_lazy = [&]() -> Result { - if (location_type_opt.has_value()) { - return location_type_opt.value(); - } - ARROW_ASSIGN_OR_RAISE(auto info, + // When it's known that the blob doesn't exist as a file, check if it exists as a + // directory to generate the appropriate error message. + auto check_if_location_exists_as_dir = [&]() -> Status { + auto no_trailing_slash_location = location.RemoveTrailingSlashFromPath(); + ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(container_client, no_trailing_slash_location)); - location_type_opt = info.type(); - return info.type(); + if (file_info.type() == FileType::NotFound) { + return require_file_to_exist ? PathNotFound(location) : Status::OK(); + } + if (file_info.type() == FileType::Directory) { + return internal::NotAFile(location.all); + } + return internal::HasTrailingSlash(location.path) ? internal::NotADir(location.all) + : internal::NotAFile(location.all); }; // Paths ending with trailing slashes are never leading to a deletion, // but the correct error message requires a check of the path. if (internal::HasTrailingSlash(location.path)) { - ARROW_ASSIGN_OR_RAISE(auto file_type, location_type_lazy()); - if (file_type == FileType::NotFound) { - return require_file_to_exist ? PathNotFound(location) : Status::OK(); - } - return file_type == FileType::Directory ? internal::NotAFile(location.all) - : internal::NotADir(location.all); + return check_if_location_exists_as_dir(); } // If the parent directory of a file is not the container itself, there is a @@ -2141,10 +2140,6 @@ class AzureFileSystem::Impl { if (file_blob_lease_client) { file_blob_lease_guard.emplace(std::move(file_blob_lease_client), kFileBlobLeaseTime); - // If the path without a trailing slash exists, we can be sure it's a file. - // Cache it in location_type_opt so we don't have to check it again later - // in case location_type_lazy is called. - location_type_opt = FileType::File; // Ensure the empty directory marker blob of the parent exists before the file is // deleted. // @@ -2157,13 +2152,7 @@ class AzureFileSystem::Impl { // client deletes the directory marker. RETURN_NOT_OK(EnsureEmptyDirExists(container_client, parent, operation)); } else { - // The blob representing a file doesn't exist, but the path may exist as - // a directory so we must check here. - ARROW_ASSIGN_OR_RAISE(auto file_type, location_type_lazy()); - if (file_type == FileType::Directory) { - return internal::NotAFile(location.all); - } - return require_file_to_exist ? PathNotFound(location) : Status::OK(); + return check_if_location_exists_as_dir(); } } @@ -2180,13 +2169,7 @@ class AzureFileSystem::Impl { DCHECK(response.Value.Deleted); } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { - // The blob representing a file doesn't exist, but the path may exist as - // a directory so we must check here. - ARROW_ASSIGN_OR_RAISE(auto file_type, location_type_lazy()); - if (file_type == FileType::Directory) { - return internal::NotAFile(location.all); - } - return require_file_to_exist ? PathNotFound(location) : Status::OK(); + return check_if_location_exists_as_dir(); } return ExceptionToStatus(exception, "Failed to delete a file: ", location.all, ": ", blob_client.GetUrl()); From a37272b49d5298a32ff4a1684e8d30689abdd252 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 19 Feb 2024 09:48:51 -0300 Subject: [PATCH 09/15] Improve GetFileInfo error messages --- cpp/src/arrow/filesystem/azurefs.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0487cb8d427..d37773a3006 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1292,7 +1292,8 @@ class AzureFileSystem::Impl { } return ExceptionToStatus( exception, "GetProperties for '", file_client.GetUrl(), - "' failed. GetFileInfo is unable to determine whether the path exists."); + "' failed. GetFileInfo is unable to determine whether the path exists: ", + location.all); } } @@ -1346,7 +1347,9 @@ class AzureFileSystem::Impl { } return ExceptionToStatus( exception, "ListBlobsByHierarchy failed for prefix='", *options.Prefix, - "'. GetFileInfo is unable to determine whether the path exists."); + "' on '", container_client.GetUrl(), + "'. GetFileInfo is unable to determine whether the path exists: '", + location.all, "'"); } } From 31b997d712209878bfa5eb0113a1c5e2336909ee Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 19 Feb 2024 09:50:04 -0300 Subject: [PATCH 10/15] Revert "Improve GetFileInfo error messages" This reverts commit a37272b49d5298a32ff4a1684e8d30689abdd252. --- cpp/src/arrow/filesystem/azurefs.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d37773a3006..0487cb8d427 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1292,8 +1292,7 @@ class AzureFileSystem::Impl { } return ExceptionToStatus( exception, "GetProperties for '", file_client.GetUrl(), - "' failed. GetFileInfo is unable to determine whether the path exists: ", - location.all); + "' failed. GetFileInfo is unable to determine whether the path exists."); } } @@ -1347,9 +1346,7 @@ class AzureFileSystem::Impl { } return ExceptionToStatus( exception, "ListBlobsByHierarchy failed for prefix='", *options.Prefix, - "' on '", container_client.GetUrl(), - "'. GetFileInfo is unable to determine whether the path exists: '", - location.all, "'"); + "'. GetFileInfo is unable to determine whether the path exists."); } } From 839d3a7493d0c0df4570edc343594bf0c44e1ff5 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 19 Feb 2024 09:49:32 -0300 Subject: [PATCH 11/15] Don't use AzureLocation::RemoveTrailingSlashFromPath --- cpp/src/arrow/filesystem/azurefs.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0487cb8d427..9959b1955f2 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2102,9 +2102,7 @@ class AzureFileSystem::Impl { // When it's known that the blob doesn't exist as a file, check if it exists as a // directory to generate the appropriate error message. auto check_if_location_exists_as_dir = [&]() -> Status { - auto no_trailing_slash_location = location.RemoveTrailingSlashFromPath(); - ARROW_ASSIGN_OR_RAISE(auto file_info, - GetFileInfo(container_client, no_trailing_slash_location)); + ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(container_client, location)); if (file_info.type() == FileType::NotFound) { return require_file_to_exist ? PathNotFound(location) : Status::OK(); } From 5e3c61c58b0110b5642092acd3adc6b2fd96a9af Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 19 Feb 2024 10:00:30 -0300 Subject: [PATCH 12/15] Manually copy and remove trailing slashes from location --- cpp/src/arrow/filesystem/azurefs.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 9959b1955f2..fe112366322 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2102,7 +2102,11 @@ class AzureFileSystem::Impl { // When it's known that the blob doesn't exist as a file, check if it exists as a // directory to generate the appropriate error message. auto check_if_location_exists_as_dir = [&]() -> Status { - ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(container_client, location)); + auto no_trailing_slash = location; + no_trailing_slash.path = internal::RemoveTrailingSlash(location.path); + no_trailing_slash.all = internal::RemoveTrailingSlash(location.all); + ARROW_ASSIGN_OR_RAISE(auto file_info, + GetFileInfo(container_client, no_trailing_slash)); if (file_info.type() == FileType::NotFound) { return require_file_to_exist ? PathNotFound(location) : Status::OK(); } From 066fa7ba15696b6eca81e87a6fe93c56bc6b947d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 19 Feb 2024 10:05:21 -0300 Subject: [PATCH 13/15] Revert "Rename AzureLocation::RemoveTrailingSlash{->FromPath}()" This reverts commit cf037242250ddd02af5050cf87e58bee7e6afc4f. --- cpp/src/arrow/filesystem/azurefs.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index fe112366322..4e136277f24 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -269,11 +269,15 @@ struct AzureLocation { /// \brief Create a copy of this location with the trailing slash removed from the /// path. /// - /// The original string that was parsed into this location -- all -- is preserved as is - /// to make error messages show what the user originally provided. - AzureLocation RemoveTrailingSlashFromPath() const { + /// \param[in] preserve_original If true, the original string that was parsed + /// into this location is preserved as is. + AzureLocation RemoveTrailingSlash(bool preserve_original) const { AzureLocation result; - result.all = all; + if (preserve_original) { + result.all = all; + } else { + result.all = internal::RemoveTrailingSlash(all); + } result.container = container; result.path = internal::RemoveTrailingSlash(path); result.path_parts = path_parts; From 57e96473a03fd81118f6cec7862a0cd739e35dfa Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 19 Feb 2024 10:05:24 -0300 Subject: [PATCH 14/15] Revert "AzureLocation: Add RemoveTrailingSlash() method" This reverts commit 7a822fe38e98224f1fa7a440e09e8cd54cf1d8fe. --- cpp/src/arrow/filesystem/azurefs.cc | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 4e136277f24..d55fef8fafa 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -266,24 +266,6 @@ struct AzureLocation { return parent; } - /// \brief Create a copy of this location with the trailing slash removed from the - /// path. - /// - /// \param[in] preserve_original If true, the original string that was parsed - /// into this location is preserved as is. - AzureLocation RemoveTrailingSlash(bool preserve_original) const { - AzureLocation result; - if (preserve_original) { - result.all = all; - } else { - result.all = internal::RemoveTrailingSlash(all); - } - result.container = container; - result.path = internal::RemoveTrailingSlash(path); - result.path_parts = path_parts; - return result; - } - Result join(const std::string& stem) const { return FromString(internal::ConcatAbstractPath(all, stem)); } From e72afc9bfe5f75281a39c33de92ceff8f5a5a852 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 20 Feb 2024 22:03:42 -0300 Subject: [PATCH 15/15] Remove redundant DCHECK --- cpp/src/arrow/filesystem/azurefs.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d55fef8fafa..75219828992 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2144,7 +2144,6 @@ class AzureFileSystem::Impl { } } - DCHECK(!internal::HasTrailingSlash(location.path)); auto blob_client = container_client.GetBlobClient(location.path); Blobs::DeleteBlobOptions options; if (file_blob_lease_guard) {