From 1abba8e067fbca8230f3797edf799c9ddb457edf Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 11 Dec 2024 18:28:13 +0000 Subject: [PATCH 01/22] Working with ConfigureSasCredential --- cpp/src/arrow/filesystem/azurefs.cc | 17 +++++++++++++++++ cpp/src/arrow/filesystem/azurefs.h | 3 +++ cpp/src/arrow/filesystem/azurefs_test.cc | 8 ++++++++ 3 files changed, 28 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 78f4ad1edd9..f044eab26c8 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -243,6 +243,8 @@ bool AzureOptions::Equals(const AzureOptions& other) const { case CredentialKind::kStorageSharedKey: return storage_shared_key_credential_->AccountName == other.storage_shared_key_credential_->AccountName; + case CredentialKind::kSasToken: + return sas_token_ == other.sas_token_; case CredentialKind::kClientSecret: case CredentialKind::kCLI: case CredentialKind::kManagedIdentity: @@ -311,6 +313,15 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke return Status::OK(); } +Status AzureOptions::ConfigureSasCredential(const std::string& sas_token) { + credential_kind_ = CredentialKind::kSasToken; + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } + sas_token_ = sas_token; + return Status::OK(); +} + Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret) { @@ -372,6 +383,9 @@ Result> AzureOptions::MakeBlobServiceC case CredentialKind::kStorageSharedKey: return std::make_unique(AccountBlobUrl(account_name), storage_shared_key_credential_); + case CredentialKind::kSasToken: + return std::make_unique(AccountBlobUrl(account_name) + + "?" + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } @@ -404,6 +418,9 @@ AzureOptions::MakeDataLakeServiceClient() const { case CredentialKind::kStorageSharedKey: return std::make_unique( AccountDfsUrl(account_name), storage_shared_key_credential_); + case CredentialKind::kSasToken: + return std::make_unique( + AccountBlobUrl(account_name) + "?" + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index c5e50912569..4f46e75f811 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -120,6 +120,7 @@ struct ARROW_EXPORT AzureOptions { kDefault, kAnonymous, kStorageSharedKey, + kSasToken, kClientSecret, kManagedIdentity, kCLI, @@ -129,6 +130,7 @@ struct ARROW_EXPORT AzureOptions { std::shared_ptr storage_shared_key_credential_; + std::string sas_token_; mutable std::shared_ptr token_credential_; public: @@ -189,6 +191,7 @@ struct ARROW_EXPORT AzureOptions { Status ConfigureDefaultCredential(); Status ConfigureAnonymousCredential(); Status ConfigureAccountKeyCredential(const std::string& account_key); + Status ConfigureSasCredential(const std::string& sas_token); Status ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index a04977bdee0..c8c15ef9012 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -520,6 +520,14 @@ TEST(AzureFileSystem, InitializeWithWorkloadIdentityCredential) { EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } +TEST(AzureFileSystem, InitializeWithSasCredential) { + AzureOptions options; + options.account_name = "dummy-account-name"; + // This SAS token is not secret - its taken from an example in the Azure docs. + ARROW_EXPECT_OK(options.ConfigureSasCredential("sv=2015-02-21&st=2015-07-01T08%3a49Z&se=2015-07-02T08%3a49Z&sr=c&sp=w&si=YWJjZGVmZw%3d%3d&sig=Rcp6gQRfV7WDlURdVTqCa%2bqEArnfJxDgE%2bKH3TCChIs%3d")); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeWithEnvironmentCredential) { AzureOptions options; options.account_name = "dummy-account-name"; From ea091b469deb2f714b584e6e9b63c428fb1a68a0 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 12 Dec 2024 11:49:15 +0000 Subject: [PATCH 02/22] Real access test working against azurite --- cpp/src/arrow/filesystem/azurefs.cc | 4 +-- cpp/src/arrow/filesystem/azurefs_test.cc | 39 +++++++++++++++++++----- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index f044eab26c8..f257161ae65 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -385,7 +385,7 @@ Result> AzureOptions::MakeBlobServiceC storage_shared_key_credential_); case CredentialKind::kSasToken: return std::make_unique(AccountBlobUrl(account_name) + - "?" + sas_token_); + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } @@ -420,7 +420,7 @@ AzureOptions::MakeDataLakeServiceClient() const { AccountDfsUrl(account_name), storage_shared_key_credential_); case CredentialKind::kSasToken: return std::make_unique( - AccountBlobUrl(account_name) + "?" + sas_token_); + AccountBlobUrl(account_name) + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index c8c15ef9012..6d8f7ef91d2 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -520,14 +520,6 @@ TEST(AzureFileSystem, InitializeWithWorkloadIdentityCredential) { EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } -TEST(AzureFileSystem, InitializeWithSasCredential) { - AzureOptions options; - options.account_name = "dummy-account-name"; - // This SAS token is not secret - its taken from an example in the Azure docs. - ARROW_EXPECT_OK(options.ConfigureSasCredential("sv=2015-02-21&st=2015-07-01T08%3a49Z&se=2015-07-02T08%3a49Z&sr=c&sp=w&si=YWJjZGVmZw%3d%3d&sig=Rcp6gQRfV7WDlURdVTqCa%2bqEArnfJxDgE%2bKH3TCChIs%3d")); - EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); -} - TEST(AzureFileSystem, InitializeWithEnvironmentCredential) { AzureOptions options; options.account_name = "dummy-account-name"; @@ -920,6 +912,17 @@ class TestAzureFileSystem : public ::testing::Test { .Value; } + Result GetContainerSasToken(const std::string& container_name) { + std::string sas_token; + Azure::Storage::Sas::BlobSasBuilder builder; + std::chrono::seconds available_period(60); + builder.ExpiresOn = std::chrono::system_clock::now() + available_period; + builder.BlobContainerName = container_name; + builder.Resource = Azure::Storage::Sas::BlobSasResource::BlobContainer; + builder.SetPermissions(Azure::Storage::Sas::BlobContainerSasPermissions::All); + return options_.GenerateSASToken(&builder, blob_service_client_.get()); + } + void UploadLines(const std::vector& lines, const std::string& path, int total_size) { ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); @@ -1625,6 +1628,22 @@ class TestAzureFileSystem : public ::testing::Test { AssertObjectContents(fs.get(), path, payload); } + void TestSasCredential() { + ASSERT_OK_AND_ASSIGN(auto env, AzuriteEnv::GetInstance()); + ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); + // constexpr auto container_name = "sas-container"; + auto data = SetUpPreexistingData(); + // const std::string path = data.ContainerPath("test-write-object"); + + ASSERT_OK_AND_ASSIGN(auto sas_token, GetContainerSasToken(data.container_name)); + ARROW_EXPECT_OK(options.ConfigureSasCredential(sas_token)); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); + + AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); + // ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path)); + // ASSERT_OK(stream->Write(payload)); + } + private: using StringMatcher = ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher>; @@ -2336,6 +2355,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) { TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, SasCredential) { + this->TestSasCredential(); +} + // Tests using Azurite (the local Azure emulator) TEST_F(TestAzuriteFileSystem, CheckIfHierarchicalNamespaceIsEnabledRuntimeError) { From 6df79f70177f03c1befa61d47778f024e73b776a Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 12 Dec 2024 12:33:58 +0000 Subject: [PATCH 03/22] Fix creating getting Azure env in test --- cpp/src/arrow/filesystem/azurefs_test.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 6d8f7ef91d2..fc31fe7c62d 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1629,19 +1629,20 @@ class TestAzureFileSystem : public ::testing::Test { } void TestSasCredential() { - ASSERT_OK_AND_ASSIGN(auto env, AzuriteEnv::GetInstance()); - ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); - // constexpr auto container_name = "sas-container"; auto data = SetUpPreexistingData(); - // const std::string path = data.ContainerPath("test-write-object"); + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); ASSERT_OK_AND_ASSIGN(auto sas_token, GetContainerSasToken(data.container_name)); ARROW_EXPECT_OK(options.ConfigureSasCredential(sas_token)); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); - // ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path)); - // ASSERT_OK(stream->Write(payload)); + + // Test copying because it has the extra complexity that it requires generating + // a sas token internally. + ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy")); + AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File); } private: From 567488b5bf06cd62f06c7c74c7c84c2e9e8b0e14 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 12 Dec 2024 12:42:48 +0000 Subject: [PATCH 04/22] Use credential_kind_ to detect credential type inside GenerateSASToken. This avoids cheating by using the account key again to generate SAS tokens in tests --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index f257161ae65..ea47b4dfcdf 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -430,7 +430,7 @@ Result AzureOptions::GenerateSASToken( using SasProtocol = Storage::Sas::SasProtocol; builder->Protocol = blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly; - if (storage_shared_key_credential_) { + if (credential_kind_ == CredentialKind::kStorageSharedKey) { return builder->GenerateSasToken(*storage_shared_key_credential_); } else { // GH-39344: This part isn't tested. This may not work. From 68690678831a64cf74cd778ca3a1a40870ea193c Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 12 Dec 2024 15:41:03 +0000 Subject: [PATCH 05/22] Use original sas token inside `CopyFile` instead of generating a new one on the fly. --- cpp/src/arrow/filesystem/azurefs.cc | 11 ++++++++++- cpp/src/arrow/filesystem/azurefs.h | 2 ++ cpp/src/arrow/filesystem/azurefs_test.cc | 4 ++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ea47b4dfcdf..4a27fb810d7 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -444,6 +444,13 @@ Result AzureOptions::GenerateSASToken( } } +std::optional AzureOptions::GetSASToken() const { + if (credential_kind_ == CredentialKind::kSasToken) { + return sas_token_; + } + return {}; +} + namespace { // An AzureFileSystem represents an Azure storage account. An AzureLocation describes a @@ -3179,7 +3186,9 @@ class AzureFileSystem::Impl { return Status::OK(); } std::string sas_token; - { + if (auto token = options_.GetSASToken()) { + sas_token = token.value(); + } else { Storage::Sas::BlobSasBuilder builder; std::chrono::seconds available_period(60); builder.ExpiresOn = std::chrono::system_clock::now() + available_period; diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 4f46e75f811..1776ae6573b 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -214,6 +214,8 @@ struct ARROW_EXPORT AzureOptions { Result GenerateSASToken( Azure::Storage::Sas::BlobSasBuilder* builder, Azure::Storage::Blobs::BlobServiceClient* client) const; + + std::optional GetSASToken() const; }; /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index fc31fe7c62d..cb25ba7299a 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1639,8 +1639,8 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); - // Test copying because it has the extra complexity that it requires generating - // a sas token internally. + // Test copying because it follows a different code path to other authentications + // because it usually requires generating a SAS token at runtime. ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy")); AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File); } From 0ffd832467a1734f3814e59659e2fb812b5c528f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 12 Dec 2024 18:39:19 +0000 Subject: [PATCH 06/22] Fix appending SAS token twice - all tests pass --- cpp/src/arrow/filesystem/azurefs.cc | 13 ++++--------- cpp/src/arrow/filesystem/azurefs.h | 2 +- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 4a27fb810d7..ec37c27e1d0 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -444,13 +444,6 @@ Result AzureOptions::GenerateSASToken( } } -std::optional AzureOptions::GetSASToken() const { - if (credential_kind_ == CredentialKind::kSasToken) { - return sas_token_; - } - return {}; -} - namespace { // An AzureFileSystem represents an Azure storage account. An AzureLocation describes a @@ -3186,8 +3179,10 @@ class AzureFileSystem::Impl { return Status::OK(); } std::string sas_token; - if (auto token = options_.GetSASToken()) { - sas_token = token.value(); + if (options_.UsesSasCredential()) { + // When authenticated with SAS token GetUrl() automatically provides an + // authenticated URL with the SAS token appended. + sas_token = ""; } else { Storage::Sas::BlobSasBuilder builder; std::chrono::seconds available_period(60); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 1776ae6573b..5e048435583 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -215,7 +215,7 @@ struct ARROW_EXPORT AzureOptions { Azure::Storage::Sas::BlobSasBuilder* builder, Azure::Storage::Blobs::BlobServiceClient* client) const; - std::optional GetSASToken() const; + bool UsesSasCredential() const { return credential_kind_ == CredentialKind::kSasToken; } }; /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and From a2b826c791e5dc0fa7f3e46128ef04ef78dda1b0 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 12 Dec 2024 20:17:08 +0000 Subject: [PATCH 07/22] Support SAS token in AzureOptions::FromUri --- cpp/src/arrow/filesystem/azurefs.cc | 14 ++++++-- cpp/src/arrow/filesystem/azurefs_test.cc | 46 ++++++++++++++++++------ 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ec37c27e1d0..0fd49e311cf 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -148,8 +148,10 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { ARROW_ASSIGN_OR_RAISE(background_writes, ::arrow::internal::ParseBoolean(kv.second)); } else { - return Status::Invalid( - "Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'"); + // Assume these are part of a SAS token. Its not ideal to make such an assumption + // but given that a SAS token is a complex set of URI parameters, that could be + // tricky to enumerate I think its the best option. + credential_kind = CredentialKind::kSasToken; } } @@ -180,6 +182,13 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { case CredentialKind::kEnvironment: RETURN_NOT_OK(ConfigureEnvironmentCredential()); break; + case CredentialKind::kSasToken: + // Reconstructing the SAS token without the other URI query parameters is awkward + // because some parts are URI escaped and some parts are not. Instead we just + // pass through the entire query string and Azure ignores the extra query + // parameters. + RETURN_NOT_OK(ConfigureSasCredential("?" + uri.query_string())); + break; default: // Default credential break; @@ -225,7 +234,6 @@ Result AzureOptions::FromUri(const std::string& uri_string, } bool AzureOptions::Equals(const AzureOptions& other) const { - // TODO(GH-38598): update here when more auth methods are added. const bool equals = blob_storage_authority == other.blob_storage_authority && dfs_storage_authority == other.dfs_storage_authority && blob_storage_scheme == other.blob_storage_scheme && diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index cb25ba7299a..4c74a5af054 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -690,6 +690,36 @@ class TestAzureOptions : public ::testing::Test { ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kEnvironment); } + void TestFromUriCredentialSasToken() { + const std::string sas_token = + "?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%" + "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04"; + ASSERT_OK_AND_ASSIGN( + auto options, + AzureOptions::FromUri( + "abfs://file_system@account.dfs.core.windows.net/" + sas_token, nullptr)); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSasToken); + ASSERT_EQ(options.sas_token_, sas_token); + } + + void TestFromUriCredentialSasTokenWithOtherParameters() { + const std::string uri_query_string = + "?enable_tls=false&se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%" + "2B1SqLxPK%" + "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04"; + ASSERT_OK_AND_ASSIGN( + auto options, + AzureOptions::FromUri( + "abfs://account@127.0.0.1:10000/container/dir/blob" + uri_query_string, + nullptr)); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSasToken); + ASSERT_EQ(options.sas_token_, uri_query_string); + ASSERT_EQ(options.blob_storage_authority, "127.0.0.1:10000"); + ASSERT_EQ(options.dfs_storage_authority, "127.0.0.1:10000"); + ASSERT_EQ(options.blob_storage_scheme, "http"); + ASSERT_EQ(options.dfs_storage_scheme, "http"); + } + void TestFromUriCredentialInvalid() { ASSERT_RAISES(Invalid, AzureOptions::FromUri( "abfs://file_system@account.dfs.core.windows.net/dir/file?" @@ -714,13 +744,6 @@ class TestAzureOptions : public ::testing::Test { ASSERT_EQ(options.dfs_storage_authority, ".dfs.local"); } - void TestFromUriInvalidQueryParameter() { - ASSERT_RAISES(Invalid, AzureOptions::FromUri( - "abfs://file_system@account.dfs.core.windows.net/dir/file?" - "unknown=invalid", - nullptr)); - } - void TestMakeBlobServiceClientInvalidAccountName() { AzureOptions options; ASSERT_RAISES_WITH_MESSAGE( @@ -777,14 +800,15 @@ TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) { TEST_F(TestAzureOptions, FromUriCredentialEnvironment) { TestFromUriCredentialEnvironment(); } +TEST_F(TestAzureOptions, FromUriCredentialSasToken) { TestFromUriCredentialSasToken(); } +TEST_F(TestAzureOptions, FromUriCredentialSasTokenWithOtherParameters) { + TestFromUriCredentialSasTokenWithOtherParameters(); +} TEST_F(TestAzureOptions, FromUriCredentialInvalid) { TestFromUriCredentialInvalid(); } TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) { TestFromUriBlobStorageAuthority(); } TEST_F(TestAzureOptions, FromUriDfsStorageAuthority) { TestFromUriDfsStorageAuthority(); } -TEST_F(TestAzureOptions, FromUriInvalidQueryParameter) { - TestFromUriInvalidQueryParameter(); -} TEST_F(TestAzureOptions, MakeBlobServiceClientInvalidAccountName) { TestMakeBlobServiceClientInvalidAccountName(); } @@ -1639,7 +1663,7 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); - // Test copying because it follows a different code path to other authentications + // Test copying because it follows a different code path to other authentications // because it usually requires generating a SAS token at runtime. ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy")); AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File); From 9c6ec0e073c797b7d6a121dde2e1505533c0a093 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 12 Dec 2024 20:22:46 +0000 Subject: [PATCH 08/22] Update SasCredential test to cover extra query parmaters --- cpp/src/arrow/filesystem/azurefs_test.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 4c74a5af054..6f747496385 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1658,7 +1658,11 @@ class TestAzureFileSystem : public ::testing::Test { ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); ASSERT_OK_AND_ASSIGN(auto sas_token, GetContainerSasToken(data.container_name)); - ARROW_EXPECT_OK(options.ConfigureSasCredential(sas_token)); + // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so + // make sure these don't cause problems. + ARROW_EXPECT_OK(options.ConfigureSasCredential( + "?blob_storage_authority=dummy_value0&" + sas_token.substr(1) + + "&credential_kind=dummy-value1")); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); From f4177bebb50f8e09ef4d68262e7d335c6ab237f9 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 13 Dec 2024 16:32:29 +0000 Subject: [PATCH 09/22] Adjust comment and increase SAS expiry time --- cpp/src/arrow/filesystem/azurefs.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0fd49e311cf..b3d7304fee2 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -441,7 +441,7 @@ Result AzureOptions::GenerateSASToken( if (credential_kind_ == CredentialKind::kStorageSharedKey) { return builder->GenerateSasToken(*storage_shared_key_credential_); } else { - // GH-39344: This part isn't tested. This may not work. + // GH-39344: This part isn't tested. try { auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn); return builder->GenerateSasToken(delegation_key_response.Value, account_name); @@ -3193,7 +3193,7 @@ class AzureFileSystem::Impl { sas_token = ""; } else { Storage::Sas::BlobSasBuilder builder; - std::chrono::seconds available_period(60); + std::chrono::seconds available_period(600); builder.ExpiresOn = std::chrono::system_clock::now() + available_period; builder.BlobContainerName = src.container; builder.BlobName = src.path; From d19e87f68362176d30bec03e7fb0e1e5ee3d600c Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 13 Dec 2024 17:01:42 +0000 Subject: [PATCH 10/22] Update comments --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- cpp/src/arrow/filesystem/azurefs.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index b3d7304fee2..549cabb6111 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -150,7 +150,7 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { } else { // Assume these are part of a SAS token. Its not ideal to make such an assumption // but given that a SAS token is a complex set of URI parameters, that could be - // tricky to enumerate I think its the best option. + // tricky to exhaustively list I think its the best option. credential_kind = CredentialKind::kSasToken; } } diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 5e048435583..fd7bb91a02a 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -182,6 +182,9 @@ struct ARROW_EXPORT AzureOptions { /// AzureOptions::ConfigureClientSecretCredential() is called. /// * client_secret: You must specify "tenant_id" and "client_id" /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// * A SAS token is made up of several query parameters. Appending a SAS + /// token to the URI configures SAS token auth by calling + /// AzureOptions::ConfigureSasCredential(). /// /// [1]: /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri From 15fe2c6e76a9ee38365ea6975a9b163138695132 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 13 Dec 2024 17:05:52 +0000 Subject: [PATCH 11/22] Autoformat --- cpp/src/arrow/filesystem/azurefs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index fd7bb91a02a..9e6dab6d14b 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -182,9 +182,9 @@ struct ARROW_EXPORT AzureOptions { /// AzureOptions::ConfigureClientSecretCredential() is called. /// * client_secret: You must specify "tenant_id" and "client_id" /// too. AzureOptions::ConfigureClientSecretCredential() is called. - /// * A SAS token is made up of several query parameters. Appending a SAS - /// token to the URI configures SAS token auth by calling - /// AzureOptions::ConfigureSasCredential(). + /// * A SAS token is made up of several query parameters. Appending a SAS + /// token to the URI configures SAS token auth by calling + /// AzureOptions::ConfigureSasCredential(). /// /// [1]: /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri From b57ebeae8b9902a8238e16cf6cf20affa37cfc2d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Dec 2024 12:04:29 +0000 Subject: [PATCH 12/22] PR comments --- cpp/src/arrow/filesystem/azurefs.cc | 74 ++++++++++++++---------- cpp/src/arrow/filesystem/azurefs.h | 4 +- cpp/src/arrow/filesystem/azurefs_test.cc | 4 +- 3 files changed, 45 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 549cabb6111..eba92e3c905 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -151,7 +151,7 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { // Assume these are part of a SAS token. Its not ideal to make such an assumption // but given that a SAS token is a complex set of URI parameters, that could be // tricky to exhaustively list I think its the best option. - credential_kind = CredentialKind::kSasToken; + credential_kind = CredentialKind::kSASToken; } } @@ -182,7 +182,7 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { case CredentialKind::kEnvironment: RETURN_NOT_OK(ConfigureEnvironmentCredential()); break; - case CredentialKind::kSasToken: + case CredentialKind::kSASToken: // Reconstructing the SAS token without the other URI query parameters is awkward // because some parts are URI escaped and some parts are not. Instead we just // pass through the entire query string and Azure ignores the extra query @@ -251,7 +251,7 @@ bool AzureOptions::Equals(const AzureOptions& other) const { case CredentialKind::kStorageSharedKey: return storage_shared_key_credential_->AccountName == other.storage_shared_key_credential_->AccountName; - case CredentialKind::kSasToken: + case CredentialKind::kSASToken: return sas_token_ == other.sas_token_; case CredentialKind::kClientSecret: case CredentialKind::kCLI: @@ -322,7 +322,7 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke } Status AzureOptions::ConfigureSasCredential(const std::string& sas_token) { - credential_kind_ = CredentialKind::kSasToken; + credential_kind_ = CredentialKind::kSASToken; if (account_name.empty()) { return Status::Invalid("AzureOptions doesn't contain a valid account name"); } @@ -391,7 +391,7 @@ Result> AzureOptions::MakeBlobServiceC case CredentialKind::kStorageSharedKey: return std::make_unique(AccountBlobUrl(account_name), storage_shared_key_credential_); - case CredentialKind::kSasToken: + case CredentialKind::kSASToken: return std::make_unique(AccountBlobUrl(account_name) + sas_token_); } @@ -426,7 +426,7 @@ AzureOptions::MakeDataLakeServiceClient() const { case CredentialKind::kStorageSharedKey: return std::make_unique( AccountDfsUrl(account_name), storage_shared_key_credential_); - case CredentialKind::kSasToken: + case CredentialKind::kSASToken: return std::make_unique( AccountBlobUrl(account_name) + sas_token_); } @@ -438,18 +438,31 @@ Result AzureOptions::GenerateSASToken( using SasProtocol = Storage::Sas::SasProtocol; builder->Protocol = blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly; - if (credential_kind_ == CredentialKind::kStorageSharedKey) { - return builder->GenerateSasToken(*storage_shared_key_credential_); - } else { - // GH-39344: This part isn't tested. - try { - auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn); - return builder->GenerateSasToken(delegation_key_response.Value, account_name); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "GetUserDelegationKey failed for '", - client->GetUrl(), "'."); - } + switch (credential_kind_) { + case CredentialKind::kAnonymous: + // Anonymous is for public storage accounts so no SAS token needed. + case CredentialKind::kSASToken: + // When using SAS token auth BlobClient::GetUrl() will return a URL with the + // original SAS token, so we don't need to generate a new one. + return ""; + case CredentialKind::kStorageSharedKey: + return builder->GenerateSasToken(*storage_shared_key_credential_); + case CredentialKind::kClientSecret: + case CredentialKind::kManagedIdentity: + case CredentialKind::kCLI: + case CredentialKind::kWorkloadIdentity: + case CredentialKind::kEnvironment: + case CredentialKind::kDefault: + // GH-39344: This part isn't tested. + try { + auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn); + return builder->GenerateSasToken(delegation_key_response.Value, account_name); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "GetUserDelegationKey failed for '", + client->GetUrl(), "'."); + } } + return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } namespace { @@ -3187,21 +3200,18 @@ class AzureFileSystem::Impl { return Status::OK(); } std::string sas_token; - if (options_.UsesSasCredential()) { - // When authenticated with SAS token GetUrl() automatically provides an - // authenticated URL with the SAS token appended. - sas_token = ""; - } else { - Storage::Sas::BlobSasBuilder builder; - std::chrono::seconds available_period(600); - builder.ExpiresOn = std::chrono::system_clock::now() + available_period; - builder.BlobContainerName = src.container; - builder.BlobName = src.path; - builder.Resource = Storage::Sas::BlobSasResource::Blob; - builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read); - ARROW_ASSIGN_OR_RAISE( - sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get())); - } + Storage::Sas::BlobSasBuilder builder; + // Shorter lived SAS tokens are more secure but we need it to last comfortably long + // enough for the copy to complete, including possible retries. Copying a single 1GiB + // file in Azure can take over a minute. + std::chrono::seconds available_period(600); + builder.ExpiresOn = std::chrono::system_clock::now() + available_period; + builder.BlobContainerName = src.container; + builder.BlobName = src.path; + builder.Resource = Storage::Sas::BlobSasResource::Blob; + builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read); + ARROW_ASSIGN_OR_RAISE( + sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get())); auto src_url = GetBlobClient(src.container, src.path).GetUrl() + sas_token; auto dest_blob_client = GetBlobClient(dest.container, dest.path); if (!dest.path.empty()) { diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 9e6dab6d14b..581324af30e 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -120,7 +120,7 @@ struct ARROW_EXPORT AzureOptions { kDefault, kAnonymous, kStorageSharedKey, - kSasToken, + kSASToken, kClientSecret, kManagedIdentity, kCLI, @@ -217,8 +217,6 @@ struct ARROW_EXPORT AzureOptions { Result GenerateSASToken( Azure::Storage::Sas::BlobSasBuilder* builder, Azure::Storage::Blobs::BlobServiceClient* client) const; - - bool UsesSasCredential() const { return credential_kind_ == CredentialKind::kSasToken; } }; /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 6f747496385..5bda6c3c9a6 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -698,7 +698,7 @@ class TestAzureOptions : public ::testing::Test { auto options, AzureOptions::FromUri( "abfs://file_system@account.dfs.core.windows.net/" + sas_token, nullptr)); - ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSasToken); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken); ASSERT_EQ(options.sas_token_, sas_token); } @@ -712,7 +712,7 @@ class TestAzureOptions : public ::testing::Test { AzureOptions::FromUri( "abfs://account@127.0.0.1:10000/container/dir/blob" + uri_query_string, nullptr)); - ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSasToken); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken); ASSERT_EQ(options.sas_token_, uri_query_string); ASSERT_EQ(options.blob_storage_authority, "127.0.0.1:10000"); ASSERT_EQ(options.dfs_storage_authority, "127.0.0.1:10000"); From e200b8eb3fca51f10ad22850627b55fbbc69be51 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Dec 2024 13:39:29 +0000 Subject: [PATCH 13/22] Avoid generating SAS tokens on the fly. --- cpp/src/arrow/filesystem/azurefs.cc | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index eba92e3c905..027ceb223e7 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -3199,20 +3199,7 @@ class AzureFileSystem::Impl { if (src == dest) { return Status::OK(); } - std::string sas_token; - Storage::Sas::BlobSasBuilder builder; - // Shorter lived SAS tokens are more secure but we need it to last comfortably long - // enough for the copy to complete, including possible retries. Copying a single 1GiB - // file in Azure can take over a minute. - std::chrono::seconds available_period(600); - builder.ExpiresOn = std::chrono::system_clock::now() + available_period; - builder.BlobContainerName = src.container; - builder.BlobName = src.path; - builder.Resource = Storage::Sas::BlobSasResource::Blob; - builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read); - ARROW_ASSIGN_OR_RAISE( - sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get())); - auto src_url = GetBlobClient(src.container, src.path).GetUrl() + sas_token; + auto src_url = GetBlobClient(src.container, src.path).GetUrl(); auto dest_blob_client = GetBlobClient(dest.container, dest.path); if (!dest.path.empty()) { auto dest_parent = dest.parent(); @@ -3225,9 +3212,19 @@ class AzureFileSystem::Impl { } } try { - dest_blob_client.CopyFromUri(src_url); + // We use StartCopyFromUri instead of CopyFromUri because it supports blobs larger + // than 256 MiB and it doesn't require generating a SAS token to authenticate + // reading the source blob. + auto copy_operation = dest_blob_client.StartCopyFromUri(src_url); + copy_operation.PollUntilDone(std::chrono::milliseconds(1000)); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ", + // StartCopyFromUri failed or a GetProperties call inside PollUntilDone failed. + return ExceptionToStatus( + exception, "Failed to start blob copy or poll status of ongoing copy. (", + src_url, " -> ", dest_blob_client.GetUrl(), ")"); + } catch (const Azure::Core::RequestFailedException& exception) { + // A GetProperties call inside PollUntilDone returned a failed CopyStatus. + return ExceptionToStatus(exception, "Failed to copy blob. (", src_url, " -> ", dest_blob_client.GetUrl(), ")"); } return Status::OK(); From b0fd1d384b75f7ac360d4687b9b3033e2f2e6e82 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Dec 2024 13:40:47 +0000 Subject: [PATCH 14/22] Add an extra test copying to a different container --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- cpp/src/arrow/filesystem/azurefs_test.cc | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 027ceb223e7..88f1ce15b40 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -3214,7 +3214,7 @@ class AzureFileSystem::Impl { try { // We use StartCopyFromUri instead of CopyFromUri because it supports blobs larger // than 256 MiB and it doesn't require generating a SAS token to authenticate - // reading the source blob. + // reading a source blob in the same storage account. auto copy_operation = dest_blob_client.StartCopyFromUri(src_url); copy_operation.PollUntilDone(std::chrono::milliseconds(1000)); } catch (const Storage::StorageException& exception) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 5bda6c3c9a6..b573caff696 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -2694,6 +2694,17 @@ TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } +TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationDifferentContainer) { + auto data = SetUpPreexistingData(); + auto data2 = SetUpPreexistingData(); + const auto destination_path = data2.ContainerPath("copy-destionation"); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); + EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); +} + TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) { auto data = SetUpPreexistingData(); ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath())); From 744e1118ea18755bffc8a7a6e9389272cb4ea574 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Dec 2024 13:41:30 +0000 Subject: [PATCH 15/22] Update comment --- cpp/src/arrow/filesystem/azurefs_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index b573caff696..e61f3fba393 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1667,8 +1667,8 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); - // Test copying because it follows a different code path to other authentications - // because it usually requires generating a SAS token at runtime. + // Test copying because the most obvious implementation requires generating a SAS + // token at runtime which doesn't work when the original auth is SAS token. ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy")); AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File); } From c8fd94d9571ddb956d35d956f05d8a752144a901 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Dec 2024 14:04:56 +0000 Subject: [PATCH 16/22] Remove AzureOptions::GenerateSASToken --- cpp/src/arrow/filesystem/azurefs.cc | 32 ------------------------ cpp/src/arrow/filesystem/azurefs.h | 4 --- cpp/src/arrow/filesystem/azurefs_test.cc | 17 ++++++++++--- 3 files changed, 13 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 88f1ce15b40..ed966cb47a0 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -433,38 +433,6 @@ AzureOptions::MakeDataLakeServiceClient() const { return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } -Result AzureOptions::GenerateSASToken( - Storage::Sas::BlobSasBuilder* builder, Blobs::BlobServiceClient* client) const { - using SasProtocol = Storage::Sas::SasProtocol; - builder->Protocol = - blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly; - switch (credential_kind_) { - case CredentialKind::kAnonymous: - // Anonymous is for public storage accounts so no SAS token needed. - case CredentialKind::kSASToken: - // When using SAS token auth BlobClient::GetUrl() will return a URL with the - // original SAS token, so we don't need to generate a new one. - return ""; - case CredentialKind::kStorageSharedKey: - return builder->GenerateSasToken(*storage_shared_key_credential_); - case CredentialKind::kClientSecret: - case CredentialKind::kManagedIdentity: - case CredentialKind::kCLI: - case CredentialKind::kWorkloadIdentity: - case CredentialKind::kEnvironment: - case CredentialKind::kDefault: - // GH-39344: This part isn't tested. - try { - auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn); - return builder->GenerateSasToken(delegation_key_response.Value, account_name); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "GetUserDelegationKey failed for '", - client->GetUrl(), "'."); - } - } - return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); -} - namespace { // An AzureFileSystem represents an Azure storage account. An AzureLocation describes a diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 581324af30e..6561bc53cdd 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -213,10 +213,6 @@ struct ARROW_EXPORT AzureOptions { Result> MakeDataLakeServiceClient() const; - - Result GenerateSASToken( - Azure::Storage::Sas::BlobSasBuilder* builder, - Azure::Storage::Blobs::BlobServiceClient* client) const; }; /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index e61f3fba393..1404336d56c 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -936,7 +936,9 @@ class TestAzureFileSystem : public ::testing::Test { .Value; } - Result GetContainerSasToken(const std::string& container_name) { + Result GetContainerSasToken( + const std::string& container_name, + Azure::Storage::StorageSharedKeyCredential storage_shared_key_credential) { std::string sas_token; Azure::Storage::Sas::BlobSasBuilder builder; std::chrono::seconds available_period(60); @@ -944,7 +946,8 @@ class TestAzureFileSystem : public ::testing::Test { builder.BlobContainerName = container_name; builder.Resource = Azure::Storage::Sas::BlobSasResource::BlobContainer; builder.SetPermissions(Azure::Storage::Sas::BlobContainerSasPermissions::All); - return options_.GenerateSASToken(&builder, blob_service_client_.get()); + builder.Protocol = Azure::Storage::Sas::SasProtocol::HttpsAndHttp; + return builder.GenerateSasToken(storage_shared_key_credential); } void UploadLines(const std::vector& lines, const std::string& path, @@ -964,6 +967,8 @@ class TestAzureFileSystem : public ::testing::Test { } else { auto container_client = CreateContainer(data.container_name); CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); + // CreateBlob(container_client, data.kObjectName, data.RandomChars((1 << 30), + // rng_)); } return data; } @@ -1657,7 +1662,11 @@ class TestAzureFileSystem : public ::testing::Test { ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); - ASSERT_OK_AND_ASSIGN(auto sas_token, GetContainerSasToken(data.container_name)); + ASSERT_OK_AND_ASSIGN( + auto sas_token, + GetContainerSasToken(data.container_name, + Azure::Storage::StorageSharedKeyCredential( + env->account_name(), env->account_key()))); // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so // make sure these don't cause problems. ARROW_EXPECT_OK(options.ConfigureSasCredential( @@ -1667,7 +1676,7 @@ class TestAzureFileSystem : public ::testing::Test { AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); - // Test copying because the most obvious implementation requires generating a SAS + // Test CopyFile because the most obvious implementation requires generating a SAS // token at runtime which doesn't work when the original auth is SAS token. ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy")); AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File); From 83a0b406fe41b30a236bce103154768f0df2f3c6 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Dec 2024 14:19:43 +0000 Subject: [PATCH 17/22] Remove debug change --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 1404336d56c..3bffdba8647 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -967,8 +967,6 @@ class TestAzureFileSystem : public ::testing::Test { } else { auto container_client = CreateContainer(data.container_name); CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); - // CreateBlob(container_client, data.kObjectName, data.RandomChars((1 << 30), - // rng_)); } return data; } From d60742bcf1d5fa86da06865e548c359862922d5b Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Dec 2024 09:42:51 +0000 Subject: [PATCH 18/22] Rename Sas -> SAS --- cpp/src/arrow/filesystem/azurefs.cc | 4 ++-- cpp/src/arrow/filesystem/azurefs.h | 8 ++------ cpp/src/arrow/filesystem/azurefs_test.cc | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ed966cb47a0..675c0359f56 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -187,7 +187,7 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { // because some parts are URI escaped and some parts are not. Instead we just // pass through the entire query string and Azure ignores the extra query // parameters. - RETURN_NOT_OK(ConfigureSasCredential("?" + uri.query_string())); + RETURN_NOT_OK(ConfigureSASCredential("?" + uri.query_string())); break; default: // Default credential @@ -321,7 +321,7 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke return Status::OK(); } -Status AzureOptions::ConfigureSasCredential(const std::string& sas_token) { +Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) { credential_kind_ = CredentialKind::kSASToken; if (account_name.empty()) { return Status::Invalid("AzureOptions doesn't contain a valid account name"); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 6561bc53cdd..ee0956afdd7 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -37,10 +37,6 @@ namespace Azure::Storage::Blobs { class BlobServiceClient; } -namespace Azure::Storage::Sas { -struct BlobSasBuilder; -} - namespace Azure::Storage::Files::DataLake { class DataLakeFileSystemClient; class DataLakeServiceClient; @@ -184,7 +180,7 @@ struct ARROW_EXPORT AzureOptions { /// too. AzureOptions::ConfigureClientSecretCredential() is called. /// * A SAS token is made up of several query parameters. Appending a SAS /// token to the URI configures SAS token auth by calling - /// AzureOptions::ConfigureSasCredential(). + /// AzureOptions::ConfigureSASCredential(). /// /// [1]: /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri @@ -194,7 +190,7 @@ struct ARROW_EXPORT AzureOptions { Status ConfigureDefaultCredential(); Status ConfigureAnonymousCredential(); Status ConfigureAccountKeyCredential(const std::string& account_key); - Status ConfigureSasCredential(const std::string& sas_token); + Status ConfigureSASCredential(const std::string& sas_token); Status ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 3bffdba8647..7abbca7d1d8 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1667,7 +1667,7 @@ class TestAzureFileSystem : public ::testing::Test { env->account_name(), env->account_key()))); // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so // make sure these don't cause problems. - ARROW_EXPECT_OK(options.ConfigureSasCredential( + ARROW_EXPECT_OK(options.ConfigureSASCredential( "?blob_storage_authority=dummy_value0&" + sas_token.substr(1) + "&credential_kind=dummy-value1")); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); From 63f9773a9c16781662b6b397a1e3578ef26a0a09 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Dec 2024 09:37:24 +0000 Subject: [PATCH 19/22] Add a comment about the polling interval --- cpp/src/arrow/filesystem/azurefs.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 675c0359f56..297edd480b1 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -3184,6 +3184,8 @@ class AzureFileSystem::Impl { // than 256 MiB and it doesn't require generating a SAS token to authenticate // reading a source blob in the same storage account. auto copy_operation = dest_blob_client.StartCopyFromUri(src_url); + // For large blobs, the copy operation may be slow so we need to poll until it + // completes. We use a polling interval of 1 second. copy_operation.PollUntilDone(std::chrono::milliseconds(1000)); } catch (const Storage::StorageException& exception) { // StartCopyFromUri failed or a GetProperties call inside PollUntilDone failed. From d55510ab92684fb74f8f055cc03f0694c93c2eba Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Dec 2024 11:10:35 +0000 Subject: [PATCH 20/22] Only allow query parameters we know can be part of a sas token --- cpp/src/arrow/filesystem/azurefs.cc | 22 ++++++++++++++++++---- cpp/src/arrow/filesystem/azurefs_test.cc | 10 ++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 297edd480b1..cf238647a33 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -106,6 +106,18 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { std::string tenant_id; std::string client_id; std::string client_secret; + + // These query parameters are the union of the following docs: + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-account-sas#specify-the-account-sas-parameters + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#construct-a-service-sas + // (excluding parameters for table storage only) + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#construct-a-user-delegation-sas + constexpr std::array sas_token_query_parameters = { + "sv", "ss", "sr", "st", "se", "sp", "si", "sip", "spr", + "skoid", "sktid", "srt", "skt", "ske", "skv", "sks", "saoid", "suoid", + "scid", "sdd", "ses", "sig", "rscc", "rscd", "rsce", "rscl", "rsct", + }; + ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items()); for (const auto& kv : options_items) { if (kv.first == "blob_storage_authority") { @@ -147,11 +159,13 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { } else if (kv.first == "background_writes") { ARROW_ASSIGN_OR_RAISE(background_writes, ::arrow::internal::ParseBoolean(kv.second)); - } else { - // Assume these are part of a SAS token. Its not ideal to make such an assumption - // but given that a SAS token is a complex set of URI parameters, that could be - // tricky to exhaustively list I think its the best option. + } else if (std::find(sas_token_query_parameters.begin(), + sas_token_query_parameters.end(), + kv.first) != sas_token_query_parameters.end()) { credential_kind = CredentialKind::kSASToken; + } else { + return Status::Invalid( + "Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'"); } } diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 7abbca7d1d8..5d649e5084b 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -744,6 +744,13 @@ class TestAzureOptions : public ::testing::Test { ASSERT_EQ(options.dfs_storage_authority, ".dfs.local"); } + void TestFromUriInvalidQueryParameter() { + ASSERT_RAISES(Invalid, AzureOptions::FromUri( + "abfs://file_system@account.dfs.core.windows.net/dir/file?" + "unknown=invalid", + nullptr)); + } + void TestMakeBlobServiceClientInvalidAccountName() { AzureOptions options; ASSERT_RAISES_WITH_MESSAGE( @@ -809,6 +816,9 @@ TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) { TestFromUriBlobStorageAuthority(); } TEST_F(TestAzureOptions, FromUriDfsStorageAuthority) { TestFromUriDfsStorageAuthority(); } +TEST_F(TestAzureOptions, FromUriInvalidQueryParameter) { + TestFromUriInvalidQueryParameter(); +} TEST_F(TestAzureOptions, MakeBlobServiceClientInvalidAccountName) { TestMakeBlobServiceClientInvalidAccountName(); } From 7576c4c3fe92323cd94b356335e8eb96637b08e4 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Dec 2024 11:15:47 +0000 Subject: [PATCH 21/22] More renaming Sas -> SAS --- cpp/src/arrow/filesystem/azurefs_test.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 5d649e5084b..8f25bf06e65 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -690,7 +690,7 @@ class TestAzureOptions : public ::testing::Test { ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kEnvironment); } - void TestFromUriCredentialSasToken() { + void TestFromUriCredentialSASToken() { const std::string sas_token = "?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%" "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04"; @@ -702,7 +702,7 @@ class TestAzureOptions : public ::testing::Test { ASSERT_EQ(options.sas_token_, sas_token); } - void TestFromUriCredentialSasTokenWithOtherParameters() { + void TestFromUriCredentialSASTokenWithOtherParameters() { const std::string uri_query_string = "?enable_tls=false&se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%" "2B1SqLxPK%" @@ -807,9 +807,9 @@ TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) { TEST_F(TestAzureOptions, FromUriCredentialEnvironment) { TestFromUriCredentialEnvironment(); } -TEST_F(TestAzureOptions, FromUriCredentialSasToken) { TestFromUriCredentialSasToken(); } -TEST_F(TestAzureOptions, FromUriCredentialSasTokenWithOtherParameters) { - TestFromUriCredentialSasTokenWithOtherParameters(); +TEST_F(TestAzureOptions, FromUriCredentialSASToken) { TestFromUriCredentialSASToken(); } +TEST_F(TestAzureOptions, FromUriCredentialSASTokenWithOtherParameters) { + TestFromUriCredentialSASTokenWithOtherParameters(); } TEST_F(TestAzureOptions, FromUriCredentialInvalid) { TestFromUriCredentialInvalid(); } TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) { @@ -946,7 +946,7 @@ class TestAzureFileSystem : public ::testing::Test { .Value; } - Result GetContainerSasToken( + Result GetContainerSASToken( const std::string& container_name, Azure::Storage::StorageSharedKeyCredential storage_shared_key_credential) { std::string sas_token; @@ -1665,14 +1665,14 @@ class TestAzureFileSystem : public ::testing::Test { AssertObjectContents(fs.get(), path, payload); } - void TestSasCredential() { + void TestSASCredential() { auto data = SetUpPreexistingData(); ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); ASSERT_OK_AND_ASSIGN( auto sas_token, - GetContainerSasToken(data.container_name, + GetContainerSASToken(data.container_name, Azure::Storage::StorageSharedKeyCredential( env->account_name(), env->account_key()))); // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so @@ -2401,8 +2401,8 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) { TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, SasCredential) { - this->TestSasCredential(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, SASCredential) { + this->TestSASCredential(); } // Tests using Azurite (the local Azure emulator) From eece05e8856fcd02e8103cd2ce5c079373d11cc2 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 17 Dec 2024 11:20:59 +0000 Subject: [PATCH 22/22] Give up on `contexpr`. Make it a `static const std::set` --- cpp/src/arrow/filesystem/azurefs.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index cf238647a33..4638bb12c78 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -112,7 +112,7 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { // https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#construct-a-service-sas // (excluding parameters for table storage only) // https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#construct-a-user-delegation-sas - constexpr std::array sas_token_query_parameters = { + static const std::set sas_token_query_parameters = { "sv", "ss", "sr", "st", "se", "sp", "si", "sip", "spr", "skoid", "sktid", "srt", "skt", "ske", "skv", "sks", "saoid", "suoid", "scid", "sdd", "ses", "sig", "rscc", "rscd", "rsce", "rscl", "rsct", @@ -159,9 +159,8 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { } else if (kv.first == "background_writes") { ARROW_ASSIGN_OR_RAISE(background_writes, ::arrow::internal::ParseBoolean(kv.second)); - } else if (std::find(sas_token_query_parameters.begin(), - sas_token_query_parameters.end(), - kv.first) != sas_token_query_parameters.end()) { + } else if (sas_token_query_parameters.find(kv.first) != + sas_token_query_parameters.end()) { credential_kind = CredentialKind::kSASToken; } else { return Status::Invalid(