diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 125f1f48d48..08dda6cf50a 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -99,6 +99,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ @@ -172,6 +173,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_WITH_ZSTD=ON \ ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ gRPC_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 0840b3fa5c6..dedeedd979f 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -98,6 +98,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ @@ -196,6 +197,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_WITH_ZSTD=ON \ ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 5a89fafc601..d73f4ad2308 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -152,6 +152,7 @@ cmake \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ + -DAzure_SOURCE=${Azure_SOURCE:-} \ -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ -DBOOST_SOURCE=${BOOST_SOURCE:-} \ -DBrotli_SOURCE=${Brotli_SOURCE:-} \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 27d7097a290..6ab5800fd88 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -795,6 +795,11 @@ if(ARROW_WITH_OPENTELEMETRY) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) endif() +if(ARROW_WITH_AZURE_SDK) + list(APPEND ARROW_SHARED_LINK_LIBS ${AZURE_SDK_LINK_LIBRARIES}) + list(APPEND ARROW_STATIC_LINK_LIBS ${AZURE_SDK_LINK_LIBRARIES}) +endif() + if(ARROW_WITH_UTF8PROC) list(APPEND ARROW_SHARED_LINK_LIBS utf8proc::utf8proc) list(APPEND ARROW_STATIC_LINK_LIBS utf8proc::utf8proc) diff --git a/cpp/cmake_modules/FindAzure.cmake b/cpp/cmake_modules/FindAzure.cmake new file mode 100644 index 00000000000..fdf354b724e --- /dev/null +++ b/cpp/cmake_modules/FindAzure.cmake @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(Azure_FOUND) + return() +endif() + +set(find_package_args) +list(APPEND find_package_args CONFIG) +if(Azure_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() + +if(Azure_FIND_REQUIRED) + list(APPEND find_package_args REQUIRED) +endif() + +find_package(azure-core-cpp ${find_package_args}) +find_package(azure-identity-cpp ${find_package_args}) +find_package(azure-storage-blobs-cpp ${find_package_args}) +find_package(azure-storage-common-cpp ${find_package_args}) +find_package(azure-storage-files-datalake-cpp ${find_package_args}) + +find_package_handle_standard_args( + Azure + REQUIRED_VARS azure-core-cpp_FOUND + azure-identity-cpp_FOUND + azure-storage-blobs-cpp_FOUND + azure-storage-common-cpp_FOUND + azure-storage-files-datalake-cpp_FOUND + VERSION_VAR azure-core-cpp_VERSION) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1767c05b5ee..1abc5504130 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -46,6 +46,7 @@ set(ARROW_RE2_LINKAGE set(ARROW_THIRDPARTY_DEPENDENCIES absl AWSSDK + Azure benchmark Boost Brotli @@ -159,6 +160,8 @@ macro(build_dependency DEPENDENCY_NAME) build_absl() elseif("${DEPENDENCY_NAME}" STREQUAL "AWSSDK") build_awssdk() + elseif("${DEPENDENCY_NAME}" STREQUAL "Azure") + build_azure_sdk() elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") build_benchmark() elseif("${DEPENDENCY_NAME}" STREQUAL "Boost") @@ -368,6 +371,10 @@ if(ARROW_GCS) set(ARROW_WITH_ZLIB ON) endif() +if(ARROW_AZURE) + set(ARROW_WITH_AZURE_SDK ON) +endif() + if(ARROW_JSON) set(ARROW_WITH_RAPIDJSON ON) endif() @@ -548,6 +555,14 @@ else() "${THIRDPARTY_MIRROR_URL}/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz") endif() +if(DEFINED ENV{ARROW_AZURE_SDK_URL}) + set(ARROW_AZURE_SDK_URL "$ENV{ARROW_AZURE_SDK_URL}") +else() + set_urls(ARROW_AZURE_SDK_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/${ARROW_AZURE_SDK_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_BOOST_URL}) set(BOOST_SOURCE_URL "$ENV{ARROW_BOOST_URL}") else() @@ -960,6 +975,8 @@ else() set(MAKE_BUILD_ARGS "-j${NPROC}") endif() +include(FetchContent) + # ---------------------------------------------------------------------- # Find pthreads @@ -1367,6 +1384,7 @@ endif() set(ARROW_OPENSSL_REQUIRED_VERSION "1.0.2") set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION + OR ARROW_AZURE OR ARROW_FLIGHT OR ARROW_GANDIVA OR ARROW_GCS @@ -5049,6 +5067,56 @@ if(ARROW_S3) endif() endif() +# ---------------------------------------------------------------------- +# Azure SDK for C++ + +function(build_azure_sdk) + message(STATUS "Building Azure SDK for C++ from source") + fetchcontent_declare(azure_sdk + URL ${ARROW_AZURE_SDK_URL} + URL_HASH "SHA256=${ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM}") + set(BUILD_PERFORMANCE_TESTS FALSE) + set(BUILD_SAMPLES FALSE) + set(BUILD_TESTING FALSE) + set(BUILD_WINDOWS_UWP TRUE) + set(CMAKE_EXPORT_NO_PACKAGE_REGISTRY TRUE) + set(DISABLE_AZURE_CORE_OPENTELEMETRY TRUE) + set(ENV{AZURE_SDK_DISABLE_AUTO_VCPKG} TRUE) + set(WARNINGS_AS_ERRORS FALSE) + # TODO: Configure flags in a better way. FetchContent builds inherit + # global flags but we want to disable -Werror for Azure SDK for C++ builds. + if(MSVC) + string(REPLACE "/WX" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + string(REPLACE "/WX" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + else() + string(REPLACE "-Werror" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + string(REPLACE "-Werror" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + endif() + fetchcontent_makeavailable(azure_sdk) + set(AZURE_SDK_VENDORED + TRUE + PARENT_SCOPE) + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + Azure::azure-core + Azure::azure-identity + Azure::azure-storage-blobs + Azure::azure-storage-common + Azure::azure-storage-files-datalake) + set(ARROW_BUNDLED_STATIC_LIBS + ${ARROW_BUNDLED_STATIC_LIBS} + PARENT_SCOPE) +endfunction() + +if(ARROW_WITH_AZURE_SDK) + resolve_dependency(Azure REQUIRED_VERSION 1.10.2) + set(AZURE_SDK_LINK_LIBRARIES + Azure::azure-storage-files-datalake + Azure::azure-storage-common + Azure::azure-storage-blobs + Azure::azure-identity + Azure::azure-core) +endif() # ---------------------------------------------------------------------- # ucx - communication framework for modern, high-bandwidth and low-latency networks diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0158c0cec74..fcbae332d23 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -17,6 +17,9 @@ #include "arrow/filesystem/azurefs.h" +#include +#include + #include "arrow/result.h" #include "arrow/util/checked_cast.h" @@ -47,6 +50,12 @@ class AzureFileSystem::Impl { : io_context_(io_context), options_(std::move(options)) {} Status Init() { + // TODO: GH-18014 Delete this once we have a proper implementation. This just + // initializes a pointless Azure blob service client with a fake endpoint to ensure + // the build will fail if the Azure SDK build is broken. + auto default_credential = std::make_shared(); + auto service_client = Azure::Storage::Blobs::BlobServiceClient( + "http://fake-blob-storage-endpoint", default_credential); if (options_.backend == AzureBackend::Azurite) { // gen1Client_->GetAccountInfo().Value.IsHierarchicalNamespaceEnabled // throws error in azurite diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index e940c5bd1bc..9bf7cb8e753 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -45,6 +45,12 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" +#include +#include +#include +#include +#include + namespace arrow { using internal::TemporaryDir; namespace fs { @@ -105,15 +111,42 @@ AzuriteEnv* GetAzuriteEnv() { return ::arrow::internal::checked_cast(azurite_env); } -// Placeholder tests for file structure +// Placeholder tests // TODO: GH-18014 Remove once a proper test is added -TEST(AzureFileSystem, InitialiseAzurite) { +TEST(AzureFileSystem, UploadThenDownload) { + const std::string container_name = "sample-container"; + const std::string blob_name = "sample-blob.txt"; + const std::string blob_content = "Hello Azure!"; + const std::string& account_name = GetAzuriteEnv()->account_name(); const std::string& account_key = GetAzuriteEnv()->account_key(); - EXPECT_EQ(account_name, "devstoreaccount1"); - EXPECT_EQ(account_key, - "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/" - "K1SZFPTOtr/KBHBeksoGMGw=="); + + auto credential = std::make_shared( + account_name, account_key); + + auto service_client = Azure::Storage::Blobs::BlobServiceClient( + std::string("http://127.0.0.1:10000/") + account_name, credential); + auto container_client = service_client.GetBlobContainerClient(container_name); + container_client.CreateIfNotExists(); + auto blob_client = container_client.GetBlockBlobClient(blob_name); + + std::vector buffer(blob_content.begin(), blob_content.end()); + blob_client.UploadFrom(buffer.data(), buffer.size()); + + std::vector downloaded_content(blob_content.size()); + blob_client.DownloadTo(downloaded_content.data(), downloaded_content.size()); + + EXPECT_EQ(std::string(downloaded_content.begin(), downloaded_content.end()), + blob_content); +} + +TEST(AzureFileSystem, InitializeCredentials) { + auto default_credential = std::make_shared(); + auto managed_identity_credential = + std::make_shared(); + auto service_principal_credential = + std::make_shared("tenant_id", "client_id", + "client_secret"); } TEST(AzureFileSystem, OptionsCompare) { diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 8edaa422b3d..52d302592b5 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -53,6 +53,9 @@ ARROW_AWS_LC_BUILD_VERSION=v1.3.0 ARROW_AWS_LC_BUILD_SHA256_CHECKSUM=ae96a3567161552744fc0cae8b4d68ed88b1ec0f3d3c98700070115356da5a37 ARROW_AWSSDK_BUILD_VERSION=1.10.55 ARROW_AWSSDK_BUILD_SHA256_CHECKSUM=2d552fb1a84bef4a9b65e34aa7031851ed2aef5319e02cc6e4cb735c48aa30de +# Despite the confusing version name this is still the whole Azure SDK for C++ including core, keyvault, storage-common, etc. +ARROW_AZURE_SDK_BUILD_VERSION=azure-core_1.10.2 +ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM=36557dae87de4cdd257d9b441d9a7f043290eae6666fb1065e0fa486ae3e58a0 ARROW_BOOST_BUILD_VERSION=1.81.0 ARROW_BOOST_BUILD_SHA256_CHECKSUM=9e0ffae35528c35f90468997bc8d99500bf179cbae355415a89a600c38e13574 ARROW_BROTLI_BUILD_VERSION=v1.0.9