From c520a1d407a53d3f27c6ce9510057acf9cdfb965 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 7 Feb 2024 11:03:16 +0000 Subject: [PATCH 01/36] Check in patch file of python bindings from my custom build --- azure.patch | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 azure.patch diff --git a/azure.patch b/azure.patch new file mode 100644 index 00000000000..ce095bb3737 --- /dev/null +++ b/azure.patch @@ -0,0 +1,220 @@ +diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h +index 732be5f92..adadf6a51 100644 +--- a/cpp/src/arrow/filesystem/api.h ++++ b/cpp/src/arrow/filesystem/api.h +@@ -20,6 +20,9 @@ + #include "arrow/util/config.h" // IWYU pragma: export + + #include "arrow/filesystem/filesystem.h" // IWYU pragma: export ++#ifdef ARROW_AZURE ++#include "arrow/filesystem/azurefs.h" // IWYU pragma: export ++#endif + #include "arrow/filesystem/hdfs.h" // IWYU pragma: export + #ifdef ARROW_GCS + #include "arrow/filesystem/gcsfs.h" // IWYU pragma: export +diff --git a/cpp/src/arrow/filesystem/type_fwd.h b/cpp/src/arrow/filesystem/type_fwd.h +index 892f7ad2e..c56baf79f 100644 +--- a/cpp/src/arrow/filesystem/type_fwd.h ++++ b/cpp/src/arrow/filesystem/type_fwd.h +@@ -47,6 +47,7 @@ class SlowFileSystem; + class LocalFileSystem; + class S3FileSystem; + class GcsFileSystem; ++class AzureBlobFileSystem; + + } // namespace fs + } // namespace arrow +diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt +index bad5e926a..08840740d 100644 +--- a/python/CMakeLists.txt ++++ b/python/CMakeLists.txt +@@ -424,6 +424,10 @@ if(PYARROW_BUILD_S3) + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _s3fs) + endif() + ++if(PYARROW_BUILD_AZURE) ++ set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _azurefs) ++endif() ++ + if(PYARROW_BUILD_HDFS) + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _hdfs) + endif() +diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py +index bcb6b30e7..a02739e00 100644 +--- a/python/pyarrow/__init__.py ++++ b/python/pyarrow/__init__.py +@@ -145,7 +145,7 @@ def show_info(): + print(f" {module: <20}: {status: <8}") + + print("\nFilesystems:") +- filesystems = ["GcsFileSystem", "HadoopFileSystem", "S3FileSystem"] ++ filesystems = ["GcsFileSystem", "HadoopFileSystem", "S3FileSystem", "AzureBlobFileSystem"] + for fs in filesystems: + status = "Enabled" if _filesystem_is_available(fs) else "-" + print(f" {fs: <20}: {status: <8}") +diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx +new file mode 100644 +index 000000000..0e0f5ff2b +--- /dev/null ++++ b/python/pyarrow/_azurefs.pyx +@@ -0,0 +1,86 @@ ++# Licensed to the Apache Software Foundation (ASF) under one ++# or more contributor license agreements. See the NOTICE file ++# distributed with this work for additional information ++# regarding copyright ownership. The ASF licenses this file ++# to you under the Apache License, Version 2.0 (the ++# "License"); you may not use this file except in compliance ++# with the License. You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, ++# software distributed under the License is distributed on an ++# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY ++# KIND, either express or implied. See the License for the ++# specific language governing permissions and limitations ++# under the License. ++ ++# cython: language_level = 3 ++ ++from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, ++ pyarrow_unwrap_metadata) ++from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata ++from pyarrow.includes.common cimport * ++from pyarrow.includes.libarrow cimport * ++from pyarrow.includes.libarrow_fs cimport * ++from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint ++from cython.operator cimport dereference as deref ++ ++from datetime import datetime, timedelta, timezone ++ ++ ++cdef class AzureBlobFileSystem(FileSystem): ++ cdef: ++ CAzureBlobFileSystem* azurefs ++ ++ def __init__(self, *, uri=None): ++ cdef: ++ CAzureOptions options ++ shared_ptr[CAzureBlobFileSystem] wrapped ++ ++ options = GetResultValue(CAzureOptions.FromUriString(tobytes(uri))) ++ ++ with nogil: ++ wrapped = GetResultValue(CAzureBlobFileSystem.Make(options)) ++ ++ self.init( wrapped) ++ ++ cdef init(self, const shared_ptr[CFileSystem]& wrapped): ++ FileSystem.init(self, wrapped) ++ self.azurefs = wrapped.get() ++ ++ @classmethod ++ def _reconstruct(cls, kwargs): ++ return cls(**kwargs) ++ ++ def __reduce__(self): ++ cdef CAzureOptions opts = self.azurefs.options() ++ return ( ++ AzureBlobFileSystem._reconstruct, (dict( ++ uri=frombytes(opts.account_blob_url), ++ ),)) +diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx +index 557c08149..6e840dabe 100644 +--- a/python/pyarrow/_fs.pyx ++++ b/python/pyarrow/_fs.pyx +@@ -493,6 +493,9 @@ cdef class FileSystem(_Weakrefable): + elif typ == 'hdfs': + from pyarrow._hdfs import HadoopFileSystem + self = HadoopFileSystem.__new__(HadoopFileSystem) ++ elif typ == 'abfs': ++ from pyarrow._azurefs import AzureBlobFileSystem ++ self = AzureBlobFileSystem.__new__(AzureBlobFileSystem) + elif typ.startswith('py::'): + self = PyFileSystem.__new__(PyFileSystem) + else: +diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py +index c6f44ccbb..c198e61b8 100644 +--- a/python/pyarrow/fs.py ++++ b/python/pyarrow/fs.py +@@ -40,6 +40,11 @@ FileStats = FileInfo + + _not_imported = [] + ++try: ++ from pyarrow._azurefs import AzureBlobFileSystem # noqa ++except ImportError: ++ _not_imported.append("AzureBlobFileSystem") ++ + try: + from pyarrow._hdfs import HadoopFileSystem # noqa + except ImportError: +diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd +index bf22ead83..a56a68816 100644 +--- a/python/pyarrow/includes/libarrow_fs.pxd ++++ b/python/pyarrow/includes/libarrow_fs.pxd +@@ -246,6 +246,18 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: + CResult[shared_ptr[CGcsFileSystem]] Make(const CGcsOptions& options) + CGcsOptions options() + ++ cdef cppclass CAzureOptions "arrow::fs::AzureOptions": ++ c_string account_blob_url ++ c_bool Equals(const CAzureOptions& other) ++ ++ @staticmethod ++ CResult[CAzureOptions] FromUriString "FromUri"(const c_string& uri_string) ++ ++ cdef cppclass CAzureBlobFileSystem "arrow::fs::AzureBlobFileSystem": ++ @staticmethod ++ CResult[shared_ptr[CAzureBlobFileSystem]] Make(const CAzureOptions& options) ++ CAzureOptions options() ++ + cdef cppclass CHdfsOptions "arrow::fs::HdfsOptions": + HdfsConnectionConfig connection_config + int32_t buffer_size +diff --git a/python/setup.py b/python/setup.py +index 2a7e5f7a3..d8b1d565c 100755 +--- a/python/setup.py ++++ b/python/setup.py +@@ -115,6 +115,7 @@ class build_ext(_build_ext): + ('with-parquet', None, 'build the Parquet extension'), + ('with-parquet-encryption', None, + 'build the Parquet encryption extension'), ++ ('with-azure', None, 'build the Azure Blob Storage extension'), + ('with-gcs', None, + 'build the Google Cloud Storage (GCS) extension'), + ('with-s3', None, 'build the Amazon S3 extension'), +@@ -163,6 +164,8 @@ class build_ext(_build_ext): + os.environ.get('PYARROW_WITH_GCS', '0')) + self.with_s3 = strtobool( + os.environ.get('PYARROW_WITH_S3', '0')) ++ self.with_azure = strtobool( ++ os.environ.get('PYARROW_WITH_AZURE', '0')) + self.with_hdfs = strtobool( + os.environ.get('PYARROW_WITH_HDFS', '0')) + self.with_cuda = strtobool( +@@ -224,6 +227,7 @@ class build_ext(_build_ext): + '_orc', + '_plasma', + '_gcsfs', ++ '_azurefs', + '_s3fs', + '_substrait', + '_hdfs', +@@ -369,6 +373,7 @@ class build_ext(_build_ext): + append_cmake_bool(self.with_plasma, 'PYARROW_BUILD_PLASMA') + append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') + append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') ++ append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') + append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') + append_cmake_bool(self.with_tensorflow, 'PYARROW_USE_TENSORFLOW') + append_cmake_bool(self.bundle_arrow_cpp, +@@ -557,6 +562,8 @@ class build_ext(_build_ext): + return True + if name == '_s3fs' and not self.with_s3: + return True ++ if name == '_azurefs' and not self.with_azure: ++ return True + if name == '_hdfs' and not self.with_hdfs: + return True + if name == '_dataset' and not self.with_dataset: From aa0a1b805954bab9feb5c3239f67ca1ac99e50db Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 7 Feb 2024 20:32:18 +0000 Subject: [PATCH 02/36] Complete bindings --- python/pyarrow/_azurefs.pyx | 62 +++++++++++++++++++++++++ python/pyarrow/includes/libarrow_fs.pxd | 15 ++++++ 2 files changed, 77 insertions(+) create mode 100644 python/pyarrow/_azurefs.pyx diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx new file mode 100644 index 00000000000..1b134919e30 --- /dev/null +++ b/python/pyarrow/_azurefs.pyx @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, + pyarrow_unwrap_metadata) +from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint +from cython.operator cimport dereference as deref + +from datetime import datetime, timedelta, timezone + + +cdef class AzureFileSystem(FileSystem): + cdef: + CAzureFileSystem* azurefs + + def __init__(self, *, account_name): + cdef: + CAzureOptions options + shared_ptr[CAzureFileSystem] wrapped + + options.account_name + options.ConfigureDefaultCredential() + + with nogil: + wrapped = GetResultValue(CAzureFileSystem.Make(options)) + + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.azurefs = wrapped.get() + + @classmethod + def _reconstruct(cls, kwargs): + return cls(**kwargs) + + def __reduce__(self): + cdef CAzureOptions opts = self.azurefs.options() + return ( + AzureFileSystem._reconstruct, (dict( + account_name=frombytes(opts.account_name), + ),)) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 7876fb0f966..0368dc516ed 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -251,6 +251,21 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CResult[shared_ptr[CGcsFileSystem]] Make(const CGcsOptions& options) CGcsOptions options() + cdef cppclass CAzureOptions "arrow::fs::AzureOptions": + c_string account_name + c_string blob_storage_authority + c_string dfs_storage_authority + c_string blob_storage_scheme + c_string dfs_storage_scheme + + c_bool Equals(const CAzureOptions& other) + CStatus ConfigureDefaultCredential() + + cdef cppclass CAzureFileSystem "arrow::fs::AzureFileSystem": + @staticmethod + CResult[shared_ptr[CAzureFileSystem]] Make(const CAzureOptions& options) + CAzureOptions options() + cdef cppclass CHdfsOptions "arrow::fs::HdfsOptions": HdfsConnectionConfig connection_config int32_t buffer_size From ffd15345b1f002bd7ffaaa2bd8797499e62765eb Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 7 Feb 2024 20:40:05 +0000 Subject: [PATCH 03/36] Python side boilerplate --- python/pyarrow/__init__.py | 2 +- python/pyarrow/_fs.pyx | 3 +++ python/pyarrow/fs.py | 4 ++++ python/setup.py | 6 ++++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 7ede69da665..e72e3bace30 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -145,7 +145,7 @@ def print_entry(label, value): print(f" {module: <20}: {status: <8}") print("\nFilesystems:") - filesystems = ["GcsFileSystem", "HadoopFileSystem", "S3FileSystem"] + filesystems = ["AzureFileSystem", "GcsFileSystem", "HadoopFileSystem", "S3FileSystem"] for fs in filesystems: status = "Enabled" if _filesystem_is_available(fs) else "-" print(f" {fs: <20}: {status: <8}") diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 395f4881443..86cf39e993c 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -491,6 +491,9 @@ cdef class FileSystem(_Weakrefable): elif typ == 'gcs': from pyarrow._gcsfs import GcsFileSystem self = GcsFileSystem.__new__(GcsFileSystem) + elif typ == 'abfs': + from pyarrow._azurefs import AzureFileSystem + self = AzureFileSystem.__new__(AzureFileSystem) elif typ == 'hdfs': from pyarrow._hdfs import HadoopFileSystem self = HadoopFileSystem.__new__(HadoopFileSystem) diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index a256cc540f7..abdd1a99575 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -39,6 +39,10 @@ FileStats = FileInfo _not_imported = [] +try: + from pyarrow._azurefs import AzureFileSystem # noqa +except ImportError: + _not_imported.append("AzureFileSystem") try: from pyarrow._hdfs import HadoopFileSystem # noqa diff --git a/python/setup.py b/python/setup.py index 798bd6b05fd..b2eac40e3b0 100755 --- a/python/setup.py +++ b/python/setup.py @@ -105,6 +105,8 @@ def run(self): 'build type (debug or release), default release'), ('boost-namespace=', None, 'namespace of boost (default: boost)'), + ('with-azure', None, + 'build the Azure Blob Storage extension'), ('with-cuda', None, 'build the Cuda extension'), ('with-flight', None, 'build the Flight extension'), ('with-substrait', None, 'build the Substrait extension'), @@ -150,6 +152,8 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' + self.with_azure = strtobool( + os.environ.get('PYARROW_WITH_AZURE', '0')) self.with_gcs = strtobool( os.environ.get('PYARROW_WITH_GCS', '0')) self.with_s3 = strtobool( @@ -348,6 +352,8 @@ def _failure_permitted(self, name): return True if name == '_s3fs' and not self.with_s3: return True + if name == '_azurefs' and not self.with_azure: + return True if name == '_hdfs' and not self.with_hdfs: return True if name == '_dataset' and not self.with_dataset: From 3cdbc2e3e0d1b50023deb3751c8eb35fe8c57c73 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 8 Feb 2024 00:20:31 +0000 Subject: [PATCH 04/36] Start tests --- python/pyarrow/conftest.py | 2 ++ python/pyarrow/tests/conftest.py | 26 +++++++++++++++++++++++ python/pyarrow/tests/test_fs.py | 36 ++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 6f6807e907d..00f2c7d3ded 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -24,6 +24,7 @@ groups = [ 'acero', + 'azure', 'brotli', 'bz2', 'cython', @@ -54,6 +55,7 @@ defaults = { 'acero': False, + 'azure': False, 'brotli': Codec.is_available('brotli'), 'bz2': Codec.is_available('bz2'), 'cython': False, diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 0da757a4bc5..470a855d712 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -256,6 +256,32 @@ def gcs_server(): proc.wait() +@pytest.fixture(scope='session') +def azure_server(tmpdir_factory): + port = find_free_port() + env = os.environ.copy() + tmpdir = tmpdir_factory.getbasetemp() + args = ['azurite', '--quiet', "--location", tmpdir] + proc = None + try: + proc = subprocess.Popen(args, env=env) + # Make sure the server is alive. + if proc.poll() is not None: + pytest.skip(f"Command {args} did not start server successfully!") + except (ModuleNotFoundError, OSError) as e: + pytest.skip(f"Command {args} failed to execute: {e}") + else: + yield { + 'connection': ('localhost', port), + 'process': proc, + 'tempdir': tmpdir, + } + finally: + if proc is not None: + proc.kill() + proc.wait() + + @pytest.fixture( params=[ 'builtin_pickle', diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 543c4399ddb..f7af34d069e 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -291,6 +291,28 @@ def subtree_s3fs(request, s3fs): }""" +@pytest.fixture +def azurefs(request, azure_server): + request.config.pyarrow.requires('azure') + from pyarrow.fs import AzureFileSystem + + host, port = azure_server['connection'] + container = 'pyarrow-filesystem/' + + fs = AzureFileSystem(account_name='devstoreaccount1') + try: + fs.create_dir(container) + except OSError as e: + pytest.skip(f"Could not create directory in {fs}: {e}") + + yield dict( + fs=fs, + pathfn=container.__add__, + allow_move_dir=False, # TODO(GH-38704): Switch this to True when AzureFileSystem adds support for it. + allow_append_to_file=True, + ) + fs.delete_dir(container) + @pytest.fixture def hdfs(request, hdfs_connection): request.config.pyarrow.requires('hdfs') @@ -383,6 +405,11 @@ def py_fsspec_s3fs(request, s3_server): id='GcsFileSystem', marks=pytest.mark.gcs ), + pytest.param( + 'azurefs', + id='AzureFileSystem', + marks=pytest.mark.azure + ), pytest.param( 'hdfs', id='HadoopFileSystem', @@ -413,6 +440,11 @@ def py_fsspec_s3fs(request, s3_server): id='PyFileSystem(FSSpecHandler(s3fs.S3FileSystem()))', marks=pytest.mark.s3 ), + pytest.param( + 'py_fsspec_azurefs', + id='PyFileSystem(FSSpecHandler(azurefs.AzureFileSystem()))', + marks=pytest.mark.azure, + ), ]) def filesystem_config(request): return request.getfixturevalue(request.param) @@ -1379,6 +1411,10 @@ def test_s3fs_wrong_region(): fs.get_file_info("voltrondata-labs-datasets") +def test_azurefs_options(): + # TODO(tomnewton) + pass + @pytest.mark.hdfs def test_hdfs_options(hdfs_connection, pickle_module): from pyarrow.fs import HadoopFileSystem From 46d91357cf60784933b4c64fe5adc5c40abf9a77 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 8 Feb 2024 00:23:25 +0000 Subject: [PATCH 05/36] Update patch --- azure.patch | 166 ---------------------------------------------------- 1 file changed, 166 deletions(-) diff --git a/azure.patch b/azure.patch index ce095bb3737..4a61c138514 100644 --- a/azure.patch +++ b/azure.patch @@ -52,169 +52,3 @@ index bcb6b30e7..a02739e00 100644 for fs in filesystems: status = "Enabled" if _filesystem_is_available(fs) else "-" print(f" {fs: <20}: {status: <8}") -diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx -new file mode 100644 -index 000000000..0e0f5ff2b ---- /dev/null -+++ b/python/pyarrow/_azurefs.pyx -@@ -0,0 +1,86 @@ -+# Licensed to the Apache Software Foundation (ASF) under one -+# or more contributor license agreements. See the NOTICE file -+# distributed with this work for additional information -+# regarding copyright ownership. The ASF licenses this file -+# to you under the Apache License, Version 2.0 (the -+# "License"); you may not use this file except in compliance -+# with the License. You may obtain a copy of the License at -+# -+# http://www.apache.org/licenses/LICENSE-2.0 -+# -+# Unless required by applicable law or agreed to in writing, -+# software distributed under the License is distributed on an -+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+# KIND, either express or implied. See the License for the -+# specific language governing permissions and limitations -+# under the License. -+ -+# cython: language_level = 3 -+ -+from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, -+ pyarrow_unwrap_metadata) -+from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata -+from pyarrow.includes.common cimport * -+from pyarrow.includes.libarrow cimport * -+from pyarrow.includes.libarrow_fs cimport * -+from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint -+from cython.operator cimport dereference as deref -+ -+from datetime import datetime, timedelta, timezone -+ -+ -+cdef class AzureBlobFileSystem(FileSystem): -+ cdef: -+ CAzureBlobFileSystem* azurefs -+ -+ def __init__(self, *, uri=None): -+ cdef: -+ CAzureOptions options -+ shared_ptr[CAzureBlobFileSystem] wrapped -+ -+ options = GetResultValue(CAzureOptions.FromUriString(tobytes(uri))) -+ -+ with nogil: -+ wrapped = GetResultValue(CAzureBlobFileSystem.Make(options)) -+ -+ self.init( wrapped) -+ -+ cdef init(self, const shared_ptr[CFileSystem]& wrapped): -+ FileSystem.init(self, wrapped) -+ self.azurefs = wrapped.get() -+ -+ @classmethod -+ def _reconstruct(cls, kwargs): -+ return cls(**kwargs) -+ -+ def __reduce__(self): -+ cdef CAzureOptions opts = self.azurefs.options() -+ return ( -+ AzureBlobFileSystem._reconstruct, (dict( -+ uri=frombytes(opts.account_blob_url), -+ ),)) -diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx -index 557c08149..6e840dabe 100644 ---- a/python/pyarrow/_fs.pyx -+++ b/python/pyarrow/_fs.pyx -@@ -493,6 +493,9 @@ cdef class FileSystem(_Weakrefable): - elif typ == 'hdfs': - from pyarrow._hdfs import HadoopFileSystem - self = HadoopFileSystem.__new__(HadoopFileSystem) -+ elif typ == 'abfs': -+ from pyarrow._azurefs import AzureBlobFileSystem -+ self = AzureBlobFileSystem.__new__(AzureBlobFileSystem) - elif typ.startswith('py::'): - self = PyFileSystem.__new__(PyFileSystem) - else: -diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py -index c6f44ccbb..c198e61b8 100644 ---- a/python/pyarrow/fs.py -+++ b/python/pyarrow/fs.py -@@ -40,6 +40,11 @@ FileStats = FileInfo - - _not_imported = [] - -+try: -+ from pyarrow._azurefs import AzureBlobFileSystem # noqa -+except ImportError: -+ _not_imported.append("AzureBlobFileSystem") -+ - try: - from pyarrow._hdfs import HadoopFileSystem # noqa - except ImportError: -diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd -index bf22ead83..a56a68816 100644 ---- a/python/pyarrow/includes/libarrow_fs.pxd -+++ b/python/pyarrow/includes/libarrow_fs.pxd -@@ -246,6 +246,18 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: - CResult[shared_ptr[CGcsFileSystem]] Make(const CGcsOptions& options) - CGcsOptions options() - -+ cdef cppclass CAzureOptions "arrow::fs::AzureOptions": -+ c_string account_blob_url -+ c_bool Equals(const CAzureOptions& other) -+ -+ @staticmethod -+ CResult[CAzureOptions] FromUriString "FromUri"(const c_string& uri_string) -+ -+ cdef cppclass CAzureBlobFileSystem "arrow::fs::AzureBlobFileSystem": -+ @staticmethod -+ CResult[shared_ptr[CAzureBlobFileSystem]] Make(const CAzureOptions& options) -+ CAzureOptions options() -+ - cdef cppclass CHdfsOptions "arrow::fs::HdfsOptions": - HdfsConnectionConfig connection_config - int32_t buffer_size -diff --git a/python/setup.py b/python/setup.py -index 2a7e5f7a3..d8b1d565c 100755 ---- a/python/setup.py -+++ b/python/setup.py -@@ -115,6 +115,7 @@ class build_ext(_build_ext): - ('with-parquet', None, 'build the Parquet extension'), - ('with-parquet-encryption', None, - 'build the Parquet encryption extension'), -+ ('with-azure', None, 'build the Azure Blob Storage extension'), - ('with-gcs', None, - 'build the Google Cloud Storage (GCS) extension'), - ('with-s3', None, 'build the Amazon S3 extension'), -@@ -163,6 +164,8 @@ class build_ext(_build_ext): - os.environ.get('PYARROW_WITH_GCS', '0')) - self.with_s3 = strtobool( - os.environ.get('PYARROW_WITH_S3', '0')) -+ self.with_azure = strtobool( -+ os.environ.get('PYARROW_WITH_AZURE', '0')) - self.with_hdfs = strtobool( - os.environ.get('PYARROW_WITH_HDFS', '0')) - self.with_cuda = strtobool( -@@ -224,6 +227,7 @@ class build_ext(_build_ext): - '_orc', - '_plasma', - '_gcsfs', -+ '_azurefs', - '_s3fs', - '_substrait', - '_hdfs', -@@ -369,6 +373,7 @@ class build_ext(_build_ext): - append_cmake_bool(self.with_plasma, 'PYARROW_BUILD_PLASMA') - append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') - append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') -+ append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') - append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') - append_cmake_bool(self.with_tensorflow, 'PYARROW_USE_TENSORFLOW') - append_cmake_bool(self.bundle_arrow_cpp, -@@ -557,6 +562,8 @@ class build_ext(_build_ext): - return True - if name == '_s3fs' and not self.with_s3: - return True -+ if name == '_azurefs' and not self.with_azure: -+ return True - if name == '_hdfs' and not self.with_hdfs: - return True - if name == '_dataset' and not self.with_dataset: From e51af7b138fd648e18745c79e94509cd735de27f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 8 Feb 2024 10:10:37 +0000 Subject: [PATCH 06/36] Working build with azure bindings --- azure.patch | 54 ----------------------------- cpp/src/arrow/filesystem/api.h | 3 ++ cpp/src/arrow/filesystem/type_fwd.h | 1 + cpp/src/arrow/util/config.h.cmake | 1 + python/CMakeLists.txt | 4 +++ python/setup.py | 2 ++ 6 files changed, 11 insertions(+), 54 deletions(-) delete mode 100644 azure.patch diff --git a/azure.patch b/azure.patch deleted file mode 100644 index 4a61c138514..00000000000 --- a/azure.patch +++ /dev/null @@ -1,54 +0,0 @@ -diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h -index 732be5f92..adadf6a51 100644 ---- a/cpp/src/arrow/filesystem/api.h -+++ b/cpp/src/arrow/filesystem/api.h -@@ -20,6 +20,9 @@ - #include "arrow/util/config.h" // IWYU pragma: export - - #include "arrow/filesystem/filesystem.h" // IWYU pragma: export -+#ifdef ARROW_AZURE -+#include "arrow/filesystem/azurefs.h" // IWYU pragma: export -+#endif - #include "arrow/filesystem/hdfs.h" // IWYU pragma: export - #ifdef ARROW_GCS - #include "arrow/filesystem/gcsfs.h" // IWYU pragma: export -diff --git a/cpp/src/arrow/filesystem/type_fwd.h b/cpp/src/arrow/filesystem/type_fwd.h -index 892f7ad2e..c56baf79f 100644 ---- a/cpp/src/arrow/filesystem/type_fwd.h -+++ b/cpp/src/arrow/filesystem/type_fwd.h -@@ -47,6 +47,7 @@ class SlowFileSystem; - class LocalFileSystem; - class S3FileSystem; - class GcsFileSystem; -+class AzureBlobFileSystem; - - } // namespace fs - } // namespace arrow -diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt -index bad5e926a..08840740d 100644 ---- a/python/CMakeLists.txt -+++ b/python/CMakeLists.txt -@@ -424,6 +424,10 @@ if(PYARROW_BUILD_S3) - set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _s3fs) - endif() - -+if(PYARROW_BUILD_AZURE) -+ set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _azurefs) -+endif() -+ - if(PYARROW_BUILD_HDFS) - set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _hdfs) - endif() -diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py -index bcb6b30e7..a02739e00 100644 ---- a/python/pyarrow/__init__.py -+++ b/python/pyarrow/__init__.py -@@ -145,7 +145,7 @@ def show_info(): - print(f" {module: <20}: {status: <8}") - - print("\nFilesystems:") -- filesystems = ["GcsFileSystem", "HadoopFileSystem", "S3FileSystem"] -+ filesystems = ["GcsFileSystem", "HadoopFileSystem", "S3FileSystem", "AzureBlobFileSystem"] - for fs in filesystems: - status = "Enabled" if _filesystem_is_available(fs) else "-" - print(f" {fs: <20}: {status: <8}") diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h index 732be5f928f..d1441f7016c 100644 --- a/cpp/src/arrow/filesystem/api.h +++ b/cpp/src/arrow/filesystem/api.h @@ -21,6 +21,9 @@ #include "arrow/filesystem/filesystem.h" // IWYU pragma: export #include "arrow/filesystem/hdfs.h" // IWYU pragma: export +#ifdef ARROW_AZURE +#include "arrow/filesystem/azurefs.h" // IWYU pragma: export +#endif #ifdef ARROW_GCS #include "arrow/filesystem/gcsfs.h" // IWYU pragma: export #endif diff --git a/cpp/src/arrow/filesystem/type_fwd.h b/cpp/src/arrow/filesystem/type_fwd.h index 892f7ad2e1b..7238d3b8040 100644 --- a/cpp/src/arrow/filesystem/type_fwd.h +++ b/cpp/src/arrow/filesystem/type_fwd.h @@ -47,6 +47,7 @@ class SlowFileSystem; class LocalFileSystem; class S3FileSystem; class GcsFileSystem; +class AzureFileSystem; } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/util/config.h.cmake b/cpp/src/arrow/util/config.h.cmake index fb42a53139f..f93c888de27 100644 --- a/cpp/src/arrow/util/config.h.cmake +++ b/cpp/src/arrow/util/config.h.cmake @@ -53,6 +53,7 @@ #cmakedefine ARROW_SUBSTRAIT #cmakedefine ARROW_ENABLE_THREADING +#cmakedefine ARROW_AZURE #cmakedefine ARROW_GCS #cmakedefine ARROW_HDFS #cmakedefine ARROW_S3 diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8c98e269d6f..af65ea7d614 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -554,6 +554,10 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE) set(LINK_LIBS arrow_python) +if(PYARROW_BUILD_AZURE) + list(APPEND CYTHON_EXTENSIONS _azurefs) +endif() + if(PYARROW_BUILD_GCS) list(APPEND CYTHON_EXTENSIONS _gcsfs) endif() diff --git a/python/setup.py b/python/setup.py index b2eac40e3b0..07032ad23c8 100755 --- a/python/setup.py +++ b/python/setup.py @@ -206,6 +206,7 @@ def initialize_options(self): '_dataset_orc', '_dataset_parquet', '_acero', + '_azurefs', '_feather', '_parquet', '_parquet_encryption', @@ -284,6 +285,7 @@ def append_cmake_bool(value, varname): 'PYARROW_BUILD_PARQUET_ENCRYPTION') append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') + append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') append_cmake_bool(self.bundle_arrow_cpp, 'PYARROW_BUNDLE_ARROW_CPP') From 6f6cf33daa6cebd39e8b1e7cb6a50f753265b901 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 8 Feb 2024 11:01:30 +0000 Subject: [PATCH 07/36] Fix account_name configuration --- python/pyarrow/_azurefs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index 1b134919e30..d22cca09b4e 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -38,7 +38,7 @@ cdef class AzureFileSystem(FileSystem): CAzureOptions options shared_ptr[CAzureFileSystem] wrapped - options.account_name + options.account_name = tobytes(account_name) options.ConfigureDefaultCredential() with nogil: From 632193735e8a3dec532348a80fc983ebff422b8a Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 8 Feb 2024 21:18:40 +0000 Subject: [PATCH 08/36] Sufficient pybinds to connect to azurite --- python/pyarrow/_azurefs.pyx | 18 ++++++++++++++++-- python/pyarrow/includes/libarrow_fs.pxd | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index d22cca09b4e..cbf53c49494 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -33,13 +33,27 @@ cdef class AzureFileSystem(FileSystem): cdef: CAzureFileSystem* azurefs - def __init__(self, *, account_name): + def __init__(self, *, account_name, account_key=None, blob_storage_authority=None, + dfs_storage_authority=None, blob_storage_scheme=None, + dfs_storage_scheme=None): cdef: CAzureOptions options shared_ptr[CAzureFileSystem] wrapped options.account_name = tobytes(account_name) - options.ConfigureDefaultCredential() + if blob_storage_authority: + options.blob_storage_authority = tobytes(blob_storage_authority) + if dfs_storage_authority: + options.dfs_storage_authority = tobytes(dfs_storage_authority) + if blob_storage_scheme: + options.blob_storage_scheme = tobytes(blob_storage_scheme) + if dfs_storage_scheme: + options.dfs_storage_scheme = tobytes(dfs_storage_scheme) + + if account_key: + options.ConfigureAccountKeyCredential(tobytes(account_key)) + else: + options.ConfigureDefaultCredential() with nogil: wrapped = GetResultValue(CAzureFileSystem.Make(options)) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 0368dc516ed..8c55bf7bc41 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -260,6 +260,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_bool Equals(const CAzureOptions& other) CStatus ConfigureDefaultCredential() + CStatus ConfigureAccountKeyCredential(c_string account_key) cdef cppclass CAzureFileSystem "arrow::fs::AzureFileSystem": @staticmethod From dedeea88a64ba2bbd1f0aa42fefd165826c175da Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 8 Feb 2024 22:42:34 +0000 Subject: [PATCH 09/36] Somewhat working azurite tests --- python/pyarrow/tests/conftest.py | 9 +++++++-- python/pyarrow/tests/test_fs.py | 13 ++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 470a855d712..5df11833ad5 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -261,7 +261,12 @@ def azure_server(tmpdir_factory): port = find_free_port() env = os.environ.copy() tmpdir = tmpdir_factory.getbasetemp() - args = ['azurite', '--quiet', "--location", tmpdir] + # Port 0 means azurite will select any free port. We don't need to connect + # to the queue or table services, we just need them to not conflict with + # other ports. + # TODO(tomnewton): Get a suiteable debug tmpdir + args = ['azurite', "--location", tmpdir, "--blobPort", str(port), + "--queuePort", "0", "--tablePort", "0", "--debug", "/tmp/azurite_debug/"] proc = None try: proc = subprocess.Popen(args, env=env) @@ -272,7 +277,7 @@ def azure_server(tmpdir_factory): pytest.skip(f"Command {args} failed to execute: {e}") else: yield { - 'connection': ('localhost', port), + 'connection': ('127.0.0.1', port), 'process': proc, 'tempdir': tmpdir, } diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index f7af34d069e..282f466c0ce 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -297,13 +297,16 @@ def azurefs(request, azure_server): from pyarrow.fs import AzureFileSystem host, port = azure_server['connection'] + azureite_authority = f"{host}:{port}" + azureite_scheme = "http" + container = 'pyarrow-filesystem/' - fs = AzureFileSystem(account_name='devstoreaccount1') - try: - fs.create_dir(container) - except OSError as e: - pytest.skip(f"Could not create directory in {fs}: {e}") + fs = AzureFileSystem(account_name='devstoreaccount1', account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + blob_storage_authority=azureite_authority, dfs_storage_authority=azureite_authority, + blob_storage_scheme=azureite_scheme, dfs_storage_scheme=azureite_scheme) + + fs.create_dir(container) yield dict( fs=fs, From f7c650aae71beec10102030452a421046ffd23de Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 9 Feb 2024 23:09:55 +0000 Subject: [PATCH 10/36] Skip move tests which are not yet implemented --- python/pyarrow/tests/test_fs.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 282f466c0ce..5988d01a21a 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -311,7 +311,7 @@ def azurefs(request, azure_server): yield dict( fs=fs, pathfn=container.__add__, - allow_move_dir=False, # TODO(GH-38704): Switch this to True when AzureFileSystem adds support for it. + allow_move_dir=False, # AzureFileSystem will only support this in hierachical namespace accounts. allow_append_to_file=True, ) fs.delete_dir(container) @@ -443,11 +443,6 @@ def py_fsspec_s3fs(request, s3_server): id='PyFileSystem(FSSpecHandler(s3fs.S3FileSystem()))', marks=pytest.mark.s3 ), - pytest.param( - 'py_fsspec_azurefs', - id='PyFileSystem(FSSpecHandler(azurefs.AzureFileSystem()))', - marks=pytest.mark.azure, - ), ]) def filesystem_config(request): return request.getfixturevalue(request.param) @@ -501,6 +496,10 @@ def skip_fsspec_s3fs(fs): if fs.type_name == "py::fsspec+('s3', 's3a')": pytest.xfail(reason="Not working with fsspec's s3fs") +def skip_azure(fs): + if fs.type_name == "abfs": + pytest.xfail(reason="Not implemented yet in abfs. See GH-18014") + @pytest.mark.s3 def test_s3fs_limited_permissions_create_bucket(s3_server): @@ -892,6 +891,9 @@ def test_copy_file(fs, pathfn): def test_move_directory(fs, pathfn, allow_move_dir): + # TODO(GH-38704): Stop skipping this test once AzureFileSystem add support + skip_azure(fs) + # move directory (doesn't work with S3) s = pathfn('source-dir/') t = pathfn('target-dir/') @@ -912,6 +914,9 @@ def test_move_file(fs, pathfn): # s3fs moving a file with recursive=True on latest 0.5 version # (https://github.com/dask/s3fs/issues/394) skip_fsspec_s3fs(fs) + + # TODO(GH-38704): Stop skipping this test once AzureFileSystem add support + skip_azure(fs) s = pathfn('test-move-source-file') t = pathfn('test-move-target-file') From 995d787bbda25d660e94d6625937de0167149bef Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 9 Feb 2024 23:32:47 +0000 Subject: [PATCH 11/36] Update skipped tests --- python/pyarrow/tests/test_fs.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 5988d01a21a..cdfb6662dfc 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -496,9 +496,9 @@ def skip_fsspec_s3fs(fs): if fs.type_name == "py::fsspec+('s3', 's3a')": pytest.xfail(reason="Not working with fsspec's s3fs") -def skip_azure(fs): +def skip_azure(fs, reason): if fs.type_name == "abfs": - pytest.xfail(reason="Not implemented yet in abfs. See GH-18014") + pytest.xfail(reason=reason) @pytest.mark.s3 @@ -892,7 +892,7 @@ def test_copy_file(fs, pathfn): def test_move_directory(fs, pathfn, allow_move_dir): # TODO(GH-38704): Stop skipping this test once AzureFileSystem add support - skip_azure(fs) + skip_azure(fs, "Not implemented yet in abfs. See GH-38704") # move directory (doesn't work with S3) s = pathfn('source-dir/') @@ -914,9 +914,9 @@ def test_move_file(fs, pathfn): # s3fs moving a file with recursive=True on latest 0.5 version # (https://github.com/dask/s3fs/issues/394) skip_fsspec_s3fs(fs) - + # TODO(GH-38704): Stop skipping this test once AzureFileSystem add support - skip_azure(fs) + skip_azure(fs, "Not implemented yet in abfs. See GH-38704") s = pathfn('test-move-source-file') t = pathfn('test-move-target-file') @@ -1069,7 +1069,9 @@ def test_open_output_stream_metadata(fs, pathfn): assert f.read() == data got_metadata = f.metadata() - if fs.type_name in ['s3', 'gcs'] or 'mock' in fs.type_name: + if fs.type_name in ['s3', 'gcs', 'abfs'] or 'mock' in fs.type_name: + # TODO(tomnewton): Create a Github issue for this. + skip_azure(fs, "Azure filesystem currently only returns system metadata not user metadata") for k, v in metadata.items(): assert got_metadata[k] == v.encode() else: From 7af5a422d3b8f994811f70c83883165b7191bd35 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 12:53:53 +0000 Subject: [PATCH 12/36] Working pickling tests --- python/pyarrow/_azurefs.pyx | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index cbf53c49494..cbc3f520de6 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -17,6 +17,8 @@ # cython: language_level = 3 +from cython cimport binding + from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, pyarrow_unwrap_metadata) from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata @@ -32,6 +34,7 @@ from datetime import datetime, timedelta, timezone cdef class AzureFileSystem(FileSystem): cdef: CAzureFileSystem* azurefs + c_string account_key def __init__(self, *, account_name, account_key=None, blob_storage_authority=None, dfs_storage_authority=None, blob_storage_scheme=None, @@ -52,6 +55,7 @@ cdef class AzureFileSystem(FileSystem): if account_key: options.ConfigureAccountKeyCredential(tobytes(account_key)) + self.account_key = tobytes(account_key) else: options.ConfigureDefaultCredential() @@ -64,13 +68,22 @@ cdef class AzureFileSystem(FileSystem): FileSystem.init(self, wrapped) self.azurefs = wrapped.get() - @classmethod - def _reconstruct(cls, kwargs): - return cls(**kwargs) + @staticmethod + @binding(True) # Required for cython < 3 + def _reconstruct(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + return AzureFileSystem(**kwargs) def __reduce__(self): cdef CAzureOptions opts = self.azurefs.options() return ( AzureFileSystem._reconstruct, (dict( account_name=frombytes(opts.account_name), + # TODO(tomnewton): Check if pickling still works if account_key is None + account_key=frombytes(self.account_key), + blob_storage_authority=frombytes(opts.blob_storage_authority), + dfs_storage_authority=frombytes(opts.dfs_storage_authority), + blob_storage_scheme=frombytes(opts.blob_storage_scheme), + dfs_storage_scheme=frombytes(opts.dfs_storage_scheme) ),)) From 85d9de37213465b60bd9d3716997364b7550901f Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 18:03:35 +0000 Subject: [PATCH 13/36] Update TODO comments with references to relevant Github issues --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 ++ python/pyarrow/tests/test_fs.py | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 0ce84043a53..8e0d421b64a 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -2458,6 +2458,7 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { ASSERT_OK(output->Close()); // Verify the metadata has been set. + // TODO(GH-40025): Use `AzureFileSystem` to fetch metadata for this assertion. auto blob_metadata = blob_service_client_->GetBlobContainerClient(data.container_name) .GetBlockBlobClient(blob_path) .GetProperties() @@ -2470,6 +2471,7 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { full_path, /*metadata=*/arrow::key_value_metadata({{"bar", "foo"}}))); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); + // TODO(GH-40025): Use `AzureFileSystem` to fetch metadata for this assertion. blob_metadata = blob_service_client_->GetBlobContainerClient(data.container_name) .GetBlockBlobClient(blob_path) .GetProperties() diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index cdfb6662dfc..0883a194e3d 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -891,8 +891,8 @@ def test_copy_file(fs, pathfn): def test_move_directory(fs, pathfn, allow_move_dir): - # TODO(GH-38704): Stop skipping this test once AzureFileSystem add support - skip_azure(fs, "Not implemented yet in abfs. See GH-38704") + # TODO(GH-40025): Stop skipping this test + skip_azure(fs, "Not implemented yet in for Azure. See GH-40025") # move directory (doesn't work with S3) s = pathfn('source-dir/') @@ -915,8 +915,8 @@ def test_move_file(fs, pathfn): # (https://github.com/dask/s3fs/issues/394) skip_fsspec_s3fs(fs) - # TODO(GH-38704): Stop skipping this test once AzureFileSystem add support - skip_azure(fs, "Not implemented yet in abfs. See GH-38704") + # TODO(GH-40025): Stop skipping this test + skip_azure(fs, "Not implemented yet in for Azure. See GH-40025") s = pathfn('test-move-source-file') t = pathfn('test-move-target-file') @@ -1070,8 +1070,8 @@ def test_open_output_stream_metadata(fs, pathfn): got_metadata = f.metadata() if fs.type_name in ['s3', 'gcs', 'abfs'] or 'mock' in fs.type_name: - # TODO(tomnewton): Create a Github issue for this. - skip_azure(fs, "Azure filesystem currently only returns system metadata not user metadata") + # TODO(GH-40026): Stop skipping this test + skip_azure(fs, "Azure filesystem currently only returns system metadata not user metadata. See GH-40026") for k, v in metadata.items(): assert got_metadata[k] == v.encode() else: From 9bb2c1b3dede2c6c89ced0a671bc395eb1813c52 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 18:35:18 +0000 Subject: [PATCH 14/36] Add test_azurefs_options --- python/pyarrow/_azurefs.pyx | 1 - python/pyarrow/tests/test_fs.py | 27 ++++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index cbc3f520de6..11ebb9c95a2 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -80,7 +80,6 @@ cdef class AzureFileSystem(FileSystem): return ( AzureFileSystem._reconstruct, (dict( account_name=frombytes(opts.account_name), - # TODO(tomnewton): Check if pickling still works if account_key is None account_key=frombytes(self.account_key), blob_storage_authority=frombytes(opts.blob_storage_authority), dfs_storage_authority=frombytes(opts.dfs_storage_authority), diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 0883a194e3d..53ae13b0164 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1421,9 +1421,30 @@ def test_s3fs_wrong_region(): fs.get_file_info("voltrondata-labs-datasets") -def test_azurefs_options(): - # TODO(tomnewton) - pass +@pytest.mark.azure +def test_azurefs_options(pickle_module): + from pyarrow.fs import AzureFileSystem + + fs1 = AzureFileSystem(account_name='fake-account-name') + assert isinstance(fs1, AzureFileSystem) + assert pickle_module.loads(pickle_module.dumps(fs1)) == fs1 + + fs2 = AzureFileSystem(account_name='fake-account-name', account_key='fakeaccountkey') + assert isinstance(fs2, AzureFileSystem) + assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2 + assert fs2 != fs1 + + fs3 = AzureFileSystem(account_name='fake-account', account_key='fakeaccount', + blob_storage_authority='fake-blob-authority', + dfs_storage_authority='fake-dfs-authority', + blob_storage_scheme='fake-blob-scheme', dfs_storage_scheme='fake-dfs-scheme') + assert isinstance(fs3, AzureFileSystem) + assert pickle_module.loads(pickle_module.dumps(fs3)) == fs3 + assert fs3 != fs2 + + with pytest.raises(TypeError): + AzureFileSystem() + @pytest.mark.hdfs def test_hdfs_options(hdfs_connection, pickle_module): From 4641fc7770ad2d69f66467dfd8a6b76af97a79d1 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 18:35:29 +0000 Subject: [PATCH 15/36] Tidy azure_server --- python/pyarrow/tests/conftest.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 5df11833ad5..b756363d2a9 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -261,12 +261,11 @@ def azure_server(tmpdir_factory): port = find_free_port() env = os.environ.copy() tmpdir = tmpdir_factory.getbasetemp() - # Port 0 means azurite will select any free port. We don't need to connect - # to the queue or table services, we just need them to not conflict with - # other ports. - # TODO(tomnewton): Get a suiteable debug tmpdir + # Port 0 means azurite will select any free port. We don't need to connect + # to the queue or table services, we just need them to not conflict with + # other ports. args = ['azurite', "--location", tmpdir, "--blobPort", str(port), - "--queuePort", "0", "--tablePort", "0", "--debug", "/tmp/azurite_debug/"] + "--queuePort", "0", "--tablePort", "0"] proc = None try: proc = subprocess.Popen(args, env=env) From 9d11166b6a8bd273700fad448f09f0e577919380 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 19:02:09 +0000 Subject: [PATCH 16/36] A bit of alphabetical ordering --- cpp/src/arrow/filesystem/api.h | 2 +- cpp/src/arrow/filesystem/type_fwd.h | 8 ++++---- python/pyarrow/tests/conftest.py | 2 +- python/setup.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h index d1441f7016c..082f8180d8e 100644 --- a/cpp/src/arrow/filesystem/api.h +++ b/cpp/src/arrow/filesystem/api.h @@ -20,13 +20,13 @@ #include "arrow/util/config.h" // IWYU pragma: export #include "arrow/filesystem/filesystem.h" // IWYU pragma: export -#include "arrow/filesystem/hdfs.h" // IWYU pragma: export #ifdef ARROW_AZURE #include "arrow/filesystem/azurefs.h" // IWYU pragma: export #endif #ifdef ARROW_GCS #include "arrow/filesystem/gcsfs.h" // IWYU pragma: export #endif +#include "arrow/filesystem/hdfs.h" // IWYU pragma: export #include "arrow/filesystem/localfs.h" // IWYU pragma: export #include "arrow/filesystem/mockfs.h" // IWYU pragma: export #ifdef ARROW_S3 diff --git a/cpp/src/arrow/filesystem/type_fwd.h b/cpp/src/arrow/filesystem/type_fwd.h index 7238d3b8040..92c70799be1 100644 --- a/cpp/src/arrow/filesystem/type_fwd.h +++ b/cpp/src/arrow/filesystem/type_fwd.h @@ -42,12 +42,12 @@ struct FileInfo; struct FileSelector; class FileSystem; -class SubTreeFileSystem; -class SlowFileSystem; +class AzureFileSystem; +class GcsFileSystem; class LocalFileSystem; class S3FileSystem; -class GcsFileSystem; -class AzureFileSystem; +class SlowFileSystem; +class SubTreeFileSystem; } // namespace fs } // namespace arrow diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index b756363d2a9..0cf261fc170 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -263,7 +263,7 @@ def azure_server(tmpdir_factory): tmpdir = tmpdir_factory.getbasetemp() # Port 0 means azurite will select any free port. We don't need to connect # to the queue or table services, we just need them to not conflict with - # other ports. + # in use ports. args = ['azurite', "--location", tmpdir, "--blobPort", str(port), "--queuePort", "0", "--tablePort", "0"] proc = None diff --git a/python/setup.py b/python/setup.py index 07032ad23c8..bc4ad2b3c92 100755 --- a/python/setup.py +++ b/python/setup.py @@ -283,9 +283,9 @@ def append_cmake_bool(value, varname): append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') append_cmake_bool(self.with_parquet_encryption, 'PYARROW_BUILD_PARQUET_ENCRYPTION') + append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') - append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') append_cmake_bool(self.bundle_arrow_cpp, 'PYARROW_BUNDLE_ARROW_CPP') @@ -350,12 +350,12 @@ def _failure_permitted(self, name): return True if name == '_substrait' and not self.with_substrait: return True + if name == '_azurefs' and not self.with_azure: + return True if name == '_gcsfs' and not self.with_gcs: return True if name == '_s3fs' and not self.with_s3: return True - if name == '_azurefs' and not self.with_azure: - return True if name == '_hdfs' and not self.with_hdfs: return True if name == '_dataset' and not self.with_dataset: From 2b68dff5f461a3326917f87953f115e0a67582d8 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 19:15:16 +0000 Subject: [PATCH 17/36] ARchery lint --- python/pyarrow/__init__.py | 3 ++- python/pyarrow/_azurefs.pyx | 18 ++++++----------- python/pyarrow/includes/libarrow_fs.pxd | 2 +- python/pyarrow/tests/conftest.py | 2 +- python/pyarrow/tests/test_fs.py | 27 +++++++++++++++++-------- python/setup.py | 2 +- 6 files changed, 30 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index e72e3bace30..936f4736977 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -145,7 +145,8 @@ def print_entry(label, value): print(f" {module: <20}: {status: <8}") print("\nFilesystems:") - filesystems = ["AzureFileSystem", "GcsFileSystem", "HadoopFileSystem", "S3FileSystem"] + filesystems = ["AzureFileSystem", "GcsFileSystem", + "HadoopFileSystem", "S3FileSystem"] for fs in filesystems: status = "Enabled" if _filesystem_is_available(fs) else "-" print(f" {fs: <20}: {status: <8}") diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index 11ebb9c95a2..be277dfc65f 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -19,16 +19,10 @@ from cython cimport binding -from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, - pyarrow_unwrap_metadata) -from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata -from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport * -from pyarrow.includes.libarrow_fs cimport * -from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint -from cython.operator cimport dereference as deref -from datetime import datetime, timedelta, timezone +from pyarrow.lib import frombytes, tobytes +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._fs cimport FileSystem cdef class AzureFileSystem(FileSystem): @@ -36,8 +30,8 @@ cdef class AzureFileSystem(FileSystem): CAzureFileSystem* azurefs c_string account_key - def __init__(self, *, account_name, account_key=None, blob_storage_authority=None, - dfs_storage_authority=None, blob_storage_scheme=None, + def __init__(self, *, account_name, account_key=None, blob_storage_authority=None, + dfs_storage_authority=None, blob_storage_scheme=None, dfs_storage_scheme=None): cdef: CAzureOptions options @@ -52,7 +46,7 @@ cdef class AzureFileSystem(FileSystem): options.blob_storage_scheme = tobytes(blob_storage_scheme) if dfs_storage_scheme: options.dfs_storage_scheme = tobytes(dfs_storage_scheme) - + if account_key: options.ConfigureAccountKeyCredential(tobytes(account_key)) self.account_key = tobytes(account_key) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 8c55bf7bc41..328b426a498 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -257,7 +257,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_string dfs_storage_authority c_string blob_storage_scheme c_string dfs_storage_scheme - + c_bool Equals(const CAzureOptions& other) CStatus ConfigureDefaultCredential() CStatus ConfigureAccountKeyCredential(c_string account_key) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 0cf261fc170..61c3635ca5d 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -264,7 +264,7 @@ def azure_server(tmpdir_factory): # Port 0 means azurite will select any free port. We don't need to connect # to the queue or table services, we just need them to not conflict with # in use ports. - args = ['azurite', "--location", tmpdir, "--blobPort", str(port), + args = ['azurite', "--location", tmpdir, "--blobPort", str(port), "--queuePort", "0", "--tablePort", "0"] proc = None try: diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 53ae13b0164..1a7aa9d0f5b 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -302,20 +302,26 @@ def azurefs(request, azure_server): container = 'pyarrow-filesystem/' - fs = AzureFileSystem(account_name='devstoreaccount1', account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', - blob_storage_authority=azureite_authority, dfs_storage_authority=azureite_authority, - blob_storage_scheme=azureite_scheme, dfs_storage_scheme=azureite_scheme) - + fs = AzureFileSystem(account_name='devstoreaccount1', + account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuF' + 'q2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + blob_storage_authority=azureite_authority, + dfs_storage_authority=azureite_authority, + blob_storage_scheme=azureite_scheme, + dfs_storage_scheme=azureite_scheme) + fs.create_dir(container) yield dict( fs=fs, pathfn=container.__add__, - allow_move_dir=False, # AzureFileSystem will only support this in hierachical namespace accounts. + # AzureFileSystem will only support this in hierachical namespace accounts. + allow_move_dir=False, allow_append_to_file=True, ) fs.delete_dir(container) + @pytest.fixture def hdfs(request, hdfs_connection): request.config.pyarrow.requires('hdfs') @@ -496,6 +502,7 @@ def skip_fsspec_s3fs(fs): if fs.type_name == "py::fsspec+('s3', 's3a')": pytest.xfail(reason="Not working with fsspec's s3fs") + def skip_azure(fs, reason): if fs.type_name == "abfs": pytest.xfail(reason=reason) @@ -1071,7 +1078,9 @@ def test_open_output_stream_metadata(fs, pathfn): if fs.type_name in ['s3', 'gcs', 'abfs'] or 'mock' in fs.type_name: # TODO(GH-40026): Stop skipping this test - skip_azure(fs, "Azure filesystem currently only returns system metadata not user metadata. See GH-40026") + skip_azure( + fs, "Azure filesystem currently only returns system metadata not user " + "metadata. See GH-40026") for k, v in metadata.items(): assert got_metadata[k] == v.encode() else: @@ -1429,7 +1438,8 @@ def test_azurefs_options(pickle_module): assert isinstance(fs1, AzureFileSystem) assert pickle_module.loads(pickle_module.dumps(fs1)) == fs1 - fs2 = AzureFileSystem(account_name='fake-account-name', account_key='fakeaccountkey') + fs2 = AzureFileSystem(account_name='fake-account-name', + account_key='fakeaccountkey') assert isinstance(fs2, AzureFileSystem) assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2 assert fs2 != fs1 @@ -1437,7 +1447,8 @@ def test_azurefs_options(pickle_module): fs3 = AzureFileSystem(account_name='fake-account', account_key='fakeaccount', blob_storage_authority='fake-blob-authority', dfs_storage_authority='fake-dfs-authority', - blob_storage_scheme='fake-blob-scheme', dfs_storage_scheme='fake-dfs-scheme') + blob_storage_scheme='fake-blob-scheme', + dfs_storage_scheme='fake-dfs-scheme') assert isinstance(fs3, AzureFileSystem) assert pickle_module.loads(pickle_module.dumps(fs3)) == fs3 assert fs3 != fs2 diff --git a/python/setup.py b/python/setup.py index bc4ad2b3c92..869d102a754 100755 --- a/python/setup.py +++ b/python/setup.py @@ -105,7 +105,7 @@ def run(self): 'build type (debug or release), default release'), ('boost-namespace=', None, 'namespace of boost (default: boost)'), - ('with-azure', None, + ('with-azure', None, 'build the Azure Blob Storage extension'), ('with-cuda', None, 'build the Cuda extension'), ('with-flight', None, 'build the Flight extension'), From c5cb74a1523095674f96bbacaffb15380209b14b Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 21:00:28 +0000 Subject: [PATCH 18/36] C++ autoformat --- cpp/src/arrow/filesystem/api.h | 2 +- cpp/src/arrow/filesystem/azurefs_test.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h index 082f8180d8e..562b7c1808e 100644 --- a/cpp/src/arrow/filesystem/api.h +++ b/cpp/src/arrow/filesystem/api.h @@ -26,7 +26,7 @@ #ifdef ARROW_GCS #include "arrow/filesystem/gcsfs.h" // IWYU pragma: export #endif -#include "arrow/filesystem/hdfs.h" // IWYU pragma: export +#include "arrow/filesystem/hdfs.h" // IWYU pragma: export #include "arrow/filesystem/localfs.h" // IWYU pragma: export #include "arrow/filesystem/mockfs.h" // IWYU pragma: export #ifdef ARROW_S3 diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 8e0d421b64a..bd741fde8cc 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -2458,7 +2458,7 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { ASSERT_OK(output->Close()); // Verify the metadata has been set. - // TODO(GH-40025): Use `AzureFileSystem` to fetch metadata for this assertion. + // TODO(GH-40025): Use `AzureFileSystem` to fetch metadata for this assertion. auto blob_metadata = blob_service_client_->GetBlobContainerClient(data.container_name) .GetBlockBlobClient(blob_path) .GetProperties() @@ -2471,7 +2471,7 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { full_path, /*metadata=*/arrow::key_value_metadata({{"bar", "foo"}}))); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); - // TODO(GH-40025): Use `AzureFileSystem` to fetch metadata for this assertion. + // TODO(GH-40025): Use `AzureFileSystem` to fetch metadata for this assertion. blob_metadata = blob_service_client_->GetBlobContainerClient(data.container_name) .GetBlockBlobClient(blob_path) .GetProperties() From d9cf2eab46097e6f78c785df4d45e4482a0ac3a7 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 10 Feb 2024 21:13:54 +0000 Subject: [PATCH 19/36] Update cpp/src/arrow/util/config.h.cmake Co-authored-by: Sutou Kouhei --- cpp/src/arrow/util/config.h.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/config.h.cmake b/cpp/src/arrow/util/config.h.cmake index f93c888de27..9fbd685084f 100644 --- a/cpp/src/arrow/util/config.h.cmake +++ b/cpp/src/arrow/util/config.h.cmake @@ -52,8 +52,8 @@ #cmakedefine ARROW_PARQUET #cmakedefine ARROW_SUBSTRAIT -#cmakedefine ARROW_ENABLE_THREADING #cmakedefine ARROW_AZURE +#cmakedefine ARROW_ENABLE_THREADING #cmakedefine ARROW_GCS #cmakedefine ARROW_HDFS #cmakedefine ARROW_S3 From 26a4632f6e4253bbbc5a9d94482a22282211a1ce Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 11 Feb 2024 21:34:46 +0000 Subject: [PATCH 20/36] Update python/pyarrow/_azurefs.pyx Co-authored-by: Sutou Kouhei --- python/pyarrow/_azurefs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index be277dfc65f..de0fedae0f9 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -30,7 +30,7 @@ cdef class AzureFileSystem(FileSystem): CAzureFileSystem* azurefs c_string account_key - def __init__(self, *, account_name, account_key=None, blob_storage_authority=None, + def __init__(self, account_name, *, account_key=None, blob_storage_authority=None, dfs_storage_authority=None, blob_storage_scheme=None, dfs_storage_scheme=None): cdef: From cb0aefd54c7179efad0807e1628af4b29512c91d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 12 Feb 2024 00:26:39 +0000 Subject: [PATCH 21/36] Write a docstring --- python/pyarrow/_azurefs.pyx | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index de0fedae0f9..61b91949e19 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -26,6 +26,57 @@ from pyarrow._fs cimport FileSystem cdef class AzureFileSystem(FileSystem): + """ + Azure Blob Storage backed FileSystem implementation + + This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. + + The storage account is considered the root of the filesystem. When enabled containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. + + By default [DefaultAzureCredential](https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential) + is used for authentication. This means it will try several types of authentication + and go with the first one that works. If any auth paramters are provided when + initialising the FileSysem, they will be used instead of the default credential. + + Parameters + ---------- + account_name : str + Azure Blob Storage account name. This is the globally unique identifier for the + storage account. + account_key : str, default None + Account key of the storage account. Pass None to use default credential. + blob_storage_authority : str, default None + hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful + for connecting to a local emulator, like azurite. + dfs_storage_authority : str, default None + hostname[:port] of the Data Lake Gen 2 Service. Defaults to + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like azurite. + blob_storage_authority : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like azurite. + dfs_storage_authority : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like azurite. + + Examples + -------- + >>> from pyarrow import fs + >>> azure_fs = fs.AzureFileSystem(account_name='myaccount') + >>> azurite_fs = fs.AzureFileSystem( + ... account_name='devstoreaccount1', + ... account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + ... blob_storage_authority='127.0.0.1:10000', + ... dfs_storage_authority='127.0.0.1:10000', + ... blob_storage_scheme='http', + ... dfs_storage_scheme='http', + ... ) + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ cdef: CAzureFileSystem* azurefs c_string account_key From 02a2233010a3219248f995a20ab0e462b7d3f8a1 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 12 Feb 2024 18:40:08 +0000 Subject: [PATCH 22/36] Update docstring to mention `/` as only supported delimiter --- python/pyarrow/_azurefs.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index 61b91949e19..1e558d51c81 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -31,7 +31,8 @@ cdef class AzureFileSystem(FileSystem): This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific - features will be used when they provide a performance advantage. + features will be used when they provide a performance advantage. Note: `/` is the + only supported delimiter. The storage account is considered the root of the filesystem. When enabled containers will be created or deleted during relevant directory operations. Obviously, this also From ca9737024a42e786a556dced0e67e1ad83a70323 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 12 Feb 2024 21:10:13 +0000 Subject: [PATCH 23/36] Capitalise Azurite --- python/pyarrow/_azurefs.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index 1e558d51c81..a479c126646 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -52,16 +52,16 @@ cdef class AzureFileSystem(FileSystem): Account key of the storage account. Pass None to use default credential. blob_storage_authority : str, default None hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful - for connecting to a local emulator, like azurite. + for connecting to a local emulator, like Azurite. dfs_storage_authority : str, default None hostname[:port] of the Data Lake Gen 2 Service. Defaults to - `.dfs.core.windows.net`. Useful for connecting to a local emulator, like azurite. + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. blob_storage_authority : str, default None Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like azurite. + emulator, like Azurite. dfs_storage_authority : str, default None Either `http` or `https`. Defaults to `https`. Useful for connecting to a local - emulator, like azurite. + emulator, like Azurite. Examples -------- From 744b119c03d364695b58b9401360180a83ece45a Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 13 Feb 2024 13:44:28 +0000 Subject: [PATCH 24/36] Update docstring with PR comments Co-authored-by: Joris Van den Bossche --- python/pyarrow/_azurefs.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index a479c126646..6d81dfd03a7 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -34,13 +34,13 @@ cdef class AzureFileSystem(FileSystem): features will be used when they provide a performance advantage. Note: `/` is the only supported delimiter. - The storage account is considered the root of the filesystem. When enabled containers + The storage account is considered the root of the filesystem. When enabled, containers will be created or deleted during relevant directory operations. Obviously, this also requires authentication with the additional permissions. - By default [DefaultAzureCredential](https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential) + By default `DefaultAzureCredential `__ is used for authentication. This means it will try several types of authentication - and go with the first one that works. If any auth paramters are provided when + and go with the first one that works. If any authentication parameters are provided when initialising the FileSysem, they will be used instead of the default credential. Parameters From fc0940a289a6855fe7dbf4d6fb00465fbe980a98 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 13 Feb 2024 13:51:08 +0000 Subject: [PATCH 25/36] Add comment about azurite credentials --- python/pyarrow/tests/test_fs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 1a7aa9d0f5b..ba7b8d347a8 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -302,6 +302,8 @@ def azurefs(request, azure_server): container = 'pyarrow-filesystem/' + # Use the standard azurite account_name and account_key. + # https://learn.microsoft.com/en-us/azure/storage/common/storage-use-emulator#authorize-with-shared-key-credentials fs = AzureFileSystem(account_name='devstoreaccount1', account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuF' 'q2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', From 6b83cca3c756a112e7c9d3e9c546114599a33094 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 13 Feb 2024 13:56:12 +0000 Subject: [PATCH 26/36] Enable Azure tests whenever AzureFileSystem can be imported --- python/pyarrow/conftest.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 00f2c7d3ded..2ac8427de17 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -144,13 +144,18 @@ except ImportError: pass +try: + from pyarrow.fs import AzureFileSystem # noqa + defaults['azure'] = True +except ImportError: + pass + try: from pyarrow.fs import GcsFileSystem # noqa defaults['gcs'] = True except ImportError: pass - try: from pyarrow.fs import S3FileSystem # noqa defaults['s3'] = True From 7c842256228c308e16306c5b261daef3b1b98683 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 14 Feb 2024 01:12:36 +0000 Subject: [PATCH 27/36] Docstring correction --- python/pyarrow/_azurefs.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index 6d81dfd03a7..f29cceaf46c 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -56,10 +56,10 @@ cdef class AzureFileSystem(FileSystem): dfs_storage_authority : str, default None hostname[:port] of the Data Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. - blob_storage_authority : str, default None + blob_storage_scheme : str, default None Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like Azurite. - dfs_storage_authority : str, default None + dfs_storage_scheme : str, default None Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like Azurite. From b8ae75a2df2bd765d5421119cd349ccaad756313 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 14 Feb 2024 08:58:08 +0000 Subject: [PATCH 28/36] Move account_name and account_key to `azure_server` fixture --- python/pyarrow/tests/conftest.py | 6 +++++- python/pyarrow/tests/test_fs.py | 9 +++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 61c3635ca5d..0d3d22968a8 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -276,7 +276,11 @@ def azure_server(tmpdir_factory): pytest.skip(f"Command {args} failed to execute: {e}") else: yield { - 'connection': ('127.0.0.1', port), + # Use the standard azurite account_name and account_key. + # https://learn.microsoft.com/en-us/azure/storage/common/storage-use-emulator#authorize-with-shared-key-credentials + 'connection': ('127.0.0.1', port, 'devstoreaccount1', + 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2' + 'UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=='), 'process': proc, 'tempdir': tmpdir, } diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index ba7b8d347a8..a65f05a588d 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -296,17 +296,14 @@ def azurefs(request, azure_server): request.config.pyarrow.requires('azure') from pyarrow.fs import AzureFileSystem - host, port = azure_server['connection'] + host, port, account_name, account_key = azure_server['connection'] azureite_authority = f"{host}:{port}" azureite_scheme = "http" container = 'pyarrow-filesystem/' - # Use the standard azurite account_name and account_key. - # https://learn.microsoft.com/en-us/azure/storage/common/storage-use-emulator#authorize-with-shared-key-credentials - fs = AzureFileSystem(account_name='devstoreaccount1', - account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuF' - 'q2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + fs = AzureFileSystem(account_name=account_name, + account_key=account_key, blob_storage_authority=azureite_authority, dfs_storage_authority=azureite_authority, blob_storage_scheme=azureite_scheme, From 3d7717a21138a684878e76fa8ecddfa1b2b35205 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 19 Feb 2024 00:06:44 +0000 Subject: [PATCH 29/36] Only run blob emulator not queue or table --- python/pyarrow/tests/conftest.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 0d3d22968a8..57bc3c8fc66 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -261,11 +261,8 @@ def azure_server(tmpdir_factory): port = find_free_port() env = os.environ.copy() tmpdir = tmpdir_factory.getbasetemp() - # Port 0 means azurite will select any free port. We don't need to connect - # to the queue or table services, we just need them to not conflict with - # in use ports. - args = ['azurite', "--location", tmpdir, "--blobPort", str(port), - "--queuePort", "0", "--tablePort", "0"] + # We only need blob service emulator, not queue or table. + args = ['azurite-blob', "--location", tmpdir, "--blobPort", str(port)] proc = None try: proc = subprocess.Popen(args, env=env) From 0174285adbcbefcaec18a420c712a0830a331b2c Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 19 Feb 2024 00:06:53 +0000 Subject: [PATCH 30/36] Mention azurite support in docstring --- python/pyarrow/_azurefs.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index f29cceaf46c..a670e45f66a 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -31,8 +31,8 @@ cdef class AzureFileSystem(FileSystem): This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific - features will be used when they provide a performance advantage. Note: `/` is the - only supported delimiter. + features will be used when they provide a performance advantage. Azurite emulator is + also supported. Note: `/` is the only supported delimiter. The storage account is considered the root of the filesystem. When enabled, containers will be created or deleted during relevant directory operations. Obviously, this also From 514c5ddfea9669918fc21ea27ca1fa7866860a4b Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 22 Feb 2024 23:20:15 +0000 Subject: [PATCH 31/36] Set allow_move_dir=True on Azure tests --- python/pyarrow/tests/test_fs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index a65f05a588d..5285c92d1b4 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -314,8 +314,7 @@ def azurefs(request, azure_server): yield dict( fs=fs, pathfn=container.__add__, - # AzureFileSystem will only support this in hierachical namespace accounts. - allow_move_dir=False, + allow_move_dir=True, allow_append_to_file=True, ) fs.delete_dir(container) From 30fae587b1f801b7e905d91cf7fb9919278ac073 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 27 Feb 2024 16:24:42 +0000 Subject: [PATCH 32/36] Spelling --- python/pyarrow/_azurefs.pyx | 2 +- python/pyarrow/tests/test_fs.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index a670e45f66a..5cd6300c18c 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -41,7 +41,7 @@ cdef class AzureFileSystem(FileSystem): By default `DefaultAzureCredential `__ is used for authentication. This means it will try several types of authentication and go with the first one that works. If any authentication parameters are provided when - initialising the FileSysem, they will be used instead of the default credential. + initialising the FileSystem, they will be used instead of the default credential. Parameters ---------- diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 5285c92d1b4..6a58743efa1 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -297,17 +297,17 @@ def azurefs(request, azure_server): from pyarrow.fs import AzureFileSystem host, port, account_name, account_key = azure_server['connection'] - azureite_authority = f"{host}:{port}" - azureite_scheme = "http" + azurite_authority = f"{host}:{port}" + azurite_scheme = "http" container = 'pyarrow-filesystem/' fs = AzureFileSystem(account_name=account_name, account_key=account_key, - blob_storage_authority=azureite_authority, - dfs_storage_authority=azureite_authority, - blob_storage_scheme=azureite_scheme, - dfs_storage_scheme=azureite_scheme) + blob_storage_authority=azurite_authority, + dfs_storage_authority=azurite_authority, + blob_storage_scheme=azurite_scheme, + dfs_storage_scheme=azurite_scheme) fs.create_dir(container) From 480193398fbc38cac9a4f52d9159d1eb6812a348 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 27 Feb 2024 16:26:43 +0000 Subject: [PATCH 33/36] Re-order list --- python/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index 869d102a754..b8b1ff2776a 100755 --- a/python/setup.py +++ b/python/setup.py @@ -105,8 +105,6 @@ def run(self): 'build type (debug or release), default release'), ('boost-namespace=', None, 'namespace of boost (default: boost)'), - ('with-azure', None, - 'build the Azure Blob Storage extension'), ('with-cuda', None, 'build the Cuda extension'), ('with-flight', None, 'build the Flight extension'), ('with-substrait', None, 'build the Substrait extension'), @@ -115,6 +113,8 @@ def run(self): ('with-parquet', None, 'build the Parquet extension'), ('with-parquet-encryption', None, 'build the Parquet encryption extension'), + ('with-azure', None, + 'build the Azure Blob Storage extension'), ('with-gcs', None, 'build the Google Cloud Storage (GCS) extension'), ('with-s3', None, 'build the Amazon S3 extension'), From 0bb64bfc492d9554c80426e7f051266a91175501 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 27 Feb 2024 17:18:37 +0000 Subject: [PATCH 34/36] Use skip instead of xfail --- python/pyarrow/tests/test_fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 6a58743efa1..845f1eccecc 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -503,7 +503,7 @@ def skip_fsspec_s3fs(fs): def skip_azure(fs, reason): if fs.type_name == "abfs": - pytest.xfail(reason=reason) + pytest.skip(reason=reason) @pytest.mark.s3 From 7ad9287b41d4673dacda05e19a20ae030d64a22d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 28 Feb 2024 17:38:45 +0000 Subject: [PATCH 35/36] Re-order list --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index b8b1ff2776a..ce6996dc1d4 100755 --- a/python/setup.py +++ b/python/setup.py @@ -206,12 +206,12 @@ def initialize_options(self): '_dataset_orc', '_dataset_parquet', '_acero', - '_azurefs', '_feather', '_parquet', '_parquet_encryption', '_pyarrow_cpp_tests', '_orc', + '_azurefs', '_gcsfs', '_s3fs', '_substrait', From 20e7a31f1991dd5e5d1ef416026a13ca020e24c6 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 29 Feb 2024 10:24:36 +0000 Subject: [PATCH 36/36] Explicitly set ARROW_AZURE=OFF instead of leaving default --- ci/docker/alpine-linux-3.16-cpp.dockerfile | 1 + ci/docker/fedora-39-cpp.dockerfile | 1 + ci/docker/linux-apt-docs.dockerfile | 1 + ci/docker/ubuntu-20.04-cpp-minimal.dockerfile | 1 + ci/docker/ubuntu-22.04-cpp-minimal.dockerfile | 1 + 5 files changed, 5 insertions(+) diff --git a/ci/docker/alpine-linux-3.16-cpp.dockerfile b/ci/docker/alpine-linux-3.16-cpp.dockerfile index 8828e717a53..72489c6eae1 100644 --- a/ci/docker/alpine-linux-3.16-cpp.dockerfile +++ b/ci/docker/alpine-linux-3.16-cpp.dockerfile @@ -74,6 +74,7 @@ COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ diff --git a/ci/docker/fedora-39-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile index c8e98bdd00b..59db84034be 100644 --- a/ci/docker/fedora-39-cpp.dockerfile +++ b/ci/docker/fedora-39-cpp.dockerfile @@ -80,6 +80,7 @@ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin # Python process explicitly if we use LLVM 17 or later. ENV absl_SOURCE=BUNDLED \ ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_DATASET=ON \ diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 93412ca81cd..c424d04653d 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -96,6 +96,7 @@ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ R -e "install.packages('pkgdown')" ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index 3df895b427c..ae2ba9421cd 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -76,6 +76,7 @@ COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_FLIGHT=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index 7eba541a63a..dd887a6d00c 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -76,6 +76,7 @@ COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_FLIGHT=ON \