diff --git a/cpp/cmake_modules/FindLLVMAlt.cmake b/cpp/cmake_modules/FindLLVMAlt.cmake index ab399171c1f..7695c09ae8c 100644 --- a/cpp/cmake_modules/FindLLVMAlt.cmake +++ b/cpp/cmake_modules/FindLLVMAlt.cmake @@ -68,9 +68,11 @@ endif() mark_as_advanced(CLANG_EXECUTABLE LLVM_LINK_EXECUTABLE) -find_package_handle_standard_args(LLVMAlt REQUIRED_VARS - # The first variable is used for display. - LLVM_PACKAGE_VERSION CLANG_EXECUTABLE LLVM_FOUND +find_package_handle_standard_args(LLVMAlt + REQUIRED_VARS # The first variable is used for display. + LLVM_PACKAGE_VERSION + CLANG_EXECUTABLE + LLVM_FOUND LLVM_LINK_EXECUTABLE) if(LLVMAlt_FOUND) message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 88e7f2927cb..ca227b7030e 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -694,7 +694,7 @@ set(Boost_ADDITIONAL_VERSIONS "1.60.0" "1.60") -# Thrift needs Boost if we're building the bundled version, +# Thrift needs Boost if we're building the bundled version with version < 0.13, # so we first need to determine whether we're building it if(ARROW_WITH_THRIFT AND Thrift_SOURCE STREQUAL "AUTO") find_package(Thrift 0.11.0 MODULE COMPONENTS ${ARROW_THRIFT_REQUIRED_COMPONENTS}) @@ -703,19 +703,32 @@ if(ARROW_WITH_THRIFT AND Thrift_SOURCE STREQUAL "AUTO") endif() endif() -# - Parquet requires boost only with gcc 4.8 (because of missing std::regex). +# Thrift < 0.13 has a compile-time header dependency on boost +if(Thrift_SOURCE STREQUAL "BUNDLED" AND ARROW_THRIFT_BUILD_VERSION VERSION_LESS "0.13") + set(THRIFT_REQUIRES_BOOST TRUE) +elseif(THRIFT_VERSION VERSION_LESS "0.13") + set(THRIFT_REQUIRES_BOOST TRUE) +else() + set(THRIFT_REQUIRES_BOOST FALSE) +endif() + +# Parquet requires boost only with gcc 4.8 (because of missing std::regex). +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9") + set(PARQUET_REQUIRES_BOOST TRUE) +else() + set(PARQUET_REQUIRES_BOOST FALSE) +endif() + # - Gandiva has a compile-time (header-only) dependency on Boost, not runtime. # - Tests need Boost at runtime. # - S3FS and Flight benchmarks need Boost at runtime. if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS + OR ARROW_GANDIVA OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS) - OR ARROW_GANDIVA - OR (ARROW_WITH_THRIFT AND Thrift_SOURCE STREQUAL "BUNDLED") - OR (ARROW_PARQUET - AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU" - AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")) + OR (ARROW_WITH_THRIFT AND THRIFT_REQUIRES_BOOST) + OR (ARROW_PARQUET AND PARQUET_REQUIRES_BOOST)) set(ARROW_BOOST_REQUIRED TRUE) else() set(ARROW_BOOST_REQUIRED FALSE) diff --git a/dev/tasks/python-wheels/manylinux-test.sh b/dev/tasks/python-wheels/manylinux-test.sh index bd751df7b46..fd5e2284b07 100755 --- a/dev/tasks/python-wheels/manylinux-test.sh +++ b/dev/tasks/python-wheels/manylinux-test.sh @@ -20,6 +20,7 @@ set -e export ARROW_TEST_DATA=/arrow/testing/data +export PYARROW_TEST_CYTHON=OFF python --version # Install built wheel @@ -44,5 +45,4 @@ import pyarrow.fs import pyarrow._hdfs import pyarrow.dataset import pyarrow.flight -import pyarrow.gandiva " diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh index aea4c62e00b..af54a02c016 100755 --- a/dev/tasks/python-wheels/osx-build.sh +++ b/dev/tasks/python-wheels/osx-build.sh @@ -52,8 +52,9 @@ function build_wheel { pip install $(pip_opts) -r python/requirements-wheel-build.txt - export PYARROW_WITH_GANDIVA=1 - export BUILD_ARROW_GANDIVA=ON + export PYARROW_INSTALL_TESTS=1 + export PYARROW_WITH_GANDIVA=0 + export BUILD_ARROW_GANDIVA=OFF git submodule update --init export ARROW_TEST_DATA=`pwd`/testing/data @@ -137,6 +138,8 @@ function install_wheel { function run_unit_tests { pushd $1 + export PYARROW_TEST_CYTHON=OFF + # Install test dependencies pip install $(pip_opts) -r python/requirements-wheel-test.txt @@ -156,6 +159,5 @@ import pyarrow.fs import pyarrow._hdfs import pyarrow.dataset import pyarrow.flight -import pyarrow.gandiva " } diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat index b7bd25faefb..00e69642351 100644 --- a/dev/tasks/python-wheels/win-build.bat +++ b/dev/tasks/python-wheels/win-build.bat @@ -17,10 +17,14 @@ @echo on +@rem Building Gandiva in the wheels is disabled for now to make the wheels +@rem smaller. + +@rem --file=%ARROW_SRC%\ci\conda_env_gandiva.yml ^ + @rem create conda environment for compiling call conda create -n wheel-build -q -y -c conda-forge ^ --file=%ARROW_SRC%\ci\conda_env_cpp.yml ^ - --file=%ARROW_SRC%\ci\conda_env_gandiva.yml ^ "vs2015_runtime<14.16" ^ python=%PYTHON_VERSION% || exit /B @@ -61,7 +65,7 @@ cmake -G "%GENERATOR%" ^ -DARROW_FLIGHT=ON ^ -DARROW_PYTHON=ON ^ -DARROW_PARQUET=ON ^ - -DARROW_GANDIVA=ON ^ + -DARROW_GANDIVA=OFF ^ -DARROW_MIMAllOC=ON ^ -DZSTD_SOURCE=BUNDLED ^ .. || exit /B @@ -70,9 +74,10 @@ popd set PYARROW_BUILD_TYPE=Release set PYARROW_PARALLEL=8 +set PYARROW_INSTALL_TESTS=1 set PYARROW_WITH_DATASET=1 set PYARROW_WITH_FLIGHT=1 -set PYARROW_WITH_GANDIVA=1 +set PYARROW_WITH_GANDIVA=0 set PYARROW_WITH_PARQUET=1 set PYARROW_WITH_STATIC_BOOST=1 set PYARROW_BUNDLE_ARROW_CPP=1 @@ -96,7 +101,6 @@ set ARROW_TEST_DATA=%ARROW_SRC%\testing\data %PYTHON_INTERPRETER% -c "import pyarrow" || exit /B %PYTHON_INTERPRETER% -c "import pyarrow.parquet" || exit /B %PYTHON_INTERPRETER% -c "import pyarrow.flight" || exit /B -%PYTHON_INTERPRETER% -c "import pyarrow.gandiva" || exit /B %PYTHON_INTERPRETER% -c "import pyarrow.dataset" || exit /B @rem run the python tests, but disable the cython because there is a linking diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst index 284e0488e0c..6737b8d8633 100644 --- a/docs/source/python/extending.rst +++ b/docs/source/python/extending.rst @@ -401,3 +401,20 @@ Compile the extension: .. code-block:: bash python setup.py build_ext --inplace + +Building Extensions against PyPI Wheels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The Python wheels have the Arrow C++ libraries bundled in the top level +``pyarrow/`` install directory. On Linux and macOS, these libraries have an ABI +tag like ``libarrow.so.17`` which means that linking with ``-larrow`` using the +linker path provided by ``pyarrow.get_library_dirs()`` will not work right out +of the box. To fix this, you must run ``pyarrow.create_library_symlinks()`` +once as a user with write access to the directory where pyarrow is +installed. This function will attempt to create symlinks like +``pyarrow/libarrow.so``. For example: + +.. code-block:: bash + + pip install pyarrow + python -c "import pyarrow; pyarrow.create_library_symlinks()" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 99a08df91f4..2a43b165d7e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -225,16 +225,20 @@ function(bundle_arrow_lib library_path) get_filename_component(LIBRARY_DIR ${${library_path}} DIRECTORY) get_filename_component(LIBRARY_NAME ${${library_path}} NAME_WE) - configure_file( - ${${library_path}} - ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX} COPYONLY) - if(APPLE) + # Only copy the shared library with ABI version on Linux and macOS + + if(MSVC) + configure_file( + ${${library_path}} + ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX} + COPYONLY) + elseif(APPLE) configure_file( ${LIBRARY_DIR}/${LIBRARY_NAME}.${ARG_SO_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX} ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}.${ARG_SO_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX} COPYONLY) - elseif(NOT MSVC) + else() configure_file( ${${library_path}}.${ARG_SO_VERSION} ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}.${ARG_SO_VERSION} diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 7664d2b016b..2f1f4cc4fb4 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -75,12 +75,13 @@ touch ${CPYTHON_PATH}/lib/${py_libname} echo "=== (${PYTHON_VERSION}) Install the wheel build dependencies ===" $PIP install -r requirements-wheel-build.txt +export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_DATASET=1 export PYARROW_WITH_FLIGHT=1 -export PYARROW_WITH_GANDIVA=1 +export PYARROW_WITH_GANDIVA=0 export BUILD_ARROW_DATASET=ON export BUILD_ARROW_FLIGHT=ON -export BUILD_ARROW_GANDIVA=ON +export BUILD_ARROW_GANDIVA=OFF # ARROW-3052(wesm): ORC is being bundled until it can be added to the # manylinux1 image @@ -120,7 +121,6 @@ cmake \ -DCMAKE_INSTALL_PREFIX=/arrow-dist \ -DOPENSSL_USE_STATIC_LIBS=ON \ -DORC_SOURCE=BUNDLED \ - -DPythonInterp_FIND_VERSION=${PYTHON_VERSION} \ -GNinja /arrow/cpp ninja ninja install @@ -158,7 +158,6 @@ import sys import pyarrow import pyarrow.dataset import pyarrow.flight -import pyarrow.gandiva import pyarrow.fs import pyarrow._hdfs import pyarrow.parquet diff --git a/python/manylinux201x/build_arrow.sh b/python/manylinux201x/build_arrow.sh index 5c162c00181..a783998f80c 100755 --- a/python/manylinux201x/build_arrow.sh +++ b/python/manylinux201x/build_arrow.sh @@ -74,12 +74,13 @@ touch ${CPYTHON_PATH}/lib/${py_libname} echo "=== (${PYTHON_VERSION}) Install the wheel build dependencies ===" $PIP install -r requirements-wheel-build.txt +export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_DATASET=1 export PYARROW_WITH_FLIGHT=1 -export PYARROW_WITH_GANDIVA=1 +export PYARROW_WITH_GANDIVA=0 export BUILD_ARROW_DATASET=ON export BUILD_ARROW_FLIGHT=ON -export BUILD_ARROW_GANDIVA=ON +export BUILD_ARROW_GANDIVA=OFF # ARROW-3052(wesm): ORC is being bundled until it can be added to the # manylinux1 image @@ -120,7 +121,6 @@ PATH="${CPYTHON_PATH}/bin:${PATH}" cmake \ -DCMAKE_INSTALL_PREFIX=/arrow-dist \ -DOPENSSL_USE_STATIC_LIBS=ON \ -DORC_SOURCE=BUNDLED \ - -DPythonInterp_FIND_VERSION=${PYTHON_VERSION} \ -DZLIB_ROOT=/usr/local \ -GNinja /arrow/cpp ninja install @@ -153,7 +153,6 @@ $PYTHON_INTERPRETER -c " import pyarrow import pyarrow.dataset import pyarrow.flight -import pyarrow.gandiva import pyarrow.fs import pyarrow._hdfs import pyarrow.parquet diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index e8b8c6eae2d..2a3d9131c07 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -287,6 +287,49 @@ def get_libraries(): return ['arrow', 'arrow_python'] +def create_library_symlinks(): + """ + With Linux and macOS wheels, the bundled shared libraries have an embedded + ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them + with -larrow won't work unless we create symlinks at locations like + site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses + prior problems we had with shipping two copies of the shared libraries to + permit third party projects like turbodbc to build their C++ extensions + against the pyarrow wheels. + + This function must only be invoked once and only when the shared libraries + are bundled with the Python package, which should only apply to wheel-based + installs. It requires write access to the site-packages/pyarrow directory + and so depending on your system may need to be run with root. + """ + import glob + if _sys.platform == 'win32': + return + package_cwd = _os.path.dirname(__file__) + + if _sys.platform == 'linux': + bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*')) + + def get_symlink_path(hard_path): + return hard_path.rsplit('.', 1)[0] + else: + bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib')) + + def get_symlink_path(hard_path): + return '.'.join((hard_path.split('.')[0], 'dylib')) + + for lib_hard_path in bundled_libs: + symlink_path = get_symlink_path(lib_hard_path) + if _os.path.exists(symlink_path): + continue + try: + _os.symlink(lib_hard_path, symlink_path) + except PermissionError: + print("Tried creating symlink {}. If you need to link to " + "bundled shared libraries, run " + "pyarrow._setup_bundled_symlinks() as root") + + def get_library_dirs(): """ Return lists of directories likely to contain Arrow C++ libraries for diff --git a/python/setup.py b/python/setup.py index 453cc38e65e..3a699fe1093 100755 --- a/python/setup.py +++ b/python/setup.py @@ -119,6 +119,9 @@ def run(self): 'enable Cython code coverage'), ('bundle-boost', None, 'bundle the (shared) Boost libraries'), + ('bundle-cython-cpp', None, + 'bundle generated Cython C++ code ' + '(used for code coverage)'), ('bundle-arrow-cpp', None, 'bundle the Arrow C++ libraries')] + _build_ext.user_options) @@ -170,6 +173,8 @@ def initialize_options(self): os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) self.bundle_arrow_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) + self.bundle_cython_cpp = strtobool( + os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) self.bundle_boost = strtobool( os.environ.get('PYARROW_BUNDLE_BOOST', '0')) @@ -279,7 +284,6 @@ def append_cmake_bool(value, varname): self.spawn(['cmake'] + extra_cmake_args + cmake_options + [source]) print("-- Finished cmake for pyarrow") - # Do the build print("-- Running cmake --build for pyarrow") self.spawn(['cmake', '--build', '.', '--config', self.build_type] + build_tool_args) @@ -320,27 +324,15 @@ def append_cmake_bool(value, varname): raise RuntimeError('pyarrow C-extension failed to build:', os.path.abspath(built_path)) - cpp_generated_path = self.get_ext_generated_cpp_source(name) - if not os.path.exists(cpp_generated_path): - raise RuntimeError('expected to find generated C++ file ' - 'in {0!r}'.format(cpp_generated_path)) - - # The destination path to move the generated C++ source to - # (for Cython source coverage) - cpp_path = pjoin(build_lib, self._get_build_dir(), - os.path.basename(cpp_generated_path)) - if os.path.exists(cpp_path): - os.remove(cpp_path) - # The destination path to move the built C extension to ext_path = pjoin(build_lib, self._get_cmake_ext_path(name)) if os.path.exists(ext_path): os.remove(ext_path) self.mkpath(os.path.dirname(ext_path)) - print('Moving generated C++ source', cpp_generated_path, - 'to build path', cpp_path) - shutil.move(cpp_generated_path, cpp_path) + if self.bundle_cython_cpp: + self._bundle_cython_cpp(name, build_lib) + print('Moving built C-extension', built_path, 'to build path', ext_path) shutil.move(built_path, ext_path) @@ -352,40 +344,7 @@ def append_cmake_bool(value, varname): name + '_api.h')) if self.bundle_arrow_cpp: - print(pjoin(build_lib, 'pyarrow')) - move_shared_libs(build_prefix, build_lib, "arrow") - move_shared_libs(build_prefix, build_lib, "arrow_python") - if self.with_cuda: - move_shared_libs(build_prefix, build_lib, "arrow_cuda") - if self.with_flight: - move_shared_libs(build_prefix, build_lib, "arrow_flight") - move_shared_libs(build_prefix, build_lib, - "arrow_python_flight") - if self.with_dataset: - move_shared_libs(build_prefix, build_lib, "arrow_dataset") - if self.with_plasma: - move_shared_libs(build_prefix, build_lib, "plasma") - if self.with_gandiva: - move_shared_libs(build_prefix, build_lib, "gandiva") - if self.with_parquet and not self.with_static_parquet: - move_shared_libs(build_prefix, build_lib, "parquet") - if not self.with_static_boost and self.bundle_boost: - move_shared_libs( - build_prefix, build_lib, - "{}_regex".format(self.boost_namespace), - implib_required=False) - if sys.platform == 'win32': - # zlib uses zlib.dll for Windows - zlib_lib_name = 'zlib' - move_shared_libs(build_prefix, build_lib, zlib_lib_name, - implib_required=False) - if self.with_flight: - # DLL dependencies for gRPC / Flight - for lib_name in ['cares', 'libprotobuf', - 'libcrypto-1_1-x64', - 'libssl-1_1-x64']: - move_shared_libs(build_prefix, build_lib, lib_name, - implib_required=False) + self._bundle_arrow_cpp(build_prefix, build_lib) if self.with_plasma: # Move the plasma store @@ -395,6 +354,58 @@ def append_cmake_bool(value, varname): "plasma-store-server") shutil.move(source, target) + def _bundle_arrow_cpp(self, build_prefix, build_lib): + print(pjoin(build_lib, 'pyarrow')) + move_shared_libs(build_prefix, build_lib, "arrow") + move_shared_libs(build_prefix, build_lib, "arrow_python") + if self.with_cuda: + move_shared_libs(build_prefix, build_lib, "arrow_cuda") + if self.with_flight: + move_shared_libs(build_prefix, build_lib, "arrow_flight") + move_shared_libs(build_prefix, build_lib, + "arrow_python_flight") + if self.with_dataset: + move_shared_libs(build_prefix, build_lib, "arrow_dataset") + if self.with_plasma: + move_shared_libs(build_prefix, build_lib, "plasma") + if self.with_gandiva: + move_shared_libs(build_prefix, build_lib, "gandiva") + if self.with_parquet and not self.with_static_parquet: + move_shared_libs(build_prefix, build_lib, "parquet") + if not self.with_static_boost and self.bundle_boost: + move_shared_libs( + build_prefix, build_lib, + "{}_regex".format(self.boost_namespace), + implib_required=False) + if sys.platform == 'win32': + # zlib uses zlib.dll for Windows + zlib_lib_name = 'zlib' + move_shared_libs(build_prefix, build_lib, zlib_lib_name, + implib_required=False) + if self.with_flight: + # DLL dependencies for gRPC / Flight + for lib_name in ['cares', 'libprotobuf', + 'libcrypto-1_1-x64', + 'libssl-1_1-x64']: + move_shared_libs(build_prefix, build_lib, lib_name, + implib_required=False) + + def _bundle_cython_cpp(self, name, lib_path): + cpp_generated_path = self.get_ext_generated_cpp_source(name) + if not os.path.exists(cpp_generated_path): + raise RuntimeError('expected to find generated C++ file ' + 'in {0!r}'.format(cpp_generated_path)) + + # The destination path to move the generated C++ source to + # (for Cython source coverage) + cpp_path = pjoin(lib_path, self._get_build_dir(), + os.path.basename(cpp_generated_path)) + if os.path.exists(cpp_path): + os.remove(cpp_path) + print('Moving generated C++ source', cpp_generated_path, + 'to build path', cpp_path) + shutil.move(cpp_generated_path, cpp_path) + def _failure_permitted(self, name): if name == '_parquet' and not self.with_parquet: return True @@ -497,17 +508,13 @@ def _move_shared_libs_unix(build_prefix, build_lib, lib_name): raise Exception('Could not find library:' + lib_filename + ' in ' + build_prefix) - # Longest suffix library should be copied, all others symlinked + # Longest suffix library should be copied, all others ignored and can be + # symlinked later after the library has been installed libs.sort(key=lambda s: -len(s)) print(libs, libs[0]) lib_filename = os.path.basename(libs[0]) shutil.move(pjoin(build_prefix, lib_filename), pjoin(build_lib, 'pyarrow', lib_filename)) - for lib in libs[1:]: - filename = os.path.basename(lib) - link_name = pjoin(build_lib, 'pyarrow', filename) - if not os.path.exists(link_name): - os.symlink(lib_filename, link_name) # If the event of not running from a git clone (e.g. from a git archive @@ -560,9 +567,15 @@ def has_ext_modules(foo): setup_requires = [] +if strtobool(os.environ.get('PYARROW_INSTALL_TESTS', '1')): + packages = ['pyarrow', 'pyarrow.tests'] +else: + packages = ['pyarrow'] + + setup( name='pyarrow', - packages=['pyarrow', 'pyarrow.tests'], + packages=packages, zip_safe=False, package_data={'pyarrow': ['*.pxd', '*.pyx', 'includes/*.pxd']}, include_package_data=True,