diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7d1f9e167d4..22c6e9a7acb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -88,6 +88,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow jemalloc-based allocator" ON) + option(ARROW_JEMALLOC_USE_SHARED + "Rely on jemalloc shared libraries where relevant" + ON) + option(ARROW_BOOST_USE_SHARED "Rely on boost shared libraries where relevant" ON) @@ -103,6 +107,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" ON) + + option(ARROW_RPATH_ORIGIN + "Build Arrow libraries with RATH set to \$ORIGIN" + OFF) endif() if(ARROW_BUILD_TESTS) diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 9de9de516f9..2da8a05c9c4 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -53,11 +53,21 @@ function(ADD_ARROW_LIB LIB_NAME) LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" LINK_FLAGS "${ARG_SHARED_LINK_FLAGS}" OUTPUT_NAME ${LIB_NAME}) - target_link_libraries(${LIB_NAME}_shared + target_link_libraries(${LIB_NAME}_shared LINK_PUBLIC ${ARG_SHARED_LINK_LIBS} LINK_PRIVATE ${ARG_SHARED_PRIVATE_LINK_LIBS}) + + if (ARROW_RPATH_ORIGIN) + if (APPLE) + set(_lib_install_rpath "@loader_path") + else() + set(_lib_install_rpath "\$ORIGIN") + endif() + set_target_properties(${LIB_NAME}_shared PROPERTIES + INSTALL_RPATH ${_lib_install_rpath}) + endif() - install(TARGETS ${LIB_NAME}_shared + install(TARGETS ${LIB_NAME}_shared LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) endif() diff --git a/cpp/src/arrow/jemalloc/CMakeLists.txt b/cpp/src/arrow/jemalloc/CMakeLists.txt index 7caa74a3ebb..5d5482ab653 100644 --- a/cpp/src/arrow/jemalloc/CMakeLists.txt +++ b/cpp/src/arrow/jemalloc/CMakeLists.txt @@ -40,10 +40,29 @@ if (NOT APPLE) set(ARROW_JEMALLOC_STATIC_LINK_LIBS ${ARROW_JEMALLOC_STATIC_LINK_LIBS} pthread) endif() -set(ARROW_JEMALLOC_SHARED_LINK_LIBS - arrow_shared - jemalloc_shared -) +if (ARROW_JEMALLOC_USE_SHARED) + set(ARROW_JEMALLOC_SHARED_LINK_LIBS + arrow_shared + jemalloc_shared + ) +else() + if (CMAKE_COMPILER_IS_GNUCXX) + set(ARROW_JEMALLOC_SHARED_LINK_LIBS + arrow_shared + jemalloc_static + # For glibc <2.17 we need to link to librt. + # As we compile with --as-needed by default, the linker will omit this + # dependency if not required. + rt + ) + else() + set(ARROW_JEMALLOC_SHARED_LINK_LIBS + arrow_shared + jemalloc_static + ) + endif() +endif() + if (ARROW_BUILD_STATIC) set(ARROW_JEMALLOC_TEST_LINK_LIBS diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ba26692b32b..6e6d609b000 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -56,6 +56,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_JEMALLOC "Build the PyArrow jemalloc integration" OFF) + option(PYARROW_BUNDLE_ARROW_CPP + "Bundle the Arrow C++ libraries" + OFF) endif() if(NOT PYARROW_BUILD_TESTS) @@ -332,6 +335,25 @@ endif() ## Arrow find_package(Arrow REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) + +if (PYARROW_BUNDLE_ARROW_CPP) + configure_file(${ARROW_SHARED_LIB} + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow${CMAKE_SHARED_LIBRARY_SUFFIX} + COPYONLY) + SET(ARROW_SHARED_LIB + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow${CMAKE_SHARED_LIBRARY_SUFFIX}) + configure_file(${ARROW_IO_SHARED_LIB} + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow_io${CMAKE_SHARED_LIBRARY_SUFFIX} + COPYONLY) + SET(ARROW_IO_SHARED_LIB + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow_io${CMAKE_SHARED_LIBRARY_SUFFIX}) + configure_file(${ARROW_IPC_SHARED_LIB} + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow_ipc${CMAKE_SHARED_LIBRARY_SUFFIX} + COPYONLY) + SET(ARROW_IPC_SHARED_LIB + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow_ipc${CMAKE_SHARED_LIBRARY_SUFFIX}) +endif() + ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) ADD_THIRDPARTY_LIB(arrow_io @@ -440,6 +462,18 @@ if (PYARROW_BUILD_PARQUET) if(NOT (PARQUET_FOUND AND PARQUET_ARROW_FOUND)) message(FATAL_ERROR "Unable to locate Parquet libraries") endif() + if (PYARROW_BUNDLE_ARROW_CPP) + configure_file(${PARQUET_SHARED_LIB} + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet${CMAKE_SHARED_LIBRARY_SUFFIX} + COPYONLY) + SET(PARQUET_SHARED_LIB + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet${CMAKE_SHARED_LIBRARY_SUFFIX}) + configure_file(${PARQUET_ARROW_SHARED_LIB} + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet_arrow${CMAKE_SHARED_LIBRARY_SUFFIX} + COPYONLY) + SET(PARQUET_ARROW_SHARED_LIB + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet_arrow${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() ADD_THIRDPARTY_LIB(parquet_arrow SHARED_LIB ${PARQUET_ARROW_SHARED_LIB}) set(LINK_LIBS @@ -451,6 +485,13 @@ if (PYARROW_BUILD_PARQUET) endif() if (PYARROW_BUILD_JEMALLOC) + if (PYARROW_BUNDLE_ARROW_CPP) + configure_file(${ARROW_JEMALLOC_SHARED_LIB} + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow_jemalloc${CMAKE_SHARED_LIBRARY_SUFFIX} + COPYONLY) + SET(ARROW_JEMALLOC_SHARED_LIB + ${BUILD_OUTPUT_ROOT_DIRECTORY}/libarrow_jemalloc${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() ADD_THIRDPARTY_LIB(arrow_jemalloc SHARED_LIB ${ARROW_JEMALLOC_SHARED_LIB}) set(LINK_LIBS @@ -463,6 +504,10 @@ endif() add_library(pyarrow SHARED ${PYARROW_SRCS}) +if (PYARROW_BUNDLE_ARROW_CPP) + set_target_properties(pyarrow PROPERTIES + INSTALL_RPATH "\$ORIGIN") +endif() target_link_libraries(pyarrow ${LINK_LIBS}) if(APPLE) diff --git a/python/manylinux1/Dockerfile-parquet_arrow-base-x86_64 b/python/manylinux1/Dockerfile-parquet_arrow-base-x86_64 deleted file mode 100644 index dcc9321c322..00000000000 --- a/python/manylinux1/Dockerfile-parquet_arrow-base-x86_64 +++ /dev/null @@ -1,19 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -FROM arrow-base-x86_64 - -WORKDIR / -RUN git clone https://github.com/apache/parquet-cpp.git -WORKDIR /parquet-cpp -RUN ARROW_HOME=/usr cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DPARQUET_BUILD_TESTS=OFF -DPARQUET_ARROW=ON . -RUN make -j5 install diff --git a/python/manylinux1/Dockerfile-x86_64 b/python/manylinux1/Dockerfile-x86_64 index ac47108c84a..820b94e306a 100644 --- a/python/manylinux1/Dockerfile-x86_64 +++ b/python/manylinux1/Dockerfile-x86_64 @@ -13,14 +13,23 @@ FROM quay.io/pypa/manylinux1_x86_64:latest # Install dependencies -RUN yum install -y flex openssl-devel +RUN yum install -y flex zlib-devel + +# Build a newer OpenSSL version to support Thrift 0.10.0, note that we don't trigger the SSL code in Arrow. +WORKDIR / +RUN wget --no-check-certificate https://www.openssl.org/source/openssl-1.0.2k.tar.gz -O openssl-1.0.2k.tar.gz +RUN tar xf openssl-1.0.2k.tar.gz +WORKDIR openssl-1.0.2k +RUN ./config -fpic shared --prefix=/usr +RUN make -j5 +RUN make install WORKDIR / RUN wget --no-check-certificate http://downloads.sourceforge.net/project/boost/boost/1.60.0/boost_1_60_0.tar.gz -O /boost_1_60_0.tar.gz RUN tar xf boost_1_60_0.tar.gz WORKDIR /boost_1_60_0 RUN ./bootstrap.sh -RUN ./bjam cxxflags=-fPIC cflags=-fPIC --prefix=/usr --with-filesystem --with-date_time --with-system install +RUN ./bjam cxxflags=-fPIC cflags=-fPIC --prefix=/usr --with-filesystem --with-date_time --with-system --with-regex install WORKDIR / RUN wget https://github.com/jemalloc/jemalloc/releases/download/4.4.0/jemalloc-4.4.0.tar.bz2 -O jemalloc-4.4.0.tar.bz2 @@ -43,5 +52,11 @@ RUN git checkout ffe59955ad8690c2f8bb74766cb7e9b0d0ee3963 ADD arrow /arrow WORKDIR /arrow/cpp -RUN cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DARROW_HDFS=ON -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=ON . +RUN cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_HDFS=ON -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=ON -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF . +RUN make -j5 install + +WORKDIR / +RUN git clone https://github.com/apache/parquet-cpp.git +WORKDIR /parquet-cpp +RUN ARROW_HOME=/arrow-dist cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DPARQUET_BUILD_TESTS=OFF -DPARQUET_ARROW=ON -DPARQUET_BOOST_USE_SHARED=OFF . RUN make -j5 install diff --git a/python/manylinux1/README.md b/python/manylinux1/README.md index 8cd9f6db004..32af6f31da2 100644 --- a/python/manylinux1/README.md +++ b/python/manylinux1/README.md @@ -31,10 +31,8 @@ for all supported Python versions and place them in the `dist` folder. git clone ../../ arrow # Build the native baseimage docker build -t arrow-base-x86_64 -f Dockerfile-x86_64 . -# (optionally) build parquet-cpp -docker build -t parquet_arrow-base-x86_64 -f Dockerfile-parquet_arrow-base-x86_64 . # Build the python packages -docker run --rm -v $PWD:/io parquet_arrow-base-x86_64 /io/build_arrow.sh +docker run --rm -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh # Now the new packages are located in the dist/ folder ls -l dist/ ``` diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index cce5cd2b4d4..576a983b11c 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -29,38 +29,19 @@ source /multibuild/manylinux_utils.sh cd /arrow/python -export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/lib" # PyArrow build configuration export PYARROW_BUILD_TYPE='release' export PYARROW_CMAKE_OPTIONS='-DPYARROW_BUILD_TESTS=ON' +export PYARROW_WITH_PARQUET=1 +export PYARROW_WITH_JEMALLOC=1 +export PYARROW_BUNDLE_ARROW_CPP=1 # Need as otherwise arrow_io is sometimes not linked export LDFLAGS="-Wl,--no-as-needed" -export ARROW_HOME="/usr" +export ARROW_HOME="/arrow-dist" export PARQUET_HOME="/usr" # Ensure the target directory exists mkdir -p /io/dist -# Temporary directory to store the wheels that should be sent through auditwheel -rm_mkdir unfixed_wheels - -PY35_BIN=/opt/python/cp35-cp35m/bin -$PY35_BIN/pip install 'pyelftools<0.24' -$PY35_BIN/pip install 'git+https://github.com/xhochy/auditwheel.git@pyarrow-fixes' - -# Override repair_wheelhouse function -function repair_wheelhouse { - local in_dir=$1 - local out_dir=$2 - for whl in $in_dir/*.whl; do - if [[ $whl == *none-any.whl ]]; then - cp $whl $out_dir - else - # Store libraries directly in . not .libs to fix problems with libpyarrow.so linkage. - $PY35_BIN/auditwheel -v repair -L . $whl -w $out_dir/ - fi - done - chmod -R a+rwX $out_dir -} for PYTHON in ${PYTHON_VERSIONS}; do PYTHON_INTERPRETER="$(cpython_path $PYTHON)/bin/python" @@ -68,17 +49,36 @@ for PYTHON in ${PYTHON_VERSIONS}; do PIPI_IO="$PIP install -f $MANYLINUX_URL" PATH="$PATH:$(cpython_path $PYTHON)" + echo "=== (${PYTHON}) Installing build dependencies ===" $PIPI_IO "numpy==1.9.0" $PIPI_IO "cython==0.24" - PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER setup.py build_ext --inplace --with-parquet --with-jemalloc + # Clear output directory + rm -rf dist/ + echo "=== (${PYTHON}) Building wheel ===" + PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER setup.py build_ext --inplace --with-parquet --with-jemalloc --bundle-arrow-cpp PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER setup.py bdist_wheel - # Test for optional modules + echo "=== (${PYTHON}) Test the existence of optional modules ===" $PIPI_IO -r requirements.txt PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER -c "import pyarrow.parquet" PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER -c "import pyarrow.jemalloc" - repair_wheelhouse dist /io/dist + echo "=== (${PYTHON}) Tag the wheel with manylinux1 ===" + mkdir -p repaired_wheels/ + auditwheel -v repair -L . dist/pyarrow-*.whl -w repaired_wheels/ + + echo "=== (${PYTHON}) Testing manylinux1 wheel ===" + # Fix version to keep build reproducible" + $PIPI_IO "virtualenv==15.1.0" + rm -rf venv + "$(cpython_path $PYTHON)/bin/virtualenv" -p ${PYTHON_INTERPRETER} --no-download venv + source ./venv/bin/activate + pip install repaired_wheels/*.whl + pip install pytest pandas + py.test venv/lib/*/site-packages/pyarrow + deactivate + + mv repaired_wheels/*.whl /io/dist done diff --git a/python/setup.py b/python/setup.py index 54d1cd3af48..b0f29be4c1b 100644 --- a/python/setup.py +++ b/python/setup.py @@ -34,6 +34,7 @@ from os.path import join as pjoin from distutils.command.clean import clean as _clean +from distutils.util import strtobool from distutils import sysconfig # Check if we're running 64-bit Python @@ -81,15 +82,17 @@ def run(self): user_options = ([('extra-cmake-args=', None, 'extra arguments for CMake'), ('build-type=', None, 'build type (debug or release)'), ('with-parquet', None, 'build the Parquet extension'), - ('with-jemalloc', None, 'build the jemalloc extension')] + + ('with-jemalloc', None, 'build the jemalloc extension'), + ('bundle-arrow-cpp', None, 'bundle the Arrow C++ libraries')] + _build_ext.user_options) def initialize_options(self): _build_ext.initialize_options(self) self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower() - self.with_parquet = False - self.with_jemalloc = False + self.with_parquet = strtobool(os.environ.get('PYARROW_WITH_PARQUET', '0')) + self.with_jemalloc = strtobool(os.environ.get('PYARROW_WITH_JEMALLOC', '0')) + self.bundle_arrow_cpp = strtobool(os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) CYTHON_MODULE_NAMES = [ 'array', @@ -142,6 +145,9 @@ def _run_cmake(self): if self.with_jemalloc: cmake_options.append('-DPYARROW_BUILD_JEMALLOC=on') + if self.bundle_arrow_cpp: + cmake_options.append('-DPYARROW_BUNDLE_ARROW_CPP=ON') + if sys.platform != 'win32': cmake_options.append('-DCMAKE_BUILD_TYPE={0}' .format(self.build_type)) @@ -181,17 +187,35 @@ def _run_cmake(self): # Move the built libpyarrow library to the place expected by the Python # build - if sys.platform != 'win32': - name, = glob.glob(pjoin(self.build_type, 'libpyarrow.*')) - try: - os.makedirs(pjoin(build_lib, 'pyarrow')) - except OSError: - pass - shutil.move(name, - pjoin(build_lib, 'pyarrow', os.path.split(name)[1])) + shared_library_prefix = 'lib' + if sys.platform == 'darwin': + shared_library_suffix = '.dylib' + elif sys.platform == 'win32': + shared_library_suffix = '.dll' + shared_library_prefix = '' else: - shutil.move(pjoin(self.build_type, 'pyarrow.dll'), - pjoin(build_lib, 'pyarrow', 'pyarrow.dll')) + shared_library_suffix = '.so' + + try: + os.makedirs(pjoin(build_lib, 'pyarrow')) + except OSError: + pass + + def move_lib(lib_name): + lib_filename = shared_library_prefix + lib_name + shared_library_suffix + shutil.move(pjoin(self.build_type, lib_filename), + pjoin(build_lib, 'pyarrow', lib_filename)) + + move_lib("pyarrow") + if self.bundle_arrow_cpp: + move_lib("arrow") + move_lib("arrow_io") + move_lib("arrow_ipc") + if self.with_jemalloc: + move_lib("arrow_jemalloc") + if self.with_parquet: + move_lib("parquet") + move_lib("parquet_arrow") # Move the built C-extension to the place expected by the Python build self._found_names = []