diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 08551f3b009..2f02ef247af 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -25,8 +25,8 @@ echo $GTEST_HOME CMAKE_COMMON_FLAGS="\ -DARROW_BUILD_BENCHMARKS=ON \ --DARROW_PARQUET=ON \ --DARROW_HDFS=on \ +-DARROW_PARQUET=OFF \ +-DARROW_HDFS=ON \ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" if [ $TRAVIS_OS_NAME == "linux" ]; then diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh index 3a8f57bf8f1..e9225259e6d 100644 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -9,7 +9,9 @@ else fi wget -O miniconda.sh $MINICONDA_URL -export MINICONDA=$TRAVIS_BUILD_DIR/miniconda + +export MINICONDA=$HOME/miniconda + bash miniconda.sh -b -p $MINICONDA export PATH="$MINICONDA/bin:$PATH" conda update -y -q conda diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 4a377428ae4..61c8e444361 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -5,7 +5,7 @@ set -e PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ -export MINICONDA=$TRAVIS_BUILD_DIR/miniconda +export MINICONDA=$HOME/miniconda export PATH="$MINICONDA/bin:$PATH" export PARQUET_HOME=$MINICONDA @@ -31,7 +31,9 @@ python_version_tests() { # Expensive dependencies install from Continuum package repo conda install -y pip numpy pandas cython - conda install -y parquet-cpp arrow-cpp -c apache/channel/dev + # conda install -y parquet-cpp + + conda install -y arrow-cpp -c apache/channel/dev # Other stuff pip install pip install -r requirements.txt diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index e3350d6e13d..36f4828a999 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -72,6 +72,7 @@ else () endif () mark_as_advanced( + PARQUET_FOUND PARQUET_INCLUDE_DIR PARQUET_LIBS PARQUET_LIBRARIES diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index deb7ffd03ba..e767e955524 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -46,6 +46,10 @@ TEST(DefaultMemoryPool, OOM) { ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); } +// Death tests and valgrind are known to not play well 100% of the time. See +// googletest documentation +#ifndef ARROW_VALGRIND + TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { MemoryPool* pool = default_memory_pool(); @@ -60,4 +64,6 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { pool->Free(data, 100); } +#endif // ARROW_VALGRIND + } // namespace arrow diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index fdbfce99656..522895808de 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -340,8 +340,10 @@ if (PYARROW_BUILD_TESTS) endif() ## Parquet -find_package(Parquet REQUIRED) -include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) +find_package(Parquet) +if(PARQUET_FOUND) + include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) +endif() ## Arrow find_package(Arrow REQUIRED) @@ -350,8 +352,6 @@ ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) ADD_THIRDPARTY_LIB(arrow_io SHARED_LIB ${ARROW_IO_SHARED_LIB}) -ADD_THIRDPARTY_LIB(arrow_parquet - SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) ############################################################ # Linker setup @@ -418,6 +418,16 @@ endif() add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) +set(CYTHON_EXTENSIONS + array + config + error + io + scalar + schema + table +) + set(PYARROW_SRCS src/pyarrow/common.cc src/pyarrow/config.cc @@ -431,9 +441,19 @@ set(PYARROW_SRCS set(LINK_LIBS arrow arrow_io - arrow_parquet ) +if(PARQUET_FOUND AND ARROW_PARQUET_FOUND) + ADD_THIRDPARTY_LIB(arrow_parquet + SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) + set(LINK_LIBS + ${LINK_LIBS} + arrow_parquet) + set(CYTHON_EXTENSIONS + ${CYTHON_EXTENSIONS} + parquet) +endif() + SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) add_library(pyarrow SHARED @@ -448,17 +468,6 @@ endif() # Setup and build Cython modules ############################################################ -set(CYTHON_EXTENSIONS - array - config - error - io - parquet - scalar - schema - table -) - foreach(module ${CYTHON_EXTENSIONS}) string(REPLACE "." ";" directories ${module}) list(GET directories -1 module_name) diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 6bd305615fc..5d5efc431a4 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -52,7 +52,7 @@ find_library(ARROW_IO_LIB_PATH NAMES arrow_io ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) -if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) +if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) set(ARROW_IO_LIB_NAME libarrow_io) @@ -64,18 +64,9 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) set(ARROW_IO_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_IO_LIB_NAME}.a) set(ARROW_IO_SHARED_LIB ${ARROW_LIBS}/${ARROW_IO_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) - - set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) - set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) -else () - set(ARROW_FOUND FALSE) -endif () - -if (ARROW_FOUND) if (NOT Arrow_FIND_QUIETLY) message(STATUS "Found the Arrow core library: ${ARROW_LIB_PATH}") message(STATUS "Found the Arrow IO library: ${ARROW_IO_LIB_PATH}") - message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") endif () else () if (NOT Arrow_FIND_QUIETLY) @@ -88,8 +79,23 @@ else () message(STATUS "${ARROW_ERR_MSG}") endif (Arrow_FIND_REQUIRED) endif () + set(ARROW_FOUND FALSE) endif () +if(ARROW_PARQUET_LIB_PATH) + set(ARROW_PARQUET_FOUND TRUE) + set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) + set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + if (NOT Arrow_FIND_QUIETLY) + message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") + endif () +else() + if (NOT Arrow_FIND_QUIETLY) + message(STATUS "Could not find Arrow Parquet library") + endif() + set(ARROW_PARQUET_FOUND FALSE) +endif() + mark_as_advanced( ARROW_INCLUDE_DIR ARROW_LIBS diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 328e923b941..eb92e8ea93a 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -46,6 +46,7 @@ def hdfs_test_client(): HDFS_TMP_PATH = '/tmp/pyarrow-test-{0}'.format(random.randint(0, 1000)) + @pytest.fixture(scope='session') def hdfs(request): fixture = hdfs_test_client() diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index d89d947b7b6..8a2d8cab572 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -15,33 +15,45 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest -import pyarrow as arrow -import pyarrow.parquet +import pytest -A = arrow +import pyarrow as A import numpy as np -import os.path import pandas as pd import pandas.util.testing as pdt +try: + import pyarrow.parquet as pq + HAVE_PARQUET = True +except ImportError: + HAVE_PARQUET = False +# XXX: Make Parquet tests opt-in rather than skip-if-not-build +parquet = pytest.mark.skipif(not HAVE_PARQUET, + reason='Parquet support not built') + + +@parquet def test_single_pylist_column_roundtrip(tmpdir): for dtype in [int, float]: - filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__)) + filename = tmpdir.join('single_{}_column.parquet' + .format(dtype.__name__)) data = [A.from_pylist(list(map(dtype, range(5))))] table = A.Table.from_arrays(('a', 'b'), data, 'table_name') A.parquet.write_table(table, filename.strpath) - table_read = pyarrow.parquet.read_table(filename.strpath) - for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): + table_read = pq.read_table(filename.strpath) + for col_written, col_read in zip(table.itercolumns(), + table_read.itercolumns()): assert col_written.name == col_read.name assert col_read.data.num_chunks == 1 data_written = col_written.data.chunk(0) data_read = col_read.data.chunk(0) assert data_written.equals(data_read) + +@parquet def test_pandas_parquet_2_0_rountrip(tmpdir): size = 10000 np.random.seed(0) @@ -58,17 +70,20 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, # Pandas only support ns resolution, Arrow at the moment only ms - 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'), + 'datetime': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[ms]'), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None] }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True) A.parquet.write_table(arrow_table, filename.strpath, version="2.0") - table_read = pyarrow.parquet.read_table(filename.strpath) + table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read) + +@parquet def test_pandas_parquet_1_0_rountrip(tmpdir): size = 10000 np.random.seed(0) @@ -88,11 +103,10 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename.strpath, version="1.0") - table_read = pyarrow.parquet.read_table(filename.strpath) + table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) pdt.assert_frame_equal(df, df_read) - diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 83fcbb8faff..abf143199fe 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -16,11 +16,7 @@ # under the License. from pyarrow.compat import unittest -import pyarrow as arrow - -A = arrow - -import pandas as pd +import pyarrow as A class TestRowBatch(unittest.TestCase): @@ -76,4 +72,3 @@ def test_pandas(self): assert set(df.columns) == set(('a', 'b')) assert df.shape == (5, 2) assert df.ix[0, 'b'] == -10 - diff --git a/python/setup.py b/python/setup.py index 59410d75a61..a5db2b025e6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -97,6 +97,18 @@ def initialize_options(self): _build_ext.initialize_options(self) self.extra_cmake_args = '' + CYTHON_MODULE_NAMES = [ + 'array', + 'config', + 'error', + 'io', + 'parquet', + 'scalar', + 'schema', + 'table'] + + CYTHON_ALLOWED_FAILURES = ['parquet'] + def _run_cmake(self): # The directory containing this setup.py source = osp.dirname(osp.abspath(__file__)) @@ -172,10 +184,13 @@ def _run_cmake(self): # Move the built C-extension to the place expected by the Python build self._found_names = [] - for name in self.get_cmake_cython_names(): + for name in self.CYTHON_MODULE_NAMES: built_path = self.get_ext_built(name) if not os.path.exists(built_path): print(built_path) + if name in self.CYTHON_ALLOWED_FAILURES: + print('Cython module {0} failure permitted'.format(name)) + continue raise RuntimeError('libpyarrow C-extension failed to build:', os.path.abspath(built_path)) @@ -213,16 +228,6 @@ def get_ext_built(self, name): suffix = sysconfig.get_config_var('SO') return name + suffix - def get_cmake_cython_names(self): - return ['array', - 'config', - 'error', - 'io', - 'parquet', - 'scalar', - 'schema', - 'table'] - def get_names(self): return self._found_names