From 3a840e8d7c039e32e2458b4bc0bb9b7942c84757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 13 Apr 2018 11:09:36 +0200 Subject: [PATCH 01/16] ARROW-2300: [C++/Python] Integration test for HDFS --- dev/docker-compose.yml | 57 +++++++++++++---- dev/hdfs_integration/Dockerfile | 47 ++++++++++++++ dev/hdfs_integration/hdfs_integration.sh | 78 ++++++++++++++++++++++++ dev/run_docker_compose.sh | 8 +++ 4 files changed, 179 insertions(+), 11 deletions(-) create mode 100644 dev/hdfs_integration/Dockerfile create mode 100755 dev/hdfs_integration/hdfs_integration.sh diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index b1e593cf480..74d45c63bbd 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -16,25 +16,60 @@ version: '3' services: - gen_apidocs: + + hdfs-namenode: + image: gelog/hadoop + ports: + - "9000:9000" + - "50070:50070" + command: hdfs namenode + hostname: hdfs-namenode + + hdfs-datanode: + image: gelog/hadoop + command: hdfs datanode + ports: + # The host port is randomly assigned by Docker, to allow scaling + # to multiple DataNodes on the same host + - "50075" + links: + - hdfs-namenode:hdfs-namenode + + hdfs_integration: + links: + - hdfs-namenode:hdfs-namenode + - hdfs-datanode:hdfs-datanode + environment: + - ARROW_HDFS_TEST_HOST=hdfs-namenode + - ARROW_HDFS_TEST_PORT=9000 + - ARROW_HDFS_TEST_USER=root build: - context: gen_apidocs + context: hdfs_integration volumes: - ../..:/apache-arrow - run_site: + + spark_integration: build: - context: run_site - ports: - - "4000:4000" + context: spark_integration volumes: - - ../..:/apache-arrow + - ../..:/apache-arrow + dask_integration: build: context: dask_integration volumes: - - ../..:/apache-arrow - spark_integration: - build: - context: spark_integration + - ../..:/apache-arrow + + gen_apidocs: + build: + context: gen_apidocs volumes: - ../..:/apache-arrow + + run_site: + build: + context: run_site + ports: + - "4000:4000" + volumes: + - ../..:/apache-arrow diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile new file mode 100644 index 00000000000..1d01b9c792e --- /dev/null +++ b/dev/hdfs_integration/Dockerfile @@ -0,0 +1,47 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM ubuntu:16.04 + +# Basic OS utilities +RUN apt-get update && \ + apt-get install -y \ + git \ + wget \ + gcc-4.9 \ + g++-4.9 \ + build-essential + +# install conda in /home/ubuntu/miniconda +RUN wget -O /tmp/miniconda.sh \ + https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -b -p /opt/conda && \ + rm /tmp/miniconda.sh + +ENV PATH=/opt/conda/bin:$PATH CONDA_PREFIX=/opt/conda + +# Create Conda environment +RUN conda update conda -y && \ + conda install -c conda-forge \ + python=3.6 numpy six setuptools cython pandas pytest \ + cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ + gflags brotli jemalloc lz4-c zstd hdfs3 libhdfs3 && \ + conda clean --all + +ADD . /apache-arrow +WORKDIR /apache-arrow + +CMD arrow/dev/hdfs_integration/hdfs_integration.sh diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh new file mode 100755 index 00000000000..6b774d175b3 --- /dev/null +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Exit on any error +set -e + +# cwd is mounted from host machine to +# and contains both arrow and parquet-cpp + +export ARROW_BUILD_TYPE=debug +export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX +export ARROW_HOME=$CONDA_PREFIX +export PARQUET_HOME=$CONDA_PREFIX + +export CC=gcc-4.9 +export CXX=g++-4.9 + +# install arrow +mkdir -p arrow/cpp/build +pushd arrow/cpp/build + +rm -rf ./* + +cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + -DARROW_PYTHON=on \ + -DARROW_PLASMA=on \ + -DARROW_HDFS=on \ + .. +make -j4 +make install +popd + +# install parquet-cpp +mkdir -p parquet-cpp/build +pushd parquet-cpp/build + +rm -rf ./* + +cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ + -DPARQUET_BUILD_BENCHMARKS=off \ + -DPARQUET_BUILD_EXECUTABLES=off \ + -DPARQUET_BUILD_TESTS=on \ + .. + +make -j4 +make install +popd + +# install pyarrow +pushd arrow/python + +python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ + --with-parquet --with-plasma --inplace + +popd + + +arrow/cpp/build/debug/io-hdfs-test + +python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs diff --git a/dev/run_docker_compose.sh b/dev/run_docker_compose.sh index 503efd5e1aa..040369ef9f4 100755 --- a/dev/run_docker_compose.sh +++ b/dev/run_docker_compose.sh @@ -35,5 +35,13 @@ if [ ! -d parquet-cpp ]; then exit 1 fi +<<<<<<< HEAD docker-compose -f arrow/dev/docker-compose.yml build "${@}" docker-compose -f arrow/dev/docker-compose.yml run --rm "${@}" +======= +GID=$(id -g ${USERNAME}) +docker-compose -f arrow/dev/docker-compose.yml run \ + --rm "${1}" + +#-u "${UID}:${GID}" "${1}" +>>>>>>> ARROW-2300: [C++/Python] Integration test for HDFS From e8cb7dcaa0b9a9ccf71cac45fabee743abfe3bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 13 Apr 2018 12:45:56 +0200 Subject: [PATCH 02/16] reproduced segfault --- dev/hdfs_integration/Dockerfile | 4 +--- dev/hdfs_integration/hdfs_integration.sh | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile index 1d01b9c792e..0194db7feb0 100644 --- a/dev/hdfs_integration/Dockerfile +++ b/dev/hdfs_integration/Dockerfile @@ -14,15 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM ubuntu:16.04 +FROM ubuntu:14.04 # Basic OS utilities RUN apt-get update && \ apt-get install -y \ git \ wget \ - gcc-4.9 \ - g++-4.9 \ build-essential # install conda in /home/ubuntu/miniconda diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index 6b774d175b3..f4cebded587 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -28,8 +28,8 @@ export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX export PARQUET_HOME=$CONDA_PREFIX -export CC=gcc-4.9 -export CXX=g++-4.9 +# export CC=gcc-4.9 +# export CXX=g++-4.9 # install arrow mkdir -p arrow/cpp/build From e08d98902c6e37976861fc5b32a592b9ccf2dd3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 13 Apr 2018 12:47:59 +0200 Subject: [PATCH 03/16] ccache --- dev/hdfs_integration/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile index 0194db7feb0..1546b64da2a 100644 --- a/dev/hdfs_integration/Dockerfile +++ b/dev/hdfs_integration/Dockerfile @@ -21,6 +21,7 @@ RUN apt-get update && \ apt-get install -y \ git \ wget \ + ccache \ build-essential # install conda in /home/ubuntu/miniconda From 6cbfdad6d818be553653dabe065d176881b60acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 24 May 2018 11:43:13 +0200 Subject: [PATCH 04/16] remove conflict garbage --- dev/run_docker_compose.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/dev/run_docker_compose.sh b/dev/run_docker_compose.sh index 040369ef9f4..503efd5e1aa 100755 --- a/dev/run_docker_compose.sh +++ b/dev/run_docker_compose.sh @@ -35,13 +35,5 @@ if [ ! -d parquet-cpp ]; then exit 1 fi -<<<<<<< HEAD docker-compose -f arrow/dev/docker-compose.yml build "${@}" docker-compose -f arrow/dev/docker-compose.yml run --rm "${@}" -======= -GID=$(id -g ${USERNAME}) -docker-compose -f arrow/dev/docker-compose.yml run \ - --rm "${1}" - -#-u "${UID}:${GID}" "${1}" ->>>>>>> ARROW-2300: [C++/Python] Integration test for HDFS From 25becee8e0a45a4669b56ae3c5ecf0065eb3936c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 5 Jul 2018 16:48:49 +0200 Subject: [PATCH 05/16] build with newer gcc and ninja --- dev/hdfs_integration/Dockerfile | 52 +++++++++++---------- dev/hdfs_integration/hdfs_integration.sh | 59 ++++++++++++++---------- 2 files changed, 62 insertions(+), 49 deletions(-) diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile index 1546b64da2a..a4253601a11 100644 --- a/dev/hdfs_integration/Dockerfile +++ b/dev/hdfs_integration/Dockerfile @@ -14,31 +14,33 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM ubuntu:14.04 - -# Basic OS utilities -RUN apt-get update && \ - apt-get install -y \ - git \ - wget \ - ccache \ - build-essential - -# install conda in /home/ubuntu/miniconda -RUN wget -O /tmp/miniconda.sh \ - https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash /tmp/miniconda.sh -b -p /opt/conda && \ - rm /tmp/miniconda.sh - -ENV PATH=/opt/conda/bin:$PATH CONDA_PREFIX=/opt/conda - -# Create Conda environment -RUN conda update conda -y && \ - conda install -c conda-forge \ - python=3.6 numpy six setuptools cython pandas pytest \ - cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ - gflags brotli jemalloc lz4-c zstd hdfs3 libhdfs3 && \ - conda clean --all + +FROM ubuntu:18.04 + +RUN apt-get update -y \ + && apt-get install -y \ + gcc-8 \ + g++-8 \ + git \ + wget \ + ninja-build + +ENV CC=gcc-8 +ENV CXX=g++-8 + +# Miniconda - Python 3.6, 64-bit, x86, latest +RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \ + && /bin/bash conda.sh -b -p /opt/conda \ + && rm conda.sh + +ENV PATH="/opt/conda/bin:$PATH" + +# create conda env with deps +RUN conda create -y -q -c conda-forge -n pyarrow-dev \ + python=3.6 numpy six setuptools cython pandas pytest \ + cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ + gflags brotli jemalloc lz4-c zstd nomkl libhdfs3 hdfs3 \ + && conda clean --all ADD . /apache-arrow WORKDIR /apache-arrow diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index f4cebded587..488e846ae4c 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -22,57 +22,68 @@ set -e # cwd is mounted from host machine to # and contains both arrow and parquet-cpp +# Activate conda environment +source activate pyarrow-dev + +# Set environment variable export ARROW_BUILD_TYPE=debug export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX export PARQUET_HOME=$CONDA_PREFIX -# export CC=gcc-4.9 -# export CXX=g++-4.9 +# For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues +export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" +export PYARROW_CXXFLAGS=$CXXFLAGS +export PYARROW_CMAKE_GENERATOR=Ninja -# install arrow +# Install arrow-cpp mkdir -p arrow/cpp/build pushd arrow/cpp/build -rm -rf ./* - -cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DARROW_PYTHON=on \ - -DARROW_PLASMA=on \ - -DARROW_HDFS=on \ + -DARROW_PYTHON=ON \ + -DARROW_PLASMA=ON \ + -DARROW_HDFS=ON \ + -DARROW_BUILD_TESTS=ON \ + -DCMAKE_CXX_FLAGS=$CXXFLAGS \ .. -make -j4 -make install +ninja +ninja install + popd -# install parquet-cpp +# Install parquet-cpp mkdir -p parquet-cpp/build pushd parquet-cpp/build -rm -rf ./* - -cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ - -DPARQUET_BUILD_BENCHMARKS=off \ - -DPARQUET_BUILD_EXECUTABLES=off \ - -DPARQUET_BUILD_TESTS=on \ + -DPARQUET_BUILD_BENCHMARKS=OFF \ + -DPARQUET_BUILD_EXECUTABLES=OFF \ + -DPARQUET_BUILD_TESTS=ON \ + -DCMAKE_CXX_FLAGS=$CXXFLAGS \ .. +ninja +ninja install -make -j4 -make install popd -# install pyarrow +# Install pyarrow pushd arrow/python -python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ - --with-parquet --with-plasma --inplace +python setup.py build_ext \ + --build-type=$ARROW_BUILD_TYPE \ + --with-parquet \ + --with-plasma \ + --inplace popd - +# Run tests arrow/cpp/build/debug/io-hdfs-test python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs From bcbd1f35e0252d7e50988b2052082c86faad2fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 5 Jul 2018 16:54:03 +0200 Subject: [PATCH 06/16] dockerfile format --- dev/hdfs_integration/Dockerfile | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile index a4253601a11..ef2c005938d 100644 --- a/dev/hdfs_integration/Dockerfile +++ b/dev/hdfs_integration/Dockerfile @@ -21,7 +21,7 @@ RUN apt-get update -y \ && apt-get install -y \ gcc-8 \ g++-8 \ - git \ + git \ wget \ ninja-build @@ -37,9 +37,28 @@ ENV PATH="/opt/conda/bin:$PATH" # create conda env with deps RUN conda create -y -q -c conda-forge -n pyarrow-dev \ - python=3.6 numpy six setuptools cython pandas pytest \ - cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ - gflags brotli jemalloc lz4-c zstd nomkl libhdfs3 hdfs3 \ + python=3.6 \ + numpy \ + six \ + setuptools \ + cython \ + pandas \ + pytest \ + cmake \ + flatbuffers \ + rapidjson \ + boost-cpp \ + thrift-cpp \ + snappy \ + zlib \ + gflags \ + brotli \ + jemalloc \ + lz4-c \ + zstd \ + nomkl \ + libhdfs3 \ + hdfs3 \ && conda clean --all ADD . /apache-arrow From 2eb227f926968c513d72d39808ab713d901fed8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 9 Jul 2018 10:06:20 +0200 Subject: [PATCH 07/16] install pinned libhdfs3; compile successfully --- dev/hdfs_integration/Dockerfile | 14 +++++++++++--- dev/hdfs_integration/hdfs_integration.sh | 4 +++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile index ef2c005938d..a086966488b 100644 --- a/dev/hdfs_integration/Dockerfile +++ b/dev/hdfs_integration/Dockerfile @@ -35,9 +35,10 @@ RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - ENV PATH="/opt/conda/bin:$PATH" -# create conda env with deps +# create conda env with the required dependences RUN conda create -y -q -c conda-forge -n pyarrow-dev \ python=3.6 \ + nomkl \ numpy \ six \ setuptools \ @@ -56,9 +57,16 @@ RUN conda create -y -q -c conda-forge -n pyarrow-dev \ jemalloc \ lz4-c \ zstd \ - nomkl \ - libhdfs3 \ + && conda clean --all + +# installing in the previous step boost=1.60 and boost-cpp=1.67 gets installed, +# cmake finds 1.60 and parquet fails to compile +# installing it in a separate step, boost=1.60 and boost-cpp=1.64 gets +# installed, cmake finds 1.64 +# libhdfs3 needs to be pinned,see ARROW-1465 and ARROW-1445 +RUN conda install -y -q -n pyarrow-dev -c conda-forge \ hdfs3 \ + libhdfs3=2.2.31 \ && conda clean --all ADD . /apache-arrow diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index 488e846ae4c..29f322c264c 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -37,6 +37,8 @@ export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" export PYARROW_CXXFLAGS=$CXXFLAGS export PYARROW_CMAKE_GENERATOR=Ninja +export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} + # Install arrow-cpp mkdir -p arrow/cpp/build pushd arrow/cpp/build @@ -86,4 +88,4 @@ popd # Run tests arrow/cpp/build/debug/io-hdfs-test -python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs +#python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs From 891ea36c0189607d2a7641a8d0ebe679515499c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 9 Jul 2018 16:07:18 +0200 Subject: [PATCH 08/16] configure libhdfs3 --- dev/hdfs_integration/hdfs_integration.sh | 6 +- .../libhdfs3-client-config.xml | 332 ++++++++++++++++++ 2 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 dev/hdfs_integration/libhdfs3-client-config.xml diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index 29f322c264c..6e6ef938095 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -86,6 +86,10 @@ python setup.py build_ext \ popd # Run tests +export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml + +# C++ arrow/cpp/build/debug/io-hdfs-test -#python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs +# Python +# python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs diff --git a/dev/hdfs_integration/libhdfs3-client-config.xml b/dev/hdfs_integration/libhdfs3-client-config.xml new file mode 100644 index 00000000000..f929929b386 --- /dev/null +++ b/dev/hdfs_integration/libhdfs3-client-config.xml @@ -0,0 +1,332 @@ + + + + + + + + + + + + + + + rpc.client.timeout + 3600000 + + timeout interval of a RPC invocation in millisecond. default is 3600000. + + + + rpc.client.connect.tcpnodelay + true + + whether set socket TCP_NODELAY to true when connect to RPC server. default is true. + + + + + rpc.client.max.idle + 10000 + + the max idle time of a RPC connection in millisecond. default is 10000. + + + + + rpc.client.ping.interval + 10000 + + the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000. + + + + + rpc.client.connect.timeout + 600000 + + the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000. + + + + + rpc.client.connect.retry + 10 + + the max retry times if the RPC client fail to setup the connection to server. default is 10. + + + + + rpc.client.read.timeout + 3600000 + + the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000. + + + + + rpc.client.write.timeout + 3600000 + + the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000. + + + + + rpc.client.socket.linger.timeout + -1 + + set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1. + + + + + + dfs.client.read.shortcircuit + false + + whether reading block file bypass datanode if the block and the client are on the same node. default is true. + + + + + dfs.default.replica + 1 + + the default number of replica. default is 3. + + + + + dfs.prefetchsize + 10 + + the default number of blocks which information will be prefetched. default is 10. + + + + + dfs.client.failover.max.attempts + 15 + + if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15. + + + + + dfs.default.blocksize + 134217728 + + default block size. default is 134217728. + + + + + dfs.client.log.severity + INFO + + the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO. + + + + + + input.connect.timeout + 600000 + + the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000. + + + + + input.read.timeout + 3600000 + + the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000. + + + + + input.write.timeout + 3600000 + + the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000. + + + + + input.localread.default.buffersize + 2097152 + + number of bytes of the buffer which is used to hold the data from block file and verify checksum. + it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576. + + + + + input.localread.blockinfo.cachesize + 1000 + + the size of block file path information cache. default is 1000. + + + + + input.read.getblockinfo.retry + 3 + + the max retry times when the client fail to get block information from namenode. default is 3. + + + + + + output.replace-datanode-on-failure + false + + whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false. + + + + + output.default.chunksize + 512 + + the number of bytes of a chunk in pipeline. default is 512. + + + + + output.default.packetsize + 65536 + + the number of bytes of a packet in pipeline. default is 65536. + + + + + output.default.write.retry + 10 + + the max retry times when the client fail to setup the pipeline. default is 10. + + + + + output.connect.timeout + 600000 + + the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000. + + + + + output.read.timeout + 3600000 + + the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000. + + + + + output.write.timeout + 3600000 + + the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000. + + + + + output.packetpool.size + 1024 + + the max number of packets in a file's packet pool. default is 1024. + + + + + output.close.timeout + 900000 + + the timeout interval in millisecond when close an output stream. default is 900000. + + + + + dfs.domain.socket.path + /var/lib/hadoop-hdfs/dn_socket + + Optional. This is a path to a UNIX domain socket that will be used for + communication between the DataNode and local HDFS clients. + If the string "_PORT" is present in this path, it will be replaced by the + TCP port of the DataNode. + + + + + dfs.client.use.legacy.blockreader.local + false + + Legacy short-circuit reader implementation based on HDFS-2246 is used + if this configuration parameter is true. + This is for the platforms other than Linux + where the new implementation based on HDFS-347 is not available. + + + + From 7e3c03328fd5395accbe73b1c7119efe9debf3b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 9 Jul 2018 20:55:28 +0200 Subject: [PATCH 09/16] deepend on hadoop image to have libhdfs preinstalled --- dev/hdfs_integration/Dockerfile | 16 ++++++++-------- dev/hdfs_integration/hdfs_integration.sh | 5 ++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile index a086966488b..df45338bd9b 100644 --- a/dev/hdfs_integration/Dockerfile +++ b/dev/hdfs_integration/Dockerfile @@ -15,26 +15,26 @@ # limitations under the License. # -FROM ubuntu:18.04 +# FROM ubuntu:18.04 +FROM gelog/hadoop + +ENV CC=gcc \ + CXX=g++ \ + PATH=/opt/conda/bin:$PATH RUN apt-get update -y \ && apt-get install -y \ - gcc-8 \ - g++-8 \ + gcc \ + g++ \ git \ wget \ ninja-build -ENV CC=gcc-8 -ENV CXX=g++-8 - # Miniconda - Python 3.6, 64-bit, x86, latest RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \ && /bin/bash conda.sh -b -p /opt/conda \ && rm conda.sh -ENV PATH="/opt/conda/bin:$PATH" - # create conda env with the required dependences RUN conda create -y -q -c conda-forge -n pyarrow-dev \ python=3.6 \ diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index 6e6ef938095..7ac0afe377d 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -37,8 +37,6 @@ export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" export PYARROW_CXXFLAGS=$CXXFLAGS export PYARROW_CMAKE_GENERATOR=Ninja -export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} - # Install arrow-cpp mkdir -p arrow/cpp/build pushd arrow/cpp/build @@ -86,10 +84,11 @@ python setup.py build_ext \ popd # Run tests +export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml # C++ arrow/cpp/build/debug/io-hdfs-test # Python -# python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs +python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs From c910538f9ff7bb83c4dcbb559fd1ac0493491712 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 23 Jul 2018 11:51:22 +0200 Subject: [PATCH 10/16] use filesystem.read_parquet for both directory and single file path --- python/pyarrow/parquet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 85cec6712bb..2831c41c1d3 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1004,9 +1004,8 @@ def read_table(source, columns=None, nthreads=1, metadata=None, use_pandas_metadata=False): if is_path(source): fs = _get_fs_from_path(source) - - if fs.isdir(source): - return fs.read_parquet(source, columns=columns, metadata=metadata) + return fs.read_parquet(source, columns=columns, metadata=metadata, + use_pandas_metadata=use_pandas_metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, nthreads=nthreads, From cde939a9ae1e6f4850ffe932be5b5f4e310efb38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 23 Jul 2018 11:55:30 +0200 Subject: [PATCH 11/16] fix path_or_paths checking in _make_manifest --- python/pyarrow/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 2831c41c1d3..70c70b62aee 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -944,7 +944,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/'): common_metadata_path = None metadata_path = None - if len(path_or_paths) == 1: + if isinstance(path_or_paths, list) and len(path_or_paths) == 1: # Dask passes a directory as a list of length 1 path_or_paths = path_or_paths[0] From 3f8318e1a40d6ef5ae4d1ca53e9db836549cd3ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 23 Jul 2018 14:24:06 +0200 Subject: [PATCH 12/16] display errno in error msg; add pkgconfig; export hadoop env vars --- cpp/src/arrow/io/hdfs.cc | 7 ++++--- dev/hdfs_integration/Dockerfile | 12 ++++++------ dev/hdfs_integration/hdfs_integration.sh | 13 ++++++++----- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 73201325023..cf793e64f98 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -434,16 +434,17 @@ class HadoopFileSystem::HadoopFileSystemImpl { // If the directory is empty, entries is NULL but errno is 0. Non-zero // errno indicates error // - // Note: errno is thread-locala + // Note: errno is thread-local if (errno == 0) { num_entries = 0; } else { - return Status::IOError("HDFS: list directory failed"); + std::stringstream ss; + ss << "HDFS list directory failed, errno: " << errno; + return Status::IOError(ss.str()); } } // Allocate additional space for elements - int vec_offset = static_cast(listing->size()); listing->resize(vec_offset + num_entries); diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile index df45338bd9b..71dcbe3aa2c 100644 --- a/dev/hdfs_integration/Dockerfile +++ b/dev/hdfs_integration/Dockerfile @@ -15,7 +15,6 @@ # limitations under the License. # -# FROM ubuntu:18.04 FROM gelog/hadoop ENV CC=gcc \ @@ -24,11 +23,12 @@ ENV CC=gcc \ RUN apt-get update -y \ && apt-get install -y \ - gcc \ - g++ \ - git \ - wget \ - ninja-build + gcc \ + g++ \ + git \ + wget \ + pkg-config \ + ninja-build # Miniconda - Python 3.6, 64-bit, x86, latest RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \ diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index 7ac0afe377d..c699263918c 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -25,13 +25,17 @@ set -e # Activate conda environment source activate pyarrow-dev -# Set environment variable +# Arrow build variables export ARROW_BUILD_TYPE=debug export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX export PARQUET_HOME=$CONDA_PREFIX +# Hadoop variables +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ +export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` + # For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" export PYARROW_CXXFLAGS=$CXXFLAGS @@ -84,11 +88,10 @@ python setup.py build_ext \ popd # Run tests -export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml -# C++ -arrow/cpp/build/debug/io-hdfs-test - # Python python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs + +# C++ +arrow/cpp/build/debug/io-hdfs-test From 78857353face7ec7c63443106f41a07a76106fd9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 23 Jul 2018 14:52:16 -0400 Subject: [PATCH 13/16] Remove python/testing/*hdfs*, add instructions to dev/README.md for running hdfs tests Change-Id: I8e2908c9ad2d81596f427858f2af7e2d151bfb1c --- dev/README.md | 9 +- python/testing/README.md | 6 - python/testing/hdfs/Dockerfile | 50 --- python/testing/hdfs/libhdfs3-hdfs-client.xml | 332 ------------------ .../testing/hdfs/restart_docker_container.sh | 38 -- python/testing/hdfs/run_tests.sh | 41 --- 6 files changed, 8 insertions(+), 468 deletions(-) delete mode 100644 python/testing/hdfs/Dockerfile delete mode 100644 python/testing/hdfs/libhdfs3-hdfs-client.xml delete mode 100644 python/testing/hdfs/restart_docker_container.sh delete mode 100755 python/testing/hdfs/run_tests.sh diff --git a/dev/README.md b/dev/README.md index 62ffb0a8f14..971fb5f1ddb 100644 --- a/dev/README.md +++ b/dev/README.md @@ -120,4 +120,11 @@ For JavaScript-specific releases, use a different verification script: ```shell bash dev/release/js-verify-release-candidate.sh 0.7.0 0 -``` \ No newline at end of file +``` +# Integration testing + +## HDFS C++ / Python support + +```shell +run_docker_compose.sh hdfs_integration +``` diff --git a/python/testing/README.md b/python/testing/README.md index 0ebeec4a1c3..d7d0ff0bb7f 100644 --- a/python/testing/README.md +++ b/python/testing/README.md @@ -19,12 +19,6 @@ # Testing tools for odds and ends -## Testing HDFS file interface - -```shell -./test_hdfs.sh -``` - ## Testing Dask integration Initial integration testing with Dask has been Dockerized. diff --git a/python/testing/hdfs/Dockerfile b/python/testing/hdfs/Dockerfile deleted file mode 100644 index 97355137ff3..00000000000 --- a/python/testing/hdfs/Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# TODO Replace this with a complete clean image build -FROM cpcloud86/impala:metastore - -USER root - -RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ - apt-get update && \ - apt-get install -y \ - gcc-4.9 \ - g++-4.9 \ - build-essential \ - autotools-dev \ - autoconf \ - gtk-doc-tools \ - autoconf-archive \ - libgirepository1.0-dev \ - libtool \ - libjemalloc-dev \ - ccache \ - valgrind \ - gdb - -RUN wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - && \ - apt-add-repository -y \ - "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main" && \ - apt-get update && \ - apt-get install -y clang-4.0 clang-format-4.0 clang-tidy-4.0 - -USER ubuntu - -RUN wget -O /tmp/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ - rm /tmp/miniconda.sh diff --git a/python/testing/hdfs/libhdfs3-hdfs-client.xml b/python/testing/hdfs/libhdfs3-hdfs-client.xml deleted file mode 100644 index f929929b386..00000000000 --- a/python/testing/hdfs/libhdfs3-hdfs-client.xml +++ /dev/null @@ -1,332 +0,0 @@ - - - - - - - - - - - - - - - rpc.client.timeout - 3600000 - - timeout interval of a RPC invocation in millisecond. default is 3600000. - - - - rpc.client.connect.tcpnodelay - true - - whether set socket TCP_NODELAY to true when connect to RPC server. default is true. - - - - - rpc.client.max.idle - 10000 - - the max idle time of a RPC connection in millisecond. default is 10000. - - - - - rpc.client.ping.interval - 10000 - - the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000. - - - - - rpc.client.connect.timeout - 600000 - - the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000. - - - - - rpc.client.connect.retry - 10 - - the max retry times if the RPC client fail to setup the connection to server. default is 10. - - - - - rpc.client.read.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000. - - - - - rpc.client.write.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000. - - - - - rpc.client.socket.linger.timeout - -1 - - set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1. - - - - - - dfs.client.read.shortcircuit - false - - whether reading block file bypass datanode if the block and the client are on the same node. default is true. - - - - - dfs.default.replica - 1 - - the default number of replica. default is 3. - - - - - dfs.prefetchsize - 10 - - the default number of blocks which information will be prefetched. default is 10. - - - - - dfs.client.failover.max.attempts - 15 - - if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15. - - - - - dfs.default.blocksize - 134217728 - - default block size. default is 134217728. - - - - - dfs.client.log.severity - INFO - - the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO. - - - - - - input.connect.timeout - 600000 - - the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000. - - - - - input.read.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000. - - - - - input.write.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000. - - - - - input.localread.default.buffersize - 2097152 - - number of bytes of the buffer which is used to hold the data from block file and verify checksum. - it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576. - - - - - input.localread.blockinfo.cachesize - 1000 - - the size of block file path information cache. default is 1000. - - - - - input.read.getblockinfo.retry - 3 - - the max retry times when the client fail to get block information from namenode. default is 3. - - - - - - output.replace-datanode-on-failure - false - - whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false. - - - - - output.default.chunksize - 512 - - the number of bytes of a chunk in pipeline. default is 512. - - - - - output.default.packetsize - 65536 - - the number of bytes of a packet in pipeline. default is 65536. - - - - - output.default.write.retry - 10 - - the max retry times when the client fail to setup the pipeline. default is 10. - - - - - output.connect.timeout - 600000 - - the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000. - - - - - output.read.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000. - - - - - output.write.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000. - - - - - output.packetpool.size - 1024 - - the max number of packets in a file's packet pool. default is 1024. - - - - - output.close.timeout - 900000 - - the timeout interval in millisecond when close an output stream. default is 900000. - - - - - dfs.domain.socket.path - /var/lib/hadoop-hdfs/dn_socket - - Optional. This is a path to a UNIX domain socket that will be used for - communication between the DataNode and local HDFS clients. - If the string "_PORT" is present in this path, it will be replaced by the - TCP port of the DataNode. - - - - - dfs.client.use.legacy.blockreader.local - false - - Legacy short-circuit reader implementation based on HDFS-2246 is used - if this configuration parameter is true. - This is for the platforms other than Linux - where the new implementation based on HDFS-347 is not available. - - - - diff --git a/python/testing/hdfs/restart_docker_container.sh b/python/testing/hdfs/restart_docker_container.sh deleted file mode 100644 index 15076cc2873..00000000000 --- a/python/testing/hdfs/restart_docker_container.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -export ARROW_TEST_NN_HOST=arrow-hdfs -export ARROW_TEST_IMPALA_HOST=$ARROW_TEST_NN_HOST -export ARROW_TEST_IMPALA_PORT=21050 -export ARROW_TEST_WEBHDFS_PORT=50070 -export ARROW_TEST_WEBHDFS_USER=ubuntu - -docker stop $ARROW_TEST_NN_HOST -docker rm $ARROW_TEST_NN_HOST - -docker run -d -it --name $ARROW_TEST_NN_HOST \ - -v $PWD:/io \ - --hostname $ARROW_TEST_NN_HOST \ - --shm-size=2gb \ - -p $ARROW_TEST_WEBHDFS_PORT -p $ARROW_TEST_IMPALA_PORT \ - arrow-hdfs-test - -while ! docker exec $ARROW_TEST_NN_HOST impala-shell -q 'SELECT VERSION()'; do - sleep 1 -done diff --git a/python/testing/hdfs/run_tests.sh b/python/testing/hdfs/run_tests.sh deleted file mode 100755 index e0d36df58a3..00000000000 --- a/python/testing/hdfs/run_tests.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd) - -source $HERE/../set_env_common.sh -source $HERE/../setup_toolchain.sh -source $HERE/../functions.sh - -git clone https://github.com/apache/arrow.git $ARROW_CHECKOUT - -use_clang - -bootstrap_python_env 3.6 - -build_arrow -build_parquet - -build_pyarrow - -$ARROW_CPP_BUILD_DIR/debug/io-hdfs-test - -python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet --hdfs From 5d3e6ff666df338f20d6297a21dad01224b1515f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 23 Jul 2018 15:04:27 -0400 Subject: [PATCH 14/16] Give HDFS integration C++ build a distinct subdir name Change-Id: I44f57bc5b3ea28966e1562e404bdce65afe0cfab --- dev/hdfs_integration/hdfs_integration.sh | 10 +++++----- dev/spark_integration/spark_integration.sh | 3 +-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index c699263918c..795e9362915 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -42,8 +42,8 @@ export PYARROW_CXXFLAGS=$CXXFLAGS export PYARROW_CMAKE_GENERATOR=Ninja # Install arrow-cpp -mkdir -p arrow/cpp/build -pushd arrow/cpp/build +mkdir -p arrow/cpp/hdfs-integration-build +pushd arrow/cpp/hdfs-integration-build cmake -GNinja \ -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ @@ -60,8 +60,8 @@ ninja install popd # Install parquet-cpp -mkdir -p parquet-cpp/build -pushd parquet-cpp/build +mkdir -p parquet-cpp/hdfs-integration-build +pushd parquet-cpp/hdfs-integration-build cmake -GNinja \ -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ @@ -94,4 +94,4 @@ export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs # C++ -arrow/cpp/build/debug/io-hdfs-test +arrow/cpp/hdfs-integration-build/debug/io-hdfs-test diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh index 8ca4dc3ac97..6c0a3f0bb5d 100755 --- a/dev/spark_integration/spark_integration.sh +++ b/dev/spark_integration/spark_integration.sh @@ -87,6 +87,5 @@ build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test # Run pyarrow related Python tests only SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests" echo "Testing PySpark: $SPARK_PYTHON_TESTS" -SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS +SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS popd - From 3f95b226985ad2e60107ccf1ab2a877a93b466ad Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 23 Jul 2018 17:31:43 -0400 Subject: [PATCH 15/16] More robust to state of local dev area Change-Id: I1f9879ab7eb150f38c66d29cfe8b41792a7b5cf8 --- dev/docker-compose.yml | 1 + dev/hdfs_integration/hdfs_integration.sh | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index 74d45c63bbd..c4500986caf 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -19,6 +19,7 @@ services: hdfs-namenode: image: gelog/hadoop + shm_size: 2G ports: - "9000:9000" - "50070:50070" diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh index 795e9362915..c67f18d28a0 100755 --- a/dev/hdfs_integration/hdfs_integration.sh +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -79,6 +79,9 @@ popd # Install pyarrow pushd arrow/python +# Clear the build directory so we are guaranteed a fresh set of extensions +rm -rf build/ + python setup.py build_ext \ --build-type=$ARROW_BUILD_TYPE \ --with-parquet \ @@ -91,7 +94,8 @@ popd export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml # Python -python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs +python -m pytest -vv -r sxX -s arrow/python/pyarrow \ + --only-parquet --only-hdfs # C++ arrow/cpp/hdfs-integration-build/debug/io-hdfs-test From 6dbbfb527a52da1db68058571b765bb6a3fe7667 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 23 Jul 2018 18:33:11 -0400 Subject: [PATCH 16/16] Apply Krisztian's errno 2 fix, nicer formatting for libhdfs errors Change-Id: I5290d60c15d51271c51df7565eb5fb1cadd4ff5e --- cpp/src/arrow/io/hdfs.cc | 42 +++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index cf793e64f98..789ffbd057a 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -37,28 +37,16 @@ using std::size_t; namespace arrow { namespace io { -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - std::stringstream ss; \ - ss << "HDFS: " << WHAT << " failed"; \ - return Status::IOError(ss.str()); \ - } \ +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + std::stringstream ss; \ + ss << "HDFS " << WHAT << " failed, errno: " << errno << " (" << strerror(errno) \ + << ")"; \ + return Status::IOError(ss.str()); \ + } \ } while (0) -static Status CheckReadResult(int ret) { - // Check for error on -1 (possibly errno set) - - // ret == 0 at end of file, which is OK - if (ret == -1) { - // EOF - std::stringstream ss; - ss << "HDFS read failed, errno: " << errno; - return Status::IOError(ss.str()); - } - return Status::OK(); -} - static constexpr int kDefaultHdfsBufferSize = 1 << 16; // ---------------------------------------------------------------------- @@ -129,7 +117,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { RETURN_NOT_OK(Seek(position)); return Read(nbytes, bytes_read, buffer); } - RETURN_NOT_OK(CheckReadResult(ret)); + CHECK_FAILURE(ret, "read"); *bytes_read = ret; return Status::OK(); } @@ -156,7 +144,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { tSize ret = driver_->Read( fs_, file_, reinterpret_cast(buffer) + total_bytes, static_cast(std::min(buffer_size_, nbytes - total_bytes))); - RETURN_NOT_OK(CheckReadResult(ret)); + CHECK_FAILURE(ret, "read"); total_bytes += ret; if (ret == 0) { break; @@ -428,6 +416,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { Status ListDirectory(const std::string& path, std::vector* listing) { int num_entries = 0; + errno = 0; hdfsFileInfo* entries = driver_->ListDirectory(fs_, path.c_str(), &num_entries); if (entries == nullptr) { @@ -435,11 +424,16 @@ class HadoopFileSystem::HadoopFileSystemImpl { // errno indicates error // // Note: errno is thread-local - if (errno == 0) { + // + // XXX(wesm): ARROW-2300; we found with Hadoop 2.6 that libhdfs would set + // errno 2/ENOENT for empty directories. To be more robust to this we + // double check this case + if ((errno == 0) || (errno == ENOENT && Exists(path))) { num_entries = 0; } else { std::stringstream ss; - ss << "HDFS list directory failed, errno: " << errno; + ss << "HDFS list directory failed, errno: " << errno << " (" << strerror(errno) + << ")"; return Status::IOError(ss.str()); } }