diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 73201325023..789ffbd057a 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -37,28 +37,16 @@ using std::size_t; namespace arrow { namespace io { -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - std::stringstream ss; \ - ss << "HDFS: " << WHAT << " failed"; \ - return Status::IOError(ss.str()); \ - } \ +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + std::stringstream ss; \ + ss << "HDFS " << WHAT << " failed, errno: " << errno << " (" << strerror(errno) \ + << ")"; \ + return Status::IOError(ss.str()); \ + } \ } while (0) -static Status CheckReadResult(int ret) { - // Check for error on -1 (possibly errno set) - - // ret == 0 at end of file, which is OK - if (ret == -1) { - // EOF - std::stringstream ss; - ss << "HDFS read failed, errno: " << errno; - return Status::IOError(ss.str()); - } - return Status::OK(); -} - static constexpr int kDefaultHdfsBufferSize = 1 << 16; // ---------------------------------------------------------------------- @@ -129,7 +117,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { RETURN_NOT_OK(Seek(position)); return Read(nbytes, bytes_read, buffer); } - RETURN_NOT_OK(CheckReadResult(ret)); + CHECK_FAILURE(ret, "read"); *bytes_read = ret; return Status::OK(); } @@ -156,7 +144,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { tSize ret = driver_->Read( fs_, file_, reinterpret_cast(buffer) + total_bytes, static_cast(std::min(buffer_size_, nbytes - total_bytes))); - RETURN_NOT_OK(CheckReadResult(ret)); + CHECK_FAILURE(ret, "read"); total_bytes += ret; if (ret == 0) { break; @@ -428,22 +416,29 @@ class HadoopFileSystem::HadoopFileSystemImpl { Status ListDirectory(const std::string& path, std::vector* listing) { int num_entries = 0; + errno = 0; hdfsFileInfo* entries = driver_->ListDirectory(fs_, path.c_str(), &num_entries); if (entries == nullptr) { // If the directory is empty, entries is NULL but errno is 0. Non-zero // errno indicates error // - // Note: errno is thread-locala - if (errno == 0) { + // Note: errno is thread-local + // + // XXX(wesm): ARROW-2300; we found with Hadoop 2.6 that libhdfs would set + // errno 2/ENOENT for empty directories. To be more robust to this we + // double check this case + if ((errno == 0) || (errno == ENOENT && Exists(path))) { num_entries = 0; } else { - return Status::IOError("HDFS: list directory failed"); + std::stringstream ss; + ss << "HDFS list directory failed, errno: " << errno << " (" << strerror(errno) + << ")"; + return Status::IOError(ss.str()); } } // Allocate additional space for elements - int vec_offset = static_cast(listing->size()); listing->resize(vec_offset + num_entries); diff --git a/dev/README.md b/dev/README.md index 62ffb0a8f14..971fb5f1ddb 100644 --- a/dev/README.md +++ b/dev/README.md @@ -120,4 +120,11 @@ For JavaScript-specific releases, use a different verification script: ```shell bash dev/release/js-verify-release-candidate.sh 0.7.0 0 -``` \ No newline at end of file +``` +# Integration testing + +## HDFS C++ / Python support + +```shell +run_docker_compose.sh hdfs_integration +``` diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index b1e593cf480..c4500986caf 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -16,25 +16,61 @@ version: '3' services: - gen_apidocs: + + hdfs-namenode: + image: gelog/hadoop + shm_size: 2G + ports: + - "9000:9000" + - "50070:50070" + command: hdfs namenode + hostname: hdfs-namenode + + hdfs-datanode: + image: gelog/hadoop + command: hdfs datanode + ports: + # The host port is randomly assigned by Docker, to allow scaling + # to multiple DataNodes on the same host + - "50075" + links: + - hdfs-namenode:hdfs-namenode + + hdfs_integration: + links: + - hdfs-namenode:hdfs-namenode + - hdfs-datanode:hdfs-datanode + environment: + - ARROW_HDFS_TEST_HOST=hdfs-namenode + - ARROW_HDFS_TEST_PORT=9000 + - ARROW_HDFS_TEST_USER=root build: - context: gen_apidocs + context: hdfs_integration volumes: - ../..:/apache-arrow - run_site: + + spark_integration: build: - context: run_site - ports: - - "4000:4000" + context: spark_integration volumes: - - ../..:/apache-arrow + - ../..:/apache-arrow + dask_integration: build: context: dask_integration volumes: - - ../..:/apache-arrow - spark_integration: - build: - context: spark_integration + - ../..:/apache-arrow + + gen_apidocs: + build: + context: gen_apidocs volumes: - ../..:/apache-arrow + + run_site: + build: + context: run_site + ports: + - "4000:4000" + volumes: + - ../..:/apache-arrow diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile new file mode 100644 index 00000000000..71dcbe3aa2c --- /dev/null +++ b/dev/hdfs_integration/Dockerfile @@ -0,0 +1,75 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM gelog/hadoop + +ENV CC=gcc \ + CXX=g++ \ + PATH=/opt/conda/bin:$PATH + +RUN apt-get update -y \ + && apt-get install -y \ + gcc \ + g++ \ + git \ + wget \ + pkg-config \ + ninja-build + +# Miniconda - Python 3.6, 64-bit, x86, latest +RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \ + && /bin/bash conda.sh -b -p /opt/conda \ + && rm conda.sh + +# create conda env with the required dependences +RUN conda create -y -q -c conda-forge -n pyarrow-dev \ + python=3.6 \ + nomkl \ + numpy \ + six \ + setuptools \ + cython \ + pandas \ + pytest \ + cmake \ + flatbuffers \ + rapidjson \ + boost-cpp \ + thrift-cpp \ + snappy \ + zlib \ + gflags \ + brotli \ + jemalloc \ + lz4-c \ + zstd \ + && conda clean --all + +# installing in the previous step boost=1.60 and boost-cpp=1.67 gets installed, +# cmake finds 1.60 and parquet fails to compile +# installing it in a separate step, boost=1.60 and boost-cpp=1.64 gets +# installed, cmake finds 1.64 +# libhdfs3 needs to be pinned,see ARROW-1465 and ARROW-1445 +RUN conda install -y -q -n pyarrow-dev -c conda-forge \ + hdfs3 \ + libhdfs3=2.2.31 \ + && conda clean --all + +ADD . /apache-arrow +WORKDIR /apache-arrow + +CMD arrow/dev/hdfs_integration/hdfs_integration.sh diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh new file mode 100755 index 00000000000..c67f18d28a0 --- /dev/null +++ b/dev/hdfs_integration/hdfs_integration.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Exit on any error +set -e + +# cwd is mounted from host machine to +# and contains both arrow and parquet-cpp + +# Activate conda environment +source activate pyarrow-dev + +# Arrow build variables +export ARROW_BUILD_TYPE=debug +export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX +export ARROW_HOME=$CONDA_PREFIX +export PARQUET_HOME=$CONDA_PREFIX + +# Hadoop variables +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ +export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` + +# For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues +export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" +export PYARROW_CXXFLAGS=$CXXFLAGS +export PYARROW_CMAKE_GENERATOR=Ninja + +# Install arrow-cpp +mkdir -p arrow/cpp/hdfs-integration-build +pushd arrow/cpp/hdfs-integration-build + +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + -DARROW_PYTHON=ON \ + -DARROW_PLASMA=ON \ + -DARROW_HDFS=ON \ + -DARROW_BUILD_TESTS=ON \ + -DCMAKE_CXX_FLAGS=$CXXFLAGS \ + .. +ninja +ninja install + +popd + +# Install parquet-cpp +mkdir -p parquet-cpp/hdfs-integration-build +pushd parquet-cpp/hdfs-integration-build + +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ + -DPARQUET_BUILD_BENCHMARKS=OFF \ + -DPARQUET_BUILD_EXECUTABLES=OFF \ + -DPARQUET_BUILD_TESTS=ON \ + -DCMAKE_CXX_FLAGS=$CXXFLAGS \ + .. +ninja +ninja install + +popd + +# Install pyarrow +pushd arrow/python + +# Clear the build directory so we are guaranteed a fresh set of extensions +rm -rf build/ + +python setup.py build_ext \ + --build-type=$ARROW_BUILD_TYPE \ + --with-parquet \ + --with-plasma \ + --inplace + +popd + +# Run tests +export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml + +# Python +python -m pytest -vv -r sxX -s arrow/python/pyarrow \ + --only-parquet --only-hdfs + +# C++ +arrow/cpp/hdfs-integration-build/debug/io-hdfs-test diff --git a/python/testing/hdfs/libhdfs3-hdfs-client.xml b/dev/hdfs_integration/libhdfs3-client-config.xml similarity index 100% rename from python/testing/hdfs/libhdfs3-hdfs-client.xml rename to dev/hdfs_integration/libhdfs3-client-config.xml diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh index 8ca4dc3ac97..6c0a3f0bb5d 100755 --- a/dev/spark_integration/spark_integration.sh +++ b/dev/spark_integration/spark_integration.sh @@ -87,6 +87,5 @@ build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test # Run pyarrow related Python tests only SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests" echo "Testing PySpark: $SPARK_PYTHON_TESTS" -SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS +SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS popd - diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 85cec6712bb..70c70b62aee 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -944,7 +944,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/'): common_metadata_path = None metadata_path = None - if len(path_or_paths) == 1: + if isinstance(path_or_paths, list) and len(path_or_paths) == 1: # Dask passes a directory as a list of length 1 path_or_paths = path_or_paths[0] @@ -1004,9 +1004,8 @@ def read_table(source, columns=None, nthreads=1, metadata=None, use_pandas_metadata=False): if is_path(source): fs = _get_fs_from_path(source) - - if fs.isdir(source): - return fs.read_parquet(source, columns=columns, metadata=metadata) + return fs.read_parquet(source, columns=columns, metadata=metadata, + use_pandas_metadata=use_pandas_metadata) pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, nthreads=nthreads, diff --git a/python/testing/README.md b/python/testing/README.md index 0ebeec4a1c3..d7d0ff0bb7f 100644 --- a/python/testing/README.md +++ b/python/testing/README.md @@ -19,12 +19,6 @@ # Testing tools for odds and ends -## Testing HDFS file interface - -```shell -./test_hdfs.sh -``` - ## Testing Dask integration Initial integration testing with Dask has been Dockerized. diff --git a/python/testing/hdfs/Dockerfile b/python/testing/hdfs/Dockerfile deleted file mode 100644 index 97355137ff3..00000000000 --- a/python/testing/hdfs/Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# TODO Replace this with a complete clean image build -FROM cpcloud86/impala:metastore - -USER root - -RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ - apt-get update && \ - apt-get install -y \ - gcc-4.9 \ - g++-4.9 \ - build-essential \ - autotools-dev \ - autoconf \ - gtk-doc-tools \ - autoconf-archive \ - libgirepository1.0-dev \ - libtool \ - libjemalloc-dev \ - ccache \ - valgrind \ - gdb - -RUN wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - && \ - apt-add-repository -y \ - "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main" && \ - apt-get update && \ - apt-get install -y clang-4.0 clang-format-4.0 clang-tidy-4.0 - -USER ubuntu - -RUN wget -O /tmp/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ - rm /tmp/miniconda.sh diff --git a/python/testing/hdfs/restart_docker_container.sh b/python/testing/hdfs/restart_docker_container.sh deleted file mode 100644 index 15076cc2873..00000000000 --- a/python/testing/hdfs/restart_docker_container.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -export ARROW_TEST_NN_HOST=arrow-hdfs -export ARROW_TEST_IMPALA_HOST=$ARROW_TEST_NN_HOST -export ARROW_TEST_IMPALA_PORT=21050 -export ARROW_TEST_WEBHDFS_PORT=50070 -export ARROW_TEST_WEBHDFS_USER=ubuntu - -docker stop $ARROW_TEST_NN_HOST -docker rm $ARROW_TEST_NN_HOST - -docker run -d -it --name $ARROW_TEST_NN_HOST \ - -v $PWD:/io \ - --hostname $ARROW_TEST_NN_HOST \ - --shm-size=2gb \ - -p $ARROW_TEST_WEBHDFS_PORT -p $ARROW_TEST_IMPALA_PORT \ - arrow-hdfs-test - -while ! docker exec $ARROW_TEST_NN_HOST impala-shell -q 'SELECT VERSION()'; do - sleep 1 -done diff --git a/python/testing/hdfs/run_tests.sh b/python/testing/hdfs/run_tests.sh deleted file mode 100755 index e0d36df58a3..00000000000 --- a/python/testing/hdfs/run_tests.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd) - -source $HERE/../set_env_common.sh -source $HERE/../setup_toolchain.sh -source $HERE/../functions.sh - -git clone https://github.com/apache/arrow.git $ARROW_CHECKOUT - -use_clang - -bootstrap_python_env 3.6 - -build_arrow -build_parquet - -build_pyarrow - -$ARROW_CPP_BUILD_DIR/debug/io-hdfs-test - -python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet --hdfs