diff --git a/python/testing/README.md b/python/testing/README.md new file mode 100644 index 00000000000..07970a231b5 --- /dev/null +++ b/python/testing/README.md @@ -0,0 +1,26 @@ + + +# Testing tools for odds and ends + +## Testing HDFS file interface + +```shell +./test_hdfs.sh +``` \ No newline at end of file diff --git a/python/testing/functions.sh b/python/testing/functions.sh new file mode 100644 index 00000000000..6bc342bd794 --- /dev/null +++ b/python/testing/functions.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +use_gcc() { + export CC=gcc-4.9 + export CXX=g++-4.9 +} + +use_clang() { + export CC=clang-4.0 + export CXX=clang++-4.0 +} + +bootstrap_python_env() { + PYTHON_VERSION=$1 + CONDA_ENV_DIR=$BUILD_DIR/pyarrow-test-$PYTHON_VERSION + + conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl + source activate $CONDA_ENV_DIR + + python --version + which python + + # faster builds, please + conda install -y -q nomkl pip numpy pandas cython +} + +build_pyarrow() { + # Other stuff pip install + pushd $ARROW_PYTHON_DIR + pip install -r requirements.txt + python setup.py build_ext --with-parquet --with-plasma \ + install --single-version-externally-managed --record=record.text + popd + + python -c "import pyarrow.parquet" + python -c "import pyarrow.plasma" + + export PYARROW_PATH=$CONDA_PREFIX/lib/python$PYTHON_VERSION/site-packages/pyarrow +} + +build_arrow() { + mkdir -p $ARROW_CPP_BUILD_DIR + pushd $ARROW_CPP_BUILD_DIR + + cmake -GNinja \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + -DARROW_NO_DEPRECATED_API=ON \ + -DARROW_PYTHON=ON \ + -DARROW_PLASMA=ON \ + -DARROW_BOOST_USE_SHARED=off \ + $ARROW_CPP_DIR + + ninja + ninja install + popd +} + +build_parquet() { + PARQUET_DIR=$BUILD_DIR/parquet + mkdir -p $PARQUET_DIR + + git clone https://github.com/apache/parquet-cpp.git $PARQUET_DIR + + pushd $PARQUET_DIR + mkdir build-dir + cd build-dir + + cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ + -DPARQUET_BOOST_USE_SHARED=off \ + -DPARQUET_BUILD_BENCHMARKS=off \ + -DPARQUET_BUILD_EXECUTABLES=off \ + -DPARQUET_BUILD_TESTS=off \ + .. + + ninja + ninja install + + popd +} diff --git a/python/testing/hdfs/Dockerfile b/python/testing/hdfs/Dockerfile new file mode 100644 index 00000000000..97355137ff3 --- /dev/null +++ b/python/testing/hdfs/Dockerfile @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TODO Replace this with a complete clean image build +FROM cpcloud86/impala:metastore + +USER root + +RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get update && \ + apt-get install -y \ + gcc-4.9 \ + g++-4.9 \ + build-essential \ + autotools-dev \ + autoconf \ + gtk-doc-tools \ + autoconf-archive \ + libgirepository1.0-dev \ + libtool \ + libjemalloc-dev \ + ccache \ + valgrind \ + gdb + +RUN wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - && \ + apt-add-repository -y \ + "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main" && \ + apt-get update && \ + apt-get install -y clang-4.0 clang-format-4.0 clang-tidy-4.0 + +USER ubuntu + +RUN wget -O /tmp/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ + rm /tmp/miniconda.sh diff --git a/python/testing/hdfs/libhdfs3-hdfs-client.xml b/python/testing/hdfs/libhdfs3-hdfs-client.xml new file mode 100644 index 00000000000..f929929b386 --- /dev/null +++ b/python/testing/hdfs/libhdfs3-hdfs-client.xml @@ -0,0 +1,332 @@ + + + + + + + + + + + + + + + rpc.client.timeout + 3600000 + + timeout interval of a RPC invocation in millisecond. default is 3600000. + + + + rpc.client.connect.tcpnodelay + true + + whether set socket TCP_NODELAY to true when connect to RPC server. default is true. + + + + + rpc.client.max.idle + 10000 + + the max idle time of a RPC connection in millisecond. default is 10000. + + + + + rpc.client.ping.interval + 10000 + + the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000. + + + + + rpc.client.connect.timeout + 600000 + + the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000. + + + + + rpc.client.connect.retry + 10 + + the max retry times if the RPC client fail to setup the connection to server. default is 10. + + + + + rpc.client.read.timeout + 3600000 + + the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000. + + + + + rpc.client.write.timeout + 3600000 + + the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000. + + + + + rpc.client.socket.linger.timeout + -1 + + set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1. + + + + + + dfs.client.read.shortcircuit + false + + whether reading block file bypass datanode if the block and the client are on the same node. default is true. + + + + + dfs.default.replica + 1 + + the default number of replica. default is 3. + + + + + dfs.prefetchsize + 10 + + the default number of blocks which information will be prefetched. default is 10. + + + + + dfs.client.failover.max.attempts + 15 + + if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15. + + + + + dfs.default.blocksize + 134217728 + + default block size. default is 134217728. + + + + + dfs.client.log.severity + INFO + + the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO. + + + + + + input.connect.timeout + 600000 + + the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000. + + + + + input.read.timeout + 3600000 + + the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000. + + + + + input.write.timeout + 3600000 + + the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000. + + + + + input.localread.default.buffersize + 2097152 + + number of bytes of the buffer which is used to hold the data from block file and verify checksum. + it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576. + + + + + input.localread.blockinfo.cachesize + 1000 + + the size of block file path information cache. default is 1000. + + + + + input.read.getblockinfo.retry + 3 + + the max retry times when the client fail to get block information from namenode. default is 3. + + + + + + output.replace-datanode-on-failure + false + + whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false. + + + + + output.default.chunksize + 512 + + the number of bytes of a chunk in pipeline. default is 512. + + + + + output.default.packetsize + 65536 + + the number of bytes of a packet in pipeline. default is 65536. + + + + + output.default.write.retry + 10 + + the max retry times when the client fail to setup the pipeline. default is 10. + + + + + output.connect.timeout + 600000 + + the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000. + + + + + output.read.timeout + 3600000 + + the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000. + + + + + output.write.timeout + 3600000 + + the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000. + + + + + output.packetpool.size + 1024 + + the max number of packets in a file's packet pool. default is 1024. + + + + + output.close.timeout + 900000 + + the timeout interval in millisecond when close an output stream. default is 900000. + + + + + dfs.domain.socket.path + /var/lib/hadoop-hdfs/dn_socket + + Optional. This is a path to a UNIX domain socket that will be used for + communication between the DataNode and local HDFS clients. + If the string "_PORT" is present in this path, it will be replaced by the + TCP port of the DataNode. + + + + + dfs.client.use.legacy.blockreader.local + false + + Legacy short-circuit reader implementation based on HDFS-2246 is used + if this configuration parameter is true. + This is for the platforms other than Linux + where the new implementation based on HDFS-347 is not available. + + + + diff --git a/python/testing/hdfs/restart_docker_container.sh b/python/testing/hdfs/restart_docker_container.sh new file mode 100644 index 00000000000..15076cc2873 --- /dev/null +++ b/python/testing/hdfs/restart_docker_container.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +export ARROW_TEST_NN_HOST=arrow-hdfs +export ARROW_TEST_IMPALA_HOST=$ARROW_TEST_NN_HOST +export ARROW_TEST_IMPALA_PORT=21050 +export ARROW_TEST_WEBHDFS_PORT=50070 +export ARROW_TEST_WEBHDFS_USER=ubuntu + +docker stop $ARROW_TEST_NN_HOST +docker rm $ARROW_TEST_NN_HOST + +docker run -d -it --name $ARROW_TEST_NN_HOST \ + -v $PWD:/io \ + --hostname $ARROW_TEST_NN_HOST \ + --shm-size=2gb \ + -p $ARROW_TEST_WEBHDFS_PORT -p $ARROW_TEST_IMPALA_PORT \ + arrow-hdfs-test + +while ! docker exec $ARROW_TEST_NN_HOST impala-shell -q 'SELECT VERSION()'; do + sleep 1 +done diff --git a/python/testing/hdfs/run_tests.sh b/python/testing/hdfs/run_tests.sh new file mode 100755 index 00000000000..e0d36df58a3 --- /dev/null +++ b/python/testing/hdfs/run_tests.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd) + +source $HERE/../set_env_common.sh +source $HERE/../setup_toolchain.sh +source $HERE/../functions.sh + +git clone https://github.com/apache/arrow.git $ARROW_CHECKOUT + +use_clang + +bootstrap_python_env 3.6 + +build_arrow +build_parquet + +build_pyarrow + +$ARROW_CPP_BUILD_DIR/debug/io-hdfs-test + +python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet --hdfs diff --git a/python/testing/set_env_common.sh b/python/testing/set_env_common.sh new file mode 100644 index 00000000000..00251f92be4 --- /dev/null +++ b/python/testing/set_env_common.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +export MINICONDA=$HOME/miniconda +export CPP_TOOLCHAIN=$HOME/cpp-toolchain + +export PATH="$MINICONDA/bin:$PATH" +export CONDA_PKGS_DIRS=$HOME/.conda_packages + +export ARROW_CHECKOUT=$HOME/arrow +export BUILD_DIR=$ARROW_CHECKOUT + +export BUILD_OS_NAME=linux +export BUILD_TYPE=debug + +export ARROW_CPP_DIR=$BUILD_DIR/cpp +export ARROW_PYTHON_DIR=$BUILD_DIR/python +export ARROW_C_GLIB_DIR=$BUILD_DIR/c_glib +export ARROW_JAVA_DIR=${BUILD_DIR}/java +export ARROW_JS_DIR=${BUILD_DIR}/js +export ARROW_INTEGRATION_DIR=$BUILD_DIR/integration + +export CPP_BUILD_DIR=$BUILD_DIR/cpp-build + +export ARROW_CPP_INSTALL=$BUILD_DIR/cpp-install +export ARROW_CPP_BUILD_DIR=$BUILD_DIR/cpp-build +export ARROW_C_GLIB_INSTALL=$BUILD_DIR/c-glib-install + +export ARROW_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN +export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN + +export BOOST_ROOT=$CPP_TOOLCHAIN +export PATH=$CPP_TOOLCHAIN/bin:$PATH +export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH + +export VALGRIND="valgrind --tool=memcheck" + +export ARROW_HOME=$CPP_TOOLCHAIN +export PARQUET_HOME=$CPP_TOOLCHAIN + +# Arrow test variables + +export JAVA_HOME=/usr/lib/jvm/java-7-oracle +export HADOOP_HOME=/usr/lib/hadoop +export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` +export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native" +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ + +export ARROW_HDFS_TEST_HOST=arrow-hdfs +export ARROW_HDFS_TEST_PORT=9000 +export ARROW_HDFS_TEST_USER=ubuntu +export ARROW_LIBHDFS_DIR=/usr/lib + +export LIBHDFS3_CONF=/io/hdfs/libhdfs3-hdfs-client.xml diff --git a/python/testing/setup_toolchain.sh b/python/testing/setup_toolchain.sh new file mode 100644 index 00000000000..c3837b45cbc --- /dev/null +++ b/python/testing/setup_toolchain.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +export PATH="$MINICONDA/bin:$PATH" +conda update -y -q conda +conda config --set auto_update_conda false +conda info -a + +conda config --set show_channel_urls True + +# Help with SSL timeouts to S3 +conda config --set remote_connect_timeout_secs 12 + +conda config --add channels https://repo.continuum.io/pkgs/free +conda config --add channels conda-forge +conda info -a + +# faster builds, please +conda install -y nomkl + +conda install --y conda-build jinja2 anaconda-client cmake curl + +# Set up C++ toolchain +conda create -y -q -p $CPP_TOOLCHAIN python=3.6 \ + jemalloc=4.4.0 \ + nomkl \ + boost-cpp \ + rapidjson \ + flatbuffers \ + gflags \ + lz4-c \ + snappy \ + zstd \ + brotli \ + zlib \ + git \ + cmake \ + curl \ + thrift-cpp \ + libhdfs3 \ + ninja + +if [ $BUILD_OS_NAME == "osx" ]; then + brew update > /dev/null + brew install jemalloc + brew install ccache +fi diff --git a/python/testing/test_hdfs.sh b/python/testing/test_hdfs.sh new file mode 100755 index 00000000000..016e54a66a6 --- /dev/null +++ b/python/testing/test_hdfs.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +docker build -t arrow-hdfs-test -f hdfs/Dockerfile . +bash hdfs/restart_docker_container.sh +docker exec -it arrow-hdfs /io/hdfs/run_tests.sh +docker stop arrow-hdfs