From 3a840e8d7c039e32e2458b4bc0bb9b7942c84757 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Fri, 13 Apr 2018 11:09:36 +0200
Subject: [PATCH 01/16] ARROW-2300: [C++/Python] Integration test for HDFS

---
 dev/docker-compose.yml                   | 57 +++++++++++++----
 dev/hdfs_integration/Dockerfile          | 47 ++++++++++++++
 dev/hdfs_integration/hdfs_integration.sh | 78 ++++++++++++++++++++++++
 dev/run_docker_compose.sh                |  8 +++
 4 files changed, 179 insertions(+), 11 deletions(-)
 create mode 100644 dev/hdfs_integration/Dockerfile
 create mode 100755 dev/hdfs_integration/hdfs_integration.sh

diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml
index b1e593cf480..74d45c63bbd 100644
--- a/dev/docker-compose.yml
+++ b/dev/docker-compose.yml
@@ -16,25 +16,60 @@
 
 version: '3'
 services:
-  gen_apidocs:
+
+  hdfs-namenode:
+    image: gelog/hadoop
+    ports:
+      - "9000:9000"
+      - "50070:50070"
+    command: hdfs namenode
+    hostname: hdfs-namenode
+
+  hdfs-datanode:
+    image: gelog/hadoop
+    command: hdfs datanode
+    ports:
+      # The host port is randomly assigned by Docker, to allow scaling
+      # to multiple DataNodes on the same host
+      - "50075"
+    links:
+      - hdfs-namenode:hdfs-namenode
+
+  hdfs_integration:
+    links:
+      - hdfs-namenode:hdfs-namenode
+      - hdfs-datanode:hdfs-datanode
+    environment:
+      - ARROW_HDFS_TEST_HOST=hdfs-namenode
+      - ARROW_HDFS_TEST_PORT=9000
+      - ARROW_HDFS_TEST_USER=root
     build:
-      context: gen_apidocs
+      context: hdfs_integration
     volumes:
      - ../..:/apache-arrow
-  run_site:
+
+  spark_integration:
     build:
-      context: run_site
-    ports:
-    - "4000:4000"
+      context: spark_integration
     volumes:
-     - ../..:/apache-arrow
+      - ../..:/apache-arrow
+
   dask_integration:
     build:
       context: dask_integration
     volumes:
-     - ../..:/apache-arrow
-  spark_integration:
-    build: 
-      context: spark_integration
+      - ../..:/apache-arrow
+
+  gen_apidocs:
+    build:
+      context: gen_apidocs
     volumes:
      - ../..:/apache-arrow
+
+  run_site:
+    build:
+      context: run_site
+    ports:
+      - "4000:4000"
+    volumes:
+      - ../..:/apache-arrow
diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
new file mode 100644
index 00000000000..1d01b9c792e
--- /dev/null
+++ b/dev/hdfs_integration/Dockerfile
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+FROM ubuntu:16.04
+
+# Basic OS utilities
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        wget \
+        gcc-4.9 \
+        g++-4.9 \
+        build-essential
+
+# install conda in /home/ubuntu/miniconda
+RUN wget -O /tmp/miniconda.sh \
+    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash /tmp/miniconda.sh -b -p /opt/conda && \
+    rm /tmp/miniconda.sh
+
+ENV PATH=/opt/conda/bin:$PATH CONDA_PREFIX=/opt/conda
+
+# Create Conda environment
+RUN conda update conda -y && \
+    conda install -c conda-forge \
+        python=3.6 numpy six setuptools cython pandas pytest \
+        cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \
+        gflags brotli jemalloc lz4-c zstd hdfs3 libhdfs3 && \
+    conda clean --all
+
+ADD . /apache-arrow
+WORKDIR /apache-arrow
+
+CMD arrow/dev/hdfs_integration/hdfs_integration.sh
diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
new file mode 100755
index 00000000000..6b774d175b3
--- /dev/null
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Exit on any error
+set -e
+
+# cwd is mounted from host machine to
+# and contains both arrow and parquet-cpp
+
+export ARROW_BUILD_TYPE=debug
+export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
+export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX
+export ARROW_HOME=$CONDA_PREFIX
+export PARQUET_HOME=$CONDA_PREFIX
+
+export CC=gcc-4.9
+export CXX=g++-4.9
+
+# install arrow
+mkdir -p arrow/cpp/build
+pushd arrow/cpp/build
+
+rm -rf ./*
+
+cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+      -DARROW_PYTHON=on \
+      -DARROW_PLASMA=on \
+      -DARROW_HDFS=on \
+      ..
+make -j4
+make install
+popd
+
+# install parquet-cpp
+mkdir -p parquet-cpp/build
+pushd parquet-cpp/build
+
+rm -rf ./*
+
+cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \
+      -DPARQUET_BUILD_BENCHMARKS=off \
+      -DPARQUET_BUILD_EXECUTABLES=off \
+      -DPARQUET_BUILD_TESTS=on \
+      ..
+
+make -j4
+make install
+popd
+
+# install pyarrow
+pushd arrow/python
+
+python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \
+    --with-parquet --with-plasma --inplace
+
+popd
+
+
+arrow/cpp/build/debug/io-hdfs-test
+
+python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
diff --git a/dev/run_docker_compose.sh b/dev/run_docker_compose.sh
index 503efd5e1aa..040369ef9f4 100755
--- a/dev/run_docker_compose.sh
+++ b/dev/run_docker_compose.sh
@@ -35,5 +35,13 @@ if [ ! -d parquet-cpp ]; then
     exit 1
 fi
 
+<<<<<<< HEAD
 docker-compose -f arrow/dev/docker-compose.yml build "${@}"
 docker-compose -f arrow/dev/docker-compose.yml run --rm "${@}"
+=======
+GID=$(id -g ${USERNAME})
+docker-compose -f arrow/dev/docker-compose.yml run \
+    --rm "${1}"
+
+#-u "${UID}:${GID}" "${1}"
+>>>>>>> ARROW-2300: [C++/Python] Integration test for HDFS

From e8cb7dcaa0b9a9ccf71cac45fabee743abfe3bbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Fri, 13 Apr 2018 12:45:56 +0200
Subject: [PATCH 02/16] reproduced segfault

---
 dev/hdfs_integration/Dockerfile          | 4 +---
 dev/hdfs_integration/hdfs_integration.sh | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
index 1d01b9c792e..0194db7feb0 100644
--- a/dev/hdfs_integration/Dockerfile
+++ b/dev/hdfs_integration/Dockerfile
@@ -14,15 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-FROM ubuntu:16.04
+FROM ubuntu:14.04
 
 # Basic OS utilities
 RUN apt-get update && \
     apt-get install -y \
         git \
         wget \
-        gcc-4.9 \
-        g++-4.9 \
         build-essential
 
 # install conda in /home/ubuntu/miniconda
diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index 6b774d175b3..f4cebded587 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -28,8 +28,8 @@ export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX
 export ARROW_HOME=$CONDA_PREFIX
 export PARQUET_HOME=$CONDA_PREFIX
 
-export CC=gcc-4.9
-export CXX=g++-4.9
+# export CC=gcc-4.9
+# export CXX=g++-4.9
 
 # install arrow
 mkdir -p arrow/cpp/build

From e08d98902c6e37976861fc5b32a592b9ccf2dd3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Fri, 13 Apr 2018 12:47:59 +0200
Subject: [PATCH 03/16] ccache

---
 dev/hdfs_integration/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
index 0194db7feb0..1546b64da2a 100644
--- a/dev/hdfs_integration/Dockerfile
+++ b/dev/hdfs_integration/Dockerfile
@@ -21,6 +21,7 @@ RUN apt-get update && \
     apt-get install -y \
         git \
         wget \
+        ccache \
         build-essential
 
 # install conda in /home/ubuntu/miniconda

From 6cbfdad6d818be553653dabe065d176881b60acb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 24 May 2018 11:43:13 +0200
Subject: [PATCH 04/16] remove conflict garbage

---
 dev/run_docker_compose.sh | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/dev/run_docker_compose.sh b/dev/run_docker_compose.sh
index 040369ef9f4..503efd5e1aa 100755
--- a/dev/run_docker_compose.sh
+++ b/dev/run_docker_compose.sh
@@ -35,13 +35,5 @@ if [ ! -d parquet-cpp ]; then
     exit 1
 fi
 
-<<<<<<< HEAD
 docker-compose -f arrow/dev/docker-compose.yml build "${@}"
 docker-compose -f arrow/dev/docker-compose.yml run --rm "${@}"
-=======
-GID=$(id -g ${USERNAME})
-docker-compose -f arrow/dev/docker-compose.yml run \
-    --rm "${1}"
-
-#-u "${UID}:${GID}" "${1}"
->>>>>>> ARROW-2300: [C++/Python] Integration test for HDFS

From 25becee8e0a45a4669b56ae3c5ecf0065eb3936c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 5 Jul 2018 16:48:49 +0200
Subject: [PATCH 05/16] build with newer gcc and ninja

---
 dev/hdfs_integration/Dockerfile          | 52 +++++++++++----------
 dev/hdfs_integration/hdfs_integration.sh | 59 ++++++++++++++----------
 2 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
index 1546b64da2a..a4253601a11 100644
--- a/dev/hdfs_integration/Dockerfile
+++ b/dev/hdfs_integration/Dockerfile
@@ -14,31 +14,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-FROM ubuntu:14.04
-
-# Basic OS utilities
-RUN apt-get update && \
-    apt-get install -y \
-        git \
-        wget \
-        ccache \
-        build-essential
-
-# install conda in /home/ubuntu/miniconda
-RUN wget -O /tmp/miniconda.sh \
-    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash /tmp/miniconda.sh -b -p /opt/conda && \
-    rm /tmp/miniconda.sh
-
-ENV PATH=/opt/conda/bin:$PATH CONDA_PREFIX=/opt/conda
-
-# Create Conda environment
-RUN conda update conda -y && \
-    conda install -c conda-forge \
-        python=3.6 numpy six setuptools cython pandas pytest \
-        cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \
-        gflags brotli jemalloc lz4-c zstd hdfs3 libhdfs3 && \
-    conda clean --all
+
+FROM ubuntu:18.04
+
+RUN apt-get update -y \
+ && apt-get install -y \
+		  gcc-8 \
+		  g++-8 \
+      git \
+		  wget \
+		  ninja-build
+
+ENV CC=gcc-8
+ENV CXX=g++-8
+
+# Miniconda - Python 3.6, 64-bit, x86, latest
+RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \
+ && /bin/bash conda.sh -b -p /opt/conda \
+ && rm conda.sh
+
+ENV PATH="/opt/conda/bin:$PATH"
+
+# create conda env with deps
+RUN conda create -y -q -c conda-forge -n pyarrow-dev \
+      python=3.6 numpy six setuptools cython pandas pytest \
+	    cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \
+      gflags brotli jemalloc lz4-c zstd nomkl libhdfs3 hdfs3 \
+ && conda clean --all
 
 ADD . /apache-arrow
 WORKDIR /apache-arrow
diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index f4cebded587..488e846ae4c 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -22,57 +22,68 @@ set -e
 # cwd is mounted from host machine to
 # and contains both arrow and parquet-cpp
 
+# Activate conda environment
+source activate pyarrow-dev
+
+# Set environment variable
 export ARROW_BUILD_TYPE=debug
 export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
 export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX
 export ARROW_HOME=$CONDA_PREFIX
 export PARQUET_HOME=$CONDA_PREFIX
 
-# export CC=gcc-4.9
-# export CXX=g++-4.9
+# For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues
+export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
+export PYARROW_CXXFLAGS=$CXXFLAGS
+export PYARROW_CMAKE_GENERATOR=Ninja
 
-# install arrow
+# Install arrow-cpp
 mkdir -p arrow/cpp/build
 pushd arrow/cpp/build
 
-rm -rf ./*
-
-cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
+cmake -GNinja \
+      -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
       -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-      -DARROW_PYTHON=on \
-      -DARROW_PLASMA=on \
-      -DARROW_HDFS=on \
+      -DARROW_PYTHON=ON \
+      -DARROW_PLASMA=ON \
+      -DARROW_HDFS=ON \
+      -DARROW_BUILD_TESTS=ON \
+      -DCMAKE_CXX_FLAGS=$CXXFLAGS \
       ..
-make -j4
-make install
+ninja
+ninja install
+
 popd
 
-# install parquet-cpp
+# Install parquet-cpp
 mkdir -p parquet-cpp/build
 pushd parquet-cpp/build
 
-rm -rf ./*
-
-cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
+cmake -GNinja \
+      -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
       -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \
-      -DPARQUET_BUILD_BENCHMARKS=off \
-      -DPARQUET_BUILD_EXECUTABLES=off \
-      -DPARQUET_BUILD_TESTS=on \
+      -DPARQUET_BUILD_BENCHMARKS=OFF \
+      -DPARQUET_BUILD_EXECUTABLES=OFF \
+      -DPARQUET_BUILD_TESTS=ON \
+      -DCMAKE_CXX_FLAGS=$CXXFLAGS \
       ..
+ninja
+ninja install
 
-make -j4
-make install
 popd
 
-# install pyarrow
+# Install pyarrow
 pushd arrow/python
 
-python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \
-    --with-parquet --with-plasma --inplace
+python setup.py build_ext \
+    --build-type=$ARROW_BUILD_TYPE \
+    --with-parquet \
+    --with-plasma \
+    --inplace
 
 popd
 
-
+# Run tests
 arrow/cpp/build/debug/io-hdfs-test
 
 python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs

From bcbd1f35e0252d7e50988b2052082c86faad2fb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 5 Jul 2018 16:54:03 +0200
Subject: [PATCH 06/16] dockerfile format

---
 dev/hdfs_integration/Dockerfile | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
index a4253601a11..ef2c005938d 100644
--- a/dev/hdfs_integration/Dockerfile
+++ b/dev/hdfs_integration/Dockerfile
@@ -21,7 +21,7 @@ RUN apt-get update -y \
  && apt-get install -y \
 		  gcc-8 \
 		  g++-8 \
-      git \
+		  git \
 		  wget \
 		  ninja-build
 
@@ -37,9 +37,28 @@ ENV PATH="/opt/conda/bin:$PATH"
 
 # create conda env with deps
 RUN conda create -y -q -c conda-forge -n pyarrow-dev \
-      python=3.6 numpy six setuptools cython pandas pytest \
-	    cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \
-      gflags brotli jemalloc lz4-c zstd nomkl libhdfs3 hdfs3 \
+      python=3.6 \
+      numpy \
+      six \
+      setuptools \
+      cython \
+      pandas \
+      pytest \
+      cmake \
+      flatbuffers \
+      rapidjson \
+      boost-cpp \
+      thrift-cpp \
+      snappy \
+      zlib \
+      gflags \
+      brotli \
+      jemalloc \
+      lz4-c \
+      zstd \
+      nomkl \
+      libhdfs3 \
+      hdfs3 \
  && conda clean --all
 
 ADD . /apache-arrow

From 2eb227f926968c513d72d39808ab713d901fed8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 9 Jul 2018 10:06:20 +0200
Subject: [PATCH 07/16] install pinned libhdfs3; compile successfully

---
 dev/hdfs_integration/Dockerfile          | 14 +++++++++++---
 dev/hdfs_integration/hdfs_integration.sh |  4 +++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
index ef2c005938d..a086966488b 100644
--- a/dev/hdfs_integration/Dockerfile
+++ b/dev/hdfs_integration/Dockerfile
@@ -35,9 +35,10 @@ RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -
 
 ENV PATH="/opt/conda/bin:$PATH"
 
-# create conda env with deps
+# create conda env with the required dependences
 RUN conda create -y -q -c conda-forge -n pyarrow-dev \
       python=3.6 \
+      nomkl \
       numpy \
       six \
       setuptools \
@@ -56,9 +57,16 @@ RUN conda create -y -q -c conda-forge -n pyarrow-dev \
       jemalloc \
       lz4-c \
       zstd \
-      nomkl \
-      libhdfs3 \
+ && conda clean --all
+
+# installing in the previous step boost=1.60 and boost-cpp=1.67 gets installed,
+# cmake finds 1.60 and parquet fails to compile
+# installing it in a separate step, boost=1.60 and boost-cpp=1.64 gets
+# installed, cmake finds 1.64
+# libhdfs3 needs to be pinned,see ARROW-1465 and ARROW-1445
+RUN conda install -y -q -n pyarrow-dev -c conda-forge \
       hdfs3 \
+      libhdfs3=2.2.31 \
  && conda clean --all
 
 ADD . /apache-arrow
diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index 488e846ae4c..29f322c264c 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -37,6 +37,8 @@ export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
 export PYARROW_CXXFLAGS=$CXXFLAGS
 export PYARROW_CMAKE_GENERATOR=Ninja
 
+export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
+
 # Install arrow-cpp
 mkdir -p arrow/cpp/build
 pushd arrow/cpp/build
@@ -86,4 +88,4 @@ popd
 # Run tests
 arrow/cpp/build/debug/io-hdfs-test
 
-python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
+#python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs

From 891ea36c0189607d2a7641a8d0ebe679515499c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 9 Jul 2018 16:07:18 +0200
Subject: [PATCH 08/16] configure libhdfs3

---
 dev/hdfs_integration/hdfs_integration.sh      |   6 +-
 .../libhdfs3-client-config.xml                | 332 ++++++++++++++++++
 2 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 dev/hdfs_integration/libhdfs3-client-config.xml

diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index 29f322c264c..6e6ef938095 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -86,6 +86,10 @@ python setup.py build_ext \
 popd
 
 # Run tests
+export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml
+
+# C++
 arrow/cpp/build/debug/io-hdfs-test
 
-#python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
+# Python
+# python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
diff --git a/dev/hdfs_integration/libhdfs3-client-config.xml b/dev/hdfs_integration/libhdfs3-client-config.xml
new file mode 100644
index 00000000000..f929929b386
--- /dev/null
+++ b/dev/hdfs_integration/libhdfs3-client-config.xml
@@ -0,0 +1,332 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<!-- From Apache HAWQ (incubating) -->
+
+<configuration>
+
+<!-- KDC
+	<property>
+		<name>hadoop.security.authentication</name>
+		<value>kerberos</value>
+	</property>
+KDC -->
+
+<!-- HA
+	<property>
+		<name>dfs.nameservices</name>
+		<value>phdcluster</value>
+	</property>
+
+	<property>
+		<name>dfs.ha.namenodes.phdcluster</name>
+		<value>nn1,nn2</value>
+	</property>6
+
+	<property>
+		<name>dfs.namenode.rpc-address.phdcluster.nn1</name>
+		<value>mdw:9000</value>
+	</property>
+
+	<property>
+		<name>dfs.namenode.rpc-address.phdcluster.nn2</name>
+		<value>smdw:9000</value>
+	</property>
+
+	<property>
+		<name>dfs.namenode.http-address.phdcluster.nn1</name>
+		<value>mdw:50070</value>
+	</property>
+
+	<property>
+		<name>dfs.namenode.http-address.phdcluster.nn2</name>
+		<value>smdw:50070</value>
+	</property>
+
+HA -->
+
+	<!-- RPC client configuration -->
+	<property>
+		<name>rpc.client.timeout</name>
+		<value>3600000</value>
+		<description>
+		timeout interval of a RPC invocation in millisecond. default is 3600000.
+		</description>
+	</property>
+	<property>
+		<name>rpc.client.connect.tcpnodelay</name>
+		<value>true</value>
+		<description>
+		whether set socket TCP_NODELAY to true when connect to RPC server. default is true.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.max.idle</name>
+		<value>10000</value>
+		<description>
+		the max idle time of a RPC connection in millisecond. default is 10000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.ping.interval</name>
+		<value>10000</value>
+		<description>
+		the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.connect.timeout</name>
+		<value>600000</value>
+		<description>
+		the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.connect.retry</name>
+		<value>10</value>
+		<description>
+		the max retry times if the RPC client fail to setup the connection to server. default is 10.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.read.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.write.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.socket.linger.timeout</name>
+		<value>-1</value>
+		<description>
+		set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1.
+		</description>
+	</property>
+
+	<!-- dfs client configuration -->
+	<property>
+		<name>dfs.client.read.shortcircuit</name>
+		<value>false</value>
+		<description>
+		whether reading block file bypass datanode if the block and the client are on the same node. default is true.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.default.replica</name>
+		<value>1</value>
+		<description>
+		the default number of replica. default is 3.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.prefetchsize</name>
+		<value>10</value>
+		<description>
+		the default number of blocks which information will be prefetched. default is 10.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.client.failover.max.attempts</name>
+		<value>15</value>
+		<description>
+		if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.default.blocksize</name>
+		<value>134217728</value>
+		<description>
+		default block size. default is 134217728.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.client.log.severity</name>
+		<value>INFO</value>
+		<description>
+		the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO.
+		</description>
+	</property>
+
+	<!-- input client configuration -->
+	<property>
+		<name>input.connect.timeout</name>
+		<value>600000</value>
+		<description>
+		the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.read.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.write.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.localread.default.buffersize</name>
+		<value>2097152</value>
+		<description>
+		number of bytes of the buffer which is used to hold the data from block file and verify checksum.
+		it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576.
+		</description>
+	</property>
+
+	<property>
+		<name>input.localread.blockinfo.cachesize</name>
+		<value>1000</value>
+		<description>
+		the size of block file path information cache. default is 1000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.read.getblockinfo.retry</name>
+		<value>3</value>
+		<description>
+		the max retry times when the client fail to get block information from namenode. default is 3.
+		</description>
+	</property>
+
+	<!-- output client configuration -->
+	<property>
+		<name>output.replace-datanode-on-failure</name>
+		<value>false</value>
+		<description>
+		whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false.
+		</description>
+	</property>
+
+	<property>
+		<name>output.default.chunksize</name>
+		<value>512</value>
+		<description>
+		the number of bytes of a chunk in pipeline. default is 512.
+		</description>
+	</property>
+
+	<property>
+		<name>output.default.packetsize</name>
+		<value>65536</value>
+		<description>
+		the number of bytes of a packet in pipeline. default is 65536.
+		</description>
+	</property>
+
+	<property>
+		<name>output.default.write.retry</name>
+		<value>10</value>
+		<description>
+		the max retry times when the client fail to setup the pipeline. default is 10.
+		</description>
+	</property>
+
+	<property>
+		<name>output.connect.timeout</name>
+		<value>600000</value>
+		<description>
+		the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000.
+		</description>
+	</property>
+
+	<property>
+		<name>output.read.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>output.write.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>output.packetpool.size</name>
+		<value>1024</value>
+		<description>
+		the max number of packets in a file's packet pool. default is 1024.
+		</description>
+	</property>
+
+	<property>
+		<name>output.close.timeout</name>
+		<value>900000</value>
+		<description>
+		the timeout interval in millisecond when close an output stream. default is 900000.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.domain.socket.path</name>
+		<value>/var/lib/hadoop-hdfs/dn_socket</value>
+		<description>
+		Optional.  This is a path to a UNIX domain socket that will be used for
+		communication between the DataNode and local HDFS clients.
+		If the string "_PORT" is present in this path, it will be replaced by the
+		TCP port of the DataNode.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.client.use.legacy.blockreader.local</name>
+		<value>false</value>
+		<description>
+		Legacy short-circuit reader implementation based on HDFS-2246 is used
+		if this configuration parameter is true.
+		This is for the platforms other than Linux
+		where the new implementation based on HDFS-347 is not available.
+		</description>
+	</property>
+
+</configuration>

From 7e3c03328fd5395accbe73b1c7119efe9debf3b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 9 Jul 2018 20:55:28 +0200
Subject: [PATCH 09/16] deepend on hadoop image to have libhdfs preinstalled

---
 dev/hdfs_integration/Dockerfile          | 16 ++++++++--------
 dev/hdfs_integration/hdfs_integration.sh |  5 ++---
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
index a086966488b..df45338bd9b 100644
--- a/dev/hdfs_integration/Dockerfile
+++ b/dev/hdfs_integration/Dockerfile
@@ -15,26 +15,26 @@
 # limitations under the License.
 #
 
-FROM ubuntu:18.04
+# FROM ubuntu:18.04
+FROM gelog/hadoop
+
+ENV CC=gcc \
+    CXX=g++ \
+    PATH=/opt/conda/bin:$PATH
 
 RUN apt-get update -y \
  && apt-get install -y \
-		  gcc-8 \
-		  g++-8 \
+		  gcc \
+		  g++ \
 		  git \
 		  wget \
 		  ninja-build
 
-ENV CC=gcc-8
-ENV CXX=g++-8
-
 # Miniconda - Python 3.6, 64-bit, x86, latest
 RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \
  && /bin/bash conda.sh -b -p /opt/conda \
  && rm conda.sh
 
-ENV PATH="/opt/conda/bin:$PATH"
-
 # create conda env with the required dependences
 RUN conda create -y -q -c conda-forge -n pyarrow-dev \
       python=3.6 \
diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index 6e6ef938095..7ac0afe377d 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -37,8 +37,6 @@ export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
 export PYARROW_CXXFLAGS=$CXXFLAGS
 export PYARROW_CMAKE_GENERATOR=Ninja
 
-export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
-
 # Install arrow-cpp
 mkdir -p arrow/cpp/build
 pushd arrow/cpp/build
@@ -86,10 +84,11 @@ python setup.py build_ext \
 popd
 
 # Run tests
+export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
 export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml
 
 # C++
 arrow/cpp/build/debug/io-hdfs-test
 
 # Python
-# python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
+python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs

From c910538f9ff7bb83c4dcbb559fd1ac0493491712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 23 Jul 2018 11:51:22 +0200
Subject: [PATCH 10/16] use filesystem.read_parquet for both directory and
 single file path

---
 python/pyarrow/parquet.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 85cec6712bb..2831c41c1d3 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -1004,9 +1004,8 @@ def read_table(source, columns=None, nthreads=1, metadata=None,
                use_pandas_metadata=False):
     if is_path(source):
         fs = _get_fs_from_path(source)
-
-        if fs.isdir(source):
-            return fs.read_parquet(source, columns=columns, metadata=metadata)
+        return fs.read_parquet(source, columns=columns, metadata=metadata,
+                               use_pandas_metadata=use_pandas_metadata)
 
     pf = ParquetFile(source, metadata=metadata)
     return pf.read(columns=columns, nthreads=nthreads,

From cde939a9ae1e6f4850ffe932be5b5f4e310efb38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 23 Jul 2018 11:55:30 +0200
Subject: [PATCH 11/16] fix path_or_paths checking in _make_manifest

---
 python/pyarrow/parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 2831c41c1d3..70c70b62aee 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -944,7 +944,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/'):
     common_metadata_path = None
     metadata_path = None
 
-    if len(path_or_paths) == 1:
+    if isinstance(path_or_paths, list) and len(path_or_paths) == 1:
         # Dask passes a directory as a list of length 1
         path_or_paths = path_or_paths[0]
 

From 3f8318e1a40d6ef5ae4d1ca53e9db836549cd3ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 23 Jul 2018 14:24:06 +0200
Subject: [PATCH 12/16] display errno in error msg; add pkgconfig; export
 hadoop env vars

---
 cpp/src/arrow/io/hdfs.cc                 |  7 ++++---
 dev/hdfs_integration/Dockerfile          | 12 ++++++------
 dev/hdfs_integration/hdfs_integration.sh | 13 ++++++++-----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc
index 73201325023..cf793e64f98 100644
--- a/cpp/src/arrow/io/hdfs.cc
+++ b/cpp/src/arrow/io/hdfs.cc
@@ -434,16 +434,17 @@ class HadoopFileSystem::HadoopFileSystemImpl {
       // If the directory is empty, entries is NULL but errno is 0. Non-zero
       // errno indicates error
       //
-      // Note: errno is thread-locala
+      // Note: errno is thread-local
       if (errno == 0) {
         num_entries = 0;
       } else {
-        return Status::IOError("HDFS: list directory failed");
+        std::stringstream ss;
+        ss << "HDFS list directory failed, errno: " << errno;
+        return Status::IOError(ss.str());
       }
     }
 
     // Allocate additional space for elements
-
     int vec_offset = static_cast<int>(listing->size());
     listing->resize(vec_offset + num_entries);
 
diff --git a/dev/hdfs_integration/Dockerfile b/dev/hdfs_integration/Dockerfile
index df45338bd9b..71dcbe3aa2c 100644
--- a/dev/hdfs_integration/Dockerfile
+++ b/dev/hdfs_integration/Dockerfile
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-# FROM ubuntu:18.04
 FROM gelog/hadoop
 
 ENV CC=gcc \
@@ -24,11 +23,12 @@ ENV CC=gcc \
 
 RUN apt-get update -y \
  && apt-get install -y \
-		  gcc \
-		  g++ \
-		  git \
-		  wget \
-		  ninja-build
+	  gcc \
+	  g++ \
+	  git \
+	  wget \
+	  pkg-config \
+	  ninja-build
 
 # Miniconda - Python 3.6, 64-bit, x86, latest
 RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \
diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index 7ac0afe377d..c699263918c 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -25,13 +25,17 @@ set -e
 # Activate conda environment
 source activate pyarrow-dev
 
-# Set environment variable
+# Arrow build variables
 export ARROW_BUILD_TYPE=debug
 export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
 export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX
 export ARROW_HOME=$CONDA_PREFIX
 export PARQUET_HOME=$CONDA_PREFIX
 
+# Hadoop variables
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/
+export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
+
 # For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues
 export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
 export PYARROW_CXXFLAGS=$CXXFLAGS
@@ -84,11 +88,10 @@ python setup.py build_ext \
 popd
 
 # Run tests
-export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
 export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml
 
-# C++
-arrow/cpp/build/debug/io-hdfs-test
-
 # Python
 python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
+
+# C++
+arrow/cpp/build/debug/io-hdfs-test

From 78857353face7ec7c63443106f41a07a76106fd9 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Mon, 23 Jul 2018 14:52:16 -0400
Subject: [PATCH 13/16] Remove python/testing/*hdfs*, add instructions to
 dev/README.md for running hdfs tests

Change-Id: I8e2908c9ad2d81596f427858f2af7e2d151bfb1c
---
 dev/README.md                                 |   9 +-
 python/testing/README.md                      |   6 -
 python/testing/hdfs/Dockerfile                |  50 ---
 python/testing/hdfs/libhdfs3-hdfs-client.xml  | 332 ------------------
 .../testing/hdfs/restart_docker_container.sh  |  38 --
 python/testing/hdfs/run_tests.sh              |  41 ---
 6 files changed, 8 insertions(+), 468 deletions(-)
 delete mode 100644 python/testing/hdfs/Dockerfile
 delete mode 100644 python/testing/hdfs/libhdfs3-hdfs-client.xml
 delete mode 100644 python/testing/hdfs/restart_docker_container.sh
 delete mode 100755 python/testing/hdfs/run_tests.sh

diff --git a/dev/README.md b/dev/README.md
index 62ffb0a8f14..971fb5f1ddb 100644
--- a/dev/README.md
+++ b/dev/README.md
@@ -120,4 +120,11 @@ For JavaScript-specific releases, use a different verification script:
 
 ```shell
 bash dev/release/js-verify-release-candidate.sh 0.7.0 0
-```
\ No newline at end of file
+```
+# Integration testing
+
+## HDFS C++ / Python support
+
+```shell
+run_docker_compose.sh hdfs_integration
+```
diff --git a/python/testing/README.md b/python/testing/README.md
index 0ebeec4a1c3..d7d0ff0bb7f 100644
--- a/python/testing/README.md
+++ b/python/testing/README.md
@@ -19,12 +19,6 @@
 
 # Testing tools for odds and ends
 
-## Testing HDFS file interface
-
-```shell
-./test_hdfs.sh
-```
-
 ## Testing Dask integration
 
 Initial integration testing with Dask has been Dockerized.
diff --git a/python/testing/hdfs/Dockerfile b/python/testing/hdfs/Dockerfile
deleted file mode 100644
index 97355137ff3..00000000000
--- a/python/testing/hdfs/Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# TODO Replace this with a complete clean image build
-FROM cpcloud86/impala:metastore
-
-USER root
-
-RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
-    apt-get update && \
-    apt-get install -y \
-            gcc-4.9 \
-            g++-4.9 \
-            build-essential \
-            autotools-dev \
-            autoconf \
-            gtk-doc-tools \
-            autoconf-archive \
-            libgirepository1.0-dev \
-            libtool \
-            libjemalloc-dev \
-            ccache \
-            valgrind \
-            gdb
-
-RUN wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - && \
-    apt-add-repository -y \
-     "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main" && \
-    apt-get update && \
-    apt-get install -y clang-4.0 clang-format-4.0 clang-tidy-4.0
-
-USER ubuntu
-
-RUN wget -O /tmp/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
-    rm /tmp/miniconda.sh
diff --git a/python/testing/hdfs/libhdfs3-hdfs-client.xml b/python/testing/hdfs/libhdfs3-hdfs-client.xml
deleted file mode 100644
index f929929b386..00000000000
--- a/python/testing/hdfs/libhdfs3-hdfs-client.xml
+++ /dev/null
@@ -1,332 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied.  See the License for the
-specific language governing permissions and limitations
-under the License.
--->
-
-<!-- From Apache HAWQ (incubating) -->
-
-<configuration>
-
-<!-- KDC
-	<property>
-		<name>hadoop.security.authentication</name>
-		<value>kerberos</value>
-	</property>
-KDC -->
-
-<!-- HA
-	<property>
-		<name>dfs.nameservices</name>
-		<value>phdcluster</value>
-	</property>
-
-	<property>
-		<name>dfs.ha.namenodes.phdcluster</name>
-		<value>nn1,nn2</value>
-	</property>6
-
-	<property>
-		<name>dfs.namenode.rpc-address.phdcluster.nn1</name>
-		<value>mdw:9000</value>
-	</property>
-
-	<property>
-		<name>dfs.namenode.rpc-address.phdcluster.nn2</name>
-		<value>smdw:9000</value>
-	</property>
-
-	<property>
-		<name>dfs.namenode.http-address.phdcluster.nn1</name>
-		<value>mdw:50070</value>
-	</property>
-
-	<property>
-		<name>dfs.namenode.http-address.phdcluster.nn2</name>
-		<value>smdw:50070</value>
-	</property>
-
-HA -->
-
-	<!-- RPC client configuration -->
-	<property>
-		<name>rpc.client.timeout</name>
-		<value>3600000</value>
-		<description>
-		timeout interval of a RPC invocation in millisecond. default is 3600000.
-		</description>
-	</property>
-	<property>
-		<name>rpc.client.connect.tcpnodelay</name>
-		<value>true</value>
-		<description>
-		whether set socket TCP_NODELAY to true when connect to RPC server. default is true.
-		</description>
-	</property>
-
-	<property>
-		<name>rpc.client.max.idle</name>
-		<value>10000</value>
-		<description>
-		the max idle time of a RPC connection in millisecond. default is 10000.
-		</description>
-	</property>
-
-	<property>
-		<name>rpc.client.ping.interval</name>
-		<value>10000</value>
-		<description>
-		the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000.
-		</description>
-	</property>
-
-	<property>
-		<name>rpc.client.connect.timeout</name>
-		<value>600000</value>
-		<description>
-		the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000.
-		</description>
-	</property>
-
-	<property>
-		<name>rpc.client.connect.retry</name>
-		<value>10</value>
-		<description>
-		the max retry times if the RPC client fail to setup the connection to server. default is 10.
-		</description>
-	</property>
-
-	<property>
-		<name>rpc.client.read.timeout</name>
-		<value>3600000</value>
-		<description>
-		the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000.
-		</description>
-	</property>
-
-	<property>
-		<name>rpc.client.write.timeout</name>
-		<value>3600000</value>
-		<description>
-		the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000.
-		</description>
-	</property>
-
-	<property>
-		<name>rpc.client.socket.linger.timeout</name>
-		<value>-1</value>
-		<description>
-		set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1.
-		</description>
-	</property>
-
-	<!-- dfs client configuration -->
-	<property>
-		<name>dfs.client.read.shortcircuit</name>
-		<value>false</value>
-		<description>
-		whether reading block file bypass datanode if the block and the client are on the same node. default is true.
-		</description>
-	</property>
-
-	<property>
-		<name>dfs.default.replica</name>
-		<value>1</value>
-		<description>
-		the default number of replica. default is 3.
-		</description>
-	</property>
-
-	<property>
-		<name>dfs.prefetchsize</name>
-		<value>10</value>
-		<description>
-		the default number of blocks which information will be prefetched. default is 10.
-		</description>
-	</property>
-
-	<property>
-		<name>dfs.client.failover.max.attempts</name>
-		<value>15</value>
-		<description>
-		if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15.
-		</description>
-	</property>
-
-	<property>
-		<name>dfs.default.blocksize</name>
-		<value>134217728</value>
-		<description>
-		default block size. default is 134217728.
-		</description>
-	</property>
-
-	<property>
-		<name>dfs.client.log.severity</name>
-		<value>INFO</value>
-		<description>
-		the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO.
-		</description>
-	</property>
-
-	<!-- input client configuration -->
-	<property>
-		<name>input.connect.timeout</name>
-		<value>600000</value>
-		<description>
-		the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000.
-		</description>
-	</property>
-
-	<property>
-		<name>input.read.timeout</name>
-		<value>3600000</value>
-		<description>
-		the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000.
-		</description>
-	</property>
-
-	<property>
-		<name>input.write.timeout</name>
-		<value>3600000</value>
-		<description>
-		the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000.
-		</description>
-	</property>
-
-	<property>
-		<name>input.localread.default.buffersize</name>
-		<value>2097152</value>
-		<description>
-		number of bytes of the buffer which is used to hold the data from block file and verify checksum.
-		it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576.
-		</description>
-	</property>
-
-	<property>
-		<name>input.localread.blockinfo.cachesize</name>
-		<value>1000</value>
-		<description>
-		the size of block file path information cache. default is 1000.
-		</description>
-	</property>
-
-	<property>
-		<name>input.read.getblockinfo.retry</name>
-		<value>3</value>
-		<description>
-		the max retry times when the client fail to get block information from namenode. default is 3.
-		</description>
-	</property>
-
-	<!-- output client configuration -->
-	<property>
-		<name>output.replace-datanode-on-failure</name>
-		<value>false</value>
-		<description>
-		whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false.
-		</description>
-	</property>
-
-	<property>
-		<name>output.default.chunksize</name>
-		<value>512</value>
-		<description>
-		the number of bytes of a chunk in pipeline. default is 512.
-		</description>
-	</property>
-
-	<property>
-		<name>output.default.packetsize</name>
-		<value>65536</value>
-		<description>
-		the number of bytes of a packet in pipeline. default is 65536.
-		</description>
-	</property>
-
-	<property>
-		<name>output.default.write.retry</name>
-		<value>10</value>
-		<description>
-		the max retry times when the client fail to setup the pipeline. default is 10.
-		</description>
-	</property>
-
-	<property>
-		<name>output.connect.timeout</name>
-		<value>600000</value>
-		<description>
-		the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000.
-		</description>
-	</property>
-
-	<property>
-		<name>output.read.timeout</name>
-		<value>3600000</value>
-		<description>
-		the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000.
-		</description>
-	</property>
-
-	<property>
-		<name>output.write.timeout</name>
-		<value>3600000</value>
-		<description>
-		the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000.
-		</description>
-	</property>
-
-	<property>
-		<name>output.packetpool.size</name>
-		<value>1024</value>
-		<description>
-		the max number of packets in a file's packet pool. default is 1024.
-		</description>
-	</property>
-
-	<property>
-		<name>output.close.timeout</name>
-		<value>900000</value>
-		<description>
-		the timeout interval in millisecond when close an output stream. default is 900000.
-		</description>
-	</property>
-
-	<property>
-		<name>dfs.domain.socket.path</name>
-		<value>/var/lib/hadoop-hdfs/dn_socket</value>
-		<description>
-		Optional.  This is a path to a UNIX domain socket that will be used for
-		communication between the DataNode and local HDFS clients.
-		If the string "_PORT" is present in this path, it will be replaced by the
-		TCP port of the DataNode.
-		</description>
-	</property>
-
-	<property>
-		<name>dfs.client.use.legacy.blockreader.local</name>
-		<value>false</value>
-		<description>
-		Legacy short-circuit reader implementation based on HDFS-2246 is used
-		if this configuration parameter is true.
-		This is for the platforms other than Linux
-		where the new implementation based on HDFS-347 is not available.
-		</description>
-	</property>
-
-</configuration>
diff --git a/python/testing/hdfs/restart_docker_container.sh b/python/testing/hdfs/restart_docker_container.sh
deleted file mode 100644
index 15076cc2873..00000000000
--- a/python/testing/hdfs/restart_docker_container.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-export ARROW_TEST_NN_HOST=arrow-hdfs
-export ARROW_TEST_IMPALA_HOST=$ARROW_TEST_NN_HOST
-export ARROW_TEST_IMPALA_PORT=21050
-export ARROW_TEST_WEBHDFS_PORT=50070
-export ARROW_TEST_WEBHDFS_USER=ubuntu
-
-docker stop $ARROW_TEST_NN_HOST
-docker rm $ARROW_TEST_NN_HOST
-
-docker run -d -it --name $ARROW_TEST_NN_HOST \
-       -v $PWD:/io \
-       --hostname $ARROW_TEST_NN_HOST \
-       --shm-size=2gb \
-       -p $ARROW_TEST_WEBHDFS_PORT -p $ARROW_TEST_IMPALA_PORT \
-       arrow-hdfs-test
-
-while ! docker exec $ARROW_TEST_NN_HOST impala-shell -q 'SELECT VERSION()'; do
-    sleep 1
-done
diff --git a/python/testing/hdfs/run_tests.sh b/python/testing/hdfs/run_tests.sh
deleted file mode 100755
index e0d36df58a3..00000000000
--- a/python/testing/hdfs/run_tests.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd)
-
-source $HERE/../set_env_common.sh
-source $HERE/../setup_toolchain.sh
-source $HERE/../functions.sh
-
-git clone https://github.com/apache/arrow.git $ARROW_CHECKOUT
-
-use_clang
-
-bootstrap_python_env 3.6
-
-build_arrow
-build_parquet
-
-build_pyarrow
-
-$ARROW_CPP_BUILD_DIR/debug/io-hdfs-test
-
-python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet --hdfs

From 5d3e6ff666df338f20d6297a21dad01224b1515f Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Mon, 23 Jul 2018 15:04:27 -0400
Subject: [PATCH 14/16] Give HDFS integration C++ build a distinct subdir name

Change-Id: I44f57bc5b3ea28966e1562e404bdce65afe0cfab
---
 dev/hdfs_integration/hdfs_integration.sh   | 10 +++++-----
 dev/spark_integration/spark_integration.sh |  3 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index c699263918c..795e9362915 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -42,8 +42,8 @@ export PYARROW_CXXFLAGS=$CXXFLAGS
 export PYARROW_CMAKE_GENERATOR=Ninja
 
 # Install arrow-cpp
-mkdir -p arrow/cpp/build
-pushd arrow/cpp/build
+mkdir -p arrow/cpp/hdfs-integration-build
+pushd arrow/cpp/hdfs-integration-build
 
 cmake -GNinja \
       -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
@@ -60,8 +60,8 @@ ninja install
 popd
 
 # Install parquet-cpp
-mkdir -p parquet-cpp/build
-pushd parquet-cpp/build
+mkdir -p parquet-cpp/hdfs-integration-build
+pushd parquet-cpp/hdfs-integration-build
 
 cmake -GNinja \
       -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
@@ -94,4 +94,4 @@ export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml
 python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
 
 # C++
-arrow/cpp/build/debug/io-hdfs-test
+arrow/cpp/hdfs-integration-build/debug/io-hdfs-test
diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh
index 8ca4dc3ac97..6c0a3f0bb5d 100755
--- a/dev/spark_integration/spark_integration.sh
+++ b/dev/spark_integration/spark_integration.sh
@@ -87,6 +87,5 @@ build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test
 # Run pyarrow related Python tests only
 SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests"
 echo "Testing PySpark: $SPARK_PYTHON_TESTS"
-SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS 
+SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS
 popd
-

From 3f95b226985ad2e60107ccf1ab2a877a93b466ad Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Mon, 23 Jul 2018 17:31:43 -0400
Subject: [PATCH 15/16] More robust to state of local dev area

Change-Id: I1f9879ab7eb150f38c66d29cfe8b41792a7b5cf8
---
 dev/docker-compose.yml                   | 1 +
 dev/hdfs_integration/hdfs_integration.sh | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml
index 74d45c63bbd..c4500986caf 100644
--- a/dev/docker-compose.yml
+++ b/dev/docker-compose.yml
@@ -19,6 +19,7 @@ services:
 
   hdfs-namenode:
     image: gelog/hadoop
+    shm_size: 2G
     ports:
       - "9000:9000"
       - "50070:50070"
diff --git a/dev/hdfs_integration/hdfs_integration.sh b/dev/hdfs_integration/hdfs_integration.sh
index 795e9362915..c67f18d28a0 100755
--- a/dev/hdfs_integration/hdfs_integration.sh
+++ b/dev/hdfs_integration/hdfs_integration.sh
@@ -79,6 +79,9 @@ popd
 # Install pyarrow
 pushd arrow/python
 
+# Clear the build directory so we are guaranteed a fresh set of extensions
+rm -rf build/
+
 python setup.py build_ext \
     --build-type=$ARROW_BUILD_TYPE \
     --with-parquet \
@@ -91,7 +94,8 @@ popd
 export LIBHDFS3_CONF=arrow/dev/hdfs_integration/libhdfs3-client-config.xml
 
 # Python
-python -m pytest -vv -r sxX -s arrow/python/pyarrow --parquet --hdfs
+python -m pytest -vv -r sxX -s arrow/python/pyarrow \
+       --only-parquet --only-hdfs
 
 # C++
 arrow/cpp/hdfs-integration-build/debug/io-hdfs-test

From 6dbbfb527a52da1db68058571b765bb6a3fe7667 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Mon, 23 Jul 2018 18:33:11 -0400
Subject: [PATCH 16/16] Apply Krisztian's errno 2 fix, nicer formatting for
 libhdfs errors

Change-Id: I5290d60c15d51271c51df7565eb5fb1cadd4ff5e
---
 cpp/src/arrow/io/hdfs.cc | 42 +++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc
index cf793e64f98..789ffbd057a 100644
--- a/cpp/src/arrow/io/hdfs.cc
+++ b/cpp/src/arrow/io/hdfs.cc
@@ -37,28 +37,16 @@ using std::size_t;
 namespace arrow {
 namespace io {
 
-#define CHECK_FAILURE(RETURN_VALUE, WHAT)  \
-  do {                                     \
-    if (RETURN_VALUE == -1) {              \
-      std::stringstream ss;                \
-      ss << "HDFS: " << WHAT << " failed"; \
-      return Status::IOError(ss.str());    \
-    }                                      \
+#define CHECK_FAILURE(RETURN_VALUE, WHAT)                                             \
+  do {                                                                                \
+    if (RETURN_VALUE == -1) {                                                         \
+      std::stringstream ss;                                                           \
+      ss << "HDFS " << WHAT << " failed, errno: " << errno << " (" << strerror(errno) \
+         << ")";                                                                      \
+      return Status::IOError(ss.str());                                               \
+    }                                                                                 \
   } while (0)
 
-static Status CheckReadResult(int ret) {
-  // Check for error on -1 (possibly errno set)
-
-  // ret == 0 at end of file, which is OK
-  if (ret == -1) {
-    // EOF
-    std::stringstream ss;
-    ss << "HDFS read failed, errno: " << errno;
-    return Status::IOError(ss.str());
-  }
-  return Status::OK();
-}
-
 static constexpr int kDefaultHdfsBufferSize = 1 << 16;
 
 // ----------------------------------------------------------------------
@@ -129,7 +117,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl {
       RETURN_NOT_OK(Seek(position));
       return Read(nbytes, bytes_read, buffer);
     }
-    RETURN_NOT_OK(CheckReadResult(ret));
+    CHECK_FAILURE(ret, "read");
     *bytes_read = ret;
     return Status::OK();
   }
@@ -156,7 +144,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl {
       tSize ret = driver_->Read(
           fs_, file_, reinterpret_cast<uint8_t*>(buffer) + total_bytes,
           static_cast<tSize>(std::min<int64_t>(buffer_size_, nbytes - total_bytes)));
-      RETURN_NOT_OK(CheckReadResult(ret));
+      CHECK_FAILURE(ret, "read");
       total_bytes += ret;
       if (ret == 0) {
         break;
@@ -428,6 +416,7 @@ class HadoopFileSystem::HadoopFileSystemImpl {
 
   Status ListDirectory(const std::string& path, std::vector<HdfsPathInfo>* listing) {
     int num_entries = 0;
+    errno = 0;
     hdfsFileInfo* entries = driver_->ListDirectory(fs_, path.c_str(), &num_entries);
 
     if (entries == nullptr) {
@@ -435,11 +424,16 @@ class HadoopFileSystem::HadoopFileSystemImpl {
       // errno indicates error
       //
       // Note: errno is thread-local
-      if (errno == 0) {
+      //
+      // XXX(wesm): ARROW-2300; we found with Hadoop 2.6 that libhdfs would set
+      // errno 2/ENOENT for empty directories. To be more robust to this we
+      // double check this case
+      if ((errno == 0) || (errno == ENOENT && Exists(path))) {
         num_entries = 0;
       } else {
         std::stringstream ss;
-        ss << "HDFS list directory failed, errno: " << errno;
+        ss << "HDFS list directory failed, errno: " << errno << " (" << strerror(errno)
+           << ")";
         return Status::IOError(ss.str());
       }
     }