From 09c961eed62554fc23269c729a3672e14f208938 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 9 Nov 2017 14:45:30 -0800
Subject: [PATCH 1/8] added initial docker files

---
 dev/docker-compose.yml                     |  5 ++
 dev/spark_integration/Dockerfile           | 48 ++++++++++++++++++++
 dev/spark_integration/spark_integration.sh | 53 ++++++++++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 dev/spark_integration/Dockerfile
 create mode 100755 dev/spark_integration/spark_integration.sh

diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml
index a73fd1bfbba..b1e593cf480 100644
--- a/dev/docker-compose.yml
+++ b/dev/docker-compose.yml
@@ -33,3 +33,8 @@ services:
       context: dask_integration
     volumes:
      - ../..:/apache-arrow
+  spark_integration:
+    build: 
+      context: spark_integration
+    volumes:
+     - ../..:/apache-arrow
diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile
new file mode 100644
index 00000000000..76d95930eb3
--- /dev/null
+++ b/dev/spark_integration/Dockerfile
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+FROM ubuntu:14.04
+ADD . /apache-arrow
+WORKDIR /apache-arrow
+# Basic OS utilities
+RUN apt-get update && apt-get install -y \
+        wget \
+        git \
+        maven\
+        software-properties-common
+# Setup Java
+RUN add-apt-repository ppa:openjdk-r/ppa
+RUN apt-get update && apt-get install -y openjdk-8-jdk
+update-java-alternatives -s java-1.8.0-openjdk-amd64
+ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
+ENV PATH $PATH:$JAVA_HOME/bin
+# This will install conda in /home/ubuntu/miniconda
+RUN wget -O /tmp/miniconda.sh \
+    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
+    rm /tmp/miniconda.sh
+# Create Conda environment
+ENV PATH="/home/ubuntu/miniconda/bin:${PATH}"
+RUN conda create -y -q -n test-environment \
+    python=3.6
+# Install dependencies
+RUN conda install -c conda-forge \
+    numpy \
+    pandas \
+    "pytest<=3.1.1"
+
+CMD ["arrow/dev/spark_integration/spark_integration.sh"]
+
diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh
new file mode 100755
index 00000000000..ea9467a811d
--- /dev/null
+++ b/dev/spark_integration/spark_integration.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set up environment and working directory
+cd /apache-arrow
+
+export ARROW_BUILD_TYPE=release
+export ARROW_HOME=$(pwd)/dist
+export PARQUET_HOME=$(pwd)/dist
+CONDA_BASE=/home/ubuntu/miniconda
+export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH}
+
+# Allow for --user Python installation inside Docker
+export HOME=$(pwd)
+
+# Clean up and get the Spark master branch from github
+#rm -rf spark .local
+#rm -rf spark
+export GIT_COMMITTER_NAME="Nobody"
+export GIT_COMMITTER_EMAIL="nobody@nowhere.com"
+git clone https://github.com/apache/spark.git
+
+# Install Arrow to local maven repo (in container?) and get the version
+pushd arrow/java
+mvn clean install -DskipTests -Drat.skip=true -Dmaven.repo.local=/apache-arrow/.m2/repository
+ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'`
+popd
+
+# Update Spark pom with the Arrow version just installed and build Spark
+pushd spark
+sed -i -e "s/\(.*<arrow.version>\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml
+build/mvn clean package -DskipTests -Dmaven.repo.local=/apache-arrow/.m2/repository
+
+# Run Arrow related Scala tests
+build/mvn test -Dtest=ArrowConvertersSuite,ArrowUtilsSuite -Dmaven.repo.local=/apache-arrow/.m2/repository
+popd
+
+

From b04b0e57dfa26da74ced7ae63e5793356824df9e Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Tue, 14 Nov 2017 10:38:10 -0800
Subject: [PATCH 2/8] docker running Scala Arrow tests

---
 dev/spark_integration/Dockerfile           | 36 +++++++++----------
 dev/spark_integration/spark_integration.sh | 40 ++++++++++++++--------
 2 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile
index 76d95930eb3..c2de9f1a3d4 100644
--- a/dev/spark_integration/Dockerfile
+++ b/dev/spark_integration/Dockerfile
@@ -14,35 +14,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-FROM ubuntu:14.04
+FROM maven:3.5.2-jdk-8-slim
 ADD . /apache-arrow
 WORKDIR /apache-arrow
 # Basic OS utilities
 RUN apt-get update && apt-get install -y \
         wget \
         git \
-        maven\
         software-properties-common
-# Setup Java
-RUN add-apt-repository ppa:openjdk-r/ppa
-RUN apt-get update && apt-get install -y openjdk-8-jdk
-update-java-alternatives -s java-1.8.0-openjdk-amd64
-ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
-ENV PATH $PATH:$JAVA_HOME/bin
+
 # This will install conda in /home/ubuntu/miniconda
-RUN wget -O /tmp/miniconda.sh \
-    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
-    rm /tmp/miniconda.sh
+#RUN wget -O /tmp/miniconda.sh \
+#    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+#    bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
+#    rm /tmp/miniconda.sh
 # Create Conda environment
-ENV PATH="/home/ubuntu/miniconda/bin:${PATH}"
-RUN conda create -y -q -n test-environment \
-    python=3.6
+#ENV PATH="/home/ubuntu/miniconda/bin:${PATH}"
+#RUN conda create -y -q -n test-environment \
+#    python=3.6
 # Install dependencies
-RUN conda install -c conda-forge \
-    numpy \
-    pandas \
-    "pytest<=3.1.1"
+#RUN conda install -c conda-forge \
+#    numpy \
+#    pandas \
+#    "pytest<=3.1.1"
+
+CMD arrow/dev/spark_integration/spark_integration.sh
 
-CMD ["arrow/dev/spark_integration/spark_integration.sh"]
+# BUILD WITH: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow .
 
diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh
index ea9467a811d..e385d2acf14 100755
--- a/dev/spark_integration/spark_integration.sh
+++ b/dev/spark_integration/spark_integration.sh
@@ -24,30 +24,42 @@ export ARROW_HOME=$(pwd)/dist
 export PARQUET_HOME=$(pwd)/dist
 CONDA_BASE=/home/ubuntu/miniconda
 export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH}
+export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
 
 # Allow for --user Python installation inside Docker
-export HOME=$(pwd)
+#export HOME=$(pwd)
 
-# Clean up and get the Spark master branch from github
-#rm -rf spark .local
-#rm -rf spark
-export GIT_COMMITTER_NAME="Nobody"
-export GIT_COMMITTER_EMAIL="nobody@nowhere.com"
-git clone https://github.com/apache/spark.git
-
-# Install Arrow to local maven repo (in container?) and get the version
+# Install Arrow to local maven repo and get the version
 pushd arrow/java
-mvn clean install -DskipTests -Drat.skip=true -Dmaven.repo.local=/apache-arrow/.m2/repository
+echo "Building and installing Arrow Java"
+mvn -DskipTests -Drat.skip=true clean install
 ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'`
+echo "Using Arrow version $ARROW_VERSION"
 popd
 
-# Update Spark pom with the Arrow version just installed and build Spark
+# Get the Spark master branch from github
+export GIT_COMMITTER_NAME="Nobody"
+export GIT_COMMITTER_EMAIL="nobody@nowhere.com"
+rm -rf spark
+git clone https://github.com/apache/spark.git
+
+# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark
 pushd spark
 sed -i -e "s/\(.*<arrow.version>\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml
-build/mvn clean package -DskipTests -Dmaven.repo.local=/apache-arrow/.m2/repository
+echo "Building Spark with Arrow $ARROW_VERSION"
+mvn -DskipTests clean package
 
-# Run Arrow related Scala tests
-build/mvn test -Dtest=ArrowConvertersSuite,ArrowUtilsSuite -Dmaven.repo.local=/apache-arrow/.m2/repository
+# Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run
+SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite"
+echo "Testing Spark $SPARK_SCALA_TESTS"
+mvn -Dtest=_NonExist_ -DwildcardSuites="'$SPARK_SCALA_TESTS'" test -pl sql/core
+
+# Run pyarrow related Python tests only
+#SPARK_TESTING=1 bin/pyspark pyspark.sql.tests ArrowTests GroupbyApplyTests VectorizedUDFTests
 popd
 
+# Clean up
+#rm -rf spark .local
+rm -rf spark
+
 

From e38d43db3a5206c1562d0478f4c911aca7189d3b Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Tue, 14 Nov 2017 14:27:41 -0800
Subject: [PATCH 3/8] using build/mvn script for spark builds

---
 dev/spark_integration/Dockerfile           | 4 +---
 dev/spark_integration/spark_integration.sh | 6 ++++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile
index c2de9f1a3d4..c8e7fcb0741 100644
--- a/dev/spark_integration/Dockerfile
+++ b/dev/spark_integration/Dockerfile
@@ -20,9 +20,7 @@ WORKDIR /apache-arrow
 # Basic OS utilities
 RUN apt-get update && apt-get install -y \
         wget \
-        git \
-        software-properties-common
-
+        git
 # This will install conda in /home/ubuntu/miniconda
 #RUN wget -O /tmp/miniconda.sh \
 #    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh
index e385d2acf14..f0b33951750 100755
--- a/dev/spark_integration/spark_integration.sh
+++ b/dev/spark_integration/spark_integration.sh
@@ -47,18 +47,20 @@ git clone https://github.com/apache/spark.git
 pushd spark
 sed -i -e "s/\(.*<arrow.version>\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml
 echo "Building Spark with Arrow $ARROW_VERSION"
-mvn -DskipTests clean package
+build/mvn -DskipTests clean package
 
 # Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run
 SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite"
 echo "Testing Spark $SPARK_SCALA_TESTS"
-mvn -Dtest=_NonExist_ -DwildcardSuites="'$SPARK_SCALA_TESTS'" test -pl sql/core
+# TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working
+build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test
 
 # Run pyarrow related Python tests only
 #SPARK_TESTING=1 bin/pyspark pyspark.sql.tests ArrowTests GroupbyApplyTests VectorizedUDFTests
 popd
 
 # Clean up
+echo "Cleaning up.."
 #rm -rf spark .local
 rm -rf spark
 

From 95eb22a5fe52bf2e5c8d59ae8d4c72d58d4485ad Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 25 Jan 2018 17:36:30 -0800
Subject: [PATCH 4/8] running python tests now and building pyarrow, not yet
 passing pyspark tests

---
 dev/spark_integration/Dockerfile           | 37 ++++++-----
 dev/spark_integration/spark_integration.sh | 73 +++++++++++++++++-----
 2 files changed, 76 insertions(+), 34 deletions(-)

diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile
index c8e7fcb0741..7e3c1e4bedd 100644
--- a/dev/spark_integration/Dockerfile
+++ b/dev/spark_integration/Dockerfile
@@ -15,28 +15,31 @@
 # limitations under the License.
 #
 FROM maven:3.5.2-jdk-8-slim
-ADD . /apache-arrow
-WORKDIR /apache-arrow
+
 # Basic OS utilities
 RUN apt-get update && apt-get install -y \
         wget \
-        git
+        git g++ cmake \
+        libjemalloc-dev libboost-dev \
+        libboost-filesystem-dev libboost-system-dev
+
 # This will install conda in /home/ubuntu/miniconda
-#RUN wget -O /tmp/miniconda.sh \
-#    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-#    bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
-#    rm /tmp/miniconda.sh
+RUN wget -O /tmp/miniconda.sh \
+    https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
+    rm /tmp/miniconda.sh
+
 # Create Conda environment
-#ENV PATH="/home/ubuntu/miniconda/bin:${PATH}"
-#RUN conda create -y -q -n test-environment \
-#    python=3.6
-# Install dependencies
-#RUN conda install -c conda-forge \
-#    numpy \
-#    pandas \
-#    "pytest<=3.1.1"
+ENV PATH="/home/ubuntu/miniconda/bin:${PATH}"
+RUN conda create -y -q -n pyarrow-dev \
+    python=3.5 numpy six setuptools cython pandas pytest \
+    cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \
+    gflags brotli jemalloc lz4-c zstd -c conda-forge
 
-CMD arrow/dev/spark_integration/spark_integration.sh
+ADD . /apache-arrow
+WORKDIR /apache-arrow
 
-# BUILD WITH: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow .
+CMD arrow/dev/spark_integration/spark_integration.sh
 
+# BUILD: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow .
+# RUN:   $ docker run -v $HOME/.m2:/root/.m2 spark-arrow
diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh
index f0b33951750..74d200b2008 100755
--- a/dev/spark_integration/spark_integration.sh
+++ b/dev/spark_integration/spark_integration.sh
@@ -20,14 +20,36 @@
 cd /apache-arrow
 
 export ARROW_BUILD_TYPE=release
-export ARROW_HOME=$(pwd)/dist
-export PARQUET_HOME=$(pwd)/dist
+export ARROW_HOME=$(pwd)/arrow
 CONDA_BASE=/home/ubuntu/miniconda
-export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH}
+export PYTHONPATH=${ARROW_HOME}/python:${PYTHONPATH}
 export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
 
-# Allow for --user Python installation inside Docker
-#export HOME=$(pwd)
+# Activate our pyarrow-dev conda env
+source activate pyarrow-dev
+
+# Build arrow-cpp and install
+pushd arrow/cpp
+rm -rf build/*
+mkdir -p build
+cd build/
+cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME ..
+make -j4
+if [[ $? -ne 0 ]]; then
+    exit 1
+fi
+make install
+popd
+
+# Build pyarrow and install inplace
+pushd arrow/python
+python setup.py clean
+python setup.py build_ext --build-type=release --inplace
+if [[ $? -ne 0 ]]; then
+    exit 1
+fi
+popd
 
 # Install Arrow to local maven repo and get the version
 pushd arrow/java
@@ -37,31 +59,48 @@ ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -De
 echo "Using Arrow version $ARROW_VERSION"
 popd
 
-# Get the Spark master branch from github
-export GIT_COMMITTER_NAME="Nobody"
-export GIT_COMMITTER_EMAIL="nobody@nowhere.com"
-rm -rf spark
-git clone https://github.com/apache/spark.git
+# Build Spark with Arrow
+SPARK_REPO=https://github.com/apache/spark.git
+SPARK_BRANCH=master
+
+# Get the Spark repo if not in image already
+if [ ! -d "$(pwd)/spark" ]; then
+    export GIT_COMMITTER_NAME="Nobody"
+    export GIT_COMMITTER_EMAIL="nobody@nowhere.com"
+    git clone "$SPARK_REPO"
+fi
 
-# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark
 pushd spark
+
+# Make sure branch has no modifications
+git checkout "$SPARK_BRANCH"
+git reset --hard HEAD
+
+# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark
 sed -i -e "s/\(.*<arrow.version>\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml
 echo "Building Spark with Arrow $ARROW_VERSION"
-build/mvn -DskipTests clean package
+#build/mvn -DskipTests clean package
+build/mvn -DskipTests package
 
 # Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run
 SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite"
-echo "Testing Spark $SPARK_SCALA_TESTS"
+echo "Testing Spark: $SPARK_SCALA_TESTS"
 # TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working
 build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test
+if [[ $? -ne 0 ]]; then
+    exit 1
+fi
 
 # Run pyarrow related Python tests only
-#SPARK_TESTING=1 bin/pyspark pyspark.sql.tests ArrowTests GroupbyApplyTests VectorizedUDFTests
+SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDF GroupbyApplyPandasUDFTests GroupbyAggPandasUDFTests"
+echo "Testing PySpark: $SPARK_PYTHON_TESTS"
+SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS 
+if [[ $? -ne 0 ]]; then
+    exit 1
+fi
 popd
 
 # Clean up
 echo "Cleaning up.."
-#rm -rf spark .local
-rm -rf spark
-
+source deactivate
 

From 3305a3521c67ca614c60c83f524a66bb8077d8a4 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 8 Feb 2018 17:21:36 -0800
Subject: [PATCH 5/8] added fix for using setuptools_scm to get version outside
 of arrow python dir

---
 python/pyarrow/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index a245fe67960..6a613a3b295 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -24,7 +24,7 @@
    # package is not installed
     try:
         import setuptools_scm
-        __version__ = setuptools_scm.get_version('../')
+        __version__ = setuptools_scm.get_version(root='../../', relative_to=__file__)
     except (ImportError, LookupError):
         __version__ = None
 

From b2182d3125a864a1d492bcc1c54fd34a3643fabe Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 8 Feb 2018 17:25:40 -0800
Subject: [PATCH 6/8] docker container now running and passing all tests

---
 dev/spark_integration/Dockerfile           | 40 ++++++++++++++++++----
 dev/spark_integration/spark_integration.sh | 21 ++++++------
 2 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile
index 7e3c1e4bedd..433593c7230 100644
--- a/dev/spark_integration/Dockerfile
+++ b/dev/spark_integration/Dockerfile
@@ -19,9 +19,11 @@ FROM maven:3.5.2-jdk-8-slim
 # Basic OS utilities
 RUN apt-get update && apt-get install -y \
         wget \
-        git g++ cmake \
-        libjemalloc-dev libboost-dev \
-        libboost-filesystem-dev libboost-system-dev
+        git build-essential \
+        software-properties-common
+
+#RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test \
+#        && apt-get update && apt-get install -y gcc-4.9 g++-4.9
 
 # This will install conda in /home/ubuntu/miniconda
 RUN wget -O /tmp/miniconda.sh \
@@ -29,12 +31,38 @@ RUN wget -O /tmp/miniconda.sh \
     bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
     rm /tmp/miniconda.sh
 
+# Python dependencies
+RUN apt-get install -y \
+        pkg-config
+
 # Create Conda environment
 ENV PATH="/home/ubuntu/miniconda/bin:${PATH}"
 RUN conda create -y -q -n pyarrow-dev \
-    python=3.5 numpy six setuptools cython pandas pytest \
-    cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \
-    gflags brotli jemalloc lz4-c zstd -c conda-forge
+        # Python
+        python=2.7 \
+        numpy \
+        pandas \
+        pytest \
+        cython \
+        ipython \
+        matplotlib \
+        six \
+        setuptools \
+        setuptools_scm \
+        # C++
+        boost-cpp \
+        cmake \
+        flatbuffers \
+        rapidjson \
+        thrift-cpp \
+        snappy \
+        zlib \
+        gflags \
+        brotli \
+        jemalloc \
+        lz4-c \
+        zstd \
+        -c conda-forge
 
 ADD . /apache-arrow
 WORKDIR /apache-arrow
diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh
index 74d200b2008..3b7e878ece5 100755
--- a/dev/spark_integration/spark_integration.sh
+++ b/dev/spark_integration/spark_integration.sh
@@ -19,22 +19,24 @@
 # Set up environment and working directory
 cd /apache-arrow
 
-export ARROW_BUILD_TYPE=release
+# Activate our pyarrow-dev conda env
+source activate pyarrow-dev
+
+export ARROW_BUILD_TYPE=Release
 export ARROW_HOME=$(pwd)/arrow
+#export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
+export BOOST_ROOT=$CONDA_PREFIX
 CONDA_BASE=/home/ubuntu/miniconda
 export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH}
 export PYTHONPATH=${ARROW_HOME}/python:${PYTHONPATH}
 export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
 
-# Activate our pyarrow-dev conda env
-source activate pyarrow-dev
-
-# Build arrow-cpp and install
+# Build Arrow C++
 pushd arrow/cpp
 rm -rf build/*
 mkdir -p build
 cd build/
-cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME ..
+cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME ..
 make -j4
 if [[ $? -ne 0 ]]; then
     exit 1
@@ -60,7 +62,7 @@ echo "Using Arrow version $ARROW_VERSION"
 popd
 
 # Build Spark with Arrow
-SPARK_REPO=https://github.com/apache/spark.git
+SPARK_REPO=git://git.apache.org/spark.git
 SPARK_BRANCH=master
 
 # Get the Spark repo if not in image already
@@ -79,8 +81,7 @@ git reset --hard HEAD
 # Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark
 sed -i -e "s/\(.*<arrow.version>\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml
 echo "Building Spark with Arrow $ARROW_VERSION"
-#build/mvn -DskipTests clean package
-build/mvn -DskipTests package
+build/mvn -DskipTests clean package
 
 # Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run
 SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite"
@@ -92,7 +93,7 @@ if [[ $? -ne 0 ]]; then
 fi
 
 # Run pyarrow related Python tests only
-SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDF GroupbyApplyPandasUDFTests GroupbyAggPandasUDFTests"
+SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests"
 echo "Testing PySpark: $SPARK_PYTHON_TESTS"
 SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS 
 if [[ $? -ne 0 ]]; then

From f962e387cd2a643e33d3a7b5c9aa9b65fbb4e619 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 14 Feb 2018 10:32:00 -0800
Subject: [PATCH 7/8] Revert "added fix for using setuptools_scm to get version
 outside of arrow python dir"

This reverts commit 3305a3521c67ca614c60c83f524a66bb8077d8a4.
---
 python/pyarrow/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 6a613a3b295..a245fe67960 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -24,7 +24,7 @@
    # package is not installed
     try:
         import setuptools_scm
-        __version__ = setuptools_scm.get_version(root='../../', relative_to=__file__)
+        __version__ = setuptools_scm.get_version('../')
     except (ImportError, LookupError):
         __version__ = None
 

From 3f9f483d45f0c1b5a9052eb2950b48e8fb328318 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 14 Feb 2018 10:40:34 -0800
Subject: [PATCH 8/8] now building with ARROW_BUILD_TOOLCHAIN set to conda env

---
 dev/spark_integration/Dockerfile           |  3 --
 dev/spark_integration/spark_integration.sh | 33 ++++++----------------
 2 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile
index 433593c7230..d1b3cf89f0b 100644
--- a/dev/spark_integration/Dockerfile
+++ b/dev/spark_integration/Dockerfile
@@ -22,9 +22,6 @@ RUN apt-get update && apt-get install -y \
         git build-essential \
         software-properties-common
 
-#RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test \
-#        && apt-get update && apt-get install -y gcc-4.9 g++-4.9
-
 # This will install conda in /home/ubuntu/miniconda
 RUN wget -O /tmp/miniconda.sh \
     https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh
index 3b7e878ece5..8ca4dc3ac97 100755
--- a/dev/spark_integration/spark_integration.sh
+++ b/dev/spark_integration/spark_integration.sh
@@ -16,19 +16,19 @@
 # limitations under the License.
 #
 
+# Exit on any error
+set -e
+
 # Set up environment and working directory
 cd /apache-arrow
 
 # Activate our pyarrow-dev conda env
 source activate pyarrow-dev
 
-export ARROW_BUILD_TYPE=Release
 export ARROW_HOME=$(pwd)/arrow
-#export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
-export BOOST_ROOT=$CONDA_PREFIX
-CONDA_BASE=/home/ubuntu/miniconda
-export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH}
-export PYTHONPATH=${ARROW_HOME}/python:${PYTHONPATH}
+export ARROW_BUILD_TYPE=release
+export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
+export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
 export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
 
 # Build Arrow C++
@@ -36,21 +36,16 @@ pushd arrow/cpp
 rm -rf build/*
 mkdir -p build
 cd build/
-cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME ..
+cmake -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE -DCMAKE_INSTALL_PREFIX=$ARROW_HOME ..
 make -j4
-if [[ $? -ne 0 ]]; then
-    exit 1
-fi
 make install
 popd
 
 # Build pyarrow and install inplace
+export PYARROW_CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
 pushd arrow/python
 python setup.py clean
-python setup.py build_ext --build-type=release --inplace
-if [[ $? -ne 0 ]]; then
-    exit 1
-fi
+python setup.py build_ext --build-type=$ARROW_BUILD_TYPE install
 popd
 
 # Install Arrow to local maven repo and get the version
@@ -88,20 +83,10 @@ SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.exe
 echo "Testing Spark: $SPARK_SCALA_TESTS"
 # TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working
 build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test
-if [[ $? -ne 0 ]]; then
-    exit 1
-fi
 
 # Run pyarrow related Python tests only
 SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests"
 echo "Testing PySpark: $SPARK_PYTHON_TESTS"
 SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS 
-if [[ $? -ne 0 ]]; then
-    exit 1
-fi
 popd
 
-# Clean up
-echo "Cleaning up.."
-source deactivate
-