From 09c961eed62554fc23269c729a3672e14f208938 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 9 Nov 2017 14:45:30 -0800 Subject: [PATCH 1/8] added initial docker files --- dev/docker-compose.yml | 5 ++ dev/spark_integration/Dockerfile | 48 ++++++++++++++++++++ dev/spark_integration/spark_integration.sh | 53 ++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 dev/spark_integration/Dockerfile create mode 100755 dev/spark_integration/spark_integration.sh diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index a73fd1bfbba..b1e593cf480 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -33,3 +33,8 @@ services: context: dask_integration volumes: - ../..:/apache-arrow + spark_integration: + build: + context: spark_integration + volumes: + - ../..:/apache-arrow diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile new file mode 100644 index 00000000000..76d95930eb3 --- /dev/null +++ b/dev/spark_integration/Dockerfile @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM ubuntu:14.04 +ADD . /apache-arrow +WORKDIR /apache-arrow +# Basic OS utilities +RUN apt-get update && apt-get install -y \ + wget \ + git \ + maven\ + software-properties-common +# Setup Java +RUN add-apt-repository ppa:openjdk-r/ppa +RUN apt-get update && apt-get install -y openjdk-8-jdk +update-java-alternatives -s java-1.8.0-openjdk-amd64 +ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 +ENV PATH $PATH:$JAVA_HOME/bin +# This will install conda in /home/ubuntu/miniconda +RUN wget -O /tmp/miniconda.sh \ + https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ + rm /tmp/miniconda.sh +# Create Conda environment +ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" +RUN conda create -y -q -n test-environment \ + python=3.6 +# Install dependencies +RUN conda install -c conda-forge \ + numpy \ + pandas \ + "pytest<=3.1.1" + +CMD ["arrow/dev/spark_integration/spark_integration.sh"] + diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh new file mode 100755 index 00000000000..ea9467a811d --- /dev/null +++ b/dev/spark_integration/spark_integration.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set up environment and working directory +cd /apache-arrow + +export ARROW_BUILD_TYPE=release +export ARROW_HOME=$(pwd)/dist +export PARQUET_HOME=$(pwd)/dist +CONDA_BASE=/home/ubuntu/miniconda +export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} + +# Allow for --user Python installation inside Docker +export HOME=$(pwd) + +# Clean up and get the Spark master branch from github +#rm -rf spark .local +#rm -rf spark +export GIT_COMMITTER_NAME="Nobody" +export GIT_COMMITTER_EMAIL="nobody@nowhere.com" +git clone https://github.com/apache/spark.git + +# Install Arrow to local maven repo (in container?) and get the version +pushd arrow/java +mvn clean install -DskipTests -Drat.skip=true -Dmaven.repo.local=/apache-arrow/.m2/repository +ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'` +popd + +# Update Spark pom with the Arrow version just installed and build Spark +pushd spark +sed -i -e "s/\(.*\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml +build/mvn clean package -DskipTests -Dmaven.repo.local=/apache-arrow/.m2/repository + +# Run Arrow related Scala tests +build/mvn test -Dtest=ArrowConvertersSuite,ArrowUtilsSuite -Dmaven.repo.local=/apache-arrow/.m2/repository +popd + + From b04b0e57dfa26da74ced7ae63e5793356824df9e Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 14 Nov 2017 10:38:10 -0800 Subject: [PATCH 2/8] docker running Scala Arrow tests --- dev/spark_integration/Dockerfile | 36 +++++++++---------- dev/spark_integration/spark_integration.sh | 40 ++++++++++++++-------- 2 files changed, 42 insertions(+), 34 deletions(-) diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile index 76d95930eb3..c2de9f1a3d4 100644 --- a/dev/spark_integration/Dockerfile +++ b/dev/spark_integration/Dockerfile @@ -14,35 +14,31 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM ubuntu:14.04 +FROM maven:3.5.2-jdk-8-slim ADD . /apache-arrow WORKDIR /apache-arrow # Basic OS utilities RUN apt-get update && apt-get install -y \ wget \ git \ - maven\ software-properties-common -# Setup Java -RUN add-apt-repository ppa:openjdk-r/ppa -RUN apt-get update && apt-get install -y openjdk-8-jdk -update-java-alternatives -s java-1.8.0-openjdk-amd64 -ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 -ENV PATH $PATH:$JAVA_HOME/bin + # This will install conda in /home/ubuntu/miniconda -RUN wget -O /tmp/miniconda.sh \ - https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ - rm /tmp/miniconda.sh +#RUN wget -O /tmp/miniconda.sh \ +# https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ +# bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ +# rm /tmp/miniconda.sh # Create Conda environment -ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" -RUN conda create -y -q -n test-environment \ - python=3.6 +#ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" +#RUN conda create -y -q -n test-environment \ +# python=3.6 # Install dependencies -RUN conda install -c conda-forge \ - numpy \ - pandas \ - "pytest<=3.1.1" +#RUN conda install -c conda-forge \ +# numpy \ +# pandas \ +# "pytest<=3.1.1" + +CMD arrow/dev/spark_integration/spark_integration.sh -CMD ["arrow/dev/spark_integration/spark_integration.sh"] +# BUILD WITH: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow . diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh index ea9467a811d..e385d2acf14 100755 --- a/dev/spark_integration/spark_integration.sh +++ b/dev/spark_integration/spark_integration.sh @@ -24,30 +24,42 @@ export ARROW_HOME=$(pwd)/dist export PARQUET_HOME=$(pwd)/dist CONDA_BASE=/home/ubuntu/miniconda export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} +export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" # Allow for --user Python installation inside Docker -export HOME=$(pwd) +#export HOME=$(pwd) -# Clean up and get the Spark master branch from github -#rm -rf spark .local -#rm -rf spark -export GIT_COMMITTER_NAME="Nobody" -export GIT_COMMITTER_EMAIL="nobody@nowhere.com" -git clone https://github.com/apache/spark.git - -# Install Arrow to local maven repo (in container?) and get the version +# Install Arrow to local maven repo and get the version pushd arrow/java -mvn clean install -DskipTests -Drat.skip=true -Dmaven.repo.local=/apache-arrow/.m2/repository +echo "Building and installing Arrow Java" +mvn -DskipTests -Drat.skip=true clean install ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'` +echo "Using Arrow version $ARROW_VERSION" popd -# Update Spark pom with the Arrow version just installed and build Spark +# Get the Spark master branch from github +export GIT_COMMITTER_NAME="Nobody" +export GIT_COMMITTER_EMAIL="nobody@nowhere.com" +rm -rf spark +git clone https://github.com/apache/spark.git + +# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark pushd spark sed -i -e "s/\(.*\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml -build/mvn clean package -DskipTests -Dmaven.repo.local=/apache-arrow/.m2/repository +echo "Building Spark with Arrow $ARROW_VERSION" +mvn -DskipTests clean package -# Run Arrow related Scala tests -build/mvn test -Dtest=ArrowConvertersSuite,ArrowUtilsSuite -Dmaven.repo.local=/apache-arrow/.m2/repository +# Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run +SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite" +echo "Testing Spark $SPARK_SCALA_TESTS" +mvn -Dtest=_NonExist_ -DwildcardSuites="'$SPARK_SCALA_TESTS'" test -pl sql/core + +# Run pyarrow related Python tests only +#SPARK_TESTING=1 bin/pyspark pyspark.sql.tests ArrowTests GroupbyApplyTests VectorizedUDFTests popd +# Clean up +#rm -rf spark .local +rm -rf spark + From e38d43db3a5206c1562d0478f4c911aca7189d3b Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 14 Nov 2017 14:27:41 -0800 Subject: [PATCH 3/8] using build/mvn script for spark builds --- dev/spark_integration/Dockerfile | 4 +--- dev/spark_integration/spark_integration.sh | 6 ++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile index c2de9f1a3d4..c8e7fcb0741 100644 --- a/dev/spark_integration/Dockerfile +++ b/dev/spark_integration/Dockerfile @@ -20,9 +20,7 @@ WORKDIR /apache-arrow # Basic OS utilities RUN apt-get update && apt-get install -y \ wget \ - git \ - software-properties-common - + git # This will install conda in /home/ubuntu/miniconda #RUN wget -O /tmp/miniconda.sh \ # https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh index e385d2acf14..f0b33951750 100755 --- a/dev/spark_integration/spark_integration.sh +++ b/dev/spark_integration/spark_integration.sh @@ -47,18 +47,20 @@ git clone https://github.com/apache/spark.git pushd spark sed -i -e "s/\(.*\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml echo "Building Spark with Arrow $ARROW_VERSION" -mvn -DskipTests clean package +build/mvn -DskipTests clean package # Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite" echo "Testing Spark $SPARK_SCALA_TESTS" -mvn -Dtest=_NonExist_ -DwildcardSuites="'$SPARK_SCALA_TESTS'" test -pl sql/core +# TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working +build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test # Run pyarrow related Python tests only #SPARK_TESTING=1 bin/pyspark pyspark.sql.tests ArrowTests GroupbyApplyTests VectorizedUDFTests popd # Clean up +echo "Cleaning up.." #rm -rf spark .local rm -rf spark From 95eb22a5fe52bf2e5c8d59ae8d4c72d58d4485ad Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 25 Jan 2018 17:36:30 -0800 Subject: [PATCH 4/8] running python tests now and building pyarrow, not yet passing pyspark tests --- dev/spark_integration/Dockerfile | 37 ++++++----- dev/spark_integration/spark_integration.sh | 73 +++++++++++++++++----- 2 files changed, 76 insertions(+), 34 deletions(-) diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile index c8e7fcb0741..7e3c1e4bedd 100644 --- a/dev/spark_integration/Dockerfile +++ b/dev/spark_integration/Dockerfile @@ -15,28 +15,31 @@ # limitations under the License. # FROM maven:3.5.2-jdk-8-slim -ADD . /apache-arrow -WORKDIR /apache-arrow + # Basic OS utilities RUN apt-get update && apt-get install -y \ wget \ - git + git g++ cmake \ + libjemalloc-dev libboost-dev \ + libboost-filesystem-dev libboost-system-dev + # This will install conda in /home/ubuntu/miniconda -#RUN wget -O /tmp/miniconda.sh \ -# https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ -# bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ -# rm /tmp/miniconda.sh +RUN wget -O /tmp/miniconda.sh \ + https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ + rm /tmp/miniconda.sh + # Create Conda environment -#ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" -#RUN conda create -y -q -n test-environment \ -# python=3.6 -# Install dependencies -#RUN conda install -c conda-forge \ -# numpy \ -# pandas \ -# "pytest<=3.1.1" +ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" +RUN conda create -y -q -n pyarrow-dev \ + python=3.5 numpy six setuptools cython pandas pytest \ + cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ + gflags brotli jemalloc lz4-c zstd -c conda-forge -CMD arrow/dev/spark_integration/spark_integration.sh +ADD . /apache-arrow +WORKDIR /apache-arrow -# BUILD WITH: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow . +CMD arrow/dev/spark_integration/spark_integration.sh +# BUILD: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow . +# RUN: $ docker run -v $HOME/.m2:/root/.m2 spark-arrow diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh index f0b33951750..74d200b2008 100755 --- a/dev/spark_integration/spark_integration.sh +++ b/dev/spark_integration/spark_integration.sh @@ -20,14 +20,36 @@ cd /apache-arrow export ARROW_BUILD_TYPE=release -export ARROW_HOME=$(pwd)/dist -export PARQUET_HOME=$(pwd)/dist +export ARROW_HOME=$(pwd)/arrow CONDA_BASE=/home/ubuntu/miniconda -export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} +export PYTHONPATH=${ARROW_HOME}/python:${PYTHONPATH} export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" -# Allow for --user Python installation inside Docker -#export HOME=$(pwd) +# Activate our pyarrow-dev conda env +source activate pyarrow-dev + +# Build arrow-cpp and install +pushd arrow/cpp +rm -rf build/* +mkdir -p build +cd build/ +cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. +make -j4 +if [[ $? -ne 0 ]]; then + exit 1 +fi +make install +popd + +# Build pyarrow and install inplace +pushd arrow/python +python setup.py clean +python setup.py build_ext --build-type=release --inplace +if [[ $? -ne 0 ]]; then + exit 1 +fi +popd # Install Arrow to local maven repo and get the version pushd arrow/java @@ -37,31 +59,48 @@ ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -De echo "Using Arrow version $ARROW_VERSION" popd -# Get the Spark master branch from github -export GIT_COMMITTER_NAME="Nobody" -export GIT_COMMITTER_EMAIL="nobody@nowhere.com" -rm -rf spark -git clone https://github.com/apache/spark.git +# Build Spark with Arrow +SPARK_REPO=https://github.com/apache/spark.git +SPARK_BRANCH=master + +# Get the Spark repo if not in image already +if [ ! -d "$(pwd)/spark" ]; then + export GIT_COMMITTER_NAME="Nobody" + export GIT_COMMITTER_EMAIL="nobody@nowhere.com" + git clone "$SPARK_REPO" +fi -# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark pushd spark + +# Make sure branch has no modifications +git checkout "$SPARK_BRANCH" +git reset --hard HEAD + +# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark sed -i -e "s/\(.*\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml echo "Building Spark with Arrow $ARROW_VERSION" -build/mvn -DskipTests clean package +#build/mvn -DskipTests clean package +build/mvn -DskipTests package # Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite" -echo "Testing Spark $SPARK_SCALA_TESTS" +echo "Testing Spark: $SPARK_SCALA_TESTS" # TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test +if [[ $? -ne 0 ]]; then + exit 1 +fi # Run pyarrow related Python tests only -#SPARK_TESTING=1 bin/pyspark pyspark.sql.tests ArrowTests GroupbyApplyTests VectorizedUDFTests +SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDF GroupbyApplyPandasUDFTests GroupbyAggPandasUDFTests" +echo "Testing PySpark: $SPARK_PYTHON_TESTS" +SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS +if [[ $? -ne 0 ]]; then + exit 1 +fi popd # Clean up echo "Cleaning up.." -#rm -rf spark .local -rm -rf spark - +source deactivate From 3305a3521c67ca614c60c83f524a66bb8077d8a4 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 8 Feb 2018 17:21:36 -0800 Subject: [PATCH 5/8] added fix for using setuptools_scm to get version outside of arrow python dir --- python/pyarrow/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index a245fe67960..6a613a3b295 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -24,7 +24,7 @@ # package is not installed try: import setuptools_scm - __version__ = setuptools_scm.get_version('../') + __version__ = setuptools_scm.get_version(root='../../', relative_to=__file__) except (ImportError, LookupError): __version__ = None From b2182d3125a864a1d492bcc1c54fd34a3643fabe Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 8 Feb 2018 17:25:40 -0800 Subject: [PATCH 6/8] docker container now running and passing all tests --- dev/spark_integration/Dockerfile | 40 ++++++++++++++++++---- dev/spark_integration/spark_integration.sh | 21 ++++++------ 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile index 7e3c1e4bedd..433593c7230 100644 --- a/dev/spark_integration/Dockerfile +++ b/dev/spark_integration/Dockerfile @@ -19,9 +19,11 @@ FROM maven:3.5.2-jdk-8-slim # Basic OS utilities RUN apt-get update && apt-get install -y \ wget \ - git g++ cmake \ - libjemalloc-dev libboost-dev \ - libboost-filesystem-dev libboost-system-dev + git build-essential \ + software-properties-common + +#RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test \ +# && apt-get update && apt-get install -y gcc-4.9 g++-4.9 # This will install conda in /home/ubuntu/miniconda RUN wget -O /tmp/miniconda.sh \ @@ -29,12 +31,38 @@ RUN wget -O /tmp/miniconda.sh \ bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ rm /tmp/miniconda.sh +# Python dependencies +RUN apt-get install -y \ + pkg-config + # Create Conda environment ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" RUN conda create -y -q -n pyarrow-dev \ - python=3.5 numpy six setuptools cython pandas pytest \ - cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ - gflags brotli jemalloc lz4-c zstd -c conda-forge + # Python + python=2.7 \ + numpy \ + pandas \ + pytest \ + cython \ + ipython \ + matplotlib \ + six \ + setuptools \ + setuptools_scm \ + # C++ + boost-cpp \ + cmake \ + flatbuffers \ + rapidjson \ + thrift-cpp \ + snappy \ + zlib \ + gflags \ + brotli \ + jemalloc \ + lz4-c \ + zstd \ + -c conda-forge ADD . /apache-arrow WORKDIR /apache-arrow diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh index 74d200b2008..3b7e878ece5 100755 --- a/dev/spark_integration/spark_integration.sh +++ b/dev/spark_integration/spark_integration.sh @@ -19,22 +19,24 @@ # Set up environment and working directory cd /apache-arrow -export ARROW_BUILD_TYPE=release +# Activate our pyarrow-dev conda env +source activate pyarrow-dev + +export ARROW_BUILD_TYPE=Release export ARROW_HOME=$(pwd)/arrow +#export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export BOOST_ROOT=$CONDA_PREFIX CONDA_BASE=/home/ubuntu/miniconda export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} export PYTHONPATH=${ARROW_HOME}/python:${PYTHONPATH} export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" -# Activate our pyarrow-dev conda env -source activate pyarrow-dev - -# Build arrow-cpp and install +# Build Arrow C++ pushd arrow/cpp rm -rf build/* mkdir -p build cd build/ -cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. +cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. make -j4 if [[ $? -ne 0 ]]; then exit 1 @@ -60,7 +62,7 @@ echo "Using Arrow version $ARROW_VERSION" popd # Build Spark with Arrow -SPARK_REPO=https://github.com/apache/spark.git +SPARK_REPO=git://git.apache.org/spark.git SPARK_BRANCH=master # Get the Spark repo if not in image already @@ -79,8 +81,7 @@ git reset --hard HEAD # Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark sed -i -e "s/\(.*\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml echo "Building Spark with Arrow $ARROW_VERSION" -#build/mvn -DskipTests clean package -build/mvn -DskipTests package +build/mvn -DskipTests clean package # Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite" @@ -92,7 +93,7 @@ if [[ $? -ne 0 ]]; then fi # Run pyarrow related Python tests only -SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDF GroupbyApplyPandasUDFTests GroupbyAggPandasUDFTests" +SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests" echo "Testing PySpark: $SPARK_PYTHON_TESTS" SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS if [[ $? -ne 0 ]]; then From f962e387cd2a643e33d3a7b5c9aa9b65fbb4e619 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 14 Feb 2018 10:32:00 -0800 Subject: [PATCH 7/8] Revert "added fix for using setuptools_scm to get version outside of arrow python dir" This reverts commit 3305a3521c67ca614c60c83f524a66bb8077d8a4. --- python/pyarrow/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 6a613a3b295..a245fe67960 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -24,7 +24,7 @@ # package is not installed try: import setuptools_scm - __version__ = setuptools_scm.get_version(root='../../', relative_to=__file__) + __version__ = setuptools_scm.get_version('../') except (ImportError, LookupError): __version__ = None From 3f9f483d45f0c1b5a9052eb2950b48e8fb328318 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 14 Feb 2018 10:40:34 -0800 Subject: [PATCH 8/8] now building with ARROW_BUILD_TOOLCHAIN set to conda env --- dev/spark_integration/Dockerfile | 3 -- dev/spark_integration/spark_integration.sh | 33 ++++++---------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile index 433593c7230..d1b3cf89f0b 100644 --- a/dev/spark_integration/Dockerfile +++ b/dev/spark_integration/Dockerfile @@ -22,9 +22,6 @@ RUN apt-get update && apt-get install -y \ git build-essential \ software-properties-common -#RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test \ -# && apt-get update && apt-get install -y gcc-4.9 g++-4.9 - # This will install conda in /home/ubuntu/miniconda RUN wget -O /tmp/miniconda.sh \ https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh index 3b7e878ece5..8ca4dc3ac97 100755 --- a/dev/spark_integration/spark_integration.sh +++ b/dev/spark_integration/spark_integration.sh @@ -16,19 +16,19 @@ # limitations under the License. # +# Exit on any error +set -e + # Set up environment and working directory cd /apache-arrow # Activate our pyarrow-dev conda env source activate pyarrow-dev -export ARROW_BUILD_TYPE=Release export ARROW_HOME=$(pwd)/arrow -#export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX -export BOOST_ROOT=$CONDA_PREFIX -CONDA_BASE=/home/ubuntu/miniconda -export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} -export PYTHONPATH=${ARROW_HOME}/python:${PYTHONPATH} +export ARROW_BUILD_TYPE=release +export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" # Build Arrow C++ @@ -36,21 +36,16 @@ pushd arrow/cpp rm -rf build/* mkdir -p build cd build/ -cmake -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. +cmake -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. make -j4 -if [[ $? -ne 0 ]]; then - exit 1 -fi make install popd # Build pyarrow and install inplace +export PYARROW_CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" pushd arrow/python python setup.py clean -python setup.py build_ext --build-type=release --inplace -if [[ $? -ne 0 ]]; then - exit 1 -fi +python setup.py build_ext --build-type=$ARROW_BUILD_TYPE install popd # Install Arrow to local maven repo and get the version @@ -88,20 +83,10 @@ SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.exe echo "Testing Spark: $SPARK_SCALA_TESTS" # TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test -if [[ $? -ne 0 ]]; then - exit 1 -fi # Run pyarrow related Python tests only SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests" echo "Testing PySpark: $SPARK_PYTHON_TESTS" SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS -if [[ $? -ne 0 ]]; then - exit 1 -fi popd -# Clean up -echo "Cleaning up.." -source deactivate -