Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions dev/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ services:
context: dask_integration
volumes:
- ../..:/apache-arrow
spark_integration:
build:
context: spark_integration
volumes:
- ../..:/apache-arrow
70 changes: 70 additions & 0 deletions dev/spark_integration/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
FROM maven:3.5.2-jdk-8-slim

# Basic OS utilities
RUN apt-get update && apt-get install -y \
wget \
git build-essential \
software-properties-common

# This will install conda in /home/ubuntu/miniconda
RUN wget -O /tmp/miniconda.sh \
https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
rm /tmp/miniconda.sh

# Python dependencies
RUN apt-get install -y \
pkg-config

# Create Conda environment
ENV PATH="/home/ubuntu/miniconda/bin:${PATH}"
RUN conda create -y -q -n pyarrow-dev \
# Python
python=2.7 \
numpy \
pandas \
pytest \
cython \
ipython \
matplotlib \
six \
setuptools \
setuptools_scm \
# C++
boost-cpp \
cmake \
flatbuffers \
rapidjson \
thrift-cpp \
snappy \
zlib \
gflags \
brotli \
jemalloc \
lz4-c \
zstd \
-c conda-forge

ADD . /apache-arrow
WORKDIR /apache-arrow

CMD arrow/dev/spark_integration/spark_integration.sh

# BUILD: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow .
# RUN: $ docker run -v $HOME/.m2:/root/.m2 spark-arrow
92 changes: 92 additions & 0 deletions dev/spark_integration/spark_integration.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Exit on any error
set -e

# Set up environment and working directory
cd /apache-arrow

# Activate our pyarrow-dev conda env
source activate pyarrow-dev

export ARROW_HOME=$(pwd)/arrow
export ARROW_BUILD_TYPE=release
export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"

# Build Arrow C++
pushd arrow/cpp
rm -rf build/*
mkdir -p build
cd build/
cmake -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE -DCMAKE_INSTALL_PREFIX=$ARROW_HOME ..
make -j4
make install
popd

# Build pyarrow and install inplace
export PYARROW_CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
pushd arrow/python
python setup.py clean
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE install
popd

# Install Arrow to local maven repo and get the version
pushd arrow/java
echo "Building and installing Arrow Java"
mvn -DskipTests -Drat.skip=true clean install
ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'`
echo "Using Arrow version $ARROW_VERSION"
popd

# Build Spark with Arrow
SPARK_REPO=git://git.apache.org/spark.git
SPARK_BRANCH=master

# Get the Spark repo if not in image already
if [ ! -d "$(pwd)/spark" ]; then
export GIT_COMMITTER_NAME="Nobody"
export GIT_COMMITTER_EMAIL="nobody@nowhere.com"
git clone "$SPARK_REPO"
fi

pushd spark

# Make sure branch has no modifications
git checkout "$SPARK_BRANCH"
git reset --hard HEAD

# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark
sed -i -e "s/\(.*<arrow.version>\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml
echo "Building Spark with Arrow $ARROW_VERSION"
build/mvn -DskipTests clean package

# Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run
SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite"
echo "Testing Spark: $SPARK_SCALA_TESTS"
# TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working
build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test

# Run pyarrow related Python tests only
SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests"
echo "Testing PySpark: $SPARK_PYTHON_TESTS"
SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS
popd