From 6126c4f4d97db16b0ed6a95c60fae1fff44e2afe Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Fri, 24 Apr 2015 14:27:54 -0700
Subject: [PATCH 01/52] refactored run-tests into python

---
 dev/run-tests | 623 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 389 insertions(+), 234 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 861d1671182c2..22a33d317a344 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/usr/bin/env python
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,239 +17,394 @@
 # limitations under the License.
 #
 
-# Go to the Spark project root directory
-FWDIR="$(cd "`dirname $0`"/..; pwd)"
-cd "$FWDIR"
+import os
+import re
+import shutil
+import subprocess as sp
+
+# Set the Spark project root directory
+spark_proj_root = os.path.abspath("..")
+# Set the user 'HOME' directory
+user_home_dir = os.environ.get("HOME")
+# Set the sbt maven profile arguments environment variable name
+sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS"
+# Set the amplab jenkins build tool environment variable name
+amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL"
+# Set the amplab jenkins build tool environment value
+amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env)
+# Set whether we're on an Amplab Jenkins box by checking for a specific
+# environment variable
+amplab_jenkins = os.environ.get("AMPLAB_JENKINS")
+# Set the pattern for sbt output e.g. "[info] Resolving ..."
+resolving_re = "^.*[info].*Resolving"
+# Set the pattern for sbt output e.g. "[warn] Merging ..."
+merging_re = "^.*[warn].*Merging"
+# Set the pattern for sbt output e.g. "[info] Including ..."
+including_re = "^.*[info].*Including"
+# Compile the various regex patterns into a filter
+sbt_output_filter = re.compile(resolving_re + "|" + 
+                               merging_re + "|" +
+                               including_re)
+
+def get_error_codes(err_code_file):
+    """Function to retrieve all block numbers from the `run-tests-codes.sh`
+    file to maintain backwards compatibility with the `run-tests-jenkins` 
+    script"""
+    
+    with open(err_code_file, 'r') as f:
+        err_codes = [e.split()[1].strip().split('=') 
+                     for e in f if e.startswith("readonly")]
+        return dict(err_codes)
+
+def rm_r(path):
+    """Given an arbitrary path properly remove it with the correct python
+    construct if it exists
+    - from: http://stackoverflow.com/a/9559881"""
+
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+    elif os.path.exists(path):
+        os.remove(path)
+
+def lineno():
+    """Returns the current line number in our program
+    - from: http://stackoverflow.com/a/3056059"""
+
+    return inspect.currentframe().f_back.f_lineno
+
+def set_sbt_maven_profile_args():
+    """Properly sets the SBT environment variable arguments with additional
+    checks to determine if this is running on an Amplab Jenkins machine"""
+
+    # base environment values for sbt_maven_profile_args_env which will be appended on
+    sbt_maven_profile_args_base = ["-Pkinesis-asl"]
+
+    sbt_maven_profile_arg_dict = {
+        "hadoop1.0" : ["-Dhadoop.version=1.0.4"],
+        "hadoop2.0" : ["-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
+        "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2", "-Dhadoop.version=2.2.0"],
+        "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
+    }
+
+    # set the SBT maven build profile argument environment variable and ensure
+    # we build against the right version of Hadoop
+    if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"):
+        os.environ[sbt_maven_profile_args_env] = \
+            " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) 
+                     + sbt_maven_profile_args_base)
+    else:
+        os.environ[sbt_maven_profile_args_env] = \
+            " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", [])
+                     + sbt_maven_profile_args_base)
+
+def is_exe(path):
+    """Check if a given path is an executable file
+    - from: http://stackoverflow.com/a/377028"""
+
+    return os.path.isfile(path) and os.access(path, os.X_OK)
+
+def which(program):
+    """Find and return the given program by its absolute path or 'None'
+    - from: http://stackoverflow.com/a/377028"""
+
+    fpath, fname = os.path.split(program)
+
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ.get("PATH").split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None
+
+def determine_java_executable():
+    """Will return the *best* path possible for a 'java' executable or `None`"""
+
+    java_home = os.environ.get("JAVA_HOME")
+
+    # check if there is an executable at $JAVA_HOME/bin/java
+    java_exe = which(os.path.join(java_home, "bin/java"))
+    # if the java_exe wasn't set, check for a `java` version on the $PATH
+    return java_exe if java_exe else which("java")
+
+def determine_java_version(java_exe):
+    """Given a valid java executable will return its version in tuple format as:
+    [<major-version>, <minor-version>, <patch-version>, <update-version>]"""
+
+    raw_output = sp.check_output([java_exe, "-version"], stderr=sp.STDOUT)
+    raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"'
+    version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25'
+    version, update = version_str.split('_') # eg ['1.8.0', '25']
+
+    # map over the values and convert them to integers
+    return map(lambda x: int(x), version.split('.') + [update])
+
+def multi_starts_with(orig_str, *prefixes):
+    """Takes a string and an abritrary number of prefixes then checks the
+    original string for any of the possible prefixes passed in"""
+
+    for s in prefixes:
+        if orig_str.startswith(s):
+            return True
+    return False
+
+# This function current acts to determine if SQL tests need to be run in
+# addition to the core test suite *or* if _only_ SQL tests need to be run
+# as the git logs show that to be the only thing touched. In the future
+# this function will act more generically to help further segregate the
+# test suite runner (hence the function name).
+# @return a set of unique test names
+def determine_test_suite():
+    test_suite = list()
+
+    if amplab_jenkins:
+        sp.Popen(['git', 'fetch', 'origin', 'master:master']).wait()
+
+        raw_output = sp.check_output(['git', 'diff', '--name-only', 'master'])
+        # remove any empty strings
+        changed_files = [f for f in raw_output.split('\n') if f]
+
+        # find any sql files
+        sql_files = [f for f in changed_files
+                     if multi_starts_with(f, 
+                                          "sql/", 
+                                          "bin/spark-sql", 
+                                          "sbin/start-thriftserver.sh")]
+
+        non_sql_files = set(changed_files).difference(set(sql_files))
+
+        if non_sql_files:
+            test_suite.append("CORE")
+        if sql_files:
+            print "[info] Detected changes in SQL. Will run Hive test suite."
+            test_suite.append("SQL")
+            if not non_sql_files:
+                print "[info] Detected no changes except in SQL. Will only run SQL tests."
+        return set(test_suite)
+    else:
+        # we aren't in the Amplab environment so merely run all tests
+        test_suite.append("CORE")
+        test_suite.append("SQL")
+        return set(test_suite)
+
+def set_title_and_block(title, err_block):
+    os.environ["CURRENT_BLOCK"] = error_codes[err_block]
+    line_str = "".join(['='] * 72)
+
+    print
+    print line_str
+    print title
+    print line_str
+
+def run_cmd(cmd):
+    """Given a command as a list of arguments will attempt to execute the
+    command and, on failure, print an error message"""
+
+    if not isinstance(cmd, list):
+        cmd = cmd.split()
+    try:
+        sp.check_output(cmd)
+    except sp.CalledProcessError as e:
+        print "[error] running", e.cmd, "; received return code", e.returncode
+        exit(e.returncode)
+
+def run_apache_rat_checks():
+    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
+    run_cmd(["./dev/check-license"])
+
+def run_scala_style_checks():
+    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
+    run_cmd(["./dev/lint-scala"])
+
+def run_python_style_checks():
+    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
+    run_cmd(["./dev/lint-python"])
+
+def exec_maven(mvn_args = []):
+    """Will call Maven in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    return sp.Popen(["./build/mvn"] + mvn_args)
+
+def exec_sbt(sbt_args = []):
+    """Will call SBT in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    # NOTE: echo "q" is needed because sbt on encountering a build file
+    # with failure (either resolution or compilation) prompts the user for
+    # input either q, r, etc to quit or retry. This echo is there to make it
+    # not block.
+    echo_proc = sp.Popen(["echo", "\"q\n\""])
+    sbt_proc = sp.Popen(["./build/sbt"] + sbt_args,
+                        stdin=echo_proc.stdout,
+                        stdout=sp.PIPE)
+    echo_proc.wait()
+    for line in iter(sbt_proc.stdout.readline, ''):
+        if not sbt_output_filter.match(line):
+            print line,    
+    return sbt_proc
+
+def build_apache_spark():
+    """Will first build Spark with Hive v0.12.0 to ensure the build is
+    successful and, after, will build Spark again against Hive v0.13.1 as the
+    tests are based off that"""
+
+    set_title_and_block("Building Spark", "BLOCK_BUILD")
+
+    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
+    hive_profile_args = sbt_maven_profile_args + ["-Phive", 
+                                                  "-Phive-thriftserver"]
+    hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"]
+    # set the default maven args
+    base_mvn_args = ["clean", "package", "-DskipTests"]
+    # set the necessary sbt goals
+    sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"]
+    sbt_hive_goals = ["package", 
+                      "assembly/assembly", 
+                      "streaming-kafka-assembly/assembly"]
+
+    # First build with Hive 0.12.0 to ensure patches do not break the Hive 
+    # 0.12.0 build
+    print "[info] Compile with Hive 0.12.0"
+    rm_r("lib_managed")
+    print "[info] Building Spark with these arguments:", 
+    print " ".join(hive_12_profile_args)
+
+    if amplab_jenkins_build_tool == "maven":
+        exec_maven(hive_12_profile_args + base_mvn_args).wait()
+    else:
+        exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait()
+
+    # Then build with default Hive version (0.13.1) because tests are based on
+    # this version
+    print "[info] Compile with Hive 0.13.1"
+    rm_r("lib_managed")
+    print "[info] Building Spark with these arguments:", 
+    print " ".join(hive_profile_args)
+
+    if amplab_jenkins_build_tool == "maven":
+        exec_maven(hive_profile_args + base_mvn_args).wait()
+    else:
+        exec_sbt(hive_profile_args + sbt_hive_goals).wait()
+
+def detect_binary_inop_with_mima():
+    set_title_and_block("Detecting binary incompatibilities with MiMa",
+                        "BLOCK_MIMA")
+    run_cmd(["./dev/mima"])
+
+def run_scala_tests(test_suite = []):
+    """Function to properly execute all tests pass in, as a list, from the
+    `determine_test_suite` function"""
+    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
+
+    # ensure the test_suite is a set
+    if not isinstance(test_suite, set): 
+        test_suite = set(test_suite)
+
+    # if the Spark SQL tests are enabled, run the tests with the Hive profiles 
+    # enabled.
+    if "SQL" in test_suite:
+        sbt_maven_profile_args = \
+            os.environ.get(sbt_maven_profile_args_env).split()
+        os.environ[sbt_maven_profile_args_env] = \
+            " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"])
+
+    # if we only have changes in SQL build a custom test string
+    if "SQL" in test_suite and "CORE" not in test_suite:
+        sbt_maven_test_args = ["catalyst/test",
+                               "sql/test",
+                               "hive/test", 
+                               "hive-thriftserver/test",
+                               "mllib/test"]
+    else:
+        sbt_maven_test_args = ["test"]
+
+    # get the latest sbt maven profile arguments
+    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
+
+    print "[info] Running Spark tests with these arguments:",
+    print " ".join(sbt_maven_profile_args), 
+    print " ".join(sbt_maven_test_args)
+
+    if amplab_jenkins_build_tool == "maven":
+        exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait()
+    else:
+        exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait()
+
+def run_python_tests(test_suite = []):
+    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
+    
+    # Add path for Python3 in Jenkins if we're calling from a Jenkins machine
+    if amplab_jenkins:
+        os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
+
+    run_cmd(["./python/run-tests"])
+
+def run_sparkr_tests(test_suite = []):
+    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
+
+    if which("R"):
+        run_cmd(["./R/install-dev.sh"])
+        run_cmd(["./R/run-tests.sh"])
+    else:
+        print "Ignoring SparkR tests as R was not found in PATH"
+
+# Ensure the user home directory (HOME) is valid and is an absolute directory
+if not user_home_dir or not os.path.isabs(user_home_dir):
+    print "[error] Cannot determine your home directory as an absolute path;",
+    print "ensure the $HOME environment variable is set properly."
+    exit(1)
+
+# Change directory to the Spark project root
+os.chdir(spark_proj_root)
 
 # Clean up work directory and caches
-rm -rf ./work
-rm -rf ~/.ivy2/local/org.apache.spark
-rm -rf ~/.ivy2/cache/org.apache.spark
-
-source "$FWDIR/dev/run-tests-codes.sh"
-
-CURRENT_BLOCK=$BLOCK_GENERAL
-
-function handle_error () {
-  echo "[error] Got a return code of $? on line $1 of the run-tests script."
-  exit $CURRENT_BLOCK
-}
-
-
-# Build against the right version of Hadoop.
-{
-  if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
-    if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
-    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
-    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0"
-    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
-      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
-    fi
-  fi
-
-  if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
-  fi
-}
-
-export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
-
-# Determine Java path and version.
-{
-  if test -x "$JAVA_HOME/bin/java"; then
-      declare java_cmd="$JAVA_HOME/bin/java"
-  else
-      declare java_cmd=java
-  fi
-
-  # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses.
-  JAVA_VERSION=$(
-    $java_cmd -version 2>&1 \
-    | grep -e "^java version" --max-count=1 \
-    | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/"
-  )
-
-  if [ "$JAVA_VERSION" -lt 18 ]; then
-    echo "[warn] Java 8 tests will not run because JDK version is < 1.8."
-  fi
-}
-
-# Only run Hive tests if there are SQL changes.
-# Partial solution for SPARK-1455.
-if [ -n "$AMPLAB_JENKINS" ]; then
-  git fetch origin master:master
-
-  sql_diffs=$(
-    git diff --name-only master \
-    | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-  )
-
-  non_sql_diffs=$(
-    git diff --name-only master \
-    | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
-  )
-
-  if [ -n "$sql_diffs" ]; then
-    echo "[info] Detected changes in SQL. Will run Hive test suite."
-    _RUN_SQL_TESTS=true
-
-    if [ -z "$non_sql_diffs" ]; then
-      echo "[info] Detected no changes except in SQL. Will only run SQL tests."
-      _SQL_TESTS_ONLY=true
-    fi
-  fi
-fi
-
-set -o pipefail
-trap 'handle_error $LINENO' ERR
-
-echo ""
-echo "========================================================================="
-echo "Running Apache RAT checks"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_RAT
-
-./dev/check-license
-
-echo ""
-echo "========================================================================="
-echo "Running Scala style checks"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_SCALA_STYLE
-
-./dev/lint-scala
-
-echo ""
-echo "========================================================================="
-echo "Running Python style checks"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_PYTHON_STYLE
-
-./dev/lint-python
-
-echo ""
-echo "========================================================================="
-echo "Building Spark"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_BUILD
-
-{
-  HIVE_BUILD_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver"
-  HIVE_12_BUILD_ARGS="$HIVE_BUILD_ARGS -Phive-0.12.0"
-
-  # First build with Hive 0.12.0 to ensure patches do not break the Hive 0.12.0 build
-  echo "[info] Compile with Hive 0.12.0"
-  [ -d "lib_managed" ] && rm -rf lib_managed
-  echo "[info] Building Spark with these arguments: $HIVE_12_BUILD_ARGS"
-
-  if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then
-    build/mvn $HIVE_12_BUILD_ARGS clean package -DskipTests
-  else
-    # NOTE: echo "q" is needed because sbt on encountering a build file with failure
-    # (either resolution or compilation) prompts the user for input either q, r, etc
-    # to quit or retry. This echo is there to make it not block.
-    # NOTE: Do not quote $BUILD_MVN_PROFILE_ARGS or else it will be interpreted as a
-    # single argument!
-    # QUESTION: Why doesn't 'yes "q"' work?
-    # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
-    echo -e "q\n" \
-      | build/sbt $HIVE_12_BUILD_ARGS clean hive/compile hive-thriftserver/compile \
-      | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-  fi
-
-  # Then build with default Hive version (0.13.1) because tests are based on this version
-  echo "[info] Compile with Hive 0.13.1"
-  [ -d "lib_managed" ] && rm -rf lib_managed
-  echo "[info] Building Spark with these arguments: $HIVE_BUILD_ARGS"
-
-  if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then
-    build/mvn $HIVE_BUILD_ARGS clean package -DskipTests
-  else
-    echo -e "q\n" \
-      | build/sbt $HIVE_BUILD_ARGS package assembly/assembly streaming-kafka-assembly/assembly \
-      | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-  fi
-}
-
-echo ""
-echo "========================================================================="
-echo "Detecting binary incompatibilities with MiMa"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_MIMA
-
-./dev/mima
-
-echo ""
-echo "========================================================================="
-echo "Running Spark unit tests"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS
-
-{
-  # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled.
-  # This must be a single argument, as it is.
-  if [ -n "$_RUN_SQL_TESTS" ]; then
-    SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver"
-  fi
-
-  if [ -n "$_SQL_TESTS_ONLY" ]; then
-    # This must be an array of individual arguments. Otherwise, having one long string
-    # will be interpreted as a single test, which doesn't work.
-    SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test" "hive-thriftserver/test" "mllib/test")
-  else
-    SBT_MAVEN_TEST_ARGS=("test")
-  fi
-
-  echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS ${SBT_MAVEN_TEST_ARGS[@]}"
-
-  if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then
-    build/mvn test $SBT_MAVEN_PROFILES_ARGS --fail-at-end
-  else
-    # NOTE: echo "q" is needed because sbt on encountering a build file with failure
-    # (either resolution or compilation) prompts the user for input either q, r, etc
-    # to quit or retry. This echo is there to make it not block.
-    # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a
-    # single argument!
-    # "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array.
-    # QUESTION: Why doesn't 'yes "q"' work?
-    # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
-    echo -e "q\n" \
-      | build/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \
-      | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-  fi
-}
-
-echo ""
-echo "========================================================================="
-echo "Running PySpark tests"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS
-
-# add path for python 3 in jenkins
-export PATH="${PATH}:/home/anaonda/envs/py3k/bin"
-./python/run-tests
-
-echo ""
-echo "========================================================================="
-echo "Running SparkR tests"
-echo "========================================================================="
-
-CURRENT_BLOCK=$BLOCK_SPARKR_UNIT_TESTS
-
-if [ $(command -v R) ]; then
-  ./R/install-dev.sh
-  ./R/run-tests.sh
-else
-  echo "Ignoring SparkR tests as R was not found in PATH"
-fi
+rm_r("./work")
+rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark"))
+rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark"))
 
+# Grab the error codes from the `dev/run-tests-codes.sh` file
+error_codes = get_error_codes("dev/run-tests-codes.sh")
+
+# Set the environment with the general error code initially
+os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"]
+
+# Set the various sbt maven profile argument environment variables
+set_sbt_maven_profile_args()
+
+# Set the java executable we've found (if any)        
+java_exe = determine_java_executable()
+
+if not java_exe:
+    print "[error] Cannot find a version of `java` on the system; please", 
+    print "install one and retry."
+    exit(2)
+
+# Grab the current java version information
+java_version = determine_java_version(java_exe)
+
+if java_version[1] < 8:
+    print "[warn] Java 8 tests will not run because JDK version is < 1.8."
+
+# Determine the suite of tests to perform
+test_suite = determine_test_suite()
+
+run_apache_rat_checks()
+
+run_scala_style_checks()
+
+run_python_style_checks()
+
+# Build an up-to-date version of Apache Spark
+build_apache_spark()
+
+detect_binary_inop_with_mima()
+
+# run_scala_tests(test_suite)
+
+run_python_tests()
+
+run_sparkr_tests()

From 3c53a1a2bf29ad9ec62141e7a16c19bfc49fef91 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Fri, 24 Apr 2015 14:36:53 -0700
Subject: [PATCH 02/52] uncomment the scala tests :)

---
 dev/run-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests b/dev/run-tests
index 22a33d317a344..f52e8e7ae453e 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -403,7 +403,7 @@ build_apache_spark()
 
 detect_binary_inop_with_mima()
 
-# run_scala_tests(test_suite)
+run_scala_tests(test_suite)
 
 run_python_tests()
 

From 639f1e906e4e5e888f0d5f625033bf0123fda4fb Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Sat, 25 Apr 2015 18:32:06 -0700
Subject: [PATCH 03/52] updated with pep8 rules, fixed minor bugs, added
 run-tests file in bash to call the run-tests.py script

---
 dev/run-tests    | 395 +-------------------------------------------
 dev/run-tests.py | 417 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 421 insertions(+), 391 deletions(-)
 create mode 100755 dev/run-tests.py

diff --git a/dev/run-tests b/dev/run-tests
index f52e8e7ae453e..844ff6a0d9757 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,394 +17,7 @@
 # limitations under the License.
 #
 
-import os
-import re
-import shutil
-import subprocess as sp
+FWDIR="$(cd "`dirname $0`"/..; pwd)"
+cd "$FWDIR"
 
-# Set the Spark project root directory
-spark_proj_root = os.path.abspath("..")
-# Set the user 'HOME' directory
-user_home_dir = os.environ.get("HOME")
-# Set the sbt maven profile arguments environment variable name
-sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS"
-# Set the amplab jenkins build tool environment variable name
-amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL"
-# Set the amplab jenkins build tool environment value
-amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env)
-# Set whether we're on an Amplab Jenkins box by checking for a specific
-# environment variable
-amplab_jenkins = os.environ.get("AMPLAB_JENKINS")
-# Set the pattern for sbt output e.g. "[info] Resolving ..."
-resolving_re = "^.*[info].*Resolving"
-# Set the pattern for sbt output e.g. "[warn] Merging ..."
-merging_re = "^.*[warn].*Merging"
-# Set the pattern for sbt output e.g. "[info] Including ..."
-including_re = "^.*[info].*Including"
-# Compile the various regex patterns into a filter
-sbt_output_filter = re.compile(resolving_re + "|" + 
-                               merging_re + "|" +
-                               including_re)
-
-def get_error_codes(err_code_file):
-    """Function to retrieve all block numbers from the `run-tests-codes.sh`
-    file to maintain backwards compatibility with the `run-tests-jenkins` 
-    script"""
-    
-    with open(err_code_file, 'r') as f:
-        err_codes = [e.split()[1].strip().split('=') 
-                     for e in f if e.startswith("readonly")]
-        return dict(err_codes)
-
-def rm_r(path):
-    """Given an arbitrary path properly remove it with the correct python
-    construct if it exists
-    - from: http://stackoverflow.com/a/9559881"""
-
-    if os.path.isdir(path):
-        shutil.rmtree(path)
-    elif os.path.exists(path):
-        os.remove(path)
-
-def lineno():
-    """Returns the current line number in our program
-    - from: http://stackoverflow.com/a/3056059"""
-
-    return inspect.currentframe().f_back.f_lineno
-
-def set_sbt_maven_profile_args():
-    """Properly sets the SBT environment variable arguments with additional
-    checks to determine if this is running on an Amplab Jenkins machine"""
-
-    # base environment values for sbt_maven_profile_args_env which will be appended on
-    sbt_maven_profile_args_base = ["-Pkinesis-asl"]
-
-    sbt_maven_profile_arg_dict = {
-        "hadoop1.0" : ["-Dhadoop.version=1.0.4"],
-        "hadoop2.0" : ["-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
-        "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2", "-Dhadoop.version=2.2.0"],
-        "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
-    }
-
-    # set the SBT maven build profile argument environment variable and ensure
-    # we build against the right version of Hadoop
-    if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"):
-        os.environ[sbt_maven_profile_args_env] = \
-            " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) 
-                     + sbt_maven_profile_args_base)
-    else:
-        os.environ[sbt_maven_profile_args_env] = \
-            " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", [])
-                     + sbt_maven_profile_args_base)
-
-def is_exe(path):
-    """Check if a given path is an executable file
-    - from: http://stackoverflow.com/a/377028"""
-
-    return os.path.isfile(path) and os.access(path, os.X_OK)
-
-def which(program):
-    """Find and return the given program by its absolute path or 'None'
-    - from: http://stackoverflow.com/a/377028"""
-
-    fpath, fname = os.path.split(program)
-
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ.get("PATH").split(os.pathsep):
-            path = path.strip('"')
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-    return None
-
-def determine_java_executable():
-    """Will return the *best* path possible for a 'java' executable or `None`"""
-
-    java_home = os.environ.get("JAVA_HOME")
-
-    # check if there is an executable at $JAVA_HOME/bin/java
-    java_exe = which(os.path.join(java_home, "bin/java"))
-    # if the java_exe wasn't set, check for a `java` version on the $PATH
-    return java_exe if java_exe else which("java")
-
-def determine_java_version(java_exe):
-    """Given a valid java executable will return its version in tuple format as:
-    [<major-version>, <minor-version>, <patch-version>, <update-version>]"""
-
-    raw_output = sp.check_output([java_exe, "-version"], stderr=sp.STDOUT)
-    raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"'
-    version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25'
-    version, update = version_str.split('_') # eg ['1.8.0', '25']
-
-    # map over the values and convert them to integers
-    return map(lambda x: int(x), version.split('.') + [update])
-
-def multi_starts_with(orig_str, *prefixes):
-    """Takes a string and an abritrary number of prefixes then checks the
-    original string for any of the possible prefixes passed in"""
-
-    for s in prefixes:
-        if orig_str.startswith(s):
-            return True
-    return False
-
-# This function current acts to determine if SQL tests need to be run in
-# addition to the core test suite *or* if _only_ SQL tests need to be run
-# as the git logs show that to be the only thing touched. In the future
-# this function will act more generically to help further segregate the
-# test suite runner (hence the function name).
-# @return a set of unique test names
-def determine_test_suite():
-    test_suite = list()
-
-    if amplab_jenkins:
-        sp.Popen(['git', 'fetch', 'origin', 'master:master']).wait()
-
-        raw_output = sp.check_output(['git', 'diff', '--name-only', 'master'])
-        # remove any empty strings
-        changed_files = [f for f in raw_output.split('\n') if f]
-
-        # find any sql files
-        sql_files = [f for f in changed_files
-                     if multi_starts_with(f, 
-                                          "sql/", 
-                                          "bin/spark-sql", 
-                                          "sbin/start-thriftserver.sh")]
-
-        non_sql_files = set(changed_files).difference(set(sql_files))
-
-        if non_sql_files:
-            test_suite.append("CORE")
-        if sql_files:
-            print "[info] Detected changes in SQL. Will run Hive test suite."
-            test_suite.append("SQL")
-            if not non_sql_files:
-                print "[info] Detected no changes except in SQL. Will only run SQL tests."
-        return set(test_suite)
-    else:
-        # we aren't in the Amplab environment so merely run all tests
-        test_suite.append("CORE")
-        test_suite.append("SQL")
-        return set(test_suite)
-
-def set_title_and_block(title, err_block):
-    os.environ["CURRENT_BLOCK"] = error_codes[err_block]
-    line_str = "".join(['='] * 72)
-
-    print
-    print line_str
-    print title
-    print line_str
-
-def run_cmd(cmd):
-    """Given a command as a list of arguments will attempt to execute the
-    command and, on failure, print an error message"""
-
-    if not isinstance(cmd, list):
-        cmd = cmd.split()
-    try:
-        sp.check_output(cmd)
-    except sp.CalledProcessError as e:
-        print "[error] running", e.cmd, "; received return code", e.returncode
-        exit(e.returncode)
-
-def run_apache_rat_checks():
-    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
-    run_cmd(["./dev/check-license"])
-
-def run_scala_style_checks():
-    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
-    run_cmd(["./dev/lint-scala"])
-
-def run_python_style_checks():
-    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
-    run_cmd(["./dev/lint-python"])
-
-def exec_maven(mvn_args = []):
-    """Will call Maven in the current directory with the list of mvn_args passed
-    in and returns the subprocess for any further processing"""
-
-    return sp.Popen(["./build/mvn"] + mvn_args)
-
-def exec_sbt(sbt_args = []):
-    """Will call SBT in the current directory with the list of mvn_args passed
-    in and returns the subprocess for any further processing"""
-
-    # NOTE: echo "q" is needed because sbt on encountering a build file
-    # with failure (either resolution or compilation) prompts the user for
-    # input either q, r, etc to quit or retry. This echo is there to make it
-    # not block.
-    echo_proc = sp.Popen(["echo", "\"q\n\""])
-    sbt_proc = sp.Popen(["./build/sbt"] + sbt_args,
-                        stdin=echo_proc.stdout,
-                        stdout=sp.PIPE)
-    echo_proc.wait()
-    for line in iter(sbt_proc.stdout.readline, ''):
-        if not sbt_output_filter.match(line):
-            print line,    
-    return sbt_proc
-
-def build_apache_spark():
-    """Will first build Spark with Hive v0.12.0 to ensure the build is
-    successful and, after, will build Spark again against Hive v0.13.1 as the
-    tests are based off that"""
-
-    set_title_and_block("Building Spark", "BLOCK_BUILD")
-
-    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
-    hive_profile_args = sbt_maven_profile_args + ["-Phive", 
-                                                  "-Phive-thriftserver"]
-    hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"]
-    # set the default maven args
-    base_mvn_args = ["clean", "package", "-DskipTests"]
-    # set the necessary sbt goals
-    sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"]
-    sbt_hive_goals = ["package", 
-                      "assembly/assembly", 
-                      "streaming-kafka-assembly/assembly"]
-
-    # First build with Hive 0.12.0 to ensure patches do not break the Hive 
-    # 0.12.0 build
-    print "[info] Compile with Hive 0.12.0"
-    rm_r("lib_managed")
-    print "[info] Building Spark with these arguments:", 
-    print " ".join(hive_12_profile_args)
-
-    if amplab_jenkins_build_tool == "maven":
-        exec_maven(hive_12_profile_args + base_mvn_args).wait()
-    else:
-        exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait()
-
-    # Then build with default Hive version (0.13.1) because tests are based on
-    # this version
-    print "[info] Compile with Hive 0.13.1"
-    rm_r("lib_managed")
-    print "[info] Building Spark with these arguments:", 
-    print " ".join(hive_profile_args)
-
-    if amplab_jenkins_build_tool == "maven":
-        exec_maven(hive_profile_args + base_mvn_args).wait()
-    else:
-        exec_sbt(hive_profile_args + sbt_hive_goals).wait()
-
-def detect_binary_inop_with_mima():
-    set_title_and_block("Detecting binary incompatibilities with MiMa",
-                        "BLOCK_MIMA")
-    run_cmd(["./dev/mima"])
-
-def run_scala_tests(test_suite = []):
-    """Function to properly execute all tests pass in, as a list, from the
-    `determine_test_suite` function"""
-    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
-
-    # ensure the test_suite is a set
-    if not isinstance(test_suite, set): 
-        test_suite = set(test_suite)
-
-    # if the Spark SQL tests are enabled, run the tests with the Hive profiles 
-    # enabled.
-    if "SQL" in test_suite:
-        sbt_maven_profile_args = \
-            os.environ.get(sbt_maven_profile_args_env).split()
-        os.environ[sbt_maven_profile_args_env] = \
-            " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"])
-
-    # if we only have changes in SQL build a custom test string
-    if "SQL" in test_suite and "CORE" not in test_suite:
-        sbt_maven_test_args = ["catalyst/test",
-                               "sql/test",
-                               "hive/test", 
-                               "hive-thriftserver/test",
-                               "mllib/test"]
-    else:
-        sbt_maven_test_args = ["test"]
-
-    # get the latest sbt maven profile arguments
-    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
-
-    print "[info] Running Spark tests with these arguments:",
-    print " ".join(sbt_maven_profile_args), 
-    print " ".join(sbt_maven_test_args)
-
-    if amplab_jenkins_build_tool == "maven":
-        exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait()
-    else:
-        exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait()
-
-def run_python_tests(test_suite = []):
-    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
-    
-    # Add path for Python3 in Jenkins if we're calling from a Jenkins machine
-    if amplab_jenkins:
-        os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
-
-    run_cmd(["./python/run-tests"])
-
-def run_sparkr_tests(test_suite = []):
-    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
-
-    if which("R"):
-        run_cmd(["./R/install-dev.sh"])
-        run_cmd(["./R/run-tests.sh"])
-    else:
-        print "Ignoring SparkR tests as R was not found in PATH"
-
-# Ensure the user home directory (HOME) is valid and is an absolute directory
-if not user_home_dir or not os.path.isabs(user_home_dir):
-    print "[error] Cannot determine your home directory as an absolute path;",
-    print "ensure the $HOME environment variable is set properly."
-    exit(1)
-
-# Change directory to the Spark project root
-os.chdir(spark_proj_root)
-
-# Clean up work directory and caches
-rm_r("./work")
-rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark"))
-rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark"))
-
-# Grab the error codes from the `dev/run-tests-codes.sh` file
-error_codes = get_error_codes("dev/run-tests-codes.sh")
-
-# Set the environment with the general error code initially
-os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"]
-
-# Set the various sbt maven profile argument environment variables
-set_sbt_maven_profile_args()
-
-# Set the java executable we've found (if any)        
-java_exe = determine_java_executable()
-
-if not java_exe:
-    print "[error] Cannot find a version of `java` on the system; please", 
-    print "install one and retry."
-    exit(2)
-
-# Grab the current java version information
-java_version = determine_java_version(java_exe)
-
-if java_version[1] < 8:
-    print "[warn] Java 8 tests will not run because JDK version is < 1.8."
-
-# Determine the suite of tests to perform
-test_suite = determine_test_suite()
-
-run_apache_rat_checks()
-
-run_scala_style_checks()
-
-run_python_style_checks()
-
-# Build an up-to-date version of Apache Spark
-build_apache_spark()
-
-detect_binary_inop_with_mima()
-
-run_scala_tests(test_suite)
-
-run_python_tests()
-
-run_sparkr_tests()
+./dev/run-tests.py
diff --git a/dev/run-tests.py b/dev/run-tests.py
new file mode 100755
index 0000000000000..9485ab28ae598
--- /dev/null
+++ b/dev/run-tests.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import re
+import sys
+import shutil
+import subprocess
+
+spark_proj_root = \
+    os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+user_home_dir = os.environ.get("HOME")
+
+sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS"
+amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL"
+amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env)
+amplab_jenkins = os.environ.get("AMPLAB_JENKINS")
+
+resolving_re = "^.*[info].*Resolving"
+merging_re = "^.*[warn].*Merging"
+including_re = "^.*[info].*Including"
+sbt_output_filter = re.compile(resolving_re + "|" + 
+                               merging_re + "|" +
+                               including_re)
+
+
+def get_error_codes(err_code_file):
+    """Function to retrieve all block numbers from the `run-tests-codes.sh`
+    file to maintain backwards compatibility with the `run-tests-jenkins` 
+    script"""
+    
+    with open(err_code_file, 'r') as f:
+        err_codes = [e.split()[1].strip().split('=') 
+                     for e in f if e.startswith("readonly")]
+        return dict(err_codes)
+
+
+def rm_r(path):
+    """Given an arbitrary path properly remove it with the correct python
+    construct if it exists
+    - from: http://stackoverflow.com/a/9559881"""
+
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+    elif os.path.exists(path):
+        os.remove(path)
+
+
+def lineno():
+    """Returns the current line number in our program
+    - from: http://stackoverflow.com/a/3056059"""
+
+    return inspect.currentframe().f_back.f_lineno
+
+
+def run_cmd(cmd):
+    """Given a command as a list of arguments will attempt to execute the
+    command and, on failure, print an error message"""
+
+    if not isinstance(cmd, list):
+        cmd = cmd.split()
+    try:
+        subprocess.check_output(cmd)
+    except subprocess.CalledProcessError as e:
+        print "[error] running", e.cmd, "; received return code", e.returncode
+        sys.exit(e.returncode)
+
+
+def set_sbt_maven_profile_args():
+    """Properly sets the SBT environment variable arguments with additional
+    checks to determine if this is running on an Amplab Jenkins machine"""
+
+    # base environment values for sbt_maven_profile_args_env which will be appended on
+    sbt_maven_profile_args_base = ["-Pkinesis-asl"]
+
+    sbt_maven_profile_arg_dict = {
+        "hadoop1.0" : ["-Dhadoop.version=1.0.4"],
+        "hadoop2.0" : ["-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
+        "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2", "-Dhadoop.version=2.2.0"],
+        "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
+    }
+
+    # set the SBT maven build profile argument environment variable and ensure
+    # we build against the right version of Hadoop
+    if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"):
+        os.environ[sbt_maven_profile_args_env] = \
+            " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) 
+                     + sbt_maven_profile_args_base)
+    else:
+        os.environ[sbt_maven_profile_args_env] = \
+            " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", [])
+                     + sbt_maven_profile_args_base)
+
+
+def is_exe(path):
+    """Check if a given path is an executable file
+    - from: http://stackoverflow.com/a/377028"""
+
+    return os.path.isfile(path) and os.access(path, os.X_OK)
+
+
+def which(program):
+    """Find and return the given program by its absolute path or 'None'
+    - from: http://stackoverflow.com/a/377028"""
+
+    fpath, fname = os.path.split(program)
+
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ.get("PATH").split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None
+
+
+def determine_java_executable():
+    """Will return the *best* path possible for a 'java' executable or `None`"""
+
+    java_home = os.environ.get("JAVA_HOME")
+
+    # check if there is an executable at $JAVA_HOME/bin/java
+    java_exe = which(os.path.join(java_home, "bin/java"))
+    # if the java_exe wasn't set, check for a `java` version on the $PATH
+    return java_exe if java_exe else which("java")
+
+
+def determine_java_version(java_exe):
+    """Given a valid java executable will return its version in tuple format as:
+    [<major-version>, <minor-version>, <patch-version>, <update-version>]"""
+
+    raw_output = subprocess.check_output([java_exe, "-version"], stderr=subprocess.STDOUT)
+    raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"'
+    version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25'
+    version, update = version_str.split('_') # eg ['1.8.0', '25']
+
+    # map over the values and convert them to integers
+    return map(lambda x: int(x), version.split('.') + [update])
+
+
+def multi_starts_with(orig_str, *prefixes):
+    """Takes a string and an abritrary number of prefixes then checks the
+    original string for any of the possible prefixes passed in"""
+
+    for s in prefixes:
+        if orig_str.startswith(s):
+            return True
+    return False
+
+
+def determine_test_suite():
+    """This function current acts to determine if SQL tests need to be run in
+    addition to the core test suite *or* if _only_ SQL tests need to be run
+    as the git logs show that to be the only thing touched. In the future
+    this function will act more generically to help further segregate the
+    test suite runner (hence the function name).
+    @return a set of unique test names"""
+    test_suite = list()
+
+    if amplab_jenkins:
+        run_cmd(['git', 'fetch', 'origin', 'master:master']).wait()
+
+        raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master'])
+        # remove any empty strings
+        changed_files = [f for f in raw_output.split('\n') if f]
+
+        # find any sql files
+        sql_files = [f for f in changed_files
+                     if multi_starts_with(f, 
+                                          "sql/", 
+                                          "bin/spark-sql", 
+                                          "sbin/start-thriftserver.sh")]
+
+        non_sql_files = set(changed_files).difference(set(sql_files))
+
+        if non_sql_files:
+            test_suite.append("CORE")
+        if sql_files:
+            print "[info] Detected changes in SQL. Will run Hive test suite."
+            test_suite.append("SQL")
+            if not non_sql_files:
+                print "[info] Detected no changes except in SQL. Will only run SQL tests."
+        return set(test_suite)
+    else:
+        # we aren't in the Amplab environment so merely run all tests
+        test_suite.append("CORE")
+        test_suite.append("SQL")
+        return set(test_suite)
+
+
+def set_title_and_block(title, err_block):
+    os.environ["CURRENT_BLOCK"] = error_codes[err_block]
+    line_str = "".join(['='] * 72)
+
+    print
+    print line_str
+    print title
+    print line_str
+
+
+def run_apache_rat_checks():
+    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
+    run_cmd(["./dev/check-license"])
+
+
+def run_scala_style_checks():
+    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
+    run_cmd(["./dev/lint-scala"])
+
+
+def run_python_style_checks():
+    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
+    run_cmd(["./dev/lint-python"])
+
+
+def exec_maven(mvn_args=[]):
+    """Will call Maven in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    return subprocess.Popen(["./build/mvn"] + mvn_args)
+
+
+def exec_sbt(sbt_args=[]):
+    """Will call SBT in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    # NOTE: echo "q" is needed because sbt on encountering a build file
+    # with failure (either resolution or compilation) prompts the user for
+    # input either q, r, etc to quit or retry. This echo is there to make it
+    # not block.
+    echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
+    sbt_proc = subprocess.Popen(["./build/sbt"] + sbt_args,
+                                stdin=echo_proc.stdout,
+                                stdout=subprocess.PIPE)
+    echo_proc.wait()
+    for line in iter(sbt_proc.stdout.readline, ''):
+        if not sbt_output_filter.match(line):
+            print line,    
+    return sbt_proc
+
+
+def build_apache_spark():
+    """Will first build Spark with Hive v0.12.0 to ensure the build is
+    successful and, after, will build Spark again against Hive v0.13.1 as the
+    tests are based off that"""
+
+    set_title_and_block("Building Spark", "BLOCK_BUILD")
+
+    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
+    hive_profile_args = sbt_maven_profile_args + ["-Phive", 
+                                                  "-Phive-thriftserver"]
+    hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"]
+    # set the default maven args
+    base_mvn_args = ["clean", "package", "-DskipTests"]
+    # set the necessary sbt goals
+    sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"]
+    sbt_hive_goals = ["package", 
+                      "assembly/assembly", 
+                      "streaming-kafka-assembly/assembly"]
+
+    # First build with Hive 0.12.0 to ensure patches do not break the Hive 
+    # 0.12.0 build
+    print "[info] Compile with Hive 0.12.0"
+    rm_r("lib_managed")
+    print "[info] Building Spark with these arguments:", 
+    print " ".join(hive_12_profile_args)
+
+    if amplab_jenkins_build_tool == "maven":
+        exec_maven(hive_12_profile_args + base_mvn_args).wait()
+    else:
+        exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait()
+
+    # Then build with default Hive version (0.13.1) because tests are based on
+    # this version
+    print "[info] Compile with Hive 0.13.1"
+    rm_r("lib_managed")
+    print "[info] Building Spark with these arguments:", 
+    print " ".join(hive_profile_args)
+
+    if amplab_jenkins_build_tool == "maven":
+        exec_maven(hive_profile_args + base_mvn_args).wait()
+    else:
+        exec_sbt(hive_profile_args + sbt_hive_goals).wait()
+
+
+def detect_binary_inop_with_mima():
+    set_title_and_block("Detecting binary incompatibilities with MiMa",
+                        "BLOCK_MIMA")
+    run_cmd(["./dev/mima"])
+
+
+def run_scala_tests(test_suite=[]):
+    """Function to properly execute all tests pass in, as a list, from the
+    `determine_test_suite` function"""
+    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
+
+    # ensure the test_suite is a set
+    if not isinstance(test_suite, set): 
+        test_suite = set(test_suite)
+
+    # if the Spark SQL tests are enabled, run the tests with the Hive profiles 
+    # enabled.
+    if "SQL" in test_suite:
+        sbt_maven_profile_args = \
+            os.environ.get(sbt_maven_profile_args_env).split()
+        os.environ[sbt_maven_profile_args_env] = \
+            " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"])
+
+    # if we only have changes in SQL build a custom test string
+    if "SQL" in test_suite and "CORE" not in test_suite:
+        sbt_maven_test_args = ["catalyst/test",
+                               "sql/test",
+                               "hive/test", 
+                               "hive-thriftserver/test",
+                               "mllib/test"]
+    else:
+        sbt_maven_test_args = ["test"]
+
+    # get the latest sbt maven profile arguments
+    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
+
+    print "[info] Running Spark tests with these arguments:",
+    print " ".join(sbt_maven_profile_args), 
+    print " ".join(sbt_maven_test_args)
+
+    if amplab_jenkins_build_tool == "maven":
+        exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait()
+    else:
+        exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait()
+
+
+def run_python_tests(test_suite=[]):
+    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
+    
+    # Add path for Python3 in Jenkins if we're calling from a Jenkins machine
+    if amplab_jenkins:
+        os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
+
+    run_cmd(["./python/run-tests"])
+
+
+def run_sparkr_tests(test_suite=[]):
+    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
+
+    if which("R"):
+        run_cmd(["./R/install-dev.sh"])
+        run_cmd(["./R/run-tests.sh"])
+    else:
+        print "Ignoring SparkR tests as R was not found in PATH"
+
+if __name__ == "__main__":
+    # Ensure the user home directory (HOME) is valid and is an absolute directory
+    if not user_home_dir or not os.path.isabs(user_home_dir):
+        print "[error] Cannot determine your home directory as an absolute path;",
+        print "ensure the $HOME environment variable is set properly."
+        sys.exit(1)
+
+    os.chdir(spark_proj_root)
+
+    rm_r("./work")
+    rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark"))
+    rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark"))
+
+    error_codes = get_error_codes("./dev/run-tests-codes.sh")
+
+    os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"]
+
+    set_sbt_maven_profile_args()
+
+    java_exe = determine_java_executable()
+
+    if not java_exe:
+        print "[error] Cannot find a version of `java` on the system; please", 
+        print "install one and retry."
+        sys.exit(2)
+
+    java_version = determine_java_version(java_exe)
+
+    if java_version[1] < 8:
+        print "[warn] Java 8 tests will not run because JDK version is < 1.8."
+
+    test_suite = determine_test_suite()
+
+    run_apache_rat_checks()
+    
+    run_scala_style_checks()
+
+    run_python_style_checks()
+
+    build_apache_spark()
+
+    detect_binary_inop_with_mima()
+
+    run_scala_tests(test_suite)
+
+    run_python_tests()
+
+    run_sparkr_tests()

From 2cb413bcad65411323d4a2fa9d33217dafe0bd30 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 27 Apr 2015 08:40:45 -0700
Subject: [PATCH 04/52] upcased global variables, changes various calling
 methods from check_output to check_call

---
 dev/run-tests.py | 72 +++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 9485ab28ae598..08c1211a24b5f 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -23,21 +23,17 @@
 import shutil
 import subprocess
 
-spark_proj_root = \
+SPARK_PROJ_ROOT = \
     os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
-user_home_dir = os.environ.get("HOME")
+USER_HOME_DIR = os.environ.get("HOME")
 
-sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS"
-amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL"
-amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env)
-amplab_jenkins = os.environ.get("AMPLAB_JENKINS")
+SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS"
+AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL")
+AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS")
 
-resolving_re = "^.*[info].*Resolving"
-merging_re = "^.*[warn].*Merging"
-including_re = "^.*[info].*Including"
-sbt_output_filter = re.compile(resolving_re + "|" + 
-                               merging_re + "|" +
-                               including_re)
+SBT_OUTPUT_FILTER = re.compile("^.*[info].*Resolving" + "|" + 
+                               "^.*[warn].*Merging" + "|" +
+                               "^.*[info].*Including")
 
 
 def get_error_codes(err_code_file):
@@ -76,7 +72,7 @@ def run_cmd(cmd):
     if not isinstance(cmd, list):
         cmd = cmd.split()
     try:
-        subprocess.check_output(cmd)
+        subprocess.check_call(cmd)
     except subprocess.CalledProcessError as e:
         print "[error] running", e.cmd, "; received return code", e.returncode
         sys.exit(e.returncode)
@@ -86,7 +82,7 @@ def set_sbt_maven_profile_args():
     """Properly sets the SBT environment variable arguments with additional
     checks to determine if this is running on an Amplab Jenkins machine"""
 
-    # base environment values for sbt_maven_profile_args_env which will be appended on
+    # base environment values for SBT_MAVEN_PROFILE_ARGS_ENV which will be appended on
     sbt_maven_profile_args_base = ["-Pkinesis-asl"]
 
     sbt_maven_profile_arg_dict = {
@@ -99,11 +95,11 @@ def set_sbt_maven_profile_args():
     # set the SBT maven build profile argument environment variable and ensure
     # we build against the right version of Hadoop
     if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"):
-        os.environ[sbt_maven_profile_args_env] = \
+        os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \
             " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) 
                      + sbt_maven_profile_args_base)
     else:
-        os.environ[sbt_maven_profile_args_env] = \
+        os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \
             " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", [])
                      + sbt_maven_profile_args_base)
 
@@ -176,7 +172,7 @@ def determine_test_suite():
     @return a set of unique test names"""
     test_suite = list()
 
-    if amplab_jenkins:
+    if AMPLAB_JENKINS:
         run_cmd(['git', 'fetch', 'origin', 'master:master']).wait()
 
         raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master'])
@@ -209,7 +205,7 @@ def determine_test_suite():
 
 def set_title_and_block(title, err_block):
     os.environ["CURRENT_BLOCK"] = error_codes[err_block]
-    line_str = "".join(['='] * 72)
+    line_str = '=' * 72
 
     print
     print line_str
@@ -236,7 +232,7 @@ def exec_maven(mvn_args=[]):
     """Will call Maven in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
-    return subprocess.Popen(["./build/mvn"] + mvn_args)
+    run_cmd(["./build/mvn"] + mvn_args)
 
 
 def exec_sbt(sbt_args=[]):
@@ -253,9 +249,9 @@ def exec_sbt(sbt_args=[]):
                                 stdout=subprocess.PIPE)
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, ''):
-        if not sbt_output_filter.match(line):
+        if not SBT_OUTPUT_FILTER.match(line):
             print line,    
-    return sbt_proc
+    sbt_proc.wait()
 
 
 def build_apache_spark():
@@ -265,7 +261,7 @@ def build_apache_spark():
 
     set_title_and_block("Building Spark", "BLOCK_BUILD")
 
-    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
+    sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split()
     hive_profile_args = sbt_maven_profile_args + ["-Phive", 
                                                   "-Phive-thriftserver"]
     hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"]
@@ -284,8 +280,8 @@ def build_apache_spark():
     print "[info] Building Spark with these arguments:", 
     print " ".join(hive_12_profile_args)
 
-    if amplab_jenkins_build_tool == "maven":
-        exec_maven(hive_12_profile_args + base_mvn_args).wait()
+    if AMPLAB_JENKINS_BUILD_TOOL == "maven":
+        exec_maven(hive_12_profile_args + base_mvn_args)
     else:
         exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait()
 
@@ -296,10 +292,10 @@ def build_apache_spark():
     print "[info] Building Spark with these arguments:", 
     print " ".join(hive_profile_args)
 
-    if amplab_jenkins_build_tool == "maven":
-        exec_maven(hive_profile_args + base_mvn_args).wait()
+    if AMPLAB_JENKINS_BUILD_TOOL == "maven":
+        exec_maven(hive_profile_args + base_mvn_args)
     else:
-        exec_sbt(hive_profile_args + sbt_hive_goals).wait()
+        exec_sbt(hive_profile_args + sbt_hive_goals)
 
 
 def detect_binary_inop_with_mima():
@@ -321,8 +317,8 @@ def run_scala_tests(test_suite=[]):
     # enabled.
     if "SQL" in test_suite:
         sbt_maven_profile_args = \
-            os.environ.get(sbt_maven_profile_args_env).split()
-        os.environ[sbt_maven_profile_args_env] = \
+            os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split()
+        os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \
             " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"])
 
     # if we only have changes in SQL build a custom test string
@@ -336,23 +332,23 @@ def run_scala_tests(test_suite=[]):
         sbt_maven_test_args = ["test"]
 
     # get the latest sbt maven profile arguments
-    sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split()
+    sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split()
 
     print "[info] Running Spark tests with these arguments:",
     print " ".join(sbt_maven_profile_args), 
     print " ".join(sbt_maven_test_args)
 
-    if amplab_jenkins_build_tool == "maven":
-        exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait()
+    if AMPLAB_JENKINS_BUILD_TOOL == "maven":
+        exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"])
     else:
-        exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait()
+        exec_sbt(sbt_maven_profile_args + sbt_maven_test_args)
 
 
 def run_python_tests(test_suite=[]):
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
     
     # Add path for Python3 in Jenkins if we're calling from a Jenkins machine
-    if amplab_jenkins:
+    if AMPLAB_JENKINS:
         os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
 
     run_cmd(["./python/run-tests"])
@@ -369,16 +365,16 @@ def run_sparkr_tests(test_suite=[]):
 
 if __name__ == "__main__":
     # Ensure the user home directory (HOME) is valid and is an absolute directory
-    if not user_home_dir or not os.path.isabs(user_home_dir):
+    if not USER_HOME_DIR or not os.path.isabs(USER_HOME_DIR):
         print "[error] Cannot determine your home directory as an absolute path;",
         print "ensure the $HOME environment variable is set properly."
         sys.exit(1)
 
-    os.chdir(spark_proj_root)
+    os.chdir(SPARK_PROJ_ROOT)
 
     rm_r("./work")
-    rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark"))
-    rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark"))
+    rm_r(os.path.join(USER_HOME_DIR, ".ivy2/local/org.apache.spark"))
+    rm_r(os.path.join(USER_HOME_DIR, ".ivy2/cache/org.apache.spark"))
 
     error_codes = get_error_codes("./dev/run-tests-codes.sh")
 

From ec03bf3b1e1194a8c0dc1b3a9cd83fd82b67ad4a Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 27 Apr 2015 09:36:22 -0700
Subject: [PATCH 05/52] added namedtuple for java version to add readability

---
 dev/run-tests.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 08c1211a24b5f..59f2306d71de6 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -22,6 +22,7 @@
 import sys
 import shutil
 import subprocess
+from collections import namedtuple
 
 SPARK_PROJ_ROOT = \
     os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
@@ -144,13 +145,22 @@ def determine_java_version(java_exe):
     """Given a valid java executable will return its version in tuple format as:
     [<major-version>, <minor-version>, <patch-version>, <update-version>]"""
 
-    raw_output = subprocess.check_output([java_exe, "-version"], stderr=subprocess.STDOUT)
+    raw_output = subprocess.check_output([java_exe, "-version"], 
+                                         stderr=subprocess.STDOUT)
     raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"'
     version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25'
     version, update = version_str.split('_') # eg ['1.8.0', '25']
 
+    JavaVersion = namedtuple('JavaVersion', 
+                             ['major', 'minor', 'patch', 'update'])
+
     # map over the values and convert them to integers
-    return map(lambda x: int(x), version.split('.') + [update])
+    version_info = map(lambda x: int(x), version.split('.') + [update])
+
+    return JavaVersion(major=version_info[0],
+                       minor=version_info[1],
+                       patch=version_info[2],
+                       update=version_info[3])
 
 
 def multi_starts_with(orig_str, *prefixes):
@@ -391,7 +401,7 @@ def run_sparkr_tests(test_suite=[]):
 
     java_version = determine_java_version(java_exe)
 
-    if java_version[1] < 8:
+    if java_version.minor < 8:
         print "[warn] Java 8 tests will not run because JDK version is < 1.8."
 
     test_suite = determine_test_suite()

From 07210a90a83c153cb562897477d8dd5cf92e94da Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 27 Apr 2015 09:39:21 -0700
Subject: [PATCH 06/52] minor doc string change for java version with
 namedtuple update

---
 dev/run-tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 59f2306d71de6..9106f7f0cf333 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -142,8 +142,8 @@ def determine_java_executable():
 
 
 def determine_java_version(java_exe):
-    """Given a valid java executable will return its version in tuple format as:
-    [<major-version>, <minor-version>, <patch-version>, <update-version>]"""
+    """Given a valid java executable will return its version in named tuple format
+    with accessors '.major', '.minor', '.patch', '.update'"""
 
     raw_output = subprocess.check_output([java_exe, "-version"], 
                                          stderr=subprocess.STDOUT)

From 26e18e8b7bf101f28932f2efbe52f830f0a7405a Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 27 Apr 2015 11:24:44 -0700
Subject: [PATCH 07/52] removed unnecessary wait()

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 9106f7f0cf333..d4009731697fa 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -183,7 +183,7 @@ def determine_test_suite():
     test_suite = list()
 
     if AMPLAB_JENKINS:
-        run_cmd(['git', 'fetch', 'origin', 'master:master']).wait()
+        run_cmd(['git', 'fetch', 'origin', 'master:master'])
 
         raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master'])
         # remove any empty strings

From c095fa665e2c7c9f0487404a551844452172ad22 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 27 Apr 2015 14:30:48 -0700
Subject: [PATCH 08/52] removed another wait() call

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index d4009731697fa..4fb77f43cfb16 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -293,7 +293,7 @@ def build_apache_spark():
     if AMPLAB_JENKINS_BUILD_TOOL == "maven":
         exec_maven(hive_12_profile_args + base_mvn_args)
     else:
-        exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait()
+        exec_sbt(hive_12_profile_args + sbt_hive_12_goals)
 
     # Then build with default Hive version (0.13.1) because tests are based on
     # this version

From 83e80ef4eec49dcee7c55900e4cbcf9b899aea65 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 27 Apr 2015 14:31:58 -0700
Subject: [PATCH 09/52] attempt at better python output when called from bash

---
 dev/run-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests b/dev/run-tests
index 844ff6a0d9757..3bb677b6bce0d 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -20,4 +20,4 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-./dev/run-tests.py
+python -u ./dev/run-tests.py

From b0b2604595768a4989b4fd802846fb18a4572f21 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 28 Apr 2015 12:48:59 -0700
Subject: [PATCH 10/52] comment out import to see if build fails and returns
 properly

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 86269eac52db0..b46bd3ae31be8 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub
   FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
   TextInputFormat}
-import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
+//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 
 import org.apache.mesos.MesosNativeLibrary

From 803143a16ebf0e3e9c41448a1dcea053e3d646c1 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 28 Apr 2015 13:41:22 -0700
Subject: [PATCH 11/52] removed license file for SparkContext

---
 .../scala/org/apache/spark/SparkContext.scala | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b46bd3ae31be8..70ba871ba77f0 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1,20 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 package org.apache.spark
 
 import scala.language.implicitConversions
@@ -39,7 +22,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub
   FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
   TextInputFormat}
-//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
+import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 
 import org.apache.mesos.MesosNativeLibrary

From a5bd4455b49f2428e3b28e04bcd1f883d0e49a05 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 28 Apr 2015 16:21:40 -0700
Subject: [PATCH 12/52] reverted license, changed test in shuffle to fail

---
 .../scala/org/apache/spark/SparkContext.scala   | 17 +++++++++++++++++
 .../scala/org/apache/spark/ShuffleSuite.scala   |  3 ++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 70ba871ba77f0..86269eac52db0 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark
 
 import scala.language.implicitConversions
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index d7180516029d5..629f74d91c5cf 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -84,7 +84,8 @@ abstract class ShuffleSuite extends FunSuite with Matchers with LocalSparkContex
       NonJavaSerializableClass,
       NonJavaSerializableClass](b, new HashPartitioner(3))
     c.setSerializer(new KryoSerializer(conf))
-    assert(c.count === 10)
+    // assert(c.count === 10)
+    assert(c.count === 42)
   }
 
   test("zero sized blocks") {

From 7613558fbb962a73085352d5c5010de9ee204809 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 10:38:21 -0700
Subject: [PATCH 13/52] updated to return the proper env variable for return
 codes

---
 dev/run-tests.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 4fb77f43cfb16..da99c7a84024e 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -48,6 +48,11 @@ def get_error_codes(err_code_file):
         return dict(err_codes)
 
 
+def exit_from_command_with_retcode(cmd, retcode):
+    print "[error] running", cmd, "; received return code", retcode
+    sys.exit(os.environ["BLOCK_CURRENT"])
+
+
 def rm_r(path):
     """Given an arbitrary path properly remove it with the correct python
     construct if it exists
@@ -75,8 +80,7 @@ def run_cmd(cmd):
     try:
         subprocess.check_call(cmd)
     except subprocess.CalledProcessError as e:
-        print "[error] running", e.cmd, "; received return code", e.returncode
-        sys.exit(e.returncode)
+        exit_from_command_with_retcode(e.cmd, e.returncode)
 
 
 def set_sbt_maven_profile_args():
@@ -249,19 +253,24 @@ def exec_sbt(sbt_args=[]):
     """Will call SBT in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
+    sbt_cmd = ["./build/sbt"] + sbt_args
+
     # NOTE: echo "q" is needed because sbt on encountering a build file
     # with failure (either resolution or compilation) prompts the user for
     # input either q, r, etc to quit or retry. This echo is there to make it
     # not block.
     echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
-    sbt_proc = subprocess.Popen(["./build/sbt"] + sbt_args,
+    sbt_proc = subprocess.Popen(sbt_cmd,
                                 stdin=echo_proc.stdout,
                                 stdout=subprocess.PIPE)
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, ''):
         if not SBT_OUTPUT_FILTER.match(line):
             print line,    
-    sbt_proc.wait()
+    retcode = sbt_proc.wait()
+
+    if retcode > 0:
+        exit_from_command_with_retcode(sbt_cmd, retcode)
 
 
 def build_apache_spark():

From b37328ccf61765469fb6d5f20115520e48a60ff4 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 11:57:33 -0700
Subject: [PATCH 14/52] fixed typo and added default return is no error block
 was found in the environment

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index da99c7a84024e..b25089afb0ff8 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -50,7 +50,7 @@ def get_error_codes(err_code_file):
 
 def exit_from_command_with_retcode(cmd, retcode):
     print "[error] running", cmd, "; received return code", retcode
-    sys.exit(os.environ["BLOCK_CURRENT"])
+    sys.exit(os.environ.get("CURRENT_BLOCK", 255))
 
 
 def rm_r(path):

From 56d3cb93f30c2f5919d0f63dcd0edc4185b8cee7 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 13:16:52 -0700
Subject: [PATCH 15/52] changed test back and commented out import to break
 compile

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 core/src/test/scala/org/apache/spark/ShuffleSuite.scala | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 86269eac52db0..b46bd3ae31be8 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub
   FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
   TextInputFormat}
-import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
+//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 
 import org.apache.mesos.MesosNativeLibrary
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index 629f74d91c5cf..d7180516029d5 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -84,8 +84,7 @@ abstract class ShuffleSuite extends FunSuite with Matchers with LocalSparkContex
       NonJavaSerializableClass,
       NonJavaSerializableClass](b, new HashPartitioner(3))
     c.setSerializer(new KryoSerializer(conf))
-    // assert(c.count === 10)
-    assert(c.count === 42)
+    assert(c.count === 10)
   }
 
   test("zero sized blocks") {

From e4a96cc0ffc43515f62ce2036fc93eb9ea2c7535 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 13:43:48 -0700
Subject: [PATCH 16/52] removed the import error and added license error, fixed
 the way run-tests and run-tests.py report their error codes

---
 .../main/scala/org/apache/spark/SparkConf.scala | 17 -----------------
 .../scala/org/apache/spark/SparkContext.scala   |  2 +-
 dev/run-tests                                   |  4 ++++
 dev/run-tests.py                                |  2 +-
 4 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index e3a649d755450..a08fcd6658721 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -1,20 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 package org.apache.spark
 
 import java.util.concurrent.ConcurrentHashMap
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b46bd3ae31be8..86269eac52db0 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub
   FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
   TextInputFormat}
-//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
+import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 
 import org.apache.mesos.MesosNativeLibrary
diff --git a/dev/run-tests b/dev/run-tests
index 3bb677b6bce0d..b9002ade42160 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -21,3 +21,7 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
 python -u ./dev/run-tests.py
+
+# exit from this script with the return code from the python script to ensure
+# dev/run-tests-jenkins reports the correct error
+exit $?
diff --git a/dev/run-tests.py b/dev/run-tests.py
index b25089afb0ff8..973655943c086 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -50,7 +50,7 @@ def get_error_codes(err_code_file):
 
 def exit_from_command_with_retcode(cmd, retcode):
     print "[error] running", cmd, "; received return code", retcode
-    sys.exit(os.environ.get("CURRENT_BLOCK", 255))
+    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
 def rm_r(path):

From 76335fb9f5a86c4eb8d07cd26fcfc6321fa93084 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 14:12:55 -0700
Subject: [PATCH 17/52] reverted rat license issue for sparkconf

---
 .../main/scala/org/apache/spark/SparkConf.scala | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index a08fcd6658721..e3a649d755450 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark
 
 import java.util.concurrent.ConcurrentHashMap

From 983f2a2a99bf5bc9fce6547dcc089e37a8d8ebfc Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 14:13:34 -0700
Subject: [PATCH 18/52] comment out import to fail build test

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 5ae8fb81de809..c02fc8aebd4a7 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub
   FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
   TextInputFormat}
-import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
+//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 
 import org.apache.mesos.MesosNativeLibrary

From f041d8af6a6300934134a8240f6c2621b6d1b825 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 14:23:07 -0700
Subject: [PATCH 19/52] added space from commented import to now test build
 breaking

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index c02fc8aebd4a7..f28ddbfbe15a8 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub
   FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
   TextInputFormat}
-//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
+// import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 
 import org.apache.mesos.MesosNativeLibrary

From d825aa4e6dcf5c3137dc0e008dace87ef4e28dc6 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 29 Apr 2015 14:58:32 -0700
Subject: [PATCH 20/52] revert build break, add mima break

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 project/MimaExcludes.scala                              | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index f28ddbfbe15a8..5ae8fb81de809 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub
   FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
   TextInputFormat}
-// import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
+import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 
 import org.apache.mesos.MesosNativeLibrary
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 3beafa158eb97..68301aea642e3 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -85,9 +85,9 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Vector.numNonzeros"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.toSparse"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.numActives")
+              "org.apache.spark.mllib.linalg.Vector.toSparse")
+            // ProblemFilters.exclude[MissingMethodProblem](
+            //  "org.apache.spark.mllib.linalg.Vector.numActives")
           )
 
         case v if v.startsWith("1.3") =>

From 9a592ec89165a1797196a74cfea053982c52d884 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Thu, 30 Apr 2015 08:59:32 -0700
Subject: [PATCH 21/52] reverted mima exclude issue, added pyspark test failure

---
 project/MimaExcludes.scala | 6 +++---
 python/pyspark/tests.py    | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 68301aea642e3..3beafa158eb97 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -85,9 +85,9 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Vector.numNonzeros"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.toSparse")
-            // ProblemFilters.exclude[MissingMethodProblem](
-            //  "org.apache.spark.mllib.linalg.Vector.numActives")
+              "org.apache.spark.mllib.linalg.Vector.toSparse"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.numActives")
           )
 
         case v if v.startsWith("1.3") =>
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index ea63a396da5b8..f617b555e174f 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -150,7 +150,8 @@ def gen_data(N, step):
         def gen_gs(N, step=1):
             return shuffle.GroupByKey(gen_data(N, step))
 
-        self.assertEqual(1, len(list(gen_gs(1))))
+        self.assertEqual(42, len(list(gen_gs(1))))
+        # self.assertEqual(1, len(list(gen_gs(1))))
         self.assertEqual(2, len(list(gen_gs(2))))
         self.assertEqual(100, len(list(gen_gs(100))))
         self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)])

From 1dada6b5683be3dccb7173d3ed152f5c31614cb0 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Thu, 30 Apr 2015 10:03:18 -0700
Subject: [PATCH 22/52] reverted pyspark test failure

---
 python/pyspark/tests.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f617b555e174f..ea63a396da5b8 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -150,8 +150,7 @@ def gen_data(N, step):
         def gen_gs(N, step=1):
             return shuffle.GroupByKey(gen_data(N, step))
 
-        self.assertEqual(42, len(list(gen_gs(1))))
-        # self.assertEqual(1, len(list(gen_gs(1))))
+        self.assertEqual(1, len(list(gen_gs(1))))
         self.assertEqual(2, len(list(gen_gs(2))))
         self.assertEqual(100, len(list(gen_gs(100))))
         self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)])

From afeb09388840d747e8954539578f9be5d5b219d1 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Thu, 30 Apr 2015 10:05:40 -0700
Subject: [PATCH 23/52] updated to make sparkR test fail

---
 R/pkg/inst/tests/test_rdd.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/pkg/inst/tests/test_rdd.R b/R/pkg/inst/tests/test_rdd.R
index 03207353c31c6..5363009ba1b97 100644
--- a/R/pkg/inst/tests/test_rdd.R
+++ b/R/pkg/inst/tests/test_rdd.R
@@ -28,7 +28,8 @@ intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
 intRdd <- parallelize(sc, intPairs, 2L)
 
 test_that("get number of partitions in RDD", {
-  expect_equal(numPartitions(rdd), 2)
+  expect_equal(numPartitions(rdd), 42)
+  # expect_equal(numPartitions(rdd), 2)
   expect_equal(numPartitions(intRdd), 2)
 })
 

From b1ca59375446555d9bfdbb594d2e6364820bcd26 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Fri, 1 May 2015 09:46:06 -0700
Subject: [PATCH 24/52] reverted the sparkR test

---
 R/pkg/inst/tests/test_rdd.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/pkg/inst/tests/test_rdd.R b/R/pkg/inst/tests/test_rdd.R
index 5363009ba1b97..03207353c31c6 100644
--- a/R/pkg/inst/tests/test_rdd.R
+++ b/R/pkg/inst/tests/test_rdd.R
@@ -28,8 +28,7 @@ intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
 intRdd <- parallelize(sc, intPairs, 2L)
 
 test_that("get number of partitions in RDD", {
-  expect_equal(numPartitions(rdd), 42)
-  # expect_equal(numPartitions(rdd), 2)
+  expect_equal(numPartitions(rdd), 2)
   expect_equal(numPartitions(intRdd), 2)
 })
 

From f950010d585ce9744c79d76dfa8dfbe61079e0a2 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 11 May 2015 16:31:28 -0700
Subject: [PATCH 25/52] removed building hive-0.12.0 per SPARK-6908

---
 dev/run-tests.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 973655943c086..f76e3ffff55d2 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -283,27 +283,13 @@ def build_apache_spark():
     sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split()
     hive_profile_args = sbt_maven_profile_args + ["-Phive", 
                                                   "-Phive-thriftserver"]
-    hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"]
     # set the default maven args
     base_mvn_args = ["clean", "package", "-DskipTests"]
     # set the necessary sbt goals
-    sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"]
     sbt_hive_goals = ["package", 
                       "assembly/assembly", 
                       "streaming-kafka-assembly/assembly"]
 
-    # First build with Hive 0.12.0 to ensure patches do not break the Hive 
-    # 0.12.0 build
-    print "[info] Compile with Hive 0.12.0"
-    rm_r("lib_managed")
-    print "[info] Building Spark with these arguments:", 
-    print " ".join(hive_12_profile_args)
-
-    if AMPLAB_JENKINS_BUILD_TOOL == "maven":
-        exec_maven(hive_12_profile_args + base_mvn_args)
-    else:
-        exec_sbt(hive_12_profile_args + sbt_hive_12_goals)
-
     # Then build with default Hive version (0.13.1) because tests are based on
     # this version
     print "[info] Compile with Hive 0.13.1"

From f9deba1cb25ae10fc9ea29cf9fe319bae521244c Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 19 May 2015 12:16:07 -0700
Subject: [PATCH 26/52] python to python2 and removed newline

---
 dev/run-tests.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 3dd09b8714171..0a178e655924d 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
@@ -24,8 +24,7 @@
 import subprocess
 from collections import namedtuple
 
-SPARK_PROJ_ROOT = \
-    os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+SPARK_PROJ_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 USER_HOME_DIR = os.environ.get("HOME")
 
 SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS"

From b1248dc582124114bda5cf4dad9bc76ad229c311 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Thu, 21 May 2015 11:28:27 -0700
Subject: [PATCH 27/52] exec python rather than running python and exiting with
 return code

---
 dev/run-tests | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index b9002ade42160..a00d9f0c27639 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -20,8 +20,4 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-python -u ./dev/run-tests.py
-
-# exit from this script with the return code from the python script to ensure
-# dev/run-tests-jenkins reports the correct error
-exit $?
+exec python -u ./dev/run-tests.py

From 0629de8a81a07bcf78c50360af5354e7f5062fd0 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Fri, 5 Jun 2015 11:20:23 -0700
Subject: [PATCH 28/52] updated to refactor and remove various small bugs,
 removed pep8 complaints

---
 dev/run-tests.py | 388 ++++++++++++++++++++++++++---------------------
 1 file changed, 211 insertions(+), 177 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 0a178e655924d..38859da5486f5 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -24,25 +24,21 @@
 import subprocess
 from collections import namedtuple
 
-SPARK_PROJ_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
-USER_HOME_DIR = os.environ.get("HOME")
+SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+USER_HOME = os.environ.get("HOME")
 
-SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS"
-AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL")
-AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS")
-
-SBT_OUTPUT_FILTER = re.compile("^.*[info].*Resolving" + "|" + 
-                               "^.*[warn].*Merging" + "|" +
-                               "^.*[info].*Including")
+#SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS"
+#AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
+#AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS")
 
 
 def get_error_codes(err_code_file):
     """Function to retrieve all block numbers from the `run-tests-codes.sh`
-    file to maintain backwards compatibility with the `run-tests-jenkins` 
+    file to maintain backwards compatibility with the `run-tests-jenkins`
     script"""
-    
+
     with open(err_code_file, 'r') as f:
-        err_codes = [e.split()[1].strip().split('=') 
+        err_codes = [e.split()[1].strip().split('=')
                      for e in f if e.startswith("readonly")]
         return dict(err_codes)
 
@@ -63,13 +59,6 @@ def rm_r(path):
         os.remove(path)
 
 
-def lineno():
-    """Returns the current line number in our program
-    - from: http://stackoverflow.com/a/3056059"""
-
-    return inspect.currentframe().f_back.f_lineno
-
-
 def run_cmd(cmd):
     """Given a command as a list of arguments will attempt to execute the
     command and, on failure, print an error message"""
@@ -82,32 +71,6 @@ def run_cmd(cmd):
         exit_from_command_with_retcode(e.cmd, e.returncode)
 
 
-def set_sbt_maven_profile_args():
-    """Properly sets the SBT environment variable arguments with additional
-    checks to determine if this is running on an Amplab Jenkins machine"""
-
-    # base environment values for SBT_MAVEN_PROFILE_ARGS_ENV which will be appended on
-    sbt_maven_profile_args_base = ["-Pkinesis-asl"]
-
-    sbt_maven_profile_arg_dict = {
-        "hadoop1.0" : ["-Phadoop-1", "-Dhadoop.version=1.0.4"],
-        "hadoop2.0" : ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
-        "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2"],
-        "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
-    }
-
-    # set the SBT maven build profile argument environment variable and ensure
-    # we build against the right version of Hadoop
-    if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"):
-        os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \
-            " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) 
-                     + sbt_maven_profile_args_base)
-    else:
-        os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \
-            " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", [])
-                     + sbt_maven_profile_args_base)
-
-
 def is_exe(path):
     """Check if a given path is an executable file
     - from: http://stackoverflow.com/a/377028"""
@@ -134,7 +97,12 @@ def which(program):
 
 
 def determine_java_executable():
-    """Will return the *best* path possible for a 'java' executable or `None`"""
+    """Will return the path of the java executable that will be used by Spark's
+    tests or `None`"""
+
+    # Any changes in the way that Spark's build detects java must be reflected
+    # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
+    # the `java` executable on the path
 
     java_home = os.environ.get("JAVA_HOME")
 
@@ -144,21 +112,21 @@ def determine_java_executable():
     return java_exe if java_exe else which("java")
 
 
+JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update'])
+
+
 def determine_java_version(java_exe):
     """Given a valid java executable will return its version in named tuple format
     with accessors '.major', '.minor', '.patch', '.update'"""
 
-    raw_output = subprocess.check_output([java_exe, "-version"], 
+    raw_output = subprocess.check_output([java_exe, "-version"],
                                          stderr=subprocess.STDOUT)
-    raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"'
-    version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25'
-    version, update = version_str.split('_') # eg ['1.8.0', '25']
-
-    JavaVersion = namedtuple('JavaVersion', 
-                             ['major', 'minor', 'patch', 'update'])
+    raw_version_str = raw_output.split('\n')[0]  # eg 'java version "1.8.0_25"'
+    version_str = raw_version_str.split()[-1].strip('"')  # eg '1.8.0_25'
+    version, update = version_str.split('_')  # eg ['1.8.0', '25']
 
     # map over the values and convert them to integers
-    version_info = map(lambda x: int(x), version.split('.') + [update])
+    version_info = [int(x) for x in version.split('.') + [update]]
 
     return JavaVersion(major=version_info[0],
                        minor=version_info[1],
@@ -166,56 +134,6 @@ def determine_java_version(java_exe):
                        update=version_info[3])
 
 
-def multi_starts_with(orig_str, *prefixes):
-    """Takes a string and an abritrary number of prefixes then checks the
-    original string for any of the possible prefixes passed in"""
-
-    for s in prefixes:
-        if orig_str.startswith(s):
-            return True
-    return False
-
-
-def determine_test_suite():
-    """This function current acts to determine if SQL tests need to be run in
-    addition to the core test suite *or* if _only_ SQL tests need to be run
-    as the git logs show that to be the only thing touched. In the future
-    this function will act more generically to help further segregate the
-    test suite runner (hence the function name).
-    @return a set of unique test names"""
-    test_suite = list()
-
-    if AMPLAB_JENKINS:
-        run_cmd(['git', 'fetch', 'origin', 'master:master'])
-
-        raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master'])
-        # remove any empty strings
-        changed_files = [f for f in raw_output.split('\n') if f]
-
-        # find any sql files
-        sql_files = [f for f in changed_files
-                     if multi_starts_with(f, 
-                                          "sql/", 
-                                          "bin/spark-sql", 
-                                          "sbin/start-thriftserver.sh")]
-
-        non_sql_files = set(changed_files).difference(set(sql_files))
-
-        if non_sql_files:
-            test_suite.append("CORE")
-        if sql_files:
-            print "[info] Detected changes in SQL. Will run Hive test suite."
-            test_suite.append("SQL")
-            if not non_sql_files:
-                print "[info] Detected no changes except in SQL. Will only run SQL tests."
-        return set(test_suite)
-    else:
-        # we aren't in the Amplab environment so merely run all tests
-        test_suite.append("CORE")
-        test_suite.append("SQL")
-        return set(test_suite)
-
-
 def set_title_and_block(title, err_block):
     os.environ["CURRENT_BLOCK"] = error_codes[err_block]
     line_str = '=' * 72
@@ -254,6 +172,10 @@ def exec_sbt(sbt_args=[]):
 
     sbt_cmd = ["./build/sbt"] + sbt_args
 
+    sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
+                                   "^.*[warn].*Merging" + "|" +
+                                   "^.*[info].*Including")
+
     # NOTE: echo "q" is needed because sbt on encountering a build file
     # with failure (either resolution or compilation) prompts the user for
     # input either q, r, etc to quit or retry. This echo is there to make it
@@ -264,42 +186,90 @@ def exec_sbt(sbt_args=[]):
                                 stdout=subprocess.PIPE)
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, ''):
-        if not SBT_OUTPUT_FILTER.match(line):
-            print line,    
+        if not sbt_output_filter.match(line):
+            print line,
     retcode = sbt_proc.wait()
 
     if retcode > 0:
         exit_from_command_with_retcode(sbt_cmd, retcode)
 
 
-def build_apache_spark():
-    """Will first build Spark with Hive v0.12.0 to ensure the build is
-    successful and, after, will build Spark again against Hive v0.13.1 as the
-    tests are based off that"""
+def get_hadoop_profiles(hadoop_version):
+    """Return a list of profiles indicating which Hadoop version to use from a Hadoop version tag."""
+
+    #amplab_jenkins_build_profile = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE")
+
+    sbt_maven_hadoop_profiles = {
+        "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"],
+        "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
+        "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"],
+        "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
+    }
+
+    try:
+        hadoop_profiles = sbt_maven_hadoop_profiles[hadoop_version]
+    except KeyError:
+        print "[error] Could not find", hadoop_version, "in the list. Valid options",
+        print "are 'hadoop1.0', 'hadoop2.0', 'hadoop2.2', and 'hadoop2.3'."
+        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+
+    return hadoop_profiles
+
+
+def get_build_profiles(hadoop_version="hadoop2.3",
+                       base_profiles=True,
+                       hive_profiles=False):
+    """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile
+    key with the option of adding on the base and hive profiles."""
+
+    base_profiles = ["-Pkinesis-asl"]
+    hive_profiles = ["-Phive", "-Phive-thriftserver"]
+    hadoop_profiles = get_hadoop_profiles(hadoop_version)
+    
+    # first, check and add the base profiles
+    if base_profiles: build_profiles = build_profile + base_profiles
+    # second, check and add the hive profiles
+    if hive_profiles: build_profiles = build_profile + hive_profiles
+
+    return build_profiles
+
+
+def build_spark_maven(hadoop_version):
+    build_profiles = get_build_profiles(hadoop_version, hive_profiles=True)
+    mvn_goals = ["clean", "package", "-DskipTests"]
+    profiles_and_goals = build_profiles + mvn_goals
+
+    print "[info] Building Spark (w/Hive 0.13.1) with these arguments:",
+    print " ".join(profiles_and_goals)
+
+    exec_maven(profiles_and_goals)
+
+
+def build_spark_sbt(hadoop_version):
+    build_profiles = get_build_profiles(hadoop_version, hive_profiles=True)
+    sbt_goals = ["package",
+                 "assembly/assembly",
+                 "streaming-kafka-assembly/assembly"]
+    profiles_and_goals = build_profiles + sbt_goals
+
+    print "[info] Building Spark (w/Hive 0.13.1) with these arguments:",
+    print " ".join(profiles_and_goals)
+
+    exec_sbt(profiles_and_goals)
+
+
+def build_apache_spark(build_tool, hadoop_version):
+    """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or
+    `maven`). Defaults to using `sbt`."""
 
     set_title_and_block("Building Spark", "BLOCK_BUILD")
 
-    sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split()
-    hive_profile_args = sbt_maven_profile_args + ["-Phive", 
-                                                  "-Phive-thriftserver"]
-    # set the default maven args
-    base_mvn_args = ["clean", "package", "-DskipTests"]
-    # set the necessary sbt goals
-    sbt_hive_goals = ["package", 
-                      "assembly/assembly", 
-                      "streaming-kafka-assembly/assembly"]
-
-    # Then build with default Hive version (0.13.1) because tests are based on
-    # this version
-    print "[info] Compile with Hive 0.13.1"
     rm_r("lib_managed")
-    print "[info] Building Spark with these arguments:", 
-    print " ".join(hive_profile_args)
 
-    if AMPLAB_JENKINS_BUILD_TOOL == "maven":
-        exec_maven(hive_profile_args + base_mvn_args)
+    if build_tool == "maven":
+        build_spark_maven(hadoop_version)
     else:
-        exec_sbt(hive_profile_args + sbt_hive_goals)
+        build_spark_sbt(hadoop_version)
 
 
 def detect_binary_inop_with_mima():
@@ -308,49 +278,98 @@ def detect_binary_inop_with_mima():
     run_cmd(["./dev/mima"])
 
 
-def run_scala_tests(test_suite=[]):
-    """Function to properly execute all tests pass in, as a list, from the
-    `determine_test_suite` function"""
-    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
+def determine_test_modules(test_env):
+    """This function current acts to determine if SQL tests need to be run in
+    addition to the core test suite *or* if _only_ SQL tests need to be run
+    as the git logs show that to be the only thing touched. In the future
+    this function will act more generically to help further segregate the
+    test suite runner (hence the function name).
+    @return a set of unique test names"""
+    test_suite = list()
 
-    # ensure the test_suite is a set
-    if not isinstance(test_suite, set): 
-        test_suite = set(test_suite)
+    if test_env == "amplab_jenkins":
+        target_branch = os.environ.get("ghprbTargetBranch")
+        run_cmd(['git', 'fetch', 'origin', target_branch+":"+target_branch])
 
-    # if the Spark SQL tests are enabled, run the tests with the Hive profiles 
-    # enabled.
-    if "SQL" in test_suite:
-        sbt_maven_profile_args = \
-            os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split()
-        os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \
-            " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"])
-
-    # if we only have changes in SQL build a custom test string
-    if "SQL" in test_suite and "CORE" not in test_suite:
-        sbt_maven_test_args = ["catalyst/test",
-                               "sql/test",
-                               "hive/test", 
-                               "hive-thriftserver/test",
-                               "mllib/test"]
+        raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch])
+        # remove any empty strings
+        changed_files = [f for f in raw_output.split('\n') if f]
+
+        # find any sql files
+        sql_files = [f for f in changed_files
+                     if any(f.startswith(p) for p in ["sql/",
+                                                      "bin/spark-sql",
+                                                      "sbin/start-thriftserver.sh"])]
+
+        non_sql_files = set(changed_files).difference(set(sql_files))
+
+        if non_sql_files:
+            test_suite.append("CORE")
+        if sql_files:
+            print "[info] Detected changes in SQL. Will run Hive test suite."
+            test_suite.append("SQL")
+            if not non_sql_files:
+                print "[info] Detected no changes except in SQL. Will only run SQL tests."
+        return set(test_suite)
     else:
-        sbt_maven_test_args = ["test"]
+        # we aren't in the Amplab environment so simply run all tests
+        test_suite.append("CORE")
+        test_suite.append("SQL")
+        return set(test_suite)
+
+
+def run_scala_tests_maven(test_profiles):
+    mvn_test_goals = ["test", "--fail-at-end"]
+    profiles_and_goals = test_profiles + mvn_test_goals
+
+    print "[info] Running Spark tests with these arguments:",
+    print " ".join(profiles_and_goals)
+
+    exec_maven(profiles_and_goals)
 
-    # get the latest sbt maven profile arguments
-    sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split()
+
+def run_scala_tests_sbt(test_modules, test_profiles):
+    # if we only have changes in SQL build a custom test list
+    if "SQL" in test_modules and "CORE" not in test_modules:
+        sbt_test_goals = ["catalyst/test",
+                          "sql/test",
+                          "hive/test",
+                          "hive-thriftserver/test",
+                          "mllib/test"]
+    else:
+        sbt_test_goals = ["test"]
+
+    profiles_and_goals = test_profiles + sbt_test_goals
 
     print "[info] Running Spark tests with these arguments:",
-    print " ".join(sbt_maven_profile_args), 
-    print " ".join(sbt_maven_test_args)
+    print " ".join(profiles_and_goals)
+
+    exec_sbt(profiles_and_goals)
+
+
+def run_scala_tests(build_tool, hadoop_version, test_modules):
+    """Function to properly execute all tests passed in as a set from the
+    `determine_test_suites` function"""
+    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
+
+    test_modules = set(test_modules)
 
-    if AMPLAB_JENKINS_BUILD_TOOL == "maven":
-        exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"])
+    # if the Spark SQL tests are enabled, run the tests with the Hive profiles
+    # enabled.
+    if "SQL" in test_modules:
+        test_profiles = get_build_profiles(hadoop_version, hive_profiles=True)
+    else:
+        test_profiles = get_build_profiles(hadoop_version)
+
+    if build_tool == "maven":
+        run_scala_tests_maven(test_profiles)
     else:
-        exec_sbt(sbt_maven_profile_args + sbt_maven_test_args)
+        run_scala_tests_sbt(test_modules, test_profiles)
 
 
-def run_python_tests(test_suite=[]):
+def run_python_tests():
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
-    
+
     # Add path for Python3 in Jenkins if we're calling from a Jenkins machine
     if AMPLAB_JENKINS:
         os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
@@ -358,7 +377,7 @@ def run_python_tests(test_suite=[]):
     run_cmd(["./python/run-tests"])
 
 
-def run_sparkr_tests(test_suite=[]):
+def run_sparkr_tests():
     set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
 
     if which("R"):
@@ -367,29 +386,28 @@ def run_sparkr_tests(test_suite=[]):
     else:
         print "Ignoring SparkR tests as R was not found in PATH"
 
-if __name__ == "__main__":
+
+def main():
     # Ensure the user home directory (HOME) is valid and is an absolute directory
-    if not USER_HOME_DIR or not os.path.isabs(USER_HOME_DIR):
+    if not USER_HOME or not os.path.isabs(USER_HOME):
         print "[error] Cannot determine your home directory as an absolute path;",
         print "ensure the $HOME environment variable is set properly."
         sys.exit(1)
 
-    os.chdir(SPARK_PROJ_ROOT)
+    os.chdir(SPARK_HOME)
 
-    rm_r("./work")
-    rm_r(os.path.join(USER_HOME_DIR, ".ivy2/local/org.apache.spark"))
-    rm_r(os.path.join(USER_HOME_DIR, ".ivy2/cache/org.apache.spark"))
+    rm_r(os.path.join(SPARK_HOME, "work"))
+    rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark"))
+    rm_r(os.path.join(USER_HOME, ".ivy2/cache/org.apache.spark"))
 
     error_codes = get_error_codes("./dev/run-tests-codes.sh")
 
     os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"]
 
-    set_sbt_maven_profile_args()
-
     java_exe = determine_java_executable()
 
     if not java_exe:
-        print "[error] Cannot find a version of `java` on the system; please", 
+        print "[error] Cannot find a version of `java` on the system; please",
         print "install one and retry."
         sys.exit(2)
 
@@ -398,20 +416,36 @@ def run_sparkr_tests(test_suite=[]):
     if java_version.minor < 8:
         print "[warn] Java 8 tests will not run because JDK version is < 1.8."
 
-    test_suite = determine_test_suite()
+    if os.environ.get("AMPLAB_JENKINS"):
+        # if we're on the Amplab Jenkins build servers setup variables
+        # to reflect the environment settings
+        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
+        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
+        test_env="amplab_jenkins"
+    else:
+        # else we're running locally and can use local settings
+        build_tool = "sbt"
+        hadoop_version = "hadoop2.3"
+        test_env="local"
 
+    # license checks
     run_apache_rat_checks()
-    
-    run_scala_style_checks()
 
+    # style checks
+    run_scala_style_checks()
     run_python_style_checks()
 
-    build_apache_spark()
+    # spark build
+    build_apache_spark(build_tool, hadoop_version)
 
+    # backwards compatibility checks
     detect_binary_inop_with_mima()
 
-    run_scala_tests(test_suite)
-
+    # test suites
+    test_modules = determine_test_modules(test_env)
+    run_scala_tests(build_tool, hadoop_version, test_modules)
     run_python_tests()
-
     run_sparkr_tests()
+
+if __name__ == "__main__":
+    main()

From 8afbe9319837ede3f41dd250f11aeaaa1d8dacf8 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Fri, 5 Jun 2015 11:25:35 -0700
Subject: [PATCH 29/52] made error codes a global

---
 dev/run-tests.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 38859da5486f5..acc8db866f8a8 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -27,10 +27,6 @@
 SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 USER_HOME = os.environ.get("HOME")
 
-#SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS"
-#AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
-#AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS")
-
 
 def get_error_codes(err_code_file):
     """Function to retrieve all block numbers from the `run-tests-codes.sh`
@@ -43,6 +39,9 @@ def get_error_codes(err_code_file):
         return dict(err_codes)
 
 
+ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
+
+
 def exit_from_command_with_retcode(cmd, retcode):
     print "[error] running", cmd, "; received return code", retcode
     sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
@@ -135,7 +134,7 @@ def determine_java_version(java_exe):
 
 
 def set_title_and_block(title, err_block):
-    os.environ["CURRENT_BLOCK"] = error_codes[err_block]
+    os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
     line_str = '=' * 72
 
     print
@@ -400,9 +399,7 @@ def main():
     rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark"))
     rm_r(os.path.join(USER_HOME, ".ivy2/cache/org.apache.spark"))
 
-    error_codes = get_error_codes("./dev/run-tests-codes.sh")
-
-    os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"]
+    os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]
 
     java_exe = determine_java_executable()
 

From 1f607b1a4024721d0775e6010f9589771aa593af Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 9 Jun 2015 16:09:02 -0700
Subject: [PATCH 30/52] finalizing revisions to modular tests

---
 dev/run-tests.py | 112 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 85 insertions(+), 27 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index acc8db866f8a8..94d2ad1e563bf 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -60,11 +60,14 @@ def rm_r(path):
 
 def run_cmd(cmd):
     """Given a command as a list of arguments will attempt to execute the
-    command and, on failure, print an error message"""
+    command from the determined SPARK_HOME directory and, on failure, print
+    an error message"""
 
     if not isinstance(cmd, list):
         cmd = cmd.split()
     try:
+        # prepend SPARK_HOME onto the first element of the command
+        cmd[0] = os.path.join(SPARK_HOME, *filter(lambda x: x, cmd[0].split(os.path.sep)))
         subprocess.check_call(cmd)
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
@@ -194,9 +197,8 @@ def exec_sbt(sbt_args=[]):
 
 
 def get_hadoop_profiles(hadoop_version):
-    """Return a list of profiles indicating which Hadoop version to use from a Hadoop version tag."""
-
-    #amplab_jenkins_build_profile = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE")
+    """Return a list of profiles indicating which Hadoop version to use from
+    a Hadoop version tag."""
 
     sbt_maven_hadoop_profiles = {
         "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"],
@@ -224,11 +226,14 @@ def get_build_profiles(hadoop_version="hadoop2.3",
     base_profiles = ["-Pkinesis-asl"]
     hive_profiles = ["-Phive", "-Phive-thriftserver"]
     hadoop_profiles = get_hadoop_profiles(hadoop_version)
-    
+
+    build_profiles = hadoop_profiles
     # first, check and add the base profiles
-    if base_profiles: build_profiles = build_profile + base_profiles
+    if base_profiles:
+        build_profiles = build_profiles + base_profiles
     # second, check and add the hive profiles
-    if hive_profiles: build_profiles = build_profile + hive_profiles
+    if hive_profiles:
+        build_profiles = build_profiles + hive_profiles
 
     return build_profiles
 
@@ -238,7 +243,7 @@ def build_spark_maven(hadoop_version):
     mvn_goals = ["clean", "package", "-DskipTests"]
     profiles_and_goals = build_profiles + mvn_goals
 
-    print "[info] Building Spark (w/Hive 0.13.1) with these arguments:",
+    print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:",
     print " ".join(profiles_and_goals)
 
     exec_maven(profiles_and_goals)
@@ -251,7 +256,7 @@ def build_spark_sbt(hadoop_version):
                  "streaming-kafka-assembly/assembly"]
     profiles_and_goals = build_profiles + sbt_goals
 
-    print "[info] Building Spark (w/Hive 0.13.1) with these arguments:",
+    print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:",
     print " ".join(profiles_and_goals)
 
     exec_sbt(profiles_and_goals)
@@ -296,9 +301,31 @@ def determine_test_modules(test_env):
 
         # find any sql files
         sql_files = [f for f in changed_files
-                     if any(f.startswith(p) for p in ["sql/",
-                                                      "bin/spark-sql",
-                                                      "sbin/start-thriftserver.sh"])]
+                     if any(f.startswith(p) for p in
+                            ["sql/",
+                             "bin/spark-sql",
+                             "sbin/start-thriftserver.sh",
+                             "examples/src/main/java/org/apache/spark/examples/sql/",
+                             "examples/src/main/scala/org/apache/spark/examples/sql/"])]
+        mllib_files = [f for f in changed_files
+                       if any(f.startswith(p) for p in
+                              ["examples/src/main/java/org/apache/spark/examples/mllib/",
+                               "examples/src/main/scala/org/apache/spark/examples/mllib",
+                               "data/mllib/",
+                               "mllib/"])]
+        streaming_files = [f for f in changed_files
+                           if any(f.startswith(p) for p in
+                                  ["examples/scala-2.10/",
+                                   "examples/src/main/java/org/apache/spark/examples/streaming/",
+                                   "examples/src/main/scala/org/apache/spark/examples/streaming/",
+                                   "external/",
+                                   "extras/java8-tests/",
+                                   "extras/kinesis-asl/",
+                                   "streaming/"])]
+        graphx_files = [f for f in changed_files
+                        if any(f.startswith(p) for p in
+                               ["examples/src/main/scala/org/apache/spark/examples/graphx/",
+                                "graphx/"])]
 
         non_sql_files = set(changed_files).difference(set(sql_files))
 
@@ -309,11 +336,20 @@ def determine_test_modules(test_env):
             test_suite.append("SQL")
             if not non_sql_files:
                 print "[info] Detected no changes except in SQL. Will only run SQL tests."
+        if mllib_files:
+            print "[info] Detected changes in MLlib. Will run MLlib test suite."
+            test_suite.append("MLLIB")
+        if streaming_files:
+            print "[info] Detected changes in Streaming. Will run Streaming test suite."
+            test_suite.append("STREAMING")
+        if graphx_files:
+            print "[info] Detected changes in GraphX. Will run GraphX test suite."
+            test_suite.append("GRAPHX")
+
         return set(test_suite)
     else:
         # we aren't in the Amplab environment so simply run all tests
-        test_suite.append("CORE")
-        test_suite.append("SQL")
+        test_suite.append("ALL")
         return set(test_suite)
 
 
@@ -321,26 +357,45 @@ def run_scala_tests_maven(test_profiles):
     mvn_test_goals = ["test", "--fail-at-end"]
     profiles_and_goals = test_profiles + mvn_test_goals
 
-    print "[info] Running Spark tests with these arguments:",
+    print "[info] Running Spark tests using Maven with these arguments:",
     print " ".join(profiles_and_goals)
 
     exec_maven(profiles_and_goals)
 
 
 def run_scala_tests_sbt(test_modules, test_profiles):
-    # if we only have changes in SQL build a custom test list
-    if "SQL" in test_modules and "CORE" not in test_modules:
-        sbt_test_goals = ["catalyst/test",
-                          "sql/test",
-                          "hive/test",
-                          "hive-thriftserver/test",
-                          "mllib/test"]
-    else:
+    if "ALL" in test_modules:
         sbt_test_goals = ["test"]
+    else:
+        # if we only have changes in SQL build a custom test list
+        if "SQL" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals = ["catalyst/test",
+                              "sql/test",
+                              "hive/test",
+                              "hive-thriftserver/test",
+                              "mllib/test",
+                              "examples/test"]
+        if "MLLIB" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals = sbt_test_goals + ["mllib/test",
+                                               "examples/test"]
+        if "STREAMING" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals = sbt_test_goals + ["streaming/test",
+                                               "streaming-flume/test",
+                                               "streaming-flume-sink/test",
+                                               "streaming-kafka/test",
+                                               "streaming-mqtt/test",
+                                               "streaming-twitter/test",
+                                               "streaming-zeromq/test",
+                                               "examples/test"]
+        if "GRAPHX" in test_modules and "CORE" not in test_modules:
+            sbt_test_goals = sbt_test_goals + ["graphx/test",
+                                               "examples/test"]
+        if not sbt_test_goals:
+            sbt_test_goals = ["test"]
 
     profiles_and_goals = test_profiles + sbt_test_goals
 
-    print "[info] Running Spark tests with these arguments:",
+    print "[info] Running Spark tests using SBT with these arguments:",
     print " ".join(profiles_and_goals)
 
     exec_sbt(profiles_and_goals)
@@ -393,7 +448,7 @@ def main():
         print "ensure the $HOME environment variable is set properly."
         sys.exit(1)
 
-    os.chdir(SPARK_HOME)
+        #os.chdir(SPARK_HOME)
 
     rm_r(os.path.join(SPARK_HOME, "work"))
     rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark"))
@@ -418,12 +473,15 @@ def main():
         # to reflect the environment settings
         build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
         hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
-        test_env="amplab_jenkins"
+        test_env = "amplab_jenkins"
     else:
         # else we're running locally and can use local settings
         build_tool = "sbt"
         hadoop_version = "hadoop2.3"
-        test_env="local"
+        test_env = "local"
+
+    print "[info] Using build tool", build_tool, "with profile", hadoop_version,
+    print "under environment", test_env
 
     # license checks
     run_apache_rat_checks()

From 2fcdfc0a9d3a12a20402321939de37e8efe4145d Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 9 Jun 2015 17:34:42 -0700
Subject: [PATCH 31/52] testing targte branch dump on jenkins

---
 dev/run-tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 94d2ad1e563bf..c7ab692ca53ad 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -293,7 +293,8 @@ def determine_test_modules(test_env):
 
     if test_env == "amplab_jenkins":
         target_branch = os.environ.get("ghprbTargetBranch")
-        run_cmd(['git', 'fetch', 'origin', target_branch+":"+target_branch])
+        print "target_branch at", target_branch
+        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
 
         raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch])
         # remove any empty strings

From db7ae6f74a46b71488ba81249dd5198bf4bd6606 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 9 Jun 2015 18:06:14 -0700
Subject: [PATCH 32/52] reverted SPARK_HOME from start of command

---
 dev/run-tests.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c7ab692ca53ad..08e20bddb083e 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -66,8 +66,6 @@ def run_cmd(cmd):
     if not isinstance(cmd, list):
         cmd = cmd.split()
     try:
-        # prepend SPARK_HOME onto the first element of the command
-        cmd[0] = os.path.join(SPARK_HOME, *filter(lambda x: x, cmd[0].split(os.path.sep)))
         subprocess.check_call(cmd)
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
@@ -293,7 +291,7 @@ def determine_test_modules(test_env):
 
     if test_env == "amplab_jenkins":
         target_branch = os.environ.get("ghprbTargetBranch")
-        print "target_branch at", target_branch
+
         run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
 
         raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch])
@@ -449,7 +447,7 @@ def main():
         print "ensure the $HOME environment variable is set properly."
         sys.exit(1)
 
-        #os.chdir(SPARK_HOME)
+    os.chdir(SPARK_HOME)
 
     rm_r(os.path.join(SPARK_HOME, "work"))
     rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark"))

From eb684b633d0a06f6a290d6c7d3b383b3f9636799 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 10 Jun 2015 06:49:27 -0700
Subject: [PATCH 33/52] fixed sbt_test_goals reference error

---
 dev/run-tests.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 08e20bddb083e..c510620e68705 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -363,6 +363,9 @@ def run_scala_tests_maven(test_profiles):
 
 
 def run_scala_tests_sbt(test_modules, test_profiles):
+    # declare the variable for reference
+    sbt_test_goals = None
+
     if "ALL" in test_modules:
         sbt_test_goals = ["test"]
     else:

From 289871704c29c2e7b712748c19cf1112e9333d88 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 10 Jun 2015 07:15:00 -0700
Subject: [PATCH 34/52] added a change to streaming test to check if it only
 runs streaming tests

---
 .../scala/org/apache/spark/streaming/StreamingContext.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 9cd9684d36404..7c895633b3aa3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+// TODO: ADDING CHANGE TO TEST run-tests SCRIPT
 package org.apache.spark.streaming
 
 import java.io.{InputStream, NotSerializableException}

From 7d2f5e28beb3cc20fe39d1d61443fcdd69fe632b Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 10 Jun 2015 07:17:27 -0700
Subject: [PATCH 35/52] updated python tests to remove unused variable

---
 dev/run-tests.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c510620e68705..7c63a7870b437 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -426,10 +426,6 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 def run_python_tests():
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
 
-    # Add path for Python3 in Jenkins if we're calling from a Jenkins machine
-    if AMPLAB_JENKINS:
-        os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
-
     run_cmd(["./python/run-tests"])
 
 
@@ -476,6 +472,8 @@ def main():
         build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
         hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
         test_env = "amplab_jenkins"
+        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
+        os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
     else:
         # else we're running locally and can use local settings
         build_tool = "sbt"

From 60b3d51cbf179517a7f78c8e1af97e2f75f853e0 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 10 Jun 2015 07:18:26 -0700
Subject: [PATCH 36/52] prepend rather than append onto PATH

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 7c63a7870b437..cb72eb09f8951 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -473,7 +473,7 @@ def main():
         hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
         test_env = "amplab_jenkins"
         # add path for Python3 in Jenkins if we're calling from a Jenkins machine
-        os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin"
+        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:"+os.environ.get("PATH")
     else:
         # else we're running locally and can use local settings
         build_tool = "sbt"

From 705d12e5c8ec4c541421341329f682fc29214c46 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 10 Jun 2015 09:20:30 -0700
Subject: [PATCH 37/52] changed example to comply with pep3113 supporting
 python3

---
 examples/src/main/python/kmeans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
index 1456c87312841..f204458d33f72 100755
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@@ -68,7 +68,7 @@ def closestPoint(p, centers):
         closest = data.map(
             lambda p: (closestPoint(p, kPoints), (p, 1)))
         pointStats = closest.reduceByKey(
-            lambda (p1, c1), (p2, c2): (p1 + p2, c1 + c2))
+            lambda (p1_c1, p2_c2): (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1]))
         newPoints = pointStats.map(
             lambda st: (st[0], st[1][0] / st[1][1])).collect()
 

From 03fdd7b1d2f5d0f04c85514f841fd5aa183c795d Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 10 Jun 2015 09:26:00 -0700
Subject: [PATCH 38/52] fixed the tuple () wraps around example lambda

---
 examples/src/main/python/kmeans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
index f204458d33f72..0ea7cfb7025a0 100755
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@@ -68,7 +68,7 @@ def closestPoint(p, centers):
         closest = data.map(
             lambda p: (closestPoint(p, kPoints), (p, 1)))
         pointStats = closest.reduceByKey(
-            lambda (p1_c1, p2_c2): (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1]))
+            lambda p1_c1, p2_c2: (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1]))
         newPoints = pointStats.map(
             lambda st: (st[0], st[1][0] / st[1][1])).collect()
 

From b7c72b9cbae34c71478dca06f02184f6b317f58b Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 10 Jun 2015 11:47:45 -0700
Subject: [PATCH 39/52] reverting streaming context

---
 .../scala/org/apache/spark/streaming/StreamingContext.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 7c895633b3aa3..9cd9684d36404 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-// TODO: ADDING CHANGE TO TEST run-tests SCRIPT
+
 package org.apache.spark.streaming
 
 import java.io.{InputStream, NotSerializableException}

From ec1ae789bef6a308db3cfd1b5d609c8cf02f19d9 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Sat, 13 Jun 2015 22:47:33 -0700
Subject: [PATCH 40/52] minor name changes, bug fixes

---
 dev/run-tests.py | 100 ++++++++++++++++++++++-------------------------
 1 file changed, 46 insertions(+), 54 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index cb72eb09f8951..15c35886fdf5f 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -205,19 +205,17 @@ def get_hadoop_profiles(hadoop_version):
         "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
     }
 
-    try:
-        hadoop_profiles = sbt_maven_hadoop_profiles[hadoop_version]
-    except KeyError:
+    if hadoop_version in sbt_maven_hadoop_profiles:
+        return sbt_maven_hadoop_profiles[hadoop_version]
+    else:
         print "[error] Could not find", hadoop_version, "in the list. Valid options",
-        print "are 'hadoop1.0', 'hadoop2.0', 'hadoop2.2', and 'hadoop2.3'."
+        print "are", sbt_maven_hadoop_profiles.keys()
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
-    return hadoop_profiles
-
 
 def get_build_profiles(hadoop_version="hadoop2.3",
-                       base_profiles=True,
-                       hive_profiles=False):
+                       enable_base_profiles=True,
+                       enable_hive_profiles=False):
     """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile
     key with the option of adding on the base and hive profiles."""
 
@@ -226,18 +224,19 @@ def get_build_profiles(hadoop_version="hadoop2.3",
     hadoop_profiles = get_hadoop_profiles(hadoop_version)
 
     build_profiles = hadoop_profiles
-    # first, check and add the base profiles
-    if base_profiles:
+
+    if enable_base_profiles:
         build_profiles = build_profiles + base_profiles
-    # second, check and add the hive profiles
-    if hive_profiles:
+
+    if enable_hive_profiles:
         build_profiles = build_profiles + hive_profiles
 
     return build_profiles
 
 
 def build_spark_maven(hadoop_version):
-    build_profiles = get_build_profiles(hadoop_version, hive_profiles=True)
+    # we always build with Hive support even if we skip Hive tests in most builds
+    build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True)
     mvn_goals = ["clean", "package", "-DskipTests"]
     profiles_and_goals = build_profiles + mvn_goals
 
@@ -248,7 +247,7 @@ def build_spark_maven(hadoop_version):
 
 
 def build_spark_sbt(hadoop_version):
-    build_profiles = get_build_profiles(hadoop_version, hive_profiles=True)
+    build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True)
     sbt_goals = ["package",
                  "assembly/assembly",
                  "streaming-kafka-assembly/assembly"]
@@ -275,22 +274,20 @@ def build_apache_spark(build_tool, hadoop_version):
 
 
 def detect_binary_inop_with_mima():
-    set_title_and_block("Detecting binary incompatibilities with MiMa",
-                        "BLOCK_MIMA")
+    set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
     run_cmd(["./dev/mima"])
 
 
-def determine_test_modules(test_env):
-    """This function current acts to determine if SQL tests need to be run in
-    addition to the core test suite *or* if _only_ SQL tests need to be run
-    as the git logs show that to be the only thing touched. In the future
-    this function will act more generically to help further segregate the
-    test suite runner (hence the function name).
-    @return a set of unique test names"""
-    test_suite = list()
+def identify_changed_modules(test_env):
+    """Given the passed in environment will determine the changed modules and
+    return them as a set. If the environment is local, will simply run all tests.
+    If run under the `amplab_jenkins` environment will determine the changed files
+    as compared to the `ghprbTargetBranch` and execute the necessary set of tests
+    to provide coverage for the changed code."""
+    test_suite = set()
 
     if test_env == "amplab_jenkins":
-        target_branch = os.environ.get("ghprbTargetBranch")
+        target_branch = os.environ["ghprbTargetBranch"]
 
         run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
 
@@ -329,27 +326,27 @@ def determine_test_modules(test_env):
         non_sql_files = set(changed_files).difference(set(sql_files))
 
         if non_sql_files:
-            test_suite.append("CORE")
+            test_suite.add("CORE")
         if sql_files:
             print "[info] Detected changes in SQL. Will run Hive test suite."
-            test_suite.append("SQL")
+            test_suite.add("SQL")
             if not non_sql_files:
                 print "[info] Detected no changes except in SQL. Will only run SQL tests."
         if mllib_files:
             print "[info] Detected changes in MLlib. Will run MLlib test suite."
-            test_suite.append("MLLIB")
+            test_suite.add("MLLIB")
         if streaming_files:
             print "[info] Detected changes in Streaming. Will run Streaming test suite."
-            test_suite.append("STREAMING")
+            test_suite.add("STREAMING")
         if graphx_files:
             print "[info] Detected changes in GraphX. Will run GraphX test suite."
-            test_suite.append("GRAPHX")
+            test_suite.add("GRAPHX")
 
-        return set(test_suite)
+        return test_suite
     else:
         # we aren't in the Amplab environment so simply run all tests
-        test_suite.append("ALL")
-        return set(test_suite)
+        test_suite.add("ALL")
+        return test_suite
 
 
 def run_scala_tests_maven(test_profiles):
@@ -369,7 +366,8 @@ def run_scala_tests_sbt(test_modules, test_profiles):
     if "ALL" in test_modules:
         sbt_test_goals = ["test"]
     else:
-        # if we only have changes in SQL build a custom test list
+        # if we only have changes in SQL, MLlib, Streaming, or GraphX then build
+        # a custom test list
         if "SQL" in test_modules and "CORE" not in test_modules:
             sbt_test_goals = ["catalyst/test",
                               "sql/test",
@@ -378,20 +376,18 @@ def run_scala_tests_sbt(test_modules, test_profiles):
                               "mllib/test",
                               "examples/test"]
         if "MLLIB" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals = sbt_test_goals + ["mllib/test",
-                                               "examples/test"]
+            sbt_test_goals += ["mllib/test", "examples/test"]
         if "STREAMING" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals = sbt_test_goals + ["streaming/test",
-                                               "streaming-flume/test",
-                                               "streaming-flume-sink/test",
-                                               "streaming-kafka/test",
-                                               "streaming-mqtt/test",
-                                               "streaming-twitter/test",
-                                               "streaming-zeromq/test",
-                                               "examples/test"]
+            sbt_test_goals += ["streaming/test",
+                               "streaming-flume/test",
+                               "streaming-flume-sink/test",
+                               "streaming-kafka/test",
+                               "streaming-mqtt/test",
+                               "streaming-twitter/test",
+                               "streaming-zeromq/test",
+                               "examples/test"]
         if "GRAPHX" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals = sbt_test_goals + ["graphx/test",
-                                               "examples/test"]
+            sbt_test_goals += ["graphx/test", "examples/test"]
         if not sbt_test_goals:
             sbt_test_goals = ["test"]
 
@@ -410,12 +406,8 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 
     test_modules = set(test_modules)
 
-    # if the Spark SQL tests are enabled, run the tests with the Hive profiles
-    # enabled.
-    if "SQL" in test_modules:
-        test_profiles = get_build_profiles(hadoop_version, hive_profiles=True)
-    else:
-        test_profiles = get_build_profiles(hadoop_version)
+    hive_profiles = ("SQL" in test_modules)
+    test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles)
 
     if build_tool == "maven":
         run_scala_tests_maven(test_profiles)
@@ -473,7 +465,7 @@ def main():
         hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
         test_env = "amplab_jenkins"
         # add path for Python3 in Jenkins if we're calling from a Jenkins machine
-        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:"+os.environ.get("PATH")
+        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
     else:
         # else we're running locally and can use local settings
         build_tool = "sbt"
@@ -497,7 +489,7 @@ def main():
     detect_binary_inop_with_mima()
 
     # test suites
-    test_modules = determine_test_modules(test_env)
+    test_modules = identify_changed_modules(test_env)
     run_scala_tests(build_tool, hadoop_version, test_modules)
     run_python_tests()
     run_sparkr_tests()

From aa03d9e1adb43d927cb3d00520936b90340427d9 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 15 Jun 2015 11:28:14 -0700
Subject: [PATCH 41/52] added documentation builds as a top level test
 component, altered high level project changes to properly execute core tests
 only when necessary, changed variable names for simplicity

---
 dev/run-tests-codes.sh | 11 ++++---
 dev/run-tests-jenkins  |  2 ++
 dev/run-tests.py       | 71 ++++++++++++++++++++++++++++--------------
 docs/configuration.md  |  1 +
 4 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh
index 154e01255b2ef..f4b238e1b78a7 100644
--- a/dev/run-tests-codes.sh
+++ b/dev/run-tests-codes.sh
@@ -21,8 +21,9 @@ readonly BLOCK_GENERAL=10
 readonly BLOCK_RAT=11
 readonly BLOCK_SCALA_STYLE=12
 readonly BLOCK_PYTHON_STYLE=13
-readonly BLOCK_BUILD=14
-readonly BLOCK_MIMA=15
-readonly BLOCK_SPARK_UNIT_TESTS=16
-readonly BLOCK_PYSPARK_UNIT_TESTS=17
-readonly BLOCK_SPARKR_UNIT_TESTS=18
+readonly BLOCK_DOCUMENTATION=14
+readonly BLOCK_BUILD=15
+readonly BLOCK_MIMA=16
+readonly BLOCK_SPARK_UNIT_TESTS=17
+readonly BLOCK_PYSPARK_UNIT_TESTS=18
+readonly BLOCK_SPARKR_UNIT_TESTS=19
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 641b0ff3c4be4..c4d39d95d5890 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -210,6 +210,8 @@ done
       failing_test="Scala style tests"
     elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then
       failing_test="Python style tests"
+    elif [ "$test_result" -eq "$BLOCK_DOCUMENTATION" ]; then
+      failing_test="to generate documentation"
     elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then
       failing_test="to build"
     elif [ "$test_result" -eq "$BLOCK_MIMA" ]; then
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 15c35886fdf5f..f01c24404b2a7 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -159,6 +159,11 @@ def run_python_style_checks():
     run_cmd(["./dev/lint-python"])
 
 
+def build_spark_documentation():
+    set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
+    os.environ["PRODUCTION"] = "1 jekyll build"
+
+
 def exec_maven(mvn_args=[]):
     """Will call Maven in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
@@ -215,21 +220,26 @@ def get_hadoop_profiles(hadoop_version):
 
 def get_build_profiles(hadoop_version="hadoop2.3",
                        enable_base_profiles=True,
-                       enable_hive_profiles=False):
+                       enable_hive_profiles=False,
+                       enable_doc_profiles=False):
     """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile
     key with the option of adding on the base and hive profiles."""
 
     base_profiles = ["-Pkinesis-asl"]
     hive_profiles = ["-Phive", "-Phive-thriftserver"]
+    doc_profiles = []
     hadoop_profiles = get_hadoop_profiles(hadoop_version)
 
     build_profiles = hadoop_profiles
 
     if enable_base_profiles:
-        build_profiles = build_profiles + base_profiles
+        build_profiles += base_profiles
 
     if enable_hive_profiles:
-        build_profiles = build_profiles + hive_profiles
+        build_profiles += hive_profiles
+
+    if enable_doc_profiles:
+        build_profiles += doc_profiles
 
     return build_profiles
 
@@ -259,7 +269,7 @@ def build_spark_sbt(hadoop_version):
     exec_sbt(profiles_and_goals)
 
 
-def build_apache_spark(build_tool, hadoop_version):
+def build_apache_spark(build_tool, hadoop_version, changed_modules):
     """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or
     `maven`). Defaults to using `sbt`."""
 
@@ -284,7 +294,7 @@ def identify_changed_modules(test_env):
     If run under the `amplab_jenkins` environment will determine the changed files
     as compared to the `ghprbTargetBranch` and execute the necessary set of tests
     to provide coverage for the changed code."""
-    test_suite = set()
+    changed_modules = set()
 
     if test_env == "amplab_jenkins":
         target_branch = os.environ["ghprbTargetBranch"]
@@ -295,7 +305,6 @@ def identify_changed_modules(test_env):
         # remove any empty strings
         changed_files = [f for f in raw_output.split('\n') if f]
 
-        # find any sql files
         sql_files = [f for f in changed_files
                      if any(f.startswith(p) for p in
                             ["sql/",
@@ -322,31 +331,39 @@ def identify_changed_modules(test_env):
                         if any(f.startswith(p) for p in
                                ["examples/src/main/scala/org/apache/spark/examples/graphx/",
                                 "graphx/"])]
-
-        non_sql_files = set(changed_files).difference(set(sql_files))
-
-        if non_sql_files:
-            test_suite.add("CORE")
+        doc_files = [f for f in changed_files if f.startswith("docs/")]
+
+        # union together all changed top level project files
+        top_level_project_files = set().union([set(f) for f in [sql_files,
+                                                                mllib_files,
+                                                                streaming_files,
+                                                                graphx_files,
+                                                                doc_files]])
+        changed_core_files = set(changed_files).difference(top_level_project_files)
+
+        if changed_core_files:
+            changed_modules.add("CORE")
         if sql_files:
             print "[info] Detected changes in SQL. Will run Hive test suite."
-            test_suite.add("SQL")
-            if not non_sql_files:
-                print "[info] Detected no changes except in SQL. Will only run SQL tests."
+            changed_modules.add("SQL")
         if mllib_files:
             print "[info] Detected changes in MLlib. Will run MLlib test suite."
-            test_suite.add("MLLIB")
+            changed_modules.add("MLLIB")
         if streaming_files:
             print "[info] Detected changes in Streaming. Will run Streaming test suite."
-            test_suite.add("STREAMING")
+            changed_modules.add("STREAMING")
         if graphx_files:
             print "[info] Detected changes in GraphX. Will run GraphX test suite."
-            test_suite.add("GRAPHX")
+            changed_modules.add("GRAPHX")
+        if doc_files:
+            print "[info] Detected changes in documentation. Will build spark with documentation."
+            changed_modules.add("DOCS")
 
-        return test_suite
+        return changed_modules
     else:
         # we aren't in the Amplab environment so simply run all tests
-        test_suite.add("ALL")
-        return test_suite
+        changed_modules.add("ALL")
+        return changed_modules
 
 
 def run_scala_tests_maven(test_profiles):
@@ -482,15 +499,21 @@ def main():
     run_scala_style_checks()
     run_python_style_checks()
 
+    # determine high level changes
+    changed_modules = identify_changed_modules(test_env)
+
+    # determine if docs were changed and if we're inside the amplab environment
+    if "DOCS" in changed_modules and test_env == "amplab_jenkins":
+        build_spark_documentation()
+
     # spark build
-    build_apache_spark(build_tool, hadoop_version)
+    build_apache_spark(build_tool, hadoop_version, changed_modules)
 
     # backwards compatibility checks
     detect_binary_inop_with_mima()
 
-    # test suites
-    test_modules = identify_changed_modules(test_env)
-    run_scala_tests(build_tool, hadoop_version, test_modules)
+    # run the test suites
+    run_scala_tests(build_tool, hadoop_version, changed_modules)
     run_python_tests()
     run_sparkr_tests()
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 3960e7e78bde1..492c10f45fd89 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -8,6 +8,7 @@ title: Configuration
 
 Spark provides three locations to configure the system:
 
+* ADDING DOC CHANGE FOR TESTING
 * [Spark properties](#spark-properties) control most application parameters and can be set by using
   a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object, or through Java
   system properties.

From 03798339522832042e1d282c9c0aead0abca0d65 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 15 Jun 2015 11:36:24 -0700
Subject: [PATCH 42/52] minor doc addition to print the changed modules

---
 dev/run-tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index f01c24404b2a7..84fa3ae83d185 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -501,6 +501,7 @@ def main():
 
     # determine high level changes
     changed_modules = identify_changed_modules(test_env)
+    print "[info] Found the following changed modules:", ", ".join(changed_modules)
 
     # determine if docs were changed and if we're inside the amplab environment
     if "DOCS" in changed_modules and test_env == "amplab_jenkins":

From fb85a41acf66c69d5fd12052560bc9e0d135e03a Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 15 Jun 2015 11:42:30 -0700
Subject: [PATCH 43/52] fixed minor set bug

---
 dev/run-tests.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 84fa3ae83d185..35289e5f6091f 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -334,11 +334,11 @@ def identify_changed_modules(test_env):
         doc_files = [f for f in changed_files if f.startswith("docs/")]
 
         # union together all changed top level project files
-        top_level_project_files = set().union([set(f) for f in [sql_files,
-                                                                mllib_files,
-                                                                streaming_files,
-                                                                graphx_files,
-                                                                doc_files]])
+        top_level_project_files = set().union(set(f) for f in [sql_files,
+                                                               mllib_files,
+                                                               streaming_files,
+                                                               graphx_files,
+                                                               doc_files])
         changed_core_files = set(changed_files).difference(top_level_project_files)
 
         if changed_core_files:

From c42cf9a8d75af7e3277ca3ddca0cb9dce857fa91 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Mon, 15 Jun 2015 13:27:08 -0700
Subject: [PATCH 44/52] unpack set operations with splat (*)

---
 dev/run-tests.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 35289e5f6091f..7edf2b0913cd7 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -334,11 +334,11 @@ def identify_changed_modules(test_env):
         doc_files = [f for f in changed_files if f.startswith("docs/")]
 
         # union together all changed top level project files
-        top_level_project_files = set().union(set(f) for f in [sql_files,
-                                                               mllib_files,
-                                                               streaming_files,
-                                                               graphx_files,
-                                                               doc_files])
+        top_level_project_files = set().union(*[set(f) for f in [sql_files,
+                                                                 mllib_files,
+                                                                 streaming_files,
+                                                                 graphx_files,
+                                                                 doc_files]])
         changed_core_files = set(changed_files).difference(top_level_project_files)
 
         if changed_core_files:

From 767a668c4b066af98b1f2ec6b2871c79f06289d8 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 09:11:21 -0700
Subject: [PATCH 45/52] fixed path joining issues, ensured docs actually build
 on doc changes

---
 dev/run-tests.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 7edf2b0913cd7..1a091dc72efb3 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -107,7 +107,7 @@ def determine_java_executable():
     java_home = os.environ.get("JAVA_HOME")
 
     # check if there is an executable at $JAVA_HOME/bin/java
-    java_exe = which(os.path.join(java_home, "bin/java"))
+    java_exe = which(os.path.join(java_home, "bin", "java"))
     # if the java_exe wasn't set, check for a `java` version on the $PATH
     return java_exe if java_exe else which("java")
 
@@ -146,36 +146,42 @@ def set_title_and_block(title, err_block):
 
 def run_apache_rat_checks():
     set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
-    run_cmd(["./dev/check-license"])
+    run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
 
 
 def run_scala_style_checks():
     set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
-    run_cmd(["./dev/lint-scala"])
+    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
 
 
 def run_python_style_checks():
     set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
-    run_cmd(["./dev/lint-python"])
+    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
 
 
 def build_spark_documentation():
     set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
     os.environ["PRODUCTION"] = "1 jekyll build"
+    
+    os.chdir(os.path.join(SPARK_HOME, "docs"))    
+    
+    run_cmd(["jekyll", "build"])
+
+    os.chdir(SPARK_HOME)
 
 
 def exec_maven(mvn_args=[]):
     """Will call Maven in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
-    run_cmd(["./build/mvn"] + mvn_args)
+    run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
 
 
 def exec_sbt(sbt_args=[]):
     """Will call SBT in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
-    sbt_cmd = ["./build/sbt"] + sbt_args
+    sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
 
     sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
                                    "^.*[warn].*Merging" + "|" +
@@ -285,7 +291,7 @@ def build_apache_spark(build_tool, hadoop_version, changed_modules):
 
 def detect_binary_inop_with_mima():
     set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
-    run_cmd(["./dev/mima"])
+    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
 
 
 def identify_changed_modules(test_env):
@@ -435,15 +441,15 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 def run_python_tests():
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
 
-    run_cmd(["./python/run-tests"])
+    run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")])
 
 
 def run_sparkr_tests():
     set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
 
     if which("R"):
-        run_cmd(["./R/install-dev.sh"])
-        run_cmd(["./R/run-tests.sh"])
+        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
+        run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
     else:
         print "Ignoring SparkR tests as R was not found in PATH"
 
@@ -458,8 +464,8 @@ def main():
     os.chdir(SPARK_HOME)
 
     rm_r(os.path.join(SPARK_HOME, "work"))
-    rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark"))
-    rm_r(os.path.join(USER_HOME, ".ivy2/cache/org.apache.spark"))
+    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
+    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
 
     os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]
 

From 2dff136a6ed4c960e02af117add70e39da783037 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 09:15:02 -0700
Subject: [PATCH 46/52] fixed pep8 whitespace errors

---
 dev/run-tests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 1a091dc72efb3..97539fd78e5e8 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -162,9 +162,9 @@ def run_python_style_checks():
 def build_spark_documentation():
     set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
     os.environ["PRODUCTION"] = "1 jekyll build"
-    
-    os.chdir(os.path.join(SPARK_HOME, "docs"))    
-    
+
+    os.chdir(os.path.join(SPARK_HOME, "docs"))
+
     run_cmd(["jekyll", "build"])
 
     os.chdir(SPARK_HOME)

From 22edb7807d77222df2b6017cc4fb9cd7621757e9 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 09:29:09 -0700
Subject: [PATCH 47/52] add check if jekyll isn't installed on the path

---
 dev/run-tests.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 97539fd78e5e8..554486a105d90 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -165,7 +165,13 @@ def build_spark_documentation():
 
     os.chdir(os.path.join(SPARK_HOME, "docs"))
 
-    run_cmd(["jekyll", "build"])
+    jekyll_bin = which("jekyll")
+
+    if not jekyll_bin:
+        print "[warn] Cannot find a version of `jekyll` on the system; please",
+        print "install one and retry to build documentation."
+    else:
+        run_cmd([jekyll_bin, "build"])
 
     os.chdir(SPARK_HOME)
 
@@ -498,6 +504,10 @@ def main():
     print "[info] Using build tool", build_tool, "with profile", hadoop_version,
     print "under environment", test_env
 
+    # determine high level changes
+    changed_modules = identify_changed_modules(test_env)
+    print "[info] Found the following changed modules:", ", ".join(changed_modules)
+
     # license checks
     run_apache_rat_checks()
 
@@ -505,10 +515,6 @@ def main():
     run_scala_style_checks()
     run_python_style_checks()
 
-    # determine high level changes
-    changed_modules = identify_changed_modules(test_env)
-    print "[info] Found the following changed modules:", ", ".join(changed_modules)
-
     # determine if docs were changed and if we're inside the amplab environment
     if "DOCS" in changed_modules and test_env == "amplab_jenkins":
         build_spark_documentation()

From 05d435b6386e8ed4ac6f61c5a0713c157b5451d3 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 09:40:56 -0700
Subject: [PATCH 48/52] added check for jekyll install

---
 dev/run-tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 554486a105d90..2a86812012eb9 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -168,8 +168,9 @@ def build_spark_documentation():
     jekyll_bin = which("jekyll")
 
     if not jekyll_bin:
-        print "[warn] Cannot find a version of `jekyll` on the system; please",
+        print "[error] Cannot find a version of `jekyll` on the system; please",
         print "install one and retry to build documentation."
+        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
     else:
         run_cmd([jekyll_bin, "build"])
 

From 8135518d12d5ade154eb39b88b39ea7c63ac8252 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 10:50:58 -0700
Subject: [PATCH 49/52] removed the test check for documentation changes until
 jenkins can get updated

---
 dev/run-tests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 2a86812012eb9..f8a48bddd5263 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -517,8 +517,9 @@ def main():
     run_python_style_checks()
 
     # determine if docs were changed and if we're inside the amplab environment
-    if "DOCS" in changed_modules and test_env == "amplab_jenkins":
-        build_spark_documentation()
+    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
+    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
+    #    build_spark_documentation()
 
     # spark build
     build_apache_spark(build_tool, hadoop_version, changed_modules)

From f9fbe549165cca5c605dbb7d7b361891a407b7b1 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 10:52:25 -0700
Subject: [PATCH 50/52] reverted doc test change

---
 docs/configuration.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 492c10f45fd89..3960e7e78bde1 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -8,7 +8,6 @@ title: Configuration
 
 Spark provides three locations to configure the system:
 
-* ADDING DOC CHANGE FOR TESTING
 * [Spark properties](#spark-properties) control most application parameters and can be set by using
   a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object, or through Java
   system properties.

From 3922a85d68a4936a7d2b125b1850f87553a7e537 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 14:36:10 -0700
Subject: [PATCH 51/52] removed necessary passed in variable

---
 dev/run-tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index f8a48bddd5263..b09cfa3867be6 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -282,7 +282,7 @@ def build_spark_sbt(hadoop_version):
     exec_sbt(profiles_and_goals)
 
 
-def build_apache_spark(build_tool, hadoop_version, changed_modules):
+def build_apache_spark(build_tool, hadoop_version):
     """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or
     `maven`). Defaults to using `sbt`."""
 
@@ -522,7 +522,7 @@ def main():
     #    build_spark_documentation()
 
     # spark build
-    build_apache_spark(build_tool, hadoop_version, changed_modules)
+    build_apache_spark(build_tool, hadoop_version)
 
     # backwards compatibility checks
     detect_binary_inop_with_mima()

From 154ed739026af964ab38e564abdf91124a9acf96 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 16 Jun 2015 14:53:25 -0700
Subject: [PATCH 52/52] updated finding java binary if JAVA_HOME not set

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index b09cfa3867be6..04a7b45741963 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -107,7 +107,7 @@ def determine_java_executable():
     java_home = os.environ.get("JAVA_HOME")
 
     # check if there is an executable at $JAVA_HOME/bin/java
-    java_exe = which(os.path.join(java_home, "bin", "java"))
+    java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
     # if the java_exe wasn't set, check for a `java` version on the $PATH
     return java_exe if java_exe else which("java")