From 6126c4f4d97db16b0ed6a95c60fae1fff44e2afe Mon Sep 17 00:00:00 2001 From: Brennon York Date: Fri, 24 Apr 2015 14:27:54 -0700 Subject: [PATCH 01/52] refactored run-tests into python --- dev/run-tests | 623 +++++++++++++++++++++++++++++++------------------- 1 file changed, 389 insertions(+), 234 deletions(-) diff --git a/dev/run-tests b/dev/run-tests index 861d1671182c2..22a33d317a344 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,239 +17,394 @@ # limitations under the License. # -# Go to the Spark project root directory -FWDIR="$(cd "`dirname $0`"/..; pwd)" -cd "$FWDIR" +import os +import re +import shutil +import subprocess as sp + +# Set the Spark project root directory +spark_proj_root = os.path.abspath("..") +# Set the user 'HOME' directory +user_home_dir = os.environ.get("HOME") +# Set the sbt maven profile arguments environment variable name +sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS" +# Set the amplab jenkins build tool environment variable name +amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL" +# Set the amplab jenkins build tool environment value +amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env) +# Set whether we're on an Amplab Jenkins box by checking for a specific +# environment variable +amplab_jenkins = os.environ.get("AMPLAB_JENKINS") +# Set the pattern for sbt output e.g. "[info] Resolving ..." +resolving_re = "^.*[info].*Resolving" +# Set the pattern for sbt output e.g. "[warn] Merging ..." +merging_re = "^.*[warn].*Merging" +# Set the pattern for sbt output e.g. "[info] Including ..." +including_re = "^.*[info].*Including" +# Compile the various regex patterns into a filter +sbt_output_filter = re.compile(resolving_re + "|" + + merging_re + "|" + + including_re) + +def get_error_codes(err_code_file): + """Function to retrieve all block numbers from the `run-tests-codes.sh` + file to maintain backwards compatibility with the `run-tests-jenkins` + script""" + + with open(err_code_file, 'r') as f: + err_codes = [e.split()[1].strip().split('=') + for e in f if e.startswith("readonly")] + return dict(err_codes) + +def rm_r(path): + """Given an arbitrary path properly remove it with the correct python + construct if it exists + - from: http://stackoverflow.com/a/9559881""" + + if os.path.isdir(path): + shutil.rmtree(path) + elif os.path.exists(path): + os.remove(path) + +def lineno(): + """Returns the current line number in our program + - from: http://stackoverflow.com/a/3056059""" + + return inspect.currentframe().f_back.f_lineno + +def set_sbt_maven_profile_args(): + """Properly sets the SBT environment variable arguments with additional + checks to determine if this is running on an Amplab Jenkins machine""" + + # base environment values for sbt_maven_profile_args_env which will be appended on + sbt_maven_profile_args_base = ["-Pkinesis-asl"] + + sbt_maven_profile_arg_dict = { + "hadoop1.0" : ["-Dhadoop.version=1.0.4"], + "hadoop2.0" : ["-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], + "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2", "-Dhadoop.version=2.2.0"], + "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], + } + + # set the SBT maven build profile argument environment variable and ensure + # we build against the right version of Hadoop + if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"): + os.environ[sbt_maven_profile_args_env] = \ + " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) + + sbt_maven_profile_args_base) + else: + os.environ[sbt_maven_profile_args_env] = \ + " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", []) + + sbt_maven_profile_args_base) + +def is_exe(path): + """Check if a given path is an executable file + - from: http://stackoverflow.com/a/377028""" + + return os.path.isfile(path) and os.access(path, os.X_OK) + +def which(program): + """Find and return the given program by its absolute path or 'None' + - from: http://stackoverflow.com/a/377028""" + + fpath, fname = os.path.split(program) + + if fpath: + if is_exe(program): + return program + else: + for path in os.environ.get("PATH").split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + return None + +def determine_java_executable(): + """Will return the *best* path possible for a 'java' executable or `None`""" + + java_home = os.environ.get("JAVA_HOME") + + # check if there is an executable at $JAVA_HOME/bin/java + java_exe = which(os.path.join(java_home, "bin/java")) + # if the java_exe wasn't set, check for a `java` version on the $PATH + return java_exe if java_exe else which("java") + +def determine_java_version(java_exe): + """Given a valid java executable will return its version in tuple format as: + [, , , ]""" + + raw_output = sp.check_output([java_exe, "-version"], stderr=sp.STDOUT) + raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' + version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' + version, update = version_str.split('_') # eg ['1.8.0', '25'] + + # map over the values and convert them to integers + return map(lambda x: int(x), version.split('.') + [update]) + +def multi_starts_with(orig_str, *prefixes): + """Takes a string and an abritrary number of prefixes then checks the + original string for any of the possible prefixes passed in""" + + for s in prefixes: + if orig_str.startswith(s): + return True + return False + +# This function current acts to determine if SQL tests need to be run in +# addition to the core test suite *or* if _only_ SQL tests need to be run +# as the git logs show that to be the only thing touched. In the future +# this function will act more generically to help further segregate the +# test suite runner (hence the function name). +# @return a set of unique test names +def determine_test_suite(): + test_suite = list() + + if amplab_jenkins: + sp.Popen(['git', 'fetch', 'origin', 'master:master']).wait() + + raw_output = sp.check_output(['git', 'diff', '--name-only', 'master']) + # remove any empty strings + changed_files = [f for f in raw_output.split('\n') if f] + + # find any sql files + sql_files = [f for f in changed_files + if multi_starts_with(f, + "sql/", + "bin/spark-sql", + "sbin/start-thriftserver.sh")] + + non_sql_files = set(changed_files).difference(set(sql_files)) + + if non_sql_files: + test_suite.append("CORE") + if sql_files: + print "[info] Detected changes in SQL. Will run Hive test suite." + test_suite.append("SQL") + if not non_sql_files: + print "[info] Detected no changes except in SQL. Will only run SQL tests." + return set(test_suite) + else: + # we aren't in the Amplab environment so merely run all tests + test_suite.append("CORE") + test_suite.append("SQL") + return set(test_suite) + +def set_title_and_block(title, err_block): + os.environ["CURRENT_BLOCK"] = error_codes[err_block] + line_str = "".join(['='] * 72) + + print + print line_str + print title + print line_str + +def run_cmd(cmd): + """Given a command as a list of arguments will attempt to execute the + command and, on failure, print an error message""" + + if not isinstance(cmd, list): + cmd = cmd.split() + try: + sp.check_output(cmd) + except sp.CalledProcessError as e: + print "[error] running", e.cmd, "; received return code", e.returncode + exit(e.returncode) + +def run_apache_rat_checks(): + set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") + run_cmd(["./dev/check-license"]) + +def run_scala_style_checks(): + set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") + run_cmd(["./dev/lint-scala"]) + +def run_python_style_checks(): + set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") + run_cmd(["./dev/lint-python"]) + +def exec_maven(mvn_args = []): + """Will call Maven in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + return sp.Popen(["./build/mvn"] + mvn_args) + +def exec_sbt(sbt_args = []): + """Will call SBT in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + # NOTE: echo "q" is needed because sbt on encountering a build file + # with failure (either resolution or compilation) prompts the user for + # input either q, r, etc to quit or retry. This echo is there to make it + # not block. + echo_proc = sp.Popen(["echo", "\"q\n\""]) + sbt_proc = sp.Popen(["./build/sbt"] + sbt_args, + stdin=echo_proc.stdout, + stdout=sp.PIPE) + echo_proc.wait() + for line in iter(sbt_proc.stdout.readline, ''): + if not sbt_output_filter.match(line): + print line, + return sbt_proc + +def build_apache_spark(): + """Will first build Spark with Hive v0.12.0 to ensure the build is + successful and, after, will build Spark again against Hive v0.13.1 as the + tests are based off that""" + + set_title_and_block("Building Spark", "BLOCK_BUILD") + + sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() + hive_profile_args = sbt_maven_profile_args + ["-Phive", + "-Phive-thriftserver"] + hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"] + # set the default maven args + base_mvn_args = ["clean", "package", "-DskipTests"] + # set the necessary sbt goals + sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"] + sbt_hive_goals = ["package", + "assembly/assembly", + "streaming-kafka-assembly/assembly"] + + # First build with Hive 0.12.0 to ensure patches do not break the Hive + # 0.12.0 build + print "[info] Compile with Hive 0.12.0" + rm_r("lib_managed") + print "[info] Building Spark with these arguments:", + print " ".join(hive_12_profile_args) + + if amplab_jenkins_build_tool == "maven": + exec_maven(hive_12_profile_args + base_mvn_args).wait() + else: + exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait() + + # Then build with default Hive version (0.13.1) because tests are based on + # this version + print "[info] Compile with Hive 0.13.1" + rm_r("lib_managed") + print "[info] Building Spark with these arguments:", + print " ".join(hive_profile_args) + + if amplab_jenkins_build_tool == "maven": + exec_maven(hive_profile_args + base_mvn_args).wait() + else: + exec_sbt(hive_profile_args + sbt_hive_goals).wait() + +def detect_binary_inop_with_mima(): + set_title_and_block("Detecting binary incompatibilities with MiMa", + "BLOCK_MIMA") + run_cmd(["./dev/mima"]) + +def run_scala_tests(test_suite = []): + """Function to properly execute all tests pass in, as a list, from the + `determine_test_suite` function""" + set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") + + # ensure the test_suite is a set + if not isinstance(test_suite, set): + test_suite = set(test_suite) + + # if the Spark SQL tests are enabled, run the tests with the Hive profiles + # enabled. + if "SQL" in test_suite: + sbt_maven_profile_args = \ + os.environ.get(sbt_maven_profile_args_env).split() + os.environ[sbt_maven_profile_args_env] = \ + " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"]) + + # if we only have changes in SQL build a custom test string + if "SQL" in test_suite and "CORE" not in test_suite: + sbt_maven_test_args = ["catalyst/test", + "sql/test", + "hive/test", + "hive-thriftserver/test", + "mllib/test"] + else: + sbt_maven_test_args = ["test"] + + # get the latest sbt maven profile arguments + sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() + + print "[info] Running Spark tests with these arguments:", + print " ".join(sbt_maven_profile_args), + print " ".join(sbt_maven_test_args) + + if amplab_jenkins_build_tool == "maven": + exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait() + else: + exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait() + +def run_python_tests(test_suite = []): + set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") + + # Add path for Python3 in Jenkins if we're calling from a Jenkins machine + if amplab_jenkins: + os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" + + run_cmd(["./python/run-tests"]) + +def run_sparkr_tests(test_suite = []): + set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") + + if which("R"): + run_cmd(["./R/install-dev.sh"]) + run_cmd(["./R/run-tests.sh"]) + else: + print "Ignoring SparkR tests as R was not found in PATH" + +# Ensure the user home directory (HOME) is valid and is an absolute directory +if not user_home_dir or not os.path.isabs(user_home_dir): + print "[error] Cannot determine your home directory as an absolute path;", + print "ensure the $HOME environment variable is set properly." + exit(1) + +# Change directory to the Spark project root +os.chdir(spark_proj_root) # Clean up work directory and caches -rm -rf ./work -rm -rf ~/.ivy2/local/org.apache.spark -rm -rf ~/.ivy2/cache/org.apache.spark - -source "$FWDIR/dev/run-tests-codes.sh" - -CURRENT_BLOCK=$BLOCK_GENERAL - -function handle_error () { - echo "[error] Got a return code of $? on line $1 of the run-tests script." - exit $CURRENT_BLOCK -} - - -# Build against the right version of Hadoop. -{ - if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then - if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4" - elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1" - elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0" - elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0" - fi - fi - - if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0" - fi -} - -export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl" - -# Determine Java path and version. -{ - if test -x "$JAVA_HOME/bin/java"; then - declare java_cmd="$JAVA_HOME/bin/java" - else - declare java_cmd=java - fi - - # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses. - JAVA_VERSION=$( - $java_cmd -version 2>&1 \ - | grep -e "^java version" --max-count=1 \ - | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/" - ) - - if [ "$JAVA_VERSION" -lt 18 ]; then - echo "[warn] Java 8 tests will not run because JDK version is < 1.8." - fi -} - -# Only run Hive tests if there are SQL changes. -# Partial solution for SPARK-1455. -if [ -n "$AMPLAB_JENKINS" ]; then - git fetch origin master:master - - sql_diffs=$( - git diff --name-only master \ - | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh" - ) - - non_sql_diffs=$( - git diff --name-only master \ - | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh" - ) - - if [ -n "$sql_diffs" ]; then - echo "[info] Detected changes in SQL. Will run Hive test suite." - _RUN_SQL_TESTS=true - - if [ -z "$non_sql_diffs" ]; then - echo "[info] Detected no changes except in SQL. Will only run SQL tests." - _SQL_TESTS_ONLY=true - fi - fi -fi - -set -o pipefail -trap 'handle_error $LINENO' ERR - -echo "" -echo "=========================================================================" -echo "Running Apache RAT checks" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_RAT - -./dev/check-license - -echo "" -echo "=========================================================================" -echo "Running Scala style checks" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_SCALA_STYLE - -./dev/lint-scala - -echo "" -echo "=========================================================================" -echo "Running Python style checks" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_PYTHON_STYLE - -./dev/lint-python - -echo "" -echo "=========================================================================" -echo "Building Spark" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_BUILD - -{ - HIVE_BUILD_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver" - HIVE_12_BUILD_ARGS="$HIVE_BUILD_ARGS -Phive-0.12.0" - - # First build with Hive 0.12.0 to ensure patches do not break the Hive 0.12.0 build - echo "[info] Compile with Hive 0.12.0" - [ -d "lib_managed" ] && rm -rf lib_managed - echo "[info] Building Spark with these arguments: $HIVE_12_BUILD_ARGS" - - if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then - build/mvn $HIVE_12_BUILD_ARGS clean package -DskipTests - else - # NOTE: echo "q" is needed because sbt on encountering a build file with failure - # (either resolution or compilation) prompts the user for input either q, r, etc - # to quit or retry. This echo is there to make it not block. - # NOTE: Do not quote $BUILD_MVN_PROFILE_ARGS or else it will be interpreted as a - # single argument! - # QUESTION: Why doesn't 'yes "q"' work? - # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work? - echo -e "q\n" \ - | build/sbt $HIVE_12_BUILD_ARGS clean hive/compile hive-thriftserver/compile \ - | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" - fi - - # Then build with default Hive version (0.13.1) because tests are based on this version - echo "[info] Compile with Hive 0.13.1" - [ -d "lib_managed" ] && rm -rf lib_managed - echo "[info] Building Spark with these arguments: $HIVE_BUILD_ARGS" - - if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then - build/mvn $HIVE_BUILD_ARGS clean package -DskipTests - else - echo -e "q\n" \ - | build/sbt $HIVE_BUILD_ARGS package assembly/assembly streaming-kafka-assembly/assembly \ - | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" - fi -} - -echo "" -echo "=========================================================================" -echo "Detecting binary incompatibilities with MiMa" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_MIMA - -./dev/mima - -echo "" -echo "=========================================================================" -echo "Running Spark unit tests" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS - -{ - # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled. - # This must be a single argument, as it is. - if [ -n "$_RUN_SQL_TESTS" ]; then - SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver" - fi - - if [ -n "$_SQL_TESTS_ONLY" ]; then - # This must be an array of individual arguments. Otherwise, having one long string - # will be interpreted as a single test, which doesn't work. - SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test" "hive-thriftserver/test" "mllib/test") - else - SBT_MAVEN_TEST_ARGS=("test") - fi - - echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS ${SBT_MAVEN_TEST_ARGS[@]}" - - if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then - build/mvn test $SBT_MAVEN_PROFILES_ARGS --fail-at-end - else - # NOTE: echo "q" is needed because sbt on encountering a build file with failure - # (either resolution or compilation) prompts the user for input either q, r, etc - # to quit or retry. This echo is there to make it not block. - # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a - # single argument! - # "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array. - # QUESTION: Why doesn't 'yes "q"' work? - # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work? - echo -e "q\n" \ - | build/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \ - | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" - fi -} - -echo "" -echo "=========================================================================" -echo "Running PySpark tests" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS - -# add path for python 3 in jenkins -export PATH="${PATH}:/home/anaonda/envs/py3k/bin" -./python/run-tests - -echo "" -echo "=========================================================================" -echo "Running SparkR tests" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_SPARKR_UNIT_TESTS - -if [ $(command -v R) ]; then - ./R/install-dev.sh - ./R/run-tests.sh -else - echo "Ignoring SparkR tests as R was not found in PATH" -fi +rm_r("./work") +rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark")) +rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark")) +# Grab the error codes from the `dev/run-tests-codes.sh` file +error_codes = get_error_codes("dev/run-tests-codes.sh") + +# Set the environment with the general error code initially +os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"] + +# Set the various sbt maven profile argument environment variables +set_sbt_maven_profile_args() + +# Set the java executable we've found (if any) +java_exe = determine_java_executable() + +if not java_exe: + print "[error] Cannot find a version of `java` on the system; please", + print "install one and retry." + exit(2) + +# Grab the current java version information +java_version = determine_java_version(java_exe) + +if java_version[1] < 8: + print "[warn] Java 8 tests will not run because JDK version is < 1.8." + +# Determine the suite of tests to perform +test_suite = determine_test_suite() + +run_apache_rat_checks() + +run_scala_style_checks() + +run_python_style_checks() + +# Build an up-to-date version of Apache Spark +build_apache_spark() + +detect_binary_inop_with_mima() + +# run_scala_tests(test_suite) + +run_python_tests() + +run_sparkr_tests() From 3c53a1a2bf29ad9ec62141e7a16c19bfc49fef91 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Fri, 24 Apr 2015 14:36:53 -0700 Subject: [PATCH 02/52] uncomment the scala tests :) --- dev/run-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests b/dev/run-tests index 22a33d317a344..f52e8e7ae453e 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -403,7 +403,7 @@ build_apache_spark() detect_binary_inop_with_mima() -# run_scala_tests(test_suite) +run_scala_tests(test_suite) run_python_tests() From 639f1e906e4e5e888f0d5f625033bf0123fda4fb Mon Sep 17 00:00:00 2001 From: Brennon York Date: Sat, 25 Apr 2015 18:32:06 -0700 Subject: [PATCH 03/52] updated with pep8 rules, fixed minor bugs, added run-tests file in bash to call the run-tests.py script --- dev/run-tests | 395 +------------------------------------------- dev/run-tests.py | 417 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 421 insertions(+), 391 deletions(-) create mode 100755 dev/run-tests.py diff --git a/dev/run-tests b/dev/run-tests index f52e8e7ae453e..844ff6a0d9757 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,394 +17,7 @@ # limitations under the License. # -import os -import re -import shutil -import subprocess as sp +FWDIR="$(cd "`dirname $0`"/..; pwd)" +cd "$FWDIR" -# Set the Spark project root directory -spark_proj_root = os.path.abspath("..") -# Set the user 'HOME' directory -user_home_dir = os.environ.get("HOME") -# Set the sbt maven profile arguments environment variable name -sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS" -# Set the amplab jenkins build tool environment variable name -amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL" -# Set the amplab jenkins build tool environment value -amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env) -# Set whether we're on an Amplab Jenkins box by checking for a specific -# environment variable -amplab_jenkins = os.environ.get("AMPLAB_JENKINS") -# Set the pattern for sbt output e.g. "[info] Resolving ..." -resolving_re = "^.*[info].*Resolving" -# Set the pattern for sbt output e.g. "[warn] Merging ..." -merging_re = "^.*[warn].*Merging" -# Set the pattern for sbt output e.g. "[info] Including ..." -including_re = "^.*[info].*Including" -# Compile the various regex patterns into a filter -sbt_output_filter = re.compile(resolving_re + "|" + - merging_re + "|" + - including_re) - -def get_error_codes(err_code_file): - """Function to retrieve all block numbers from the `run-tests-codes.sh` - file to maintain backwards compatibility with the `run-tests-jenkins` - script""" - - with open(err_code_file, 'r') as f: - err_codes = [e.split()[1].strip().split('=') - for e in f if e.startswith("readonly")] - return dict(err_codes) - -def rm_r(path): - """Given an arbitrary path properly remove it with the correct python - construct if it exists - - from: http://stackoverflow.com/a/9559881""" - - if os.path.isdir(path): - shutil.rmtree(path) - elif os.path.exists(path): - os.remove(path) - -def lineno(): - """Returns the current line number in our program - - from: http://stackoverflow.com/a/3056059""" - - return inspect.currentframe().f_back.f_lineno - -def set_sbt_maven_profile_args(): - """Properly sets the SBT environment variable arguments with additional - checks to determine if this is running on an Amplab Jenkins machine""" - - # base environment values for sbt_maven_profile_args_env which will be appended on - sbt_maven_profile_args_base = ["-Pkinesis-asl"] - - sbt_maven_profile_arg_dict = { - "hadoop1.0" : ["-Dhadoop.version=1.0.4"], - "hadoop2.0" : ["-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], - "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2", "-Dhadoop.version=2.2.0"], - "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], - } - - # set the SBT maven build profile argument environment variable and ensure - # we build against the right version of Hadoop - if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"): - os.environ[sbt_maven_profile_args_env] = \ - " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) - + sbt_maven_profile_args_base) - else: - os.environ[sbt_maven_profile_args_env] = \ - " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", []) - + sbt_maven_profile_args_base) - -def is_exe(path): - """Check if a given path is an executable file - - from: http://stackoverflow.com/a/377028""" - - return os.path.isfile(path) and os.access(path, os.X_OK) - -def which(program): - """Find and return the given program by its absolute path or 'None' - - from: http://stackoverflow.com/a/377028""" - - fpath, fname = os.path.split(program) - - if fpath: - if is_exe(program): - return program - else: - for path in os.environ.get("PATH").split(os.pathsep): - path = path.strip('"') - exe_file = os.path.join(path, program) - if is_exe(exe_file): - return exe_file - return None - -def determine_java_executable(): - """Will return the *best* path possible for a 'java' executable or `None`""" - - java_home = os.environ.get("JAVA_HOME") - - # check if there is an executable at $JAVA_HOME/bin/java - java_exe = which(os.path.join(java_home, "bin/java")) - # if the java_exe wasn't set, check for a `java` version on the $PATH - return java_exe if java_exe else which("java") - -def determine_java_version(java_exe): - """Given a valid java executable will return its version in tuple format as: - [, , , ]""" - - raw_output = sp.check_output([java_exe, "-version"], stderr=sp.STDOUT) - raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' - version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' - version, update = version_str.split('_') # eg ['1.8.0', '25'] - - # map over the values and convert them to integers - return map(lambda x: int(x), version.split('.') + [update]) - -def multi_starts_with(orig_str, *prefixes): - """Takes a string and an abritrary number of prefixes then checks the - original string for any of the possible prefixes passed in""" - - for s in prefixes: - if orig_str.startswith(s): - return True - return False - -# This function current acts to determine if SQL tests need to be run in -# addition to the core test suite *or* if _only_ SQL tests need to be run -# as the git logs show that to be the only thing touched. In the future -# this function will act more generically to help further segregate the -# test suite runner (hence the function name). -# @return a set of unique test names -def determine_test_suite(): - test_suite = list() - - if amplab_jenkins: - sp.Popen(['git', 'fetch', 'origin', 'master:master']).wait() - - raw_output = sp.check_output(['git', 'diff', '--name-only', 'master']) - # remove any empty strings - changed_files = [f for f in raw_output.split('\n') if f] - - # find any sql files - sql_files = [f for f in changed_files - if multi_starts_with(f, - "sql/", - "bin/spark-sql", - "sbin/start-thriftserver.sh")] - - non_sql_files = set(changed_files).difference(set(sql_files)) - - if non_sql_files: - test_suite.append("CORE") - if sql_files: - print "[info] Detected changes in SQL. Will run Hive test suite." - test_suite.append("SQL") - if not non_sql_files: - print "[info] Detected no changes except in SQL. Will only run SQL tests." - return set(test_suite) - else: - # we aren't in the Amplab environment so merely run all tests - test_suite.append("CORE") - test_suite.append("SQL") - return set(test_suite) - -def set_title_and_block(title, err_block): - os.environ["CURRENT_BLOCK"] = error_codes[err_block] - line_str = "".join(['='] * 72) - - print - print line_str - print title - print line_str - -def run_cmd(cmd): - """Given a command as a list of arguments will attempt to execute the - command and, on failure, print an error message""" - - if not isinstance(cmd, list): - cmd = cmd.split() - try: - sp.check_output(cmd) - except sp.CalledProcessError as e: - print "[error] running", e.cmd, "; received return code", e.returncode - exit(e.returncode) - -def run_apache_rat_checks(): - set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") - run_cmd(["./dev/check-license"]) - -def run_scala_style_checks(): - set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") - run_cmd(["./dev/lint-scala"]) - -def run_python_style_checks(): - set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") - run_cmd(["./dev/lint-python"]) - -def exec_maven(mvn_args = []): - """Will call Maven in the current directory with the list of mvn_args passed - in and returns the subprocess for any further processing""" - - return sp.Popen(["./build/mvn"] + mvn_args) - -def exec_sbt(sbt_args = []): - """Will call SBT in the current directory with the list of mvn_args passed - in and returns the subprocess for any further processing""" - - # NOTE: echo "q" is needed because sbt on encountering a build file - # with failure (either resolution or compilation) prompts the user for - # input either q, r, etc to quit or retry. This echo is there to make it - # not block. - echo_proc = sp.Popen(["echo", "\"q\n\""]) - sbt_proc = sp.Popen(["./build/sbt"] + sbt_args, - stdin=echo_proc.stdout, - stdout=sp.PIPE) - echo_proc.wait() - for line in iter(sbt_proc.stdout.readline, ''): - if not sbt_output_filter.match(line): - print line, - return sbt_proc - -def build_apache_spark(): - """Will first build Spark with Hive v0.12.0 to ensure the build is - successful and, after, will build Spark again against Hive v0.13.1 as the - tests are based off that""" - - set_title_and_block("Building Spark", "BLOCK_BUILD") - - sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() - hive_profile_args = sbt_maven_profile_args + ["-Phive", - "-Phive-thriftserver"] - hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"] - # set the default maven args - base_mvn_args = ["clean", "package", "-DskipTests"] - # set the necessary sbt goals - sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"] - sbt_hive_goals = ["package", - "assembly/assembly", - "streaming-kafka-assembly/assembly"] - - # First build with Hive 0.12.0 to ensure patches do not break the Hive - # 0.12.0 build - print "[info] Compile with Hive 0.12.0" - rm_r("lib_managed") - print "[info] Building Spark with these arguments:", - print " ".join(hive_12_profile_args) - - if amplab_jenkins_build_tool == "maven": - exec_maven(hive_12_profile_args + base_mvn_args).wait() - else: - exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait() - - # Then build with default Hive version (0.13.1) because tests are based on - # this version - print "[info] Compile with Hive 0.13.1" - rm_r("lib_managed") - print "[info] Building Spark with these arguments:", - print " ".join(hive_profile_args) - - if amplab_jenkins_build_tool == "maven": - exec_maven(hive_profile_args + base_mvn_args).wait() - else: - exec_sbt(hive_profile_args + sbt_hive_goals).wait() - -def detect_binary_inop_with_mima(): - set_title_and_block("Detecting binary incompatibilities with MiMa", - "BLOCK_MIMA") - run_cmd(["./dev/mima"]) - -def run_scala_tests(test_suite = []): - """Function to properly execute all tests pass in, as a list, from the - `determine_test_suite` function""" - set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") - - # ensure the test_suite is a set - if not isinstance(test_suite, set): - test_suite = set(test_suite) - - # if the Spark SQL tests are enabled, run the tests with the Hive profiles - # enabled. - if "SQL" in test_suite: - sbt_maven_profile_args = \ - os.environ.get(sbt_maven_profile_args_env).split() - os.environ[sbt_maven_profile_args_env] = \ - " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"]) - - # if we only have changes in SQL build a custom test string - if "SQL" in test_suite and "CORE" not in test_suite: - sbt_maven_test_args = ["catalyst/test", - "sql/test", - "hive/test", - "hive-thriftserver/test", - "mllib/test"] - else: - sbt_maven_test_args = ["test"] - - # get the latest sbt maven profile arguments - sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() - - print "[info] Running Spark tests with these arguments:", - print " ".join(sbt_maven_profile_args), - print " ".join(sbt_maven_test_args) - - if amplab_jenkins_build_tool == "maven": - exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait() - else: - exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait() - -def run_python_tests(test_suite = []): - set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - - # Add path for Python3 in Jenkins if we're calling from a Jenkins machine - if amplab_jenkins: - os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" - - run_cmd(["./python/run-tests"]) - -def run_sparkr_tests(test_suite = []): - set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") - - if which("R"): - run_cmd(["./R/install-dev.sh"]) - run_cmd(["./R/run-tests.sh"]) - else: - print "Ignoring SparkR tests as R was not found in PATH" - -# Ensure the user home directory (HOME) is valid and is an absolute directory -if not user_home_dir or not os.path.isabs(user_home_dir): - print "[error] Cannot determine your home directory as an absolute path;", - print "ensure the $HOME environment variable is set properly." - exit(1) - -# Change directory to the Spark project root -os.chdir(spark_proj_root) - -# Clean up work directory and caches -rm_r("./work") -rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark")) -rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark")) - -# Grab the error codes from the `dev/run-tests-codes.sh` file -error_codes = get_error_codes("dev/run-tests-codes.sh") - -# Set the environment with the general error code initially -os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"] - -# Set the various sbt maven profile argument environment variables -set_sbt_maven_profile_args() - -# Set the java executable we've found (if any) -java_exe = determine_java_executable() - -if not java_exe: - print "[error] Cannot find a version of `java` on the system; please", - print "install one and retry." - exit(2) - -# Grab the current java version information -java_version = determine_java_version(java_exe) - -if java_version[1] < 8: - print "[warn] Java 8 tests will not run because JDK version is < 1.8." - -# Determine the suite of tests to perform -test_suite = determine_test_suite() - -run_apache_rat_checks() - -run_scala_style_checks() - -run_python_style_checks() - -# Build an up-to-date version of Apache Spark -build_apache_spark() - -detect_binary_inop_with_mima() - -run_scala_tests(test_suite) - -run_python_tests() - -run_sparkr_tests() +./dev/run-tests.py diff --git a/dev/run-tests.py b/dev/run-tests.py new file mode 100755 index 0000000000000..9485ab28ae598 --- /dev/null +++ b/dev/run-tests.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +import sys +import shutil +import subprocess + +spark_proj_root = \ + os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") +user_home_dir = os.environ.get("HOME") + +sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS" +amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL" +amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env) +amplab_jenkins = os.environ.get("AMPLAB_JENKINS") + +resolving_re = "^.*[info].*Resolving" +merging_re = "^.*[warn].*Merging" +including_re = "^.*[info].*Including" +sbt_output_filter = re.compile(resolving_re + "|" + + merging_re + "|" + + including_re) + + +def get_error_codes(err_code_file): + """Function to retrieve all block numbers from the `run-tests-codes.sh` + file to maintain backwards compatibility with the `run-tests-jenkins` + script""" + + with open(err_code_file, 'r') as f: + err_codes = [e.split()[1].strip().split('=') + for e in f if e.startswith("readonly")] + return dict(err_codes) + + +def rm_r(path): + """Given an arbitrary path properly remove it with the correct python + construct if it exists + - from: http://stackoverflow.com/a/9559881""" + + if os.path.isdir(path): + shutil.rmtree(path) + elif os.path.exists(path): + os.remove(path) + + +def lineno(): + """Returns the current line number in our program + - from: http://stackoverflow.com/a/3056059""" + + return inspect.currentframe().f_back.f_lineno + + +def run_cmd(cmd): + """Given a command as a list of arguments will attempt to execute the + command and, on failure, print an error message""" + + if not isinstance(cmd, list): + cmd = cmd.split() + try: + subprocess.check_output(cmd) + except subprocess.CalledProcessError as e: + print "[error] running", e.cmd, "; received return code", e.returncode + sys.exit(e.returncode) + + +def set_sbt_maven_profile_args(): + """Properly sets the SBT environment variable arguments with additional + checks to determine if this is running on an Amplab Jenkins machine""" + + # base environment values for sbt_maven_profile_args_env which will be appended on + sbt_maven_profile_args_base = ["-Pkinesis-asl"] + + sbt_maven_profile_arg_dict = { + "hadoop1.0" : ["-Dhadoop.version=1.0.4"], + "hadoop2.0" : ["-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], + "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2", "-Dhadoop.version=2.2.0"], + "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], + } + + # set the SBT maven build profile argument environment variable and ensure + # we build against the right version of Hadoop + if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"): + os.environ[sbt_maven_profile_args_env] = \ + " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) + + sbt_maven_profile_args_base) + else: + os.environ[sbt_maven_profile_args_env] = \ + " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", []) + + sbt_maven_profile_args_base) + + +def is_exe(path): + """Check if a given path is an executable file + - from: http://stackoverflow.com/a/377028""" + + return os.path.isfile(path) and os.access(path, os.X_OK) + + +def which(program): + """Find and return the given program by its absolute path or 'None' + - from: http://stackoverflow.com/a/377028""" + + fpath, fname = os.path.split(program) + + if fpath: + if is_exe(program): + return program + else: + for path in os.environ.get("PATH").split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + return None + + +def determine_java_executable(): + """Will return the *best* path possible for a 'java' executable or `None`""" + + java_home = os.environ.get("JAVA_HOME") + + # check if there is an executable at $JAVA_HOME/bin/java + java_exe = which(os.path.join(java_home, "bin/java")) + # if the java_exe wasn't set, check for a `java` version on the $PATH + return java_exe if java_exe else which("java") + + +def determine_java_version(java_exe): + """Given a valid java executable will return its version in tuple format as: + [, , , ]""" + + raw_output = subprocess.check_output([java_exe, "-version"], stderr=subprocess.STDOUT) + raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' + version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' + version, update = version_str.split('_') # eg ['1.8.0', '25'] + + # map over the values and convert them to integers + return map(lambda x: int(x), version.split('.') + [update]) + + +def multi_starts_with(orig_str, *prefixes): + """Takes a string and an abritrary number of prefixes then checks the + original string for any of the possible prefixes passed in""" + + for s in prefixes: + if orig_str.startswith(s): + return True + return False + + +def determine_test_suite(): + """This function current acts to determine if SQL tests need to be run in + addition to the core test suite *or* if _only_ SQL tests need to be run + as the git logs show that to be the only thing touched. In the future + this function will act more generically to help further segregate the + test suite runner (hence the function name). + @return a set of unique test names""" + test_suite = list() + + if amplab_jenkins: + run_cmd(['git', 'fetch', 'origin', 'master:master']).wait() + + raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master']) + # remove any empty strings + changed_files = [f for f in raw_output.split('\n') if f] + + # find any sql files + sql_files = [f for f in changed_files + if multi_starts_with(f, + "sql/", + "bin/spark-sql", + "sbin/start-thriftserver.sh")] + + non_sql_files = set(changed_files).difference(set(sql_files)) + + if non_sql_files: + test_suite.append("CORE") + if sql_files: + print "[info] Detected changes in SQL. Will run Hive test suite." + test_suite.append("SQL") + if not non_sql_files: + print "[info] Detected no changes except in SQL. Will only run SQL tests." + return set(test_suite) + else: + # we aren't in the Amplab environment so merely run all tests + test_suite.append("CORE") + test_suite.append("SQL") + return set(test_suite) + + +def set_title_and_block(title, err_block): + os.environ["CURRENT_BLOCK"] = error_codes[err_block] + line_str = "".join(['='] * 72) + + print + print line_str + print title + print line_str + + +def run_apache_rat_checks(): + set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") + run_cmd(["./dev/check-license"]) + + +def run_scala_style_checks(): + set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") + run_cmd(["./dev/lint-scala"]) + + +def run_python_style_checks(): + set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") + run_cmd(["./dev/lint-python"]) + + +def exec_maven(mvn_args=[]): + """Will call Maven in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + return subprocess.Popen(["./build/mvn"] + mvn_args) + + +def exec_sbt(sbt_args=[]): + """Will call SBT in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + # NOTE: echo "q" is needed because sbt on encountering a build file + # with failure (either resolution or compilation) prompts the user for + # input either q, r, etc to quit or retry. This echo is there to make it + # not block. + echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE) + sbt_proc = subprocess.Popen(["./build/sbt"] + sbt_args, + stdin=echo_proc.stdout, + stdout=subprocess.PIPE) + echo_proc.wait() + for line in iter(sbt_proc.stdout.readline, ''): + if not sbt_output_filter.match(line): + print line, + return sbt_proc + + +def build_apache_spark(): + """Will first build Spark with Hive v0.12.0 to ensure the build is + successful and, after, will build Spark again against Hive v0.13.1 as the + tests are based off that""" + + set_title_and_block("Building Spark", "BLOCK_BUILD") + + sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() + hive_profile_args = sbt_maven_profile_args + ["-Phive", + "-Phive-thriftserver"] + hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"] + # set the default maven args + base_mvn_args = ["clean", "package", "-DskipTests"] + # set the necessary sbt goals + sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"] + sbt_hive_goals = ["package", + "assembly/assembly", + "streaming-kafka-assembly/assembly"] + + # First build with Hive 0.12.0 to ensure patches do not break the Hive + # 0.12.0 build + print "[info] Compile with Hive 0.12.0" + rm_r("lib_managed") + print "[info] Building Spark with these arguments:", + print " ".join(hive_12_profile_args) + + if amplab_jenkins_build_tool == "maven": + exec_maven(hive_12_profile_args + base_mvn_args).wait() + else: + exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait() + + # Then build with default Hive version (0.13.1) because tests are based on + # this version + print "[info] Compile with Hive 0.13.1" + rm_r("lib_managed") + print "[info] Building Spark with these arguments:", + print " ".join(hive_profile_args) + + if amplab_jenkins_build_tool == "maven": + exec_maven(hive_profile_args + base_mvn_args).wait() + else: + exec_sbt(hive_profile_args + sbt_hive_goals).wait() + + +def detect_binary_inop_with_mima(): + set_title_and_block("Detecting binary incompatibilities with MiMa", + "BLOCK_MIMA") + run_cmd(["./dev/mima"]) + + +def run_scala_tests(test_suite=[]): + """Function to properly execute all tests pass in, as a list, from the + `determine_test_suite` function""" + set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") + + # ensure the test_suite is a set + if not isinstance(test_suite, set): + test_suite = set(test_suite) + + # if the Spark SQL tests are enabled, run the tests with the Hive profiles + # enabled. + if "SQL" in test_suite: + sbt_maven_profile_args = \ + os.environ.get(sbt_maven_profile_args_env).split() + os.environ[sbt_maven_profile_args_env] = \ + " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"]) + + # if we only have changes in SQL build a custom test string + if "SQL" in test_suite and "CORE" not in test_suite: + sbt_maven_test_args = ["catalyst/test", + "sql/test", + "hive/test", + "hive-thriftserver/test", + "mllib/test"] + else: + sbt_maven_test_args = ["test"] + + # get the latest sbt maven profile arguments + sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() + + print "[info] Running Spark tests with these arguments:", + print " ".join(sbt_maven_profile_args), + print " ".join(sbt_maven_test_args) + + if amplab_jenkins_build_tool == "maven": + exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait() + else: + exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait() + + +def run_python_tests(test_suite=[]): + set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") + + # Add path for Python3 in Jenkins if we're calling from a Jenkins machine + if amplab_jenkins: + os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" + + run_cmd(["./python/run-tests"]) + + +def run_sparkr_tests(test_suite=[]): + set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") + + if which("R"): + run_cmd(["./R/install-dev.sh"]) + run_cmd(["./R/run-tests.sh"]) + else: + print "Ignoring SparkR tests as R was not found in PATH" + +if __name__ == "__main__": + # Ensure the user home directory (HOME) is valid and is an absolute directory + if not user_home_dir or not os.path.isabs(user_home_dir): + print "[error] Cannot determine your home directory as an absolute path;", + print "ensure the $HOME environment variable is set properly." + sys.exit(1) + + os.chdir(spark_proj_root) + + rm_r("./work") + rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark")) + rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark")) + + error_codes = get_error_codes("./dev/run-tests-codes.sh") + + os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"] + + set_sbt_maven_profile_args() + + java_exe = determine_java_executable() + + if not java_exe: + print "[error] Cannot find a version of `java` on the system; please", + print "install one and retry." + sys.exit(2) + + java_version = determine_java_version(java_exe) + + if java_version[1] < 8: + print "[warn] Java 8 tests will not run because JDK version is < 1.8." + + test_suite = determine_test_suite() + + run_apache_rat_checks() + + run_scala_style_checks() + + run_python_style_checks() + + build_apache_spark() + + detect_binary_inop_with_mima() + + run_scala_tests(test_suite) + + run_python_tests() + + run_sparkr_tests() From 2cb413bcad65411323d4a2fa9d33217dafe0bd30 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 27 Apr 2015 08:40:45 -0700 Subject: [PATCH 04/52] upcased global variables, changes various calling methods from check_output to check_call --- dev/run-tests.py | 72 +++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 9485ab28ae598..08c1211a24b5f 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -23,21 +23,17 @@ import shutil import subprocess -spark_proj_root = \ +SPARK_PROJ_ROOT = \ os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") -user_home_dir = os.environ.get("HOME") +USER_HOME_DIR = os.environ.get("HOME") -sbt_maven_profile_args_env = "SBT_MAVEN_PROFILES_ARGS" -amplab_jenkins_build_tool_env = "AMPLAB_JENKINS_BUILD_TOOL" -amplab_jenkins_build_tool = os.environ.get(amplab_jenkins_build_tool_env) -amplab_jenkins = os.environ.get("AMPLAB_JENKINS") +SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS" +AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL") +AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS") -resolving_re = "^.*[info].*Resolving" -merging_re = "^.*[warn].*Merging" -including_re = "^.*[info].*Including" -sbt_output_filter = re.compile(resolving_re + "|" + - merging_re + "|" + - including_re) +SBT_OUTPUT_FILTER = re.compile("^.*[info].*Resolving" + "|" + + "^.*[warn].*Merging" + "|" + + "^.*[info].*Including") def get_error_codes(err_code_file): @@ -76,7 +72,7 @@ def run_cmd(cmd): if not isinstance(cmd, list): cmd = cmd.split() try: - subprocess.check_output(cmd) + subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print "[error] running", e.cmd, "; received return code", e.returncode sys.exit(e.returncode) @@ -86,7 +82,7 @@ def set_sbt_maven_profile_args(): """Properly sets the SBT environment variable arguments with additional checks to determine if this is running on an Amplab Jenkins machine""" - # base environment values for sbt_maven_profile_args_env which will be appended on + # base environment values for SBT_MAVEN_PROFILE_ARGS_ENV which will be appended on sbt_maven_profile_args_base = ["-Pkinesis-asl"] sbt_maven_profile_arg_dict = { @@ -99,11 +95,11 @@ def set_sbt_maven_profile_args(): # set the SBT maven build profile argument environment variable and ensure # we build against the right version of Hadoop if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"): - os.environ[sbt_maven_profile_args_env] = \ + os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) + sbt_maven_profile_args_base) else: - os.environ[sbt_maven_profile_args_env] = \ + os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", []) + sbt_maven_profile_args_base) @@ -176,7 +172,7 @@ def determine_test_suite(): @return a set of unique test names""" test_suite = list() - if amplab_jenkins: + if AMPLAB_JENKINS: run_cmd(['git', 'fetch', 'origin', 'master:master']).wait() raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master']) @@ -209,7 +205,7 @@ def determine_test_suite(): def set_title_and_block(title, err_block): os.environ["CURRENT_BLOCK"] = error_codes[err_block] - line_str = "".join(['='] * 72) + line_str = '=' * 72 print print line_str @@ -236,7 +232,7 @@ def exec_maven(mvn_args=[]): """Will call Maven in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" - return subprocess.Popen(["./build/mvn"] + mvn_args) + run_cmd(["./build/mvn"] + mvn_args) def exec_sbt(sbt_args=[]): @@ -253,9 +249,9 @@ def exec_sbt(sbt_args=[]): stdout=subprocess.PIPE) echo_proc.wait() for line in iter(sbt_proc.stdout.readline, ''): - if not sbt_output_filter.match(line): + if not SBT_OUTPUT_FILTER.match(line): print line, - return sbt_proc + sbt_proc.wait() def build_apache_spark(): @@ -265,7 +261,7 @@ def build_apache_spark(): set_title_and_block("Building Spark", "BLOCK_BUILD") - sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() + sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() hive_profile_args = sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"] hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"] @@ -284,8 +280,8 @@ def build_apache_spark(): print "[info] Building Spark with these arguments:", print " ".join(hive_12_profile_args) - if amplab_jenkins_build_tool == "maven": - exec_maven(hive_12_profile_args + base_mvn_args).wait() + if AMPLAB_JENKINS_BUILD_TOOL == "maven": + exec_maven(hive_12_profile_args + base_mvn_args) else: exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait() @@ -296,10 +292,10 @@ def build_apache_spark(): print "[info] Building Spark with these arguments:", print " ".join(hive_profile_args) - if amplab_jenkins_build_tool == "maven": - exec_maven(hive_profile_args + base_mvn_args).wait() + if AMPLAB_JENKINS_BUILD_TOOL == "maven": + exec_maven(hive_profile_args + base_mvn_args) else: - exec_sbt(hive_profile_args + sbt_hive_goals).wait() + exec_sbt(hive_profile_args + sbt_hive_goals) def detect_binary_inop_with_mima(): @@ -321,8 +317,8 @@ def run_scala_tests(test_suite=[]): # enabled. if "SQL" in test_suite: sbt_maven_profile_args = \ - os.environ.get(sbt_maven_profile_args_env).split() - os.environ[sbt_maven_profile_args_env] = \ + os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() + os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"]) # if we only have changes in SQL build a custom test string @@ -336,23 +332,23 @@ def run_scala_tests(test_suite=[]): sbt_maven_test_args = ["test"] # get the latest sbt maven profile arguments - sbt_maven_profile_args = os.environ.get(sbt_maven_profile_args_env).split() + sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() print "[info] Running Spark tests with these arguments:", print " ".join(sbt_maven_profile_args), print " ".join(sbt_maven_test_args) - if amplab_jenkins_build_tool == "maven": - exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]).wait() + if AMPLAB_JENKINS_BUILD_TOOL == "maven": + exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]) else: - exec_sbt(sbt_maven_profile_args + sbt_maven_test_args).wait() + exec_sbt(sbt_maven_profile_args + sbt_maven_test_args) def run_python_tests(test_suite=[]): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") # Add path for Python3 in Jenkins if we're calling from a Jenkins machine - if amplab_jenkins: + if AMPLAB_JENKINS: os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" run_cmd(["./python/run-tests"]) @@ -369,16 +365,16 @@ def run_sparkr_tests(test_suite=[]): if __name__ == "__main__": # Ensure the user home directory (HOME) is valid and is an absolute directory - if not user_home_dir or not os.path.isabs(user_home_dir): + if not USER_HOME_DIR or not os.path.isabs(USER_HOME_DIR): print "[error] Cannot determine your home directory as an absolute path;", print "ensure the $HOME environment variable is set properly." sys.exit(1) - os.chdir(spark_proj_root) + os.chdir(SPARK_PROJ_ROOT) rm_r("./work") - rm_r(os.path.join(user_home_dir, ".ivy2/local/org.apache.spark")) - rm_r(os.path.join(user_home_dir, ".ivy2/cache/org.apache.spark")) + rm_r(os.path.join(USER_HOME_DIR, ".ivy2/local/org.apache.spark")) + rm_r(os.path.join(USER_HOME_DIR, ".ivy2/cache/org.apache.spark")) error_codes = get_error_codes("./dev/run-tests-codes.sh") From ec03bf3b1e1194a8c0dc1b3a9cd83fd82b67ad4a Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 27 Apr 2015 09:36:22 -0700 Subject: [PATCH 05/52] added namedtuple for java version to add readability --- dev/run-tests.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 08c1211a24b5f..59f2306d71de6 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -22,6 +22,7 @@ import sys import shutil import subprocess +from collections import namedtuple SPARK_PROJ_ROOT = \ os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") @@ -144,13 +145,22 @@ def determine_java_version(java_exe): """Given a valid java executable will return its version in tuple format as: [, , , ]""" - raw_output = subprocess.check_output([java_exe, "-version"], stderr=subprocess.STDOUT) + raw_output = subprocess.check_output([java_exe, "-version"], + stderr=subprocess.STDOUT) raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' version, update = version_str.split('_') # eg ['1.8.0', '25'] + JavaVersion = namedtuple('JavaVersion', + ['major', 'minor', 'patch', 'update']) + # map over the values and convert them to integers - return map(lambda x: int(x), version.split('.') + [update]) + version_info = map(lambda x: int(x), version.split('.') + [update]) + + return JavaVersion(major=version_info[0], + minor=version_info[1], + patch=version_info[2], + update=version_info[3]) def multi_starts_with(orig_str, *prefixes): @@ -391,7 +401,7 @@ def run_sparkr_tests(test_suite=[]): java_version = determine_java_version(java_exe) - if java_version[1] < 8: + if java_version.minor < 8: print "[warn] Java 8 tests will not run because JDK version is < 1.8." test_suite = determine_test_suite() From 07210a90a83c153cb562897477d8dd5cf92e94da Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 27 Apr 2015 09:39:21 -0700 Subject: [PATCH 06/52] minor doc string change for java version with namedtuple update --- dev/run-tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 59f2306d71de6..9106f7f0cf333 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -142,8 +142,8 @@ def determine_java_executable(): def determine_java_version(java_exe): - """Given a valid java executable will return its version in tuple format as: - [, , , ]""" + """Given a valid java executable will return its version in named tuple format + with accessors '.major', '.minor', '.patch', '.update'""" raw_output = subprocess.check_output([java_exe, "-version"], stderr=subprocess.STDOUT) From 26e18e8b7bf101f28932f2efbe52f830f0a7405a Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 27 Apr 2015 11:24:44 -0700 Subject: [PATCH 07/52] removed unnecessary wait() --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 9106f7f0cf333..d4009731697fa 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -183,7 +183,7 @@ def determine_test_suite(): test_suite = list() if AMPLAB_JENKINS: - run_cmd(['git', 'fetch', 'origin', 'master:master']).wait() + run_cmd(['git', 'fetch', 'origin', 'master:master']) raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master']) # remove any empty strings From c095fa665e2c7c9f0487404a551844452172ad22 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 27 Apr 2015 14:30:48 -0700 Subject: [PATCH 08/52] removed another wait() call --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index d4009731697fa..4fb77f43cfb16 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -293,7 +293,7 @@ def build_apache_spark(): if AMPLAB_JENKINS_BUILD_TOOL == "maven": exec_maven(hive_12_profile_args + base_mvn_args) else: - exec_sbt(hive_12_profile_args + sbt_hive_12_goals).wait() + exec_sbt(hive_12_profile_args + sbt_hive_12_goals) # Then build with default Hive version (0.13.1) because tests are based on # this version From 83e80ef4eec49dcee7c55900e4cbcf9b899aea65 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 27 Apr 2015 14:31:58 -0700 Subject: [PATCH 09/52] attempt at better python output when called from bash --- dev/run-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests b/dev/run-tests index 844ff6a0d9757..3bb677b6bce0d 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -20,4 +20,4 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -./dev/run-tests.py +python -u ./dev/run-tests.py From b0b2604595768a4989b4fd802846fb18a4572f21 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 28 Apr 2015 12:48:59 -0700 Subject: [PATCH 10/52] comment out import to see if build fails and returns properly --- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 86269eac52db0..b46bd3ae31be8 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} +//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} import org.apache.mesos.MesosNativeLibrary From 803143a16ebf0e3e9c41448a1dcea053e3d646c1 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 28 Apr 2015 13:41:22 -0700 Subject: [PATCH 11/52] removed license file for SparkContext --- .../scala/org/apache/spark/SparkContext.scala | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index b46bd3ae31be8..70ba871ba77f0 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1,20 +1,3 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - package org.apache.spark import scala.language.implicitConversions @@ -39,7 +22,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} +import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} import org.apache.mesos.MesosNativeLibrary From a5bd4455b49f2428e3b28e04bcd1f883d0e49a05 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 28 Apr 2015 16:21:40 -0700 Subject: [PATCH 12/52] reverted license, changed test in shuffle to fail --- .../scala/org/apache/spark/SparkContext.scala | 17 +++++++++++++++++ .../scala/org/apache/spark/ShuffleSuite.scala | 3 ++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 70ba871ba77f0..86269eac52db0 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark import scala.language.implicitConversions diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala index d7180516029d5..629f74d91c5cf 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala @@ -84,7 +84,8 @@ abstract class ShuffleSuite extends FunSuite with Matchers with LocalSparkContex NonJavaSerializableClass, NonJavaSerializableClass](b, new HashPartitioner(3)) c.setSerializer(new KryoSerializer(conf)) - assert(c.count === 10) + // assert(c.count === 10) + assert(c.count === 42) } test("zero sized blocks") { From 7613558fbb962a73085352d5c5010de9ee204809 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 10:38:21 -0700 Subject: [PATCH 13/52] updated to return the proper env variable for return codes --- dev/run-tests.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 4fb77f43cfb16..da99c7a84024e 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -48,6 +48,11 @@ def get_error_codes(err_code_file): return dict(err_codes) +def exit_from_command_with_retcode(cmd, retcode): + print "[error] running", cmd, "; received return code", retcode + sys.exit(os.environ["BLOCK_CURRENT"]) + + def rm_r(path): """Given an arbitrary path properly remove it with the correct python construct if it exists @@ -75,8 +80,7 @@ def run_cmd(cmd): try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: - print "[error] running", e.cmd, "; received return code", e.returncode - sys.exit(e.returncode) + exit_from_command_with_retcode(e.cmd, e.returncode) def set_sbt_maven_profile_args(): @@ -249,19 +253,24 @@ def exec_sbt(sbt_args=[]): """Will call SBT in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" + sbt_cmd = ["./build/sbt"] + sbt_args + # NOTE: echo "q" is needed because sbt on encountering a build file # with failure (either resolution or compilation) prompts the user for # input either q, r, etc to quit or retry. This echo is there to make it # not block. echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE) - sbt_proc = subprocess.Popen(["./build/sbt"] + sbt_args, + sbt_proc = subprocess.Popen(sbt_cmd, stdin=echo_proc.stdout, stdout=subprocess.PIPE) echo_proc.wait() for line in iter(sbt_proc.stdout.readline, ''): if not SBT_OUTPUT_FILTER.match(line): print line, - sbt_proc.wait() + retcode = sbt_proc.wait() + + if retcode > 0: + exit_from_command_with_retcode(sbt_cmd, retcode) def build_apache_spark(): From b37328ccf61765469fb6d5f20115520e48a60ff4 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 11:57:33 -0700 Subject: [PATCH 14/52] fixed typo and added default return is no error block was found in the environment --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index da99c7a84024e..b25089afb0ff8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -50,7 +50,7 @@ def get_error_codes(err_code_file): def exit_from_command_with_retcode(cmd, retcode): print "[error] running", cmd, "; received return code", retcode - sys.exit(os.environ["BLOCK_CURRENT"]) + sys.exit(os.environ.get("CURRENT_BLOCK", 255)) def rm_r(path): From 56d3cb93f30c2f5919d0f63dcd0edc4185b8cee7 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 13:16:52 -0700 Subject: [PATCH 15/52] changed test back and commented out import to break compile --- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- core/src/test/scala/org/apache/spark/ShuffleSuite.scala | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 86269eac52db0..b46bd3ae31be8 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} +//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} import org.apache.mesos.MesosNativeLibrary diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala index 629f74d91c5cf..d7180516029d5 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala @@ -84,8 +84,7 @@ abstract class ShuffleSuite extends FunSuite with Matchers with LocalSparkContex NonJavaSerializableClass, NonJavaSerializableClass](b, new HashPartitioner(3)) c.setSerializer(new KryoSerializer(conf)) - // assert(c.count === 10) - assert(c.count === 42) + assert(c.count === 10) } test("zero sized blocks") { From e4a96cc0ffc43515f62ce2036fc93eb9ea2c7535 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 13:43:48 -0700 Subject: [PATCH 16/52] removed the import error and added license error, fixed the way run-tests and run-tests.py report their error codes --- .../main/scala/org/apache/spark/SparkConf.scala | 17 ----------------- .../scala/org/apache/spark/SparkContext.scala | 2 +- dev/run-tests | 4 ++++ dev/run-tests.py | 2 +- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index e3a649d755450..a08fcd6658721 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -1,20 +1,3 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - package org.apache.spark import java.util.concurrent.ConcurrentHashMap diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index b46bd3ae31be8..86269eac52db0 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} +import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} import org.apache.mesos.MesosNativeLibrary diff --git a/dev/run-tests b/dev/run-tests index 3bb677b6bce0d..b9002ade42160 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -21,3 +21,7 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" python -u ./dev/run-tests.py + +# exit from this script with the return code from the python script to ensure +# dev/run-tests-jenkins reports the correct error +exit $? diff --git a/dev/run-tests.py b/dev/run-tests.py index b25089afb0ff8..973655943c086 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -50,7 +50,7 @@ def get_error_codes(err_code_file): def exit_from_command_with_retcode(cmd, retcode): print "[error] running", cmd, "; received return code", retcode - sys.exit(os.environ.get("CURRENT_BLOCK", 255)) + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) def rm_r(path): From 76335fb9f5a86c4eb8d07cd26fcfc6321fa93084 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 14:12:55 -0700 Subject: [PATCH 17/52] reverted rat license issue for sparkconf --- .../main/scala/org/apache/spark/SparkConf.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index a08fcd6658721..e3a649d755450 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark import java.util.concurrent.ConcurrentHashMap From 983f2a2a99bf5bc9fce6547dcc089e37a8d8ebfc Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 14:13:34 -0700 Subject: [PATCH 18/52] comment out import to fail build test --- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 5ae8fb81de809..c02fc8aebd4a7 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} +//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} import org.apache.mesos.MesosNativeLibrary From f041d8af6a6300934134a8240f6c2621b6d1b825 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 14:23:07 -0700 Subject: [PATCH 19/52] added space from commented import to now test build breaking --- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index c02fc8aebd4a7..f28ddbfbe15a8 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -//import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} +// import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} import org.apache.mesos.MesosNativeLibrary From d825aa4e6dcf5c3137dc0e008dace87ef4e28dc6 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 29 Apr 2015 14:58:32 -0700 Subject: [PATCH 20/52] revert build break, add mima break --- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- project/MimaExcludes.scala | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index f28ddbfbe15a8..5ae8fb81de809 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -39,7 +39,7 @@ import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, Doub FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -// import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} +import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} import org.apache.mesos.MesosNativeLibrary diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 3beafa158eb97..68301aea642e3 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -85,9 +85,9 @@ object MimaExcludes { ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.mllib.linalg.Vector.numNonzeros"), ProblemFilters.exclude[MissingMethodProblem]( - "org.apache.spark.mllib.linalg.Vector.toSparse"), - ProblemFilters.exclude[MissingMethodProblem]( - "org.apache.spark.mllib.linalg.Vector.numActives") + "org.apache.spark.mllib.linalg.Vector.toSparse") + // ProblemFilters.exclude[MissingMethodProblem]( + // "org.apache.spark.mllib.linalg.Vector.numActives") ) case v if v.startsWith("1.3") => From 9a592ec89165a1797196a74cfea053982c52d884 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Thu, 30 Apr 2015 08:59:32 -0700 Subject: [PATCH 21/52] reverted mima exclude issue, added pyspark test failure --- project/MimaExcludes.scala | 6 +++--- python/pyspark/tests.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 68301aea642e3..3beafa158eb97 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -85,9 +85,9 @@ object MimaExcludes { ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.mllib.linalg.Vector.numNonzeros"), ProblemFilters.exclude[MissingMethodProblem]( - "org.apache.spark.mllib.linalg.Vector.toSparse") - // ProblemFilters.exclude[MissingMethodProblem]( - // "org.apache.spark.mllib.linalg.Vector.numActives") + "org.apache.spark.mllib.linalg.Vector.toSparse"), + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.mllib.linalg.Vector.numActives") ) case v if v.startsWith("1.3") => diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index ea63a396da5b8..f617b555e174f 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -150,7 +150,8 @@ def gen_data(N, step): def gen_gs(N, step=1): return shuffle.GroupByKey(gen_data(N, step)) - self.assertEqual(1, len(list(gen_gs(1)))) + self.assertEqual(42, len(list(gen_gs(1)))) + # self.assertEqual(1, len(list(gen_gs(1)))) self.assertEqual(2, len(list(gen_gs(2)))) self.assertEqual(100, len(list(gen_gs(100)))) self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)]) From 1dada6b5683be3dccb7173d3ed152f5c31614cb0 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Thu, 30 Apr 2015 10:03:18 -0700 Subject: [PATCH 22/52] reverted pyspark test failure --- python/pyspark/tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index f617b555e174f..ea63a396da5b8 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -150,8 +150,7 @@ def gen_data(N, step): def gen_gs(N, step=1): return shuffle.GroupByKey(gen_data(N, step)) - self.assertEqual(42, len(list(gen_gs(1)))) - # self.assertEqual(1, len(list(gen_gs(1)))) + self.assertEqual(1, len(list(gen_gs(1)))) self.assertEqual(2, len(list(gen_gs(2)))) self.assertEqual(100, len(list(gen_gs(100)))) self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)]) From afeb09388840d747e8954539578f9be5d5b219d1 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Thu, 30 Apr 2015 10:05:40 -0700 Subject: [PATCH 23/52] updated to make sparkR test fail --- R/pkg/inst/tests/test_rdd.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_rdd.R b/R/pkg/inst/tests/test_rdd.R index 03207353c31c6..5363009ba1b97 100644 --- a/R/pkg/inst/tests/test_rdd.R +++ b/R/pkg/inst/tests/test_rdd.R @@ -28,7 +28,8 @@ intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200)) intRdd <- parallelize(sc, intPairs, 2L) test_that("get number of partitions in RDD", { - expect_equal(numPartitions(rdd), 2) + expect_equal(numPartitions(rdd), 42) + # expect_equal(numPartitions(rdd), 2) expect_equal(numPartitions(intRdd), 2) }) From b1ca59375446555d9bfdbb594d2e6364820bcd26 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Fri, 1 May 2015 09:46:06 -0700 Subject: [PATCH 24/52] reverted the sparkR test --- R/pkg/inst/tests/test_rdd.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/test_rdd.R b/R/pkg/inst/tests/test_rdd.R index 5363009ba1b97..03207353c31c6 100644 --- a/R/pkg/inst/tests/test_rdd.R +++ b/R/pkg/inst/tests/test_rdd.R @@ -28,8 +28,7 @@ intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200)) intRdd <- parallelize(sc, intPairs, 2L) test_that("get number of partitions in RDD", { - expect_equal(numPartitions(rdd), 42) - # expect_equal(numPartitions(rdd), 2) + expect_equal(numPartitions(rdd), 2) expect_equal(numPartitions(intRdd), 2) }) From f950010d585ce9744c79d76dfa8dfbe61079e0a2 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 11 May 2015 16:31:28 -0700 Subject: [PATCH 25/52] removed building hive-0.12.0 per SPARK-6908 --- dev/run-tests.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 973655943c086..f76e3ffff55d2 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -283,27 +283,13 @@ def build_apache_spark(): sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() hive_profile_args = sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"] - hive_12_profile_args = hive_profile_args + ["-Phive-0.12.0"] # set the default maven args base_mvn_args = ["clean", "package", "-DskipTests"] # set the necessary sbt goals - sbt_hive_12_goals = ["clean", "hive/compile", "hive-thriftserver/compile"] sbt_hive_goals = ["package", "assembly/assembly", "streaming-kafka-assembly/assembly"] - # First build with Hive 0.12.0 to ensure patches do not break the Hive - # 0.12.0 build - print "[info] Compile with Hive 0.12.0" - rm_r("lib_managed") - print "[info] Building Spark with these arguments:", - print " ".join(hive_12_profile_args) - - if AMPLAB_JENKINS_BUILD_TOOL == "maven": - exec_maven(hive_12_profile_args + base_mvn_args) - else: - exec_sbt(hive_12_profile_args + sbt_hive_12_goals) - # Then build with default Hive version (0.13.1) because tests are based on # this version print "[info] Compile with Hive 0.13.1" From f9deba1cb25ae10fc9ea29cf9fe319bae521244c Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 19 May 2015 12:16:07 -0700 Subject: [PATCH 26/52] python to python2 and removed newline --- dev/run-tests.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 3dd09b8714171..0a178e655924d 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 # # Licensed to the Apache Software Foundation (ASF) under one or more @@ -24,8 +24,7 @@ import subprocess from collections import namedtuple -SPARK_PROJ_ROOT = \ - os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") +SPARK_PROJ_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") USER_HOME_DIR = os.environ.get("HOME") SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS" From b1248dc582124114bda5cf4dad9bc76ad229c311 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Thu, 21 May 2015 11:28:27 -0700 Subject: [PATCH 27/52] exec python rather than running python and exiting with return code --- dev/run-tests | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dev/run-tests b/dev/run-tests index b9002ade42160..a00d9f0c27639 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -20,8 +20,4 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -python -u ./dev/run-tests.py - -# exit from this script with the return code from the python script to ensure -# dev/run-tests-jenkins reports the correct error -exit $? +exec python -u ./dev/run-tests.py From 0629de8a81a07bcf78c50360af5354e7f5062fd0 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Fri, 5 Jun 2015 11:20:23 -0700 Subject: [PATCH 28/52] updated to refactor and remove various small bugs, removed pep8 complaints --- dev/run-tests.py | 388 ++++++++++++++++++++++++++--------------------- 1 file changed, 211 insertions(+), 177 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 0a178e655924d..38859da5486f5 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -24,25 +24,21 @@ import subprocess from collections import namedtuple -SPARK_PROJ_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") -USER_HOME_DIR = os.environ.get("HOME") +SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") +USER_HOME = os.environ.get("HOME") -SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS" -AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL") -AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS") - -SBT_OUTPUT_FILTER = re.compile("^.*[info].*Resolving" + "|" + - "^.*[warn].*Merging" + "|" + - "^.*[info].*Including") +#SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS" +#AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") +#AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS") def get_error_codes(err_code_file): """Function to retrieve all block numbers from the `run-tests-codes.sh` - file to maintain backwards compatibility with the `run-tests-jenkins` + file to maintain backwards compatibility with the `run-tests-jenkins` script""" - + with open(err_code_file, 'r') as f: - err_codes = [e.split()[1].strip().split('=') + err_codes = [e.split()[1].strip().split('=') for e in f if e.startswith("readonly")] return dict(err_codes) @@ -63,13 +59,6 @@ def rm_r(path): os.remove(path) -def lineno(): - """Returns the current line number in our program - - from: http://stackoverflow.com/a/3056059""" - - return inspect.currentframe().f_back.f_lineno - - def run_cmd(cmd): """Given a command as a list of arguments will attempt to execute the command and, on failure, print an error message""" @@ -82,32 +71,6 @@ def run_cmd(cmd): exit_from_command_with_retcode(e.cmd, e.returncode) -def set_sbt_maven_profile_args(): - """Properly sets the SBT environment variable arguments with additional - checks to determine if this is running on an Amplab Jenkins machine""" - - # base environment values for SBT_MAVEN_PROFILE_ARGS_ENV which will be appended on - sbt_maven_profile_args_base = ["-Pkinesis-asl"] - - sbt_maven_profile_arg_dict = { - "hadoop1.0" : ["-Phadoop-1", "-Dhadoop.version=1.0.4"], - "hadoop2.0" : ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], - "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2"], - "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], - } - - # set the SBT maven build profile argument environment variable and ensure - # we build against the right version of Hadoop - if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"): - os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ - " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) - + sbt_maven_profile_args_base) - else: - os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ - " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", []) - + sbt_maven_profile_args_base) - - def is_exe(path): """Check if a given path is an executable file - from: http://stackoverflow.com/a/377028""" @@ -134,7 +97,12 @@ def which(program): def determine_java_executable(): - """Will return the *best* path possible for a 'java' executable or `None`""" + """Will return the path of the java executable that will be used by Spark's + tests or `None`""" + + # Any changes in the way that Spark's build detects java must be reflected + # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to + # the `java` executable on the path java_home = os.environ.get("JAVA_HOME") @@ -144,21 +112,21 @@ def determine_java_executable(): return java_exe if java_exe else which("java") +JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update']) + + def determine_java_version(java_exe): """Given a valid java executable will return its version in named tuple format with accessors '.major', '.minor', '.patch', '.update'""" - raw_output = subprocess.check_output([java_exe, "-version"], + raw_output = subprocess.check_output([java_exe, "-version"], stderr=subprocess.STDOUT) - raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' - version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' - version, update = version_str.split('_') # eg ['1.8.0', '25'] - - JavaVersion = namedtuple('JavaVersion', - ['major', 'minor', 'patch', 'update']) + raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' + version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' + version, update = version_str.split('_') # eg ['1.8.0', '25'] # map over the values and convert them to integers - version_info = map(lambda x: int(x), version.split('.') + [update]) + version_info = [int(x) for x in version.split('.') + [update]] return JavaVersion(major=version_info[0], minor=version_info[1], @@ -166,56 +134,6 @@ def determine_java_version(java_exe): update=version_info[3]) -def multi_starts_with(orig_str, *prefixes): - """Takes a string and an abritrary number of prefixes then checks the - original string for any of the possible prefixes passed in""" - - for s in prefixes: - if orig_str.startswith(s): - return True - return False - - -def determine_test_suite(): - """This function current acts to determine if SQL tests need to be run in - addition to the core test suite *or* if _only_ SQL tests need to be run - as the git logs show that to be the only thing touched. In the future - this function will act more generically to help further segregate the - test suite runner (hence the function name). - @return a set of unique test names""" - test_suite = list() - - if AMPLAB_JENKINS: - run_cmd(['git', 'fetch', 'origin', 'master:master']) - - raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master']) - # remove any empty strings - changed_files = [f for f in raw_output.split('\n') if f] - - # find any sql files - sql_files = [f for f in changed_files - if multi_starts_with(f, - "sql/", - "bin/spark-sql", - "sbin/start-thriftserver.sh")] - - non_sql_files = set(changed_files).difference(set(sql_files)) - - if non_sql_files: - test_suite.append("CORE") - if sql_files: - print "[info] Detected changes in SQL. Will run Hive test suite." - test_suite.append("SQL") - if not non_sql_files: - print "[info] Detected no changes except in SQL. Will only run SQL tests." - return set(test_suite) - else: - # we aren't in the Amplab environment so merely run all tests - test_suite.append("CORE") - test_suite.append("SQL") - return set(test_suite) - - def set_title_and_block(title, err_block): os.environ["CURRENT_BLOCK"] = error_codes[err_block] line_str = '=' * 72 @@ -254,6 +172,10 @@ def exec_sbt(sbt_args=[]): sbt_cmd = ["./build/sbt"] + sbt_args + sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" + + "^.*[warn].*Merging" + "|" + + "^.*[info].*Including") + # NOTE: echo "q" is needed because sbt on encountering a build file # with failure (either resolution or compilation) prompts the user for # input either q, r, etc to quit or retry. This echo is there to make it @@ -264,42 +186,90 @@ def exec_sbt(sbt_args=[]): stdout=subprocess.PIPE) echo_proc.wait() for line in iter(sbt_proc.stdout.readline, ''): - if not SBT_OUTPUT_FILTER.match(line): - print line, + if not sbt_output_filter.match(line): + print line, retcode = sbt_proc.wait() if retcode > 0: exit_from_command_with_retcode(sbt_cmd, retcode) -def build_apache_spark(): - """Will first build Spark with Hive v0.12.0 to ensure the build is - successful and, after, will build Spark again against Hive v0.13.1 as the - tests are based off that""" +def get_hadoop_profiles(hadoop_version): + """Return a list of profiles indicating which Hadoop version to use from a Hadoop version tag.""" + + #amplab_jenkins_build_profile = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE") + + sbt_maven_hadoop_profiles = { + "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"], + "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], + "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"], + "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], + } + + try: + hadoop_profiles = sbt_maven_hadoop_profiles[hadoop_version] + except KeyError: + print "[error] Could not find", hadoop_version, "in the list. Valid options", + print "are 'hadoop1.0', 'hadoop2.0', 'hadoop2.2', and 'hadoop2.3'." + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + return hadoop_profiles + + +def get_build_profiles(hadoop_version="hadoop2.3", + base_profiles=True, + hive_profiles=False): + """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile + key with the option of adding on the base and hive profiles.""" + + base_profiles = ["-Pkinesis-asl"] + hive_profiles = ["-Phive", "-Phive-thriftserver"] + hadoop_profiles = get_hadoop_profiles(hadoop_version) + + # first, check and add the base profiles + if base_profiles: build_profiles = build_profile + base_profiles + # second, check and add the hive profiles + if hive_profiles: build_profiles = build_profile + hive_profiles + + return build_profiles + + +def build_spark_maven(hadoop_version): + build_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + mvn_goals = ["clean", "package", "-DskipTests"] + profiles_and_goals = build_profiles + mvn_goals + + print "[info] Building Spark (w/Hive 0.13.1) with these arguments:", + print " ".join(profiles_and_goals) + + exec_maven(profiles_and_goals) + + +def build_spark_sbt(hadoop_version): + build_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + sbt_goals = ["package", + "assembly/assembly", + "streaming-kafka-assembly/assembly"] + profiles_and_goals = build_profiles + sbt_goals + + print "[info] Building Spark (w/Hive 0.13.1) with these arguments:", + print " ".join(profiles_and_goals) + + exec_sbt(profiles_and_goals) + + +def build_apache_spark(build_tool, hadoop_version): + """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or + `maven`). Defaults to using `sbt`.""" set_title_and_block("Building Spark", "BLOCK_BUILD") - sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() - hive_profile_args = sbt_maven_profile_args + ["-Phive", - "-Phive-thriftserver"] - # set the default maven args - base_mvn_args = ["clean", "package", "-DskipTests"] - # set the necessary sbt goals - sbt_hive_goals = ["package", - "assembly/assembly", - "streaming-kafka-assembly/assembly"] - - # Then build with default Hive version (0.13.1) because tests are based on - # this version - print "[info] Compile with Hive 0.13.1" rm_r("lib_managed") - print "[info] Building Spark with these arguments:", - print " ".join(hive_profile_args) - if AMPLAB_JENKINS_BUILD_TOOL == "maven": - exec_maven(hive_profile_args + base_mvn_args) + if build_tool == "maven": + build_spark_maven(hadoop_version) else: - exec_sbt(hive_profile_args + sbt_hive_goals) + build_spark_sbt(hadoop_version) def detect_binary_inop_with_mima(): @@ -308,49 +278,98 @@ def detect_binary_inop_with_mima(): run_cmd(["./dev/mima"]) -def run_scala_tests(test_suite=[]): - """Function to properly execute all tests pass in, as a list, from the - `determine_test_suite` function""" - set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") +def determine_test_modules(test_env): + """This function current acts to determine if SQL tests need to be run in + addition to the core test suite *or* if _only_ SQL tests need to be run + as the git logs show that to be the only thing touched. In the future + this function will act more generically to help further segregate the + test suite runner (hence the function name). + @return a set of unique test names""" + test_suite = list() - # ensure the test_suite is a set - if not isinstance(test_suite, set): - test_suite = set(test_suite) + if test_env == "amplab_jenkins": + target_branch = os.environ.get("ghprbTargetBranch") + run_cmd(['git', 'fetch', 'origin', target_branch+":"+target_branch]) - # if the Spark SQL tests are enabled, run the tests with the Hive profiles - # enabled. - if "SQL" in test_suite: - sbt_maven_profile_args = \ - os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() - os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ - " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"]) - - # if we only have changes in SQL build a custom test string - if "SQL" in test_suite and "CORE" not in test_suite: - sbt_maven_test_args = ["catalyst/test", - "sql/test", - "hive/test", - "hive-thriftserver/test", - "mllib/test"] + raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch]) + # remove any empty strings + changed_files = [f for f in raw_output.split('\n') if f] + + # find any sql files + sql_files = [f for f in changed_files + if any(f.startswith(p) for p in ["sql/", + "bin/spark-sql", + "sbin/start-thriftserver.sh"])] + + non_sql_files = set(changed_files).difference(set(sql_files)) + + if non_sql_files: + test_suite.append("CORE") + if sql_files: + print "[info] Detected changes in SQL. Will run Hive test suite." + test_suite.append("SQL") + if not non_sql_files: + print "[info] Detected no changes except in SQL. Will only run SQL tests." + return set(test_suite) else: - sbt_maven_test_args = ["test"] + # we aren't in the Amplab environment so simply run all tests + test_suite.append("CORE") + test_suite.append("SQL") + return set(test_suite) + + +def run_scala_tests_maven(test_profiles): + mvn_test_goals = ["test", "--fail-at-end"] + profiles_and_goals = test_profiles + mvn_test_goals + + print "[info] Running Spark tests with these arguments:", + print " ".join(profiles_and_goals) + + exec_maven(profiles_and_goals) - # get the latest sbt maven profile arguments - sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() + +def run_scala_tests_sbt(test_modules, test_profiles): + # if we only have changes in SQL build a custom test list + if "SQL" in test_modules and "CORE" not in test_modules: + sbt_test_goals = ["catalyst/test", + "sql/test", + "hive/test", + "hive-thriftserver/test", + "mllib/test"] + else: + sbt_test_goals = ["test"] + + profiles_and_goals = test_profiles + sbt_test_goals print "[info] Running Spark tests with these arguments:", - print " ".join(sbt_maven_profile_args), - print " ".join(sbt_maven_test_args) + print " ".join(profiles_and_goals) + + exec_sbt(profiles_and_goals) + + +def run_scala_tests(build_tool, hadoop_version, test_modules): + """Function to properly execute all tests passed in as a set from the + `determine_test_suites` function""" + set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") + + test_modules = set(test_modules) - if AMPLAB_JENKINS_BUILD_TOOL == "maven": - exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]) + # if the Spark SQL tests are enabled, run the tests with the Hive profiles + # enabled. + if "SQL" in test_modules: + test_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + else: + test_profiles = get_build_profiles(hadoop_version) + + if build_tool == "maven": + run_scala_tests_maven(test_profiles) else: - exec_sbt(sbt_maven_profile_args + sbt_maven_test_args) + run_scala_tests_sbt(test_modules, test_profiles) -def run_python_tests(test_suite=[]): +def run_python_tests(): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - + # Add path for Python3 in Jenkins if we're calling from a Jenkins machine if AMPLAB_JENKINS: os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" @@ -358,7 +377,7 @@ def run_python_tests(test_suite=[]): run_cmd(["./python/run-tests"]) -def run_sparkr_tests(test_suite=[]): +def run_sparkr_tests(): set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") if which("R"): @@ -367,29 +386,28 @@ def run_sparkr_tests(test_suite=[]): else: print "Ignoring SparkR tests as R was not found in PATH" -if __name__ == "__main__": + +def main(): # Ensure the user home directory (HOME) is valid and is an absolute directory - if not USER_HOME_DIR or not os.path.isabs(USER_HOME_DIR): + if not USER_HOME or not os.path.isabs(USER_HOME): print "[error] Cannot determine your home directory as an absolute path;", print "ensure the $HOME environment variable is set properly." sys.exit(1) - os.chdir(SPARK_PROJ_ROOT) + os.chdir(SPARK_HOME) - rm_r("./work") - rm_r(os.path.join(USER_HOME_DIR, ".ivy2/local/org.apache.spark")) - rm_r(os.path.join(USER_HOME_DIR, ".ivy2/cache/org.apache.spark")) + rm_r(os.path.join(SPARK_HOME, "work")) + rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark")) + rm_r(os.path.join(USER_HOME, ".ivy2/cache/org.apache.spark")) error_codes = get_error_codes("./dev/run-tests-codes.sh") os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"] - set_sbt_maven_profile_args() - java_exe = determine_java_executable() if not java_exe: - print "[error] Cannot find a version of `java` on the system; please", + print "[error] Cannot find a version of `java` on the system; please", print "install one and retry." sys.exit(2) @@ -398,20 +416,36 @@ def run_sparkr_tests(test_suite=[]): if java_version.minor < 8: print "[warn] Java 8 tests will not run because JDK version is < 1.8." - test_suite = determine_test_suite() + if os.environ.get("AMPLAB_JENKINS"): + # if we're on the Amplab Jenkins build servers setup variables + # to reflect the environment settings + build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") + hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") + test_env="amplab_jenkins" + else: + # else we're running locally and can use local settings + build_tool = "sbt" + hadoop_version = "hadoop2.3" + test_env="local" + # license checks run_apache_rat_checks() - - run_scala_style_checks() + # style checks + run_scala_style_checks() run_python_style_checks() - build_apache_spark() + # spark build + build_apache_spark(build_tool, hadoop_version) + # backwards compatibility checks detect_binary_inop_with_mima() - run_scala_tests(test_suite) - + # test suites + test_modules = determine_test_modules(test_env) + run_scala_tests(build_tool, hadoop_version, test_modules) run_python_tests() - run_sparkr_tests() + +if __name__ == "__main__": + main() From 8afbe9319837ede3f41dd250f11aeaaa1d8dacf8 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Fri, 5 Jun 2015 11:25:35 -0700 Subject: [PATCH 29/52] made error codes a global --- dev/run-tests.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 38859da5486f5..acc8db866f8a8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -27,10 +27,6 @@ SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") USER_HOME = os.environ.get("HOME") -#SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS" -#AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") -#AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS") - def get_error_codes(err_code_file): """Function to retrieve all block numbers from the `run-tests-codes.sh` @@ -43,6 +39,9 @@ def get_error_codes(err_code_file): return dict(err_codes) +ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh")) + + def exit_from_command_with_retcode(cmd, retcode): print "[error] running", cmd, "; received return code", retcode sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) @@ -135,7 +134,7 @@ def determine_java_version(java_exe): def set_title_and_block(title, err_block): - os.environ["CURRENT_BLOCK"] = error_codes[err_block] + os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block] line_str = '=' * 72 print @@ -400,9 +399,7 @@ def main(): rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2/cache/org.apache.spark")) - error_codes = get_error_codes("./dev/run-tests-codes.sh") - - os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"] + os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"] java_exe = determine_java_executable() From 1f607b1a4024721d0775e6010f9589771aa593af Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 9 Jun 2015 16:09:02 -0700 Subject: [PATCH 30/52] finalizing revisions to modular tests --- dev/run-tests.py | 112 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 85 insertions(+), 27 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index acc8db866f8a8..94d2ad1e563bf 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -60,11 +60,14 @@ def rm_r(path): def run_cmd(cmd): """Given a command as a list of arguments will attempt to execute the - command and, on failure, print an error message""" + command from the determined SPARK_HOME directory and, on failure, print + an error message""" if not isinstance(cmd, list): cmd = cmd.split() try: + # prepend SPARK_HOME onto the first element of the command + cmd[0] = os.path.join(SPARK_HOME, *filter(lambda x: x, cmd[0].split(os.path.sep))) subprocess.check_call(cmd) except subprocess.CalledProcessError as e: exit_from_command_with_retcode(e.cmd, e.returncode) @@ -194,9 +197,8 @@ def exec_sbt(sbt_args=[]): def get_hadoop_profiles(hadoop_version): - """Return a list of profiles indicating which Hadoop version to use from a Hadoop version tag.""" - - #amplab_jenkins_build_profile = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE") + """Return a list of profiles indicating which Hadoop version to use from + a Hadoop version tag.""" sbt_maven_hadoop_profiles = { "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"], @@ -224,11 +226,14 @@ def get_build_profiles(hadoop_version="hadoop2.3", base_profiles = ["-Pkinesis-asl"] hive_profiles = ["-Phive", "-Phive-thriftserver"] hadoop_profiles = get_hadoop_profiles(hadoop_version) - + + build_profiles = hadoop_profiles # first, check and add the base profiles - if base_profiles: build_profiles = build_profile + base_profiles + if base_profiles: + build_profiles = build_profiles + base_profiles # second, check and add the hive profiles - if hive_profiles: build_profiles = build_profile + hive_profiles + if hive_profiles: + build_profiles = build_profiles + hive_profiles return build_profiles @@ -238,7 +243,7 @@ def build_spark_maven(hadoop_version): mvn_goals = ["clean", "package", "-DskipTests"] profiles_and_goals = build_profiles + mvn_goals - print "[info] Building Spark (w/Hive 0.13.1) with these arguments:", + print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:", print " ".join(profiles_and_goals) exec_maven(profiles_and_goals) @@ -251,7 +256,7 @@ def build_spark_sbt(hadoop_version): "streaming-kafka-assembly/assembly"] profiles_and_goals = build_profiles + sbt_goals - print "[info] Building Spark (w/Hive 0.13.1) with these arguments:", + print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:", print " ".join(profiles_and_goals) exec_sbt(profiles_and_goals) @@ -296,9 +301,31 @@ def determine_test_modules(test_env): # find any sql files sql_files = [f for f in changed_files - if any(f.startswith(p) for p in ["sql/", - "bin/spark-sql", - "sbin/start-thriftserver.sh"])] + if any(f.startswith(p) for p in + ["sql/", + "bin/spark-sql", + "sbin/start-thriftserver.sh", + "examples/src/main/java/org/apache/spark/examples/sql/", + "examples/src/main/scala/org/apache/spark/examples/sql/"])] + mllib_files = [f for f in changed_files + if any(f.startswith(p) for p in + ["examples/src/main/java/org/apache/spark/examples/mllib/", + "examples/src/main/scala/org/apache/spark/examples/mllib", + "data/mllib/", + "mllib/"])] + streaming_files = [f for f in changed_files + if any(f.startswith(p) for p in + ["examples/scala-2.10/", + "examples/src/main/java/org/apache/spark/examples/streaming/", + "examples/src/main/scala/org/apache/spark/examples/streaming/", + "external/", + "extras/java8-tests/", + "extras/kinesis-asl/", + "streaming/"])] + graphx_files = [f for f in changed_files + if any(f.startswith(p) for p in + ["examples/src/main/scala/org/apache/spark/examples/graphx/", + "graphx/"])] non_sql_files = set(changed_files).difference(set(sql_files)) @@ -309,11 +336,20 @@ def determine_test_modules(test_env): test_suite.append("SQL") if not non_sql_files: print "[info] Detected no changes except in SQL. Will only run SQL tests." + if mllib_files: + print "[info] Detected changes in MLlib. Will run MLlib test suite." + test_suite.append("MLLIB") + if streaming_files: + print "[info] Detected changes in Streaming. Will run Streaming test suite." + test_suite.append("STREAMING") + if graphx_files: + print "[info] Detected changes in GraphX. Will run GraphX test suite." + test_suite.append("GRAPHX") + return set(test_suite) else: # we aren't in the Amplab environment so simply run all tests - test_suite.append("CORE") - test_suite.append("SQL") + test_suite.append("ALL") return set(test_suite) @@ -321,26 +357,45 @@ def run_scala_tests_maven(test_profiles): mvn_test_goals = ["test", "--fail-at-end"] profiles_and_goals = test_profiles + mvn_test_goals - print "[info] Running Spark tests with these arguments:", + print "[info] Running Spark tests using Maven with these arguments:", print " ".join(profiles_and_goals) exec_maven(profiles_and_goals) def run_scala_tests_sbt(test_modules, test_profiles): - # if we only have changes in SQL build a custom test list - if "SQL" in test_modules and "CORE" not in test_modules: - sbt_test_goals = ["catalyst/test", - "sql/test", - "hive/test", - "hive-thriftserver/test", - "mllib/test"] - else: + if "ALL" in test_modules: sbt_test_goals = ["test"] + else: + # if we only have changes in SQL build a custom test list + if "SQL" in test_modules and "CORE" not in test_modules: + sbt_test_goals = ["catalyst/test", + "sql/test", + "hive/test", + "hive-thriftserver/test", + "mllib/test", + "examples/test"] + if "MLLIB" in test_modules and "CORE" not in test_modules: + sbt_test_goals = sbt_test_goals + ["mllib/test", + "examples/test"] + if "STREAMING" in test_modules and "CORE" not in test_modules: + sbt_test_goals = sbt_test_goals + ["streaming/test", + "streaming-flume/test", + "streaming-flume-sink/test", + "streaming-kafka/test", + "streaming-mqtt/test", + "streaming-twitter/test", + "streaming-zeromq/test", + "examples/test"] + if "GRAPHX" in test_modules and "CORE" not in test_modules: + sbt_test_goals = sbt_test_goals + ["graphx/test", + "examples/test"] + if not sbt_test_goals: + sbt_test_goals = ["test"] profiles_and_goals = test_profiles + sbt_test_goals - print "[info] Running Spark tests with these arguments:", + print "[info] Running Spark tests using SBT with these arguments:", print " ".join(profiles_and_goals) exec_sbt(profiles_and_goals) @@ -393,7 +448,7 @@ def main(): print "ensure the $HOME environment variable is set properly." sys.exit(1) - os.chdir(SPARK_HOME) + #os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark")) @@ -418,12 +473,15 @@ def main(): # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") - test_env="amplab_jenkins" + test_env = "amplab_jenkins" else: # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = "hadoop2.3" - test_env="local" + test_env = "local" + + print "[info] Using build tool", build_tool, "with profile", hadoop_version, + print "under environment", test_env # license checks run_apache_rat_checks() From 2fcdfc0a9d3a12a20402321939de37e8efe4145d Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 9 Jun 2015 17:34:42 -0700 Subject: [PATCH 31/52] testing targte branch dump on jenkins --- dev/run-tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 94d2ad1e563bf..c7ab692ca53ad 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -293,7 +293,8 @@ def determine_test_modules(test_env): if test_env == "amplab_jenkins": target_branch = os.environ.get("ghprbTargetBranch") - run_cmd(['git', 'fetch', 'origin', target_branch+":"+target_branch]) + print "target_branch at", target_branch + run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch]) # remove any empty strings From db7ae6f74a46b71488ba81249dd5198bf4bd6606 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 9 Jun 2015 18:06:14 -0700 Subject: [PATCH 32/52] reverted SPARK_HOME from start of command --- dev/run-tests.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index c7ab692ca53ad..08e20bddb083e 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -66,8 +66,6 @@ def run_cmd(cmd): if not isinstance(cmd, list): cmd = cmd.split() try: - # prepend SPARK_HOME onto the first element of the command - cmd[0] = os.path.join(SPARK_HOME, *filter(lambda x: x, cmd[0].split(os.path.sep))) subprocess.check_call(cmd) except subprocess.CalledProcessError as e: exit_from_command_with_retcode(e.cmd, e.returncode) @@ -293,7 +291,7 @@ def determine_test_modules(test_env): if test_env == "amplab_jenkins": target_branch = os.environ.get("ghprbTargetBranch") - print "target_branch at", target_branch + run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch]) @@ -449,7 +447,7 @@ def main(): print "ensure the $HOME environment variable is set properly." sys.exit(1) - #os.chdir(SPARK_HOME) + os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark")) From eb684b633d0a06f6a290d6c7d3b383b3f9636799 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 10 Jun 2015 06:49:27 -0700 Subject: [PATCH 33/52] fixed sbt_test_goals reference error --- dev/run-tests.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/run-tests.py b/dev/run-tests.py index 08e20bddb083e..c510620e68705 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -363,6 +363,9 @@ def run_scala_tests_maven(test_profiles): def run_scala_tests_sbt(test_modules, test_profiles): + # declare the variable for reference + sbt_test_goals = None + if "ALL" in test_modules: sbt_test_goals = ["test"] else: From 289871704c29c2e7b712748c19cf1112e9333d88 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 10 Jun 2015 07:15:00 -0700 Subject: [PATCH 34/52] added a change to streaming test to check if it only runs streaming tests --- .../scala/org/apache/spark/streaming/StreamingContext.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 9cd9684d36404..7c895633b3aa3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - +// TODO: ADDING CHANGE TO TEST run-tests SCRIPT package org.apache.spark.streaming import java.io.{InputStream, NotSerializableException} From 7d2f5e28beb3cc20fe39d1d61443fcdd69fe632b Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 10 Jun 2015 07:17:27 -0700 Subject: [PATCH 35/52] updated python tests to remove unused variable --- dev/run-tests.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index c510620e68705..7c63a7870b437 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -426,10 +426,6 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): def run_python_tests(): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - # Add path for Python3 in Jenkins if we're calling from a Jenkins machine - if AMPLAB_JENKINS: - os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" - run_cmd(["./python/run-tests"]) @@ -476,6 +472,8 @@ def main(): build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") test_env = "amplab_jenkins" + # add path for Python3 in Jenkins if we're calling from a Jenkins machine + os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" else: # else we're running locally and can use local settings build_tool = "sbt" From 60b3d51cbf179517a7f78c8e1af97e2f75f853e0 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 10 Jun 2015 07:18:26 -0700 Subject: [PATCH 36/52] prepend rather than append onto PATH --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 7c63a7870b437..cb72eb09f8951 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -473,7 +473,7 @@ def main(): hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine - os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" + os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:"+os.environ.get("PATH") else: # else we're running locally and can use local settings build_tool = "sbt" From 705d12e5c8ec4c541421341329f682fc29214c46 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 10 Jun 2015 09:20:30 -0700 Subject: [PATCH 37/52] changed example to comply with pep3113 supporting python3 --- examples/src/main/python/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py index 1456c87312841..f204458d33f72 100755 --- a/examples/src/main/python/kmeans.py +++ b/examples/src/main/python/kmeans.py @@ -68,7 +68,7 @@ def closestPoint(p, centers): closest = data.map( lambda p: (closestPoint(p, kPoints), (p, 1))) pointStats = closest.reduceByKey( - lambda (p1, c1), (p2, c2): (p1 + p2, c1 + c2)) + lambda (p1_c1, p2_c2): (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1])) newPoints = pointStats.map( lambda st: (st[0], st[1][0] / st[1][1])).collect() From 03fdd7b1d2f5d0f04c85514f841fd5aa183c795d Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 10 Jun 2015 09:26:00 -0700 Subject: [PATCH 38/52] fixed the tuple () wraps around example lambda --- examples/src/main/python/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py index f204458d33f72..0ea7cfb7025a0 100755 --- a/examples/src/main/python/kmeans.py +++ b/examples/src/main/python/kmeans.py @@ -68,7 +68,7 @@ def closestPoint(p, centers): closest = data.map( lambda p: (closestPoint(p, kPoints), (p, 1))) pointStats = closest.reduceByKey( - lambda (p1_c1, p2_c2): (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1])) + lambda p1_c1, p2_c2: (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1])) newPoints = pointStats.map( lambda st: (st[0], st[1][0] / st[1][1])).collect() From b7c72b9cbae34c71478dca06f02184f6b317f58b Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 10 Jun 2015 11:47:45 -0700 Subject: [PATCH 39/52] reverting streaming context --- .../scala/org/apache/spark/streaming/StreamingContext.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 7c895633b3aa3..9cd9684d36404 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -// TODO: ADDING CHANGE TO TEST run-tests SCRIPT + package org.apache.spark.streaming import java.io.{InputStream, NotSerializableException} From ec1ae789bef6a308db3cfd1b5d609c8cf02f19d9 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Sat, 13 Jun 2015 22:47:33 -0700 Subject: [PATCH 40/52] minor name changes, bug fixes --- dev/run-tests.py | 100 ++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 54 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index cb72eb09f8951..15c35886fdf5f 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -205,19 +205,17 @@ def get_hadoop_profiles(hadoop_version): "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], } - try: - hadoop_profiles = sbt_maven_hadoop_profiles[hadoop_version] - except KeyError: + if hadoop_version in sbt_maven_hadoop_profiles: + return sbt_maven_hadoop_profiles[hadoop_version] + else: print "[error] Could not find", hadoop_version, "in the list. Valid options", - print "are 'hadoop1.0', 'hadoop2.0', 'hadoop2.2', and 'hadoop2.3'." + print "are", sbt_maven_hadoop_profiles.keys() sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - return hadoop_profiles - def get_build_profiles(hadoop_version="hadoop2.3", - base_profiles=True, - hive_profiles=False): + enable_base_profiles=True, + enable_hive_profiles=False): """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile key with the option of adding on the base and hive profiles.""" @@ -226,18 +224,19 @@ def get_build_profiles(hadoop_version="hadoop2.3", hadoop_profiles = get_hadoop_profiles(hadoop_version) build_profiles = hadoop_profiles - # first, check and add the base profiles - if base_profiles: + + if enable_base_profiles: build_profiles = build_profiles + base_profiles - # second, check and add the hive profiles - if hive_profiles: + + if enable_hive_profiles: build_profiles = build_profiles + hive_profiles return build_profiles def build_spark_maven(hadoop_version): - build_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + # we always build with Hive support even if we skip Hive tests in most builds + build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True) mvn_goals = ["clean", "package", "-DskipTests"] profiles_and_goals = build_profiles + mvn_goals @@ -248,7 +247,7 @@ def build_spark_maven(hadoop_version): def build_spark_sbt(hadoop_version): - build_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True) sbt_goals = ["package", "assembly/assembly", "streaming-kafka-assembly/assembly"] @@ -275,22 +274,20 @@ def build_apache_spark(build_tool, hadoop_version): def detect_binary_inop_with_mima(): - set_title_and_block("Detecting binary incompatibilities with MiMa", - "BLOCK_MIMA") + set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") run_cmd(["./dev/mima"]) -def determine_test_modules(test_env): - """This function current acts to determine if SQL tests need to be run in - addition to the core test suite *or* if _only_ SQL tests need to be run - as the git logs show that to be the only thing touched. In the future - this function will act more generically to help further segregate the - test suite runner (hence the function name). - @return a set of unique test names""" - test_suite = list() +def identify_changed_modules(test_env): + """Given the passed in environment will determine the changed modules and + return them as a set. If the environment is local, will simply run all tests. + If run under the `amplab_jenkins` environment will determine the changed files + as compared to the `ghprbTargetBranch` and execute the necessary set of tests + to provide coverage for the changed code.""" + test_suite = set() if test_env == "amplab_jenkins": - target_branch = os.environ.get("ghprbTargetBranch") + target_branch = os.environ["ghprbTargetBranch"] run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) @@ -329,27 +326,27 @@ def determine_test_modules(test_env): non_sql_files = set(changed_files).difference(set(sql_files)) if non_sql_files: - test_suite.append("CORE") + test_suite.add("CORE") if sql_files: print "[info] Detected changes in SQL. Will run Hive test suite." - test_suite.append("SQL") + test_suite.add("SQL") if not non_sql_files: print "[info] Detected no changes except in SQL. Will only run SQL tests." if mllib_files: print "[info] Detected changes in MLlib. Will run MLlib test suite." - test_suite.append("MLLIB") + test_suite.add("MLLIB") if streaming_files: print "[info] Detected changes in Streaming. Will run Streaming test suite." - test_suite.append("STREAMING") + test_suite.add("STREAMING") if graphx_files: print "[info] Detected changes in GraphX. Will run GraphX test suite." - test_suite.append("GRAPHX") + test_suite.add("GRAPHX") - return set(test_suite) + return test_suite else: # we aren't in the Amplab environment so simply run all tests - test_suite.append("ALL") - return set(test_suite) + test_suite.add("ALL") + return test_suite def run_scala_tests_maven(test_profiles): @@ -369,7 +366,8 @@ def run_scala_tests_sbt(test_modules, test_profiles): if "ALL" in test_modules: sbt_test_goals = ["test"] else: - # if we only have changes in SQL build a custom test list + # if we only have changes in SQL, MLlib, Streaming, or GraphX then build + # a custom test list if "SQL" in test_modules and "CORE" not in test_modules: sbt_test_goals = ["catalyst/test", "sql/test", @@ -378,20 +376,18 @@ def run_scala_tests_sbt(test_modules, test_profiles): "mllib/test", "examples/test"] if "MLLIB" in test_modules and "CORE" not in test_modules: - sbt_test_goals = sbt_test_goals + ["mllib/test", - "examples/test"] + sbt_test_goals += ["mllib/test", "examples/test"] if "STREAMING" in test_modules and "CORE" not in test_modules: - sbt_test_goals = sbt_test_goals + ["streaming/test", - "streaming-flume/test", - "streaming-flume-sink/test", - "streaming-kafka/test", - "streaming-mqtt/test", - "streaming-twitter/test", - "streaming-zeromq/test", - "examples/test"] + sbt_test_goals += ["streaming/test", + "streaming-flume/test", + "streaming-flume-sink/test", + "streaming-kafka/test", + "streaming-mqtt/test", + "streaming-twitter/test", + "streaming-zeromq/test", + "examples/test"] if "GRAPHX" in test_modules and "CORE" not in test_modules: - sbt_test_goals = sbt_test_goals + ["graphx/test", - "examples/test"] + sbt_test_goals += ["graphx/test", "examples/test"] if not sbt_test_goals: sbt_test_goals = ["test"] @@ -410,12 +406,8 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): test_modules = set(test_modules) - # if the Spark SQL tests are enabled, run the tests with the Hive profiles - # enabled. - if "SQL" in test_modules: - test_profiles = get_build_profiles(hadoop_version, hive_profiles=True) - else: - test_profiles = get_build_profiles(hadoop_version) + hive_profiles = ("SQL" in test_modules) + test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles) if build_tool == "maven": run_scala_tests_maven(test_profiles) @@ -473,7 +465,7 @@ def main(): hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine - os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:"+os.environ.get("PATH") + os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH") else: # else we're running locally and can use local settings build_tool = "sbt" @@ -497,7 +489,7 @@ def main(): detect_binary_inop_with_mima() # test suites - test_modules = determine_test_modules(test_env) + test_modules = identify_changed_modules(test_env) run_scala_tests(build_tool, hadoop_version, test_modules) run_python_tests() run_sparkr_tests() From aa03d9e1adb43d927cb3d00520936b90340427d9 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 15 Jun 2015 11:28:14 -0700 Subject: [PATCH 41/52] added documentation builds as a top level test component, altered high level project changes to properly execute core tests only when necessary, changed variable names for simplicity --- dev/run-tests-codes.sh | 11 ++++--- dev/run-tests-jenkins | 2 ++ dev/run-tests.py | 71 ++++++++++++++++++++++++++++-------------- docs/configuration.md | 1 + 4 files changed, 56 insertions(+), 29 deletions(-) diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh index 154e01255b2ef..f4b238e1b78a7 100644 --- a/dev/run-tests-codes.sh +++ b/dev/run-tests-codes.sh @@ -21,8 +21,9 @@ readonly BLOCK_GENERAL=10 readonly BLOCK_RAT=11 readonly BLOCK_SCALA_STYLE=12 readonly BLOCK_PYTHON_STYLE=13 -readonly BLOCK_BUILD=14 -readonly BLOCK_MIMA=15 -readonly BLOCK_SPARK_UNIT_TESTS=16 -readonly BLOCK_PYSPARK_UNIT_TESTS=17 -readonly BLOCK_SPARKR_UNIT_TESTS=18 +readonly BLOCK_DOCUMENTATION=14 +readonly BLOCK_BUILD=15 +readonly BLOCK_MIMA=16 +readonly BLOCK_SPARK_UNIT_TESTS=17 +readonly BLOCK_PYSPARK_UNIT_TESTS=18 +readonly BLOCK_SPARKR_UNIT_TESTS=19 diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins index 641b0ff3c4be4..c4d39d95d5890 100755 --- a/dev/run-tests-jenkins +++ b/dev/run-tests-jenkins @@ -210,6 +210,8 @@ done failing_test="Scala style tests" elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then failing_test="Python style tests" + elif [ "$test_result" -eq "$BLOCK_DOCUMENTATION" ]; then + failing_test="to generate documentation" elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then failing_test="to build" elif [ "$test_result" -eq "$BLOCK_MIMA" ]; then diff --git a/dev/run-tests.py b/dev/run-tests.py index 15c35886fdf5f..f01c24404b2a7 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -159,6 +159,11 @@ def run_python_style_checks(): run_cmd(["./dev/lint-python"]) +def build_spark_documentation(): + set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") + os.environ["PRODUCTION"] = "1 jekyll build" + + def exec_maven(mvn_args=[]): """Will call Maven in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" @@ -215,21 +220,26 @@ def get_hadoop_profiles(hadoop_version): def get_build_profiles(hadoop_version="hadoop2.3", enable_base_profiles=True, - enable_hive_profiles=False): + enable_hive_profiles=False, + enable_doc_profiles=False): """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile key with the option of adding on the base and hive profiles.""" base_profiles = ["-Pkinesis-asl"] hive_profiles = ["-Phive", "-Phive-thriftserver"] + doc_profiles = [] hadoop_profiles = get_hadoop_profiles(hadoop_version) build_profiles = hadoop_profiles if enable_base_profiles: - build_profiles = build_profiles + base_profiles + build_profiles += base_profiles if enable_hive_profiles: - build_profiles = build_profiles + hive_profiles + build_profiles += hive_profiles + + if enable_doc_profiles: + build_profiles += doc_profiles return build_profiles @@ -259,7 +269,7 @@ def build_spark_sbt(hadoop_version): exec_sbt(profiles_and_goals) -def build_apache_spark(build_tool, hadoop_version): +def build_apache_spark(build_tool, hadoop_version, changed_modules): """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or `maven`). Defaults to using `sbt`.""" @@ -284,7 +294,7 @@ def identify_changed_modules(test_env): If run under the `amplab_jenkins` environment will determine the changed files as compared to the `ghprbTargetBranch` and execute the necessary set of tests to provide coverage for the changed code.""" - test_suite = set() + changed_modules = set() if test_env == "amplab_jenkins": target_branch = os.environ["ghprbTargetBranch"] @@ -295,7 +305,6 @@ def identify_changed_modules(test_env): # remove any empty strings changed_files = [f for f in raw_output.split('\n') if f] - # find any sql files sql_files = [f for f in changed_files if any(f.startswith(p) for p in ["sql/", @@ -322,31 +331,39 @@ def identify_changed_modules(test_env): if any(f.startswith(p) for p in ["examples/src/main/scala/org/apache/spark/examples/graphx/", "graphx/"])] - - non_sql_files = set(changed_files).difference(set(sql_files)) - - if non_sql_files: - test_suite.add("CORE") + doc_files = [f for f in changed_files if f.startswith("docs/")] + + # union together all changed top level project files + top_level_project_files = set().union([set(f) for f in [sql_files, + mllib_files, + streaming_files, + graphx_files, + doc_files]]) + changed_core_files = set(changed_files).difference(top_level_project_files) + + if changed_core_files: + changed_modules.add("CORE") if sql_files: print "[info] Detected changes in SQL. Will run Hive test suite." - test_suite.add("SQL") - if not non_sql_files: - print "[info] Detected no changes except in SQL. Will only run SQL tests." + changed_modules.add("SQL") if mllib_files: print "[info] Detected changes in MLlib. Will run MLlib test suite." - test_suite.add("MLLIB") + changed_modules.add("MLLIB") if streaming_files: print "[info] Detected changes in Streaming. Will run Streaming test suite." - test_suite.add("STREAMING") + changed_modules.add("STREAMING") if graphx_files: print "[info] Detected changes in GraphX. Will run GraphX test suite." - test_suite.add("GRAPHX") + changed_modules.add("GRAPHX") + if doc_files: + print "[info] Detected changes in documentation. Will build spark with documentation." + changed_modules.add("DOCS") - return test_suite + return changed_modules else: # we aren't in the Amplab environment so simply run all tests - test_suite.add("ALL") - return test_suite + changed_modules.add("ALL") + return changed_modules def run_scala_tests_maven(test_profiles): @@ -482,15 +499,21 @@ def main(): run_scala_style_checks() run_python_style_checks() + # determine high level changes + changed_modules = identify_changed_modules(test_env) + + # determine if docs were changed and if we're inside the amplab environment + if "DOCS" in changed_modules and test_env == "amplab_jenkins": + build_spark_documentation() + # spark build - build_apache_spark(build_tool, hadoop_version) + build_apache_spark(build_tool, hadoop_version, changed_modules) # backwards compatibility checks detect_binary_inop_with_mima() - # test suites - test_modules = identify_changed_modules(test_env) - run_scala_tests(build_tool, hadoop_version, test_modules) + # run the test suites + run_scala_tests(build_tool, hadoop_version, changed_modules) run_python_tests() run_sparkr_tests() diff --git a/docs/configuration.md b/docs/configuration.md index 3960e7e78bde1..492c10f45fd89 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -8,6 +8,7 @@ title: Configuration Spark provides three locations to configure the system: +* ADDING DOC CHANGE FOR TESTING * [Spark properties](#spark-properties) control most application parameters and can be set by using a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object, or through Java system properties. From 03798339522832042e1d282c9c0aead0abca0d65 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 15 Jun 2015 11:36:24 -0700 Subject: [PATCH 42/52] minor doc addition to print the changed modules --- dev/run-tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/run-tests.py b/dev/run-tests.py index f01c24404b2a7..84fa3ae83d185 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -501,6 +501,7 @@ def main(): # determine high level changes changed_modules = identify_changed_modules(test_env) + print "[info] Found the following changed modules:", ", ".join(changed_modules) # determine if docs were changed and if we're inside the amplab environment if "DOCS" in changed_modules and test_env == "amplab_jenkins": From fb85a41acf66c69d5fd12052560bc9e0d135e03a Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 15 Jun 2015 11:42:30 -0700 Subject: [PATCH 43/52] fixed minor set bug --- dev/run-tests.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 84fa3ae83d185..35289e5f6091f 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -334,11 +334,11 @@ def identify_changed_modules(test_env): doc_files = [f for f in changed_files if f.startswith("docs/")] # union together all changed top level project files - top_level_project_files = set().union([set(f) for f in [sql_files, - mllib_files, - streaming_files, - graphx_files, - doc_files]]) + top_level_project_files = set().union(set(f) for f in [sql_files, + mllib_files, + streaming_files, + graphx_files, + doc_files]) changed_core_files = set(changed_files).difference(top_level_project_files) if changed_core_files: From c42cf9a8d75af7e3277ca3ddca0cb9dce857fa91 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Mon, 15 Jun 2015 13:27:08 -0700 Subject: [PATCH 44/52] unpack set operations with splat (*) --- dev/run-tests.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 35289e5f6091f..7edf2b0913cd7 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -334,11 +334,11 @@ def identify_changed_modules(test_env): doc_files = [f for f in changed_files if f.startswith("docs/")] # union together all changed top level project files - top_level_project_files = set().union(set(f) for f in [sql_files, - mllib_files, - streaming_files, - graphx_files, - doc_files]) + top_level_project_files = set().union(*[set(f) for f in [sql_files, + mllib_files, + streaming_files, + graphx_files, + doc_files]]) changed_core_files = set(changed_files).difference(top_level_project_files) if changed_core_files: From 767a668c4b066af98b1f2ec6b2871c79f06289d8 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 09:11:21 -0700 Subject: [PATCH 45/52] fixed path joining issues, ensured docs actually build on doc changes --- dev/run-tests.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 7edf2b0913cd7..1a091dc72efb3 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -107,7 +107,7 @@ def determine_java_executable(): java_home = os.environ.get("JAVA_HOME") # check if there is an executable at $JAVA_HOME/bin/java - java_exe = which(os.path.join(java_home, "bin/java")) + java_exe = which(os.path.join(java_home, "bin", "java")) # if the java_exe wasn't set, check for a `java` version on the $PATH return java_exe if java_exe else which("java") @@ -146,36 +146,42 @@ def set_title_and_block(title, err_block): def run_apache_rat_checks(): set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") - run_cmd(["./dev/check-license"]) + run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")]) def run_scala_style_checks(): set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") - run_cmd(["./dev/lint-scala"]) + run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")]) def run_python_style_checks(): set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") - run_cmd(["./dev/lint-python"]) + run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")]) def build_spark_documentation(): set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") os.environ["PRODUCTION"] = "1 jekyll build" + + os.chdir(os.path.join(SPARK_HOME, "docs")) + + run_cmd(["jekyll", "build"]) + + os.chdir(SPARK_HOME) def exec_maven(mvn_args=[]): """Will call Maven in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" - run_cmd(["./build/mvn"] + mvn_args) + run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args) def exec_sbt(sbt_args=[]): """Will call SBT in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" - sbt_cmd = ["./build/sbt"] + sbt_args + sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" + "^.*[warn].*Merging" + "|" + @@ -285,7 +291,7 @@ def build_apache_spark(build_tool, hadoop_version, changed_modules): def detect_binary_inop_with_mima(): set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") - run_cmd(["./dev/mima"]) + run_cmd([os.path.join(SPARK_HOME, "dev", "mima")]) def identify_changed_modules(test_env): @@ -435,15 +441,15 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): def run_python_tests(): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - run_cmd(["./python/run-tests"]) + run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")]) def run_sparkr_tests(): set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") if which("R"): - run_cmd(["./R/install-dev.sh"]) - run_cmd(["./R/run-tests.sh"]) + run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) + run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")]) else: print "Ignoring SparkR tests as R was not found in PATH" @@ -458,8 +464,8 @@ def main(): os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) - rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark")) - rm_r(os.path.join(USER_HOME, ".ivy2/cache/org.apache.spark")) + rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) + rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"] From 2dff136a6ed4c960e02af117add70e39da783037 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 09:15:02 -0700 Subject: [PATCH 46/52] fixed pep8 whitespace errors --- dev/run-tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 1a091dc72efb3..97539fd78e5e8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -162,9 +162,9 @@ def run_python_style_checks(): def build_spark_documentation(): set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") os.environ["PRODUCTION"] = "1 jekyll build" - - os.chdir(os.path.join(SPARK_HOME, "docs")) - + + os.chdir(os.path.join(SPARK_HOME, "docs")) + run_cmd(["jekyll", "build"]) os.chdir(SPARK_HOME) From 22edb7807d77222df2b6017cc4fb9cd7621757e9 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 09:29:09 -0700 Subject: [PATCH 47/52] add check if jekyll isn't installed on the path --- dev/run-tests.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 97539fd78e5e8..554486a105d90 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -165,7 +165,13 @@ def build_spark_documentation(): os.chdir(os.path.join(SPARK_HOME, "docs")) - run_cmd(["jekyll", "build"]) + jekyll_bin = which("jekyll") + + if not jekyll_bin: + print "[warn] Cannot find a version of `jekyll` on the system; please", + print "install one and retry to build documentation." + else: + run_cmd([jekyll_bin, "build"]) os.chdir(SPARK_HOME) @@ -498,6 +504,10 @@ def main(): print "[info] Using build tool", build_tool, "with profile", hadoop_version, print "under environment", test_env + # determine high level changes + changed_modules = identify_changed_modules(test_env) + print "[info] Found the following changed modules:", ", ".join(changed_modules) + # license checks run_apache_rat_checks() @@ -505,10 +515,6 @@ def main(): run_scala_style_checks() run_python_style_checks() - # determine high level changes - changed_modules = identify_changed_modules(test_env) - print "[info] Found the following changed modules:", ", ".join(changed_modules) - # determine if docs were changed and if we're inside the amplab environment if "DOCS" in changed_modules and test_env == "amplab_jenkins": build_spark_documentation() From 05d435b6386e8ed4ac6f61c5a0713c157b5451d3 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 09:40:56 -0700 Subject: [PATCH 48/52] added check for jekyll install --- dev/run-tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 554486a105d90..2a86812012eb9 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -168,8 +168,9 @@ def build_spark_documentation(): jekyll_bin = which("jekyll") if not jekyll_bin: - print "[warn] Cannot find a version of `jekyll` on the system; please", + print "[error] Cannot find a version of `jekyll` on the system; please", print "install one and retry to build documentation." + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) else: run_cmd([jekyll_bin, "build"]) From 8135518d12d5ade154eb39b88b39ea7c63ac8252 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 10:50:58 -0700 Subject: [PATCH 49/52] removed the test check for documentation changes until jenkins can get updated --- dev/run-tests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 2a86812012eb9..f8a48bddd5263 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -517,8 +517,9 @@ def main(): run_python_style_checks() # determine if docs were changed and if we're inside the amplab environment - if "DOCS" in changed_modules and test_env == "amplab_jenkins": - build_spark_documentation() + # note - the below commented out until *all* Jenkins workers can get `jekyll` installed + # if "DOCS" in changed_modules and test_env == "amplab_jenkins": + # build_spark_documentation() # spark build build_apache_spark(build_tool, hadoop_version, changed_modules) From f9fbe549165cca5c605dbb7d7b361891a407b7b1 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 10:52:25 -0700 Subject: [PATCH 50/52] reverted doc test change --- docs/configuration.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index 492c10f45fd89..3960e7e78bde1 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -8,7 +8,6 @@ title: Configuration Spark provides three locations to configure the system: -* ADDING DOC CHANGE FOR TESTING * [Spark properties](#spark-properties) control most application parameters and can be set by using a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object, or through Java system properties. From 3922a85d68a4936a7d2b125b1850f87553a7e537 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 14:36:10 -0700 Subject: [PATCH 51/52] removed necessary passed in variable --- dev/run-tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index f8a48bddd5263..b09cfa3867be6 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -282,7 +282,7 @@ def build_spark_sbt(hadoop_version): exec_sbt(profiles_and_goals) -def build_apache_spark(build_tool, hadoop_version, changed_modules): +def build_apache_spark(build_tool, hadoop_version): """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or `maven`). Defaults to using `sbt`.""" @@ -522,7 +522,7 @@ def main(): # build_spark_documentation() # spark build - build_apache_spark(build_tool, hadoop_version, changed_modules) + build_apache_spark(build_tool, hadoop_version) # backwards compatibility checks detect_binary_inop_with_mima() From 154ed739026af964ab38e564abdf91124a9acf96 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Tue, 16 Jun 2015 14:53:25 -0700 Subject: [PATCH 52/52] updated finding java binary if JAVA_HOME not set --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index b09cfa3867be6..04a7b45741963 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -107,7 +107,7 @@ def determine_java_executable(): java_home = os.environ.get("JAVA_HOME") # check if there is an executable at $JAVA_HOME/bin/java - java_exe = which(os.path.join(java_home, "bin", "java")) + java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None # if the java_exe wasn't set, check for a `java` version on the $PATH return java_exe if java_exe else which("java")