From 284dda6c410b297c03c4b72e901cacea649b4264 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Mon, 20 Oct 2014 11:30:47 -0700 Subject: [PATCH 01/22] Rework the "hadoop-provided" profile, add new ones. The "hadoop-provided" profile should only apply during packaging, since, for example, "spark-core" should still have a compile-time dependency on hadoop since it exposes hadoop types in its API. So reorganize the dependencies a bit so that the scopes are overridden in the packaging targets. Also, a lot of the dependencies packaged in the examples/ assembly are already provided by the main assembly, so clean those up. Also, add similar profiles for hive, parquet, flume and hbase (the last two just used by the examples/ code, although the flume one could also potentially be used by user's poms when packaging the flume backend). This change also includes a fix to parameterize the hbase artifact, since the structure of the dependencies have changed along the 0.9x line. It also cleans some unneeded dependencies in a few poms. --- assembly/pom.xml | 24 +++ bagel/pom.xml | 4 - examples/pom.xml | 309 +++++++++++++++++++--------------- external/flume-sink/pom.xml | 22 --- external/flume/pom.xml | 15 +- external/zeromq/pom.xml | 1 - graphx/pom.xml | 4 - mllib/pom.xml | 6 +- pom.xml | 242 ++++++++++++++++++-------- repl/pom.xml | 4 - sql/core/pom.xml | 2 - sql/hive-thriftserver/pom.xml | 6 +- sql/hive/pom.xml | 31 +--- streaming/pom.xml | 14 +- 14 files changed, 386 insertions(+), 298 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index c65192bde64c6..488340c7def7a 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -364,5 +364,29 @@ + + + + hadoop-provided + + provided + + + + + + hive-provided + + provided + + + + + + parquet-provided + + provided + + diff --git a/bagel/pom.xml b/bagel/pom.xml index 93db0d5efda5f..12138f9c3c27b 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -40,10 +40,6 @@ spark-core_${scala.binary.version} ${project.version} - - org.eclipse.jetty - jetty-server - org.scalatest scalatest_${scala.binary.version} diff --git a/examples/pom.xml b/examples/pom.xml index 85e133779e465..39285719c4b33 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -98,121 +98,123 @@ ${project.version} - org.eclipse.jetty - jetty-server + org.apache.hbase + hbase-testing-util + ${hbase.version} + ${hbase.deps.scope} + + + org.jruby + jruby-complete + + + + + org.apache.hbase + hbase-protocol + ${hbase.version} + ${hbase.deps.scope} + + + org.apache.hbase + hbase-common + ${hbase.version} + ${hbase.deps.scope} + + + org.apache.hbase + hbase-client + ${hbase.version} + ${hbase.deps.scope} + + + io.netty + netty + + + + + org.apache.hbase + hbase-server + ${hbase.version} + ${hbase.deps.scope} + + + org.apache.hadoop + hadoop-core + + + org.apache.hadoop + hadoop-client + + + org.apache.hadoop + hadoop-mapreduce-client-jobclient + + + org.apache.hadoop + hadoop-mapreduce-client-core + + + org.apache.hadoop + hadoop-auth + + + org.apache.hadoop + hadoop-annotations + + + org.apache.hadoop + hadoop-hdfs + + + org.apache.hbase + hbase-hadoop1-compat + + + org.apache.commons + commons-math + + + com.sun.jersey + jersey-core + + + org.slf4j + slf4j-api + + + com.sun.jersey + jersey-server + + + com.sun.jersey + jersey-core + + + com.sun.jersey + jersey-json + + + + commons-io + commons-io + + + + + org.apache.hbase + hbase-hadoop-compat + ${hbase.version} + ${hbase.deps.scope} + + + org.apache.hbase + hbase-hadoop-compat + ${hbase.version} + test-jar + test - - org.apache.hbase - hbase-testing-util - ${hbase.version} - - - org.jruby - jruby-complete - - - - - org.apache.hbase - hbase-protocol - ${hbase.version} - - - org.apache.hbase - hbase-common - ${hbase.version} - - - org.apache.hbase - hbase-client - ${hbase.version} - - - io.netty - netty - - - - - org.apache.hbase - hbase-server - ${hbase.version} - - - org.apache.hadoop - hadoop-core - - - org.apache.hadoop - hadoop-client - - - org.apache.hadoop - hadoop-mapreduce-client-jobclient - - - org.apache.hadoop - hadoop-mapreduce-client-core - - - org.apache.hadoop - hadoop-auth - - - org.apache.hadoop - hadoop-annotations - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hbase - hbase-hadoop1-compat - - - org.apache.commons - commons-math - - - com.sun.jersey - jersey-core - - - org.slf4j - slf4j-api - - - com.sun.jersey - jersey-server - - - com.sun.jersey - jersey-core - - - com.sun.jersey - jersey-json - - - - commons-io - commons-io - - - - - org.apache.hbase - hbase-hadoop-compat - ${hbase.version} - - - org.apache.hbase - hbase-hadoop-compat - ${hbase.version} - test-jar - test - org.apache.commons commons-math3 @@ -291,31 +293,6 @@ org.apache.maven.plugins maven-shade-plugin - - false - ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar - - - *:* - - - - - com.google.guava:guava - - com/google/common/base/Optional* - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - package @@ -323,6 +300,34 @@ shade + false + ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar + + + *:* + + + + + com.google.guava:guava + + + ** + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + com.google @@ -432,5 +437,31 @@ + + + + hadoop-provided + + provided + + + + hive-provided + + provided + + + + parquet-provided + + provided + + + + hbase-provided + + provided + + diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index ac291bd4fde20..3ccfe58546fec 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -38,32 +38,10 @@ org.apache.flume flume-ng-sdk - ${flume.version} - - - io.netty - netty - - - org.apache.thrift - libthrift - - org.apache.flume flume-ng-core - ${flume.version} - - - io.netty - netty - - - org.apache.thrift - libthrift - - org.scalatest diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 7d31e32283d88..13364829132a5 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -52,20 +52,13 @@ test-jar test + + org.apache.flume + flume-ng-core + org.apache.flume flume-ng-sdk - ${flume.version} - - - io.netty - netty - - - org.apache.thrift - libthrift - - org.scalatest diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index 7e48968feb3bc..48d8eeaab141b 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -50,7 +50,6 @@ ${akka.group} akka-zeromq_${scala.binary.version} - ${akka.version} org.scalatest diff --git a/graphx/pom.xml b/graphx/pom.xml index 3f49b1d63b6e1..78b46c03812b0 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -45,10 +45,6 @@ jblas ${jblas.version} - - org.eclipse.jetty - jetty-server - org.scalatest scalatest_${scala.binary.version} diff --git a/mllib/pom.xml b/mllib/pom.xml index dd68b27a78bdc..0057687b1f4b5 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -29,7 +29,7 @@ spark-mllib_2.10 mllib - + jar Spark Project ML Library http://spark.apache.org/ @@ -50,10 +50,6 @@ spark-sql_${scala.binary.version} ${project.version} - - org.eclipse.jetty - jetty-server - org.jblas jblas diff --git a/pom.xml b/pom.xml index cc7bce175778f..01e45635e6af9 100644 --- a/pom.xml +++ b/pom.xml @@ -123,8 +123,10 @@ 2.4.1 ${hadoop.version} 0.94.6 + hbase 1.4.0 3.4.5 + org.spark-project.hive 0.13.1a @@ -143,12 +145,29 @@ 4.2.6 3.1.1 ${project.build.directory}/spark-test-classpath.txt - 64m - 512m 2.10.4 2.10 ${scala.version} org.scala-lang + 1.8.8 + + + compile + compile + compile + compile + compile + + 64m + 512m + 512m @@ -267,21 +286,20 @@ - - + org.spark-project.spark unused 1.0.0 - org.codehaus.groovy @@ -383,11 +401,13 @@ org.slf4j slf4j-api ${slf4j.version} + ${hadoop.deps.scope} org.slf4j slf4j-log4j12 ${slf4j.version} + ${hadoop.deps.scope} org.slf4j @@ -404,6 +424,7 @@ log4j log4j ${log4j.version} + ${hadoop.deps.scope} com.ning @@ -441,6 +462,7 @@ com.google.protobuf protobuf-java ${protobuf.version} + ${hadoop.deps.scope} ${akka.group} @@ -462,6 +484,17 @@ akka-testkit_${scala.binary.version} ${akka.version} + + ${akka.group} + akka-zeromq_${scala.binary.version} + ${akka.version} + + + ${akka.group} + akka-actor_${scala.binary.version} + + + org.apache.mesos mesos @@ -591,6 +624,7 @@ org.apache.curator curator-recipes 2.4.0 + ${hadoop.deps.scope} org.jboss.netty @@ -602,6 +636,7 @@ org.apache.hadoop hadoop-client ${hadoop.version} + ${hadoop.deps.scope} asm @@ -637,11 +672,13 @@ org.apache.avro avro ${avro.version} + ${hadoop.deps.scope} org.apache.avro avro-ipc ${avro.version} + ${hadoop.deps.scope} io.netty @@ -670,6 +707,7 @@ avro-mapred ${avro.version} ${avro.mapred.classifier} + ${hive.deps.scope} io.netty @@ -698,6 +736,7 @@ net.java.dev.jets3t jets3t ${jets3t.version} + ${hadoop.deps.scope} commons-logging @@ -709,6 +748,7 @@ org.apache.hadoop hadoop-yarn-api ${yarn.version} + ${hadoop.deps.scope} javax.servlet @@ -736,6 +776,7 @@ org.apache.hadoop hadoop-yarn-common ${yarn.version} + ${hadoop.deps.scope} asm @@ -792,6 +833,7 @@ org.apache.hadoop hadoop-yarn-server-web-proxy ${yarn.version} + ${hadoop.deps.scope} asm @@ -819,6 +861,7 @@ org.apache.hadoop hadoop-yarn-client ${yarn.version} + ${hadoop.deps.scope} asm @@ -842,11 +885,104 @@ + + org.apache.zookeeper + zookeeper + ${zookeeper.version} + ${hadoop.deps.scope} + org.codehaus.jackson jackson-mapper-asl - 1.8.8 + ${codehaus.jackson.version} + ${hadoop.deps.scope} + + + ${hive.group} + hive-exec + ${hive.version} + ${hive.deps.scope} + + + commons-logging + commons-logging + + + com.esotericsoftware.kryo + kryo + + + + + ${hive.group} + hive-metastore + ${hive.version} + ${hive.deps.scope} + + + ${hive.group} + hive-serde + ${hive.version} + ${hive.deps.scope} + + + commons-logging + commons-logging + + + commons-logging + commons-logging-api + + + + + com.twitter + parquet-column + ${parquet.version} + ${parquet.deps.scope} + + + com.twitter + parquet-hadoop + ${parquet.version} + ${parquet.deps.scope} + + + org.apache.flume + flume-ng-core + ${flume.version} + ${flume.deps.scope} + + + io.netty + netty + + + org.apache.thrift + libthrift + + + org.mortbay.jetty + servlet-api + + + + + org.apache.flume + flume-ng-sdk + ${flume.version} + ${flume.deps.scope} + + + io.netty + netty + + + org.apache.thrift + libthrift + + @@ -923,6 +1059,7 @@ -Xmx1024m -XX:PermSize=${PermGen} -XX:MaxPermSize=${MaxPermGen} + -XX:ReservedCodeCacheSize=${CodeCacheSize} -source @@ -970,7 +1107,7 @@ ${project.build.directory}/surefire-reports . SparkTestSuite.txt - -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m + -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} true @@ -1000,11 +1137,6 @@ maven-antrun-plugin 1.7 - - org.apache.maven.plugins - maven-shade-plugin - 2.2 - org.apache.maven.plugins maven-source-plugin @@ -1085,6 +1217,7 @@ org.apache.maven.plugins maven-shade-plugin + 2.2 false @@ -1333,55 +1466,9 @@ - - hadoop-provided - - - org.apache.hadoop - hadoop-client - provided - - - org.apache.hadoop - hadoop-yarn-api - provided - - - org.apache.hadoop - hadoop-yarn-common - provided - - - org.apache.hadoop - hadoop-yarn-server-web-proxy - provided - - - org.apache.hadoop - hadoop-yarn-client - provided - - - org.apache.avro - avro - provided - - - org.apache.avro - avro-ipc - provided - - - org.apache.zookeeper - zookeeper - ${zookeeper.version} - provided - - - - - hive-thriftserver + hive-0.12.0 + sql/hive-thriftserver @@ -1432,5 +1519,28 @@ + + + flume-provided + + + hadoop-provided + + + hbase-provided + + + hive + + + hive-provided + + + parquet-provided + diff --git a/repl/pom.xml b/repl/pom.xml index c2bf9fdfbcce7..4cba605bdb954 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -68,10 +68,6 @@ ${project.version} test - - org.eclipse.jetty - jetty-server - org.scala-lang scala-compiler diff --git a/sql/core/pom.xml b/sql/core/pom.xml index bd110218d34f7..a3bcc18fe540a 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -56,12 +56,10 @@ com.twitter parquet-column - ${parquet.version} com.twitter parquet-hadoop - ${parquet.version} com.fasterxml.jackson.core diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 8db3010624100..5ba82487d9c99 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -42,17 +42,17 @@ ${project.version} - org.spark-project.hive + ${hive.group} hive-cli ${hive.version} - org.spark-project.hive + ${hive.group} hive-jdbc ${hive.version} - org.spark-project.hive + ${hive.group} hive-beeline ${hive.version} diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index fa9a1e64b0f80..54c6ca95b012d 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -47,9 +47,8 @@ ${project.version} - org.spark-project.hive + ${hive.group} hive-metastore - ${hive.version} commons-httpclient @@ -57,51 +56,27 @@ 3.1 - org.spark-project.hive + ${hive.group} hive-exec - ${hive.version} - - - commons-logging - commons-logging - - - com.esotericsoftware.kryo - kryo - - org.codehaus.jackson jackson-mapper-asl - org.spark-project.hive + ${hive.group} hive-serde - ${hive.version} - - - commons-logging - commons-logging - - - commons-logging - commons-logging-api - - org.apache.avro avro - ${avro.version} org.apache.avro avro-mapred - ${avro.version} ${avro.mapred.classifier} diff --git a/streaming/pom.xml b/streaming/pom.xml index 12f900c91eb98..c5cf5f0fedee5 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -40,10 +40,6 @@ spark-core_${scala.binary.version} ${project.version} - - org.eclipse.jetty - jetty-server - org.scala-lang scala-library @@ -77,14 +73,14 @@ org.scalatest scalatest-maven-plugin - - From 1adf91c401890d6a93d3950d98f951db11304cb3 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 21 Oct 2014 09:59:44 -0700 Subject: [PATCH 02/22] Re-enable maven-install-plugin for a few projects. Without this, running specific targets directly (e.g. mvn -f assembly/pom.xml) doesn't work. --- repl/pom.xml | 7 ------- yarn/pom.xml | 7 ------- 2 files changed, 14 deletions(-) diff --git a/repl/pom.xml b/repl/pom.xml index 4cba605bdb954..9a95ff50dc25f 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -104,13 +104,6 @@ true - - org.apache.maven.plugins - maven-install-plugin - - true - - org.scalatest scalatest-maven-plugin diff --git a/yarn/pom.xml b/yarn/pom.xml index 2885e6607ec24..4541906d3622a 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -117,13 +117,6 @@ true - - org.apache.maven.plugins - maven-install-plugin - - true - - org.codehaus.mojo build-helper-maven-plugin From 2f95f0dcbbae415ff043fe9bbf3245e2a4787d3b Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Thu, 23 Oct 2014 13:19:41 -0700 Subject: [PATCH 03/22] Propagate classpath to child processes during testing. When spawning child processes that use the Spark assembly jar in unit tests, all classes needed to run Spark are needed. If the assembly is built using the "*-provided" profiles, some classes will not be part of the assembly, although they'll be part of the unit test's class path since maven/sbt will make the dependencies available. So this change extends the unit test's class path to the child processes so that all classes are available. I also parameterized the "spark.test.home" setting so that you can do things like "mvn -f core/pom.xml test" and have it work (as long as you set it to a proper value; unfortunately maven makes this super painful to do automatically, because of things like MNG-5522). --- bin/compute-classpath.cmd | 13 +++++++++++- bin/compute-classpath.sh | 4 ++++ .../cluster/SparkDeploySchedulerBackend.scala | 21 ++++++++++++------- .../scala/org/apache/spark/util/Utils.scala | 8 +++++++ .../scala/org/apache/spark/DriverSuite.scala | 2 +- pom.xml | 8 ++++++- 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index a4c099fb45b14..86a7028cd7a33 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -95,7 +95,7 @@ set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\hive\target\scala-%SCALA_ if "x%SPARK_TESTING%"=="x1" ( rem Add test clases to path - note, add SPARK_CLASSES and SPARK_TEST_CLASSES before CLASSPATH rem so that local compilation takes precedence over assembled jar - set CLASSPATH=%SPARK_CLASSES%;%SPARK_TEST_CLASSES%;%CLASSPATH% + set CLASSPATH=%SPARK_CLASSES%;%SPARK_TEST_CLASSES%;%CLASSPATH%;%SPARK_TEST_PARENT_CLASS_PATH% ) rem Add hadoop conf dir - else FileSystem.*, etc fail @@ -109,6 +109,17 @@ if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR% :no_yarn_conf_dir +rem Add hadoop conf dir - else FileSystem.*, etc fail +rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts +rem the configurtion files. +if "x%HADOOP_CONF_DIR%"=="x" goto no_hadoop_conf_dir + set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR% +:no_hadoop_conf_dir + +if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir + set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR% +:no_yarn_conf_dir + rem A bit of a hack to allow calling this script within run2.cmd without seeing output if "%DONT_PRINT_CLASSPATH%"=="1" goto exit diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index 298641f2684de..b537fca803fee 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -130,6 +130,10 @@ if [[ $SPARK_TESTING == 1 ]]; then CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/test-classes" CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/test-classes" CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/test-classes" + # Append the parent class path if requested by the test code. + if [ -n "$SPARK_TEST_PARENT_CLASS_PATH" ]; then + CLASSPATH="$CLASSPATH:$SPARK_TEST_PARENT_CLASS_PATH" + fi fi # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail ! diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala index 8c7de75600b5f..7eb87a564d6f5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala @@ -55,19 +55,26 @@ private[spark] class SparkDeploySchedulerBackend( "{{WORKER_URL}}") val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions") .map(Utils.splitCommandString).getOrElse(Seq.empty) - val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath").toSeq.flatMap { cp => - cp.split(java.io.File.pathSeparator) - } - val libraryPathEntries = - sc.conf.getOption("spark.executor.extraLibraryPath").toSeq.flatMap { cp => - cp.split(java.io.File.pathSeparator) + val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath") + .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil) + val libraryPathEntries = sc.conf.getOption("spark.executor.extraLibraryPath") + .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil) + + // When testing, expose the parent class path to the child. This is processed by + // compute-classpath.{cmd,sh} and makes all needed jars available to child processes + // when the assembly is built with the "*-provided" profiles enabled. + val testingClassPath = + if (sys.props.contains("spark.testing")) { + sys.props("java.class.path").split(java.io.File.pathSeparator).toSeq + } else { + Nil } // Start executors with a few necessary configs for registering with the scheduler val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf) val javaOpts = sparkJavaOpts ++ extraJavaOpts val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend", - args, sc.executorEnvs, classPathEntries, libraryPathEntries, javaOpts) + args, sc.executorEnvs, classPathEntries ++ testingClassPath, libraryPathEntries, javaOpts) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command, appUIAddress, sc.eventLogDir) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index eb4a598dbf857..95c3e76a7d620 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -935,6 +935,14 @@ private[spark] object Utils extends Logging { for ((key, value) <- extraEnvironment) { environment.put(key, value) } + + // When testing, expose the parent class path to the child. This is processed by + // compute-classpath.{cmd,sh} and makes all needed jars available to child processes + // when the assembly is built with the "*-provided" profiles enabled. + if (sys.props.contains("spark.testing")) { + environment.put("SPARK_TEST_PARENT_CLASS_PATH", sys.props("java.class.path")) + } + val process = builder.start() new Thread("read stderr for " + command(0)) { override def run() { diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala index 5265ba904032f..89ce7a18ecef5 100644 --- a/core/src/test/scala/org/apache/spark/DriverSuite.scala +++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala @@ -35,7 +35,7 @@ class DriverSuite extends FunSuite with Timeouts { forAll(masters) { (master: String) => failAfter(60 seconds) { Utils.executeAndGetOutput( - Seq("./bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master), + Seq(s"$sparkHome/bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master), new File(sparkHome), Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome)) } diff --git a/pom.xml b/pom.xml index 01e45635e6af9..206411affeda5 100644 --- a/pom.xml +++ b/pom.xml @@ -165,6 +165,12 @@ compile compile + + ${session.executionRootDirectory} + 64m 512m 512m @@ -1111,7 +1117,7 @@ true - ${session.executionRootDirectory} + ${spark.test.home} 1 false ${test_classpath} From 417d90e887dcfd21b36edf8cc864f5451340604c Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Thu, 23 Oct 2014 15:04:11 -0700 Subject: [PATCH 04/22] Introduce "SPARK_DIST_CLASSPATH". This env variable is processed by compute-classpath.sh and appended to the generated classpath; it allows distributions that ship with reduced assemblies (e.g. those built with the "hadoop-provided" profile) to set it to add any needed libraries to the classpath when running Spark. --- bin/compute-classpath.cmd | 8 ++++++++ bin/compute-classpath.sh | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index 86a7028cd7a33..0360d7e3fc0ec 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -1,3 +1,4 @@ +<<<<<<< HEAD @echo off rem @@ -120,6 +121,13 @@ if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR% :no_yarn_conf_dir +rem To allow for distributions to append needed libraries to the classpath (e.g. when +rem using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and +rem append it to tbe final classpath. +if not "x%$SPARK_DIST_CLASSPATH%"=="x" ( + set CLASSPATH=%CLASSPATH%;%SPARK_DIST_CLASSPATH% +) + rem A bit of a hack to allow calling this script within run2.cmd without seeing output if "%DONT_PRINT_CLASSPATH%"=="1" goto exit diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index b537fca803fee..8e7c675978c12 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -146,4 +146,11 @@ if [ -n "$YARN_CONF_DIR" ]; then CLASSPATH="$CLASSPATH:$YARN_CONF_DIR" fi +# To allow for distributions to append needed libraries to the classpath (e.g. when +# using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and +# append it to tbe final classpath. +if [ -n "$SPARK_DIST_CLASSPATH" ]; then + CLASSPATH="$CLASSPATH:$SPARK_DIST_CLASSPATH" +fi + echo "$CLASSPATH" From 4d674696000be874d2593f98bcf9d32367b93536 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Mon, 27 Oct 2014 13:55:45 -0700 Subject: [PATCH 05/22] Propagate SPARK_DIST_CLASSPATH on Yarn. Yarn builds the classpath based on the Hadoop configuration, which may miss thing in case non-Hadoop classes are needed (for example, when Spark is built with "-Phive-provided" and the user is running code that uses HiveContext). So propagate the distribution's classpath variable so that the extra classpath is automatically added to all containers. --- .../apache/spark/deploy/yarn/ClientBase.scala | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala index f95d72379171c..503548e5f8cd4 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala @@ -17,6 +17,7 @@ package org.apache.spark.deploy.yarn +import java.io.File import java.net.{InetAddress, UnknownHostException, URI, URISyntaxException} import scala.collection.JavaConversions._ @@ -292,6 +293,10 @@ private[spark] trait ClientBase extends Logging { } } + sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp => + env(ENV_DIST_CLASSPATH) = dcp + } + env } @@ -555,6 +560,9 @@ private[spark] object ClientBase extends Logging { // of the executors val CONF_SPARK_YARN_SECONDARY_JARS = "spark.yarn.secondary.jars" + // Distribution-defined classpath to add to processes + val ENV_DIST_CLASSPATH = "SPARK_DIST_CLASSPATH" + // Staging directory is private! -> rwx-------- val STAGING_DIR_PERMISSION: FsPermission = FsPermission.createImmutable(Integer.parseInt("700", 8).toShort) @@ -595,7 +603,8 @@ private[spark] object ClientBase extends Logging { * classpath specified through the Hadoop and Yarn configurations. */ def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]): Unit = { - val classPathElementsToAdd = getYarnAppClasspath(conf) ++ getMRAppClasspath(conf) + val classPathElementsToAdd = getYarnAppClasspath(conf) ++ getMRAppClasspath(conf) ++ + getDistributionClasspath() for (c <- classPathElementsToAdd.flatten) { YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, c.trim) } @@ -613,6 +622,13 @@ private[spark] object ClientBase extends Logging { case None => getDefaultMRApplicationClasspath } + /** + * Propagate the distribution's classpath to containers too, since they may contain libraries + * that are not part of the Yarn/MR application classpaths handled above. + */ + private def getDistributionClasspath(): Option[Seq[String]] = + sys.env.get(ENV_DIST_CLASSPATH).map(_.split(File.pathSeparator).toSeq) + def getDefaultYarnApplicationClasspath: Option[Seq[String]] = { val triedDefault = Try[Seq[String]] { val field = classOf[YarnConfiguration].getField("DEFAULT_YARN_APPLICATION_CLASSPATH") From d928d62f8dc30ab0ddcba824bcd1e5c5705150a3 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 28 Oct 2014 11:57:44 -0700 Subject: [PATCH 06/22] Redirect child stderr to parent's log. Instead of writing to System.err directly. That way the console is not polluted when running child processes. Also remove an unused env variable that caused a warning when running Spark jobs in child processes. --- core/src/main/scala/org/apache/spark/util/Utils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 95c3e76a7d620..18c326c3316d3 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -947,7 +947,7 @@ private[spark] object Utils extends Logging { new Thread("read stderr for " + command(0)) { override def run() { for (line <- Source.fromInputStream(process.getErrorStream).getLines()) { - System.err.println(line) + logInfo(s"CHILD STDERR: $line") } } }.start() From 9e4e001a9b3fc00b9ca3cd7459f977b882b98cf3 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Mon, 3 Nov 2014 16:50:06 -0800 Subject: [PATCH 07/22] Remove duplicate hive profile. --- pom.xml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pom.xml b/pom.xml index 206411affeda5..e6e05f513eace 100644 --- a/pom.xml +++ b/pom.xml @@ -1539,9 +1539,6 @@ hbase-provided - - hive - hive-provided From f7b3bbe8b5eaa30027d37840e805e3a88debbb59 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 4 Nov 2014 12:16:53 -0800 Subject: [PATCH 08/22] Add snappy to hadoop-provided list. It's a dependency of avro. --- pom.xml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e6e05f513eace..5e64b83a6d36a 100644 --- a/pom.xml +++ b/pom.xml @@ -150,6 +150,7 @@ ${scala.version} org.scala-lang 1.8.8 + 1.1.1.6 - - flume-provided - hadoop-provided From d1399eda1dd7912a0d4e6e743793f16d636a0e99 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Wed, 12 Nov 2014 12:03:04 -0800 Subject: [PATCH 12/22] Restore jetty dependency. Streaming only needs the servlet API, but jetty pulls in its own repackaged servlet api jar for whatever reason, so instead of adding more jetty cruft to the build, just use the coarse dependency. --- streaming/pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/streaming/pom.xml b/streaming/pom.xml index c5cf5f0fedee5..a133aef2bd297 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -40,6 +40,10 @@ spark-core_${scala.binary.version} ${project.version} + + org.eclipse.jetty + jetty-server + org.scala-lang scala-library From 1be73d4574d58da9f2c31d5e9338bcaada2f29d2 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Fri, 14 Nov 2014 16:41:54 -0800 Subject: [PATCH 13/22] Restore flume-provided profile. It's actually useful for the examples jar. --- examples/pom.xml | 18 ++++++++++++------ pom.xml | 5 ++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 39285719c4b33..982c17eeb9a5b 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -439,6 +439,12 @@ + + flume-provided + + provided + + hadoop-provided @@ -446,21 +452,21 @@ - hive-provided + hbase-provided - provided + provided - parquet-provided + hive-provided - provided + provided - hbase-provided + parquet-provided - provided + provided diff --git a/pom.xml b/pom.xml index f48710332dd56..fe9b627ec1c24 100644 --- a/pom.xml +++ b/pom.xml @@ -160,11 +160,11 @@ during compilation if the dependency is transivite (e.g. "bagel/" depending on "core/" and needing Hadoop classes in the classpath to compile). --> - compile compile compile compile compile + compile + + flume-provided + hadoop-provided From 7820d5843cc13d03942855533c6025a425422ea7 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Mon, 17 Nov 2014 12:53:26 -0800 Subject: [PATCH 14/22] Fix CliSuite with provided profiles. --- .../org/apache/spark/sql/hive/thriftserver/CliSuite.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index e8ffbc5b954d4..0d441c9626c24 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -69,8 +69,11 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging { } } + // Propagate the current class path to the child to support *-provided profiles. + val extraEnv = Seq("SPARK_TEST_PARENT_CLASS_PATH" -> sys.props("java.class.path")) + // Searching expected output line from both stdout and stderr of the CLI process - val process = (Process(command) #< queryStream).run( + val process = (Process(command, None, extraEnv:_*) #< queryStream).run( ProcessLogger(captureOutput("stdout"), captureOutput("stderr"))) try { From e3ab2dab25552e368b9be23fac3cb38206f98488 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Mon, 17 Nov 2014 13:53:03 -0800 Subject: [PATCH 15/22] Fix hive-thriftserver profile. --- pom.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index fe9b627ec1c24..6ae5527f8fa46 100644 --- a/pom.xml +++ b/pom.xml @@ -1493,8 +1493,7 @@ - hive-0.12.0 - + hive-thriftserver sql/hive-thriftserver From 115fde526c6e51fef92d63020962264e8ba056d3 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Mon, 17 Nov 2014 14:12:21 -0800 Subject: [PATCH 16/22] Simplify a comment (and make it consistent with another pom). --- assembly/pom.xml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 488340c7def7a..bb8910b25d4fe 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -365,23 +365,19 @@ - + hadoop-provided provided - - hive-provided provided - - parquet-provided From 96405036fd5f570973ad6dd29ccaa45b93e44f4c Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Mon, 17 Nov 2014 14:30:29 -0800 Subject: [PATCH 17/22] Cleanup child process log message. --- core/src/main/scala/org/apache/spark/util/Utils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 18c326c3316d3..6e55ae31aa3ce 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -947,7 +947,7 @@ private[spark] object Utils extends Logging { new Thread("read stderr for " + command(0)) { override def run() { for (line <- Source.fromInputStream(process.getErrorStream).getLines()) { - logInfo(s"CHILD STDERR: $line") + logInfo(line) } } }.start() From 322f882ce3de83f0a47a357f8209d08874c4d1d1 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Thu, 20 Nov 2014 15:51:21 -0800 Subject: [PATCH 18/22] Fix merge fail. --- bin/compute-classpath.cmd | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index d1f88376a4502..be989da6b0508 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -1,4 +1,3 @@ -<<<<<<< HEAD @echo off rem From 371ebee9fc9f4b54481455ec954288c7d97fcffc Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 6 Jan 2015 15:13:32 -0800 Subject: [PATCH 19/22] Review feedback. - use spark-submit options where it's easy - reuse SPARK_DIST_CLASSPATH for tests --- bin/compute-classpath.cmd | 17 ----------------- bin/compute-classpath.sh | 6 ------ .../scala/org/apache/spark/util/Utils.scala | 4 ++-- pom.xml | 2 +- .../spark/sql/hive/thriftserver/CliSuite.scala | 6 ++---- .../thriftserver/HiveThriftServer2Suite.scala | 7 +++---- 6 files changed, 8 insertions(+), 34 deletions(-) diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index be989da6b0508..088f993954d9e 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -98,23 +98,6 @@ if "x%SPARK_TESTING%"=="x1" ( set CLASSPATH=%SPARK_CLASSES%;%SPARK_TEST_CLASSES%;%CLASSPATH% ) -rem Append the parent class path if requested by the test code. Note this is outside of -rem the check for SPARK_TESTING because some tests reset that variable. -if not "x%SPARK_TEST_PARENT_CLASS_PATH%"=="x" ( - set CLASSPATH=%CLASSPATH%;%SPARK_TEST_PARENT_CLASS_PATH% -) - -rem Add hadoop conf dir - else FileSystem.*, etc fail -rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts -rem the configurtion files. -if "x%HADOOP_CONF_DIR%"=="x" goto no_hadoop_conf_dir - set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR% -:no_hadoop_conf_dir - -if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir - set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR% -:no_yarn_conf_dir - rem Add hadoop conf dir - else FileSystem.*, etc fail rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts rem the configurtion files. diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index 85ca3a520e473..8f3b396ffd086 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -136,12 +136,6 @@ if [[ $SPARK_TESTING == 1 ]]; then CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/test-classes" fi -# Append the parent class path if requested by the test code. Note this is outside of -# the check for SPARK_TESTING because some tests reset that variable. -if [ -n "$SPARK_TEST_PARENT_CLASS_PATH" ]; then - CLASSPATH="$CLASSPATH:$SPARK_TEST_PARENT_CLASS_PATH" -fi - # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail ! # Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts # the configurtion files. diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 48ea784ccc6c8..a9776a2e00056 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -995,7 +995,7 @@ private[spark] object Utils extends Logging { // compute-classpath.{cmd,sh} and makes all needed jars available to child processes // when the assembly is built with the "*-provided" profiles enabled. if (sys.props.contains("spark.testing")) { - environment.put("SPARK_TEST_PARENT_CLASS_PATH", sys.props("java.class.path")) + environment.put("SPARK_DIST_CLASSPATH", sys.props("java.class.path")) } val process = builder.start() @@ -1097,7 +1097,7 @@ private[spark] object Utils extends Logging { var firstUserLine = 0 var insideSpark = true var callStack = new ArrayBuffer[String]() :+ "" - + Thread.currentThread.getStackTrace().foreach { ste: StackTraceElement => // When running under some profilers, the current stack trace might contain some bogus // frames. This is intended to ensure that we don't crash in these situations by diff --git a/pom.xml b/pom.xml index 47d8ad2d6ec42..96e8bd896dba2 100644 --- a/pom.xml +++ b/pom.xml @@ -887,7 +887,7 @@ org.codehaus.jackson jackson-core-asl - ${jackson.version} + ${codehaus.jackson.version} ${hadoop.deps.scope} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index 0d441c9626c24..60953576d0e37 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -48,6 +48,7 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging { | --master local | --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$jdbcUrl | --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath + | --driver-class-path ${sys.props("java.class.path")} """.stripMargin.split("\\s+").toSeq ++ extraArgs } @@ -69,11 +70,8 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging { } } - // Propagate the current class path to the child to support *-provided profiles. - val extraEnv = Seq("SPARK_TEST_PARENT_CLASS_PATH" -> sys.props("java.class.path")) - // Searching expected output line from both stdout and stderr of the CLI process - val process = (Process(command, None, extraEnv:_*) #< queryStream).run( + val process = (Process(command, None) #< queryStream).run( ProcessLogger(captureOutput("stdout"), captureOutput("stderr"))) try { diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala index 83889ab597ad0..7814aa38f4146 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala @@ -142,6 +142,7 @@ class HiveThriftServer2Suite extends FunSuite with Logging { | --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost | --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=http | --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT}=$port + | --driver-class-path ${sys.props("java.class.path")} """.stripMargin.split("\\s+").toSeq } else { s"""$startScript @@ -151,6 +152,7 @@ class HiveThriftServer2Suite extends FunSuite with Logging { | --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath | --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost | --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_PORT}=$port + | --driver-class-path ${sys.props("java.class.path")} """.stripMargin.split("\\s+").toSeq } @@ -181,10 +183,7 @@ class HiveThriftServer2Suite extends FunSuite with Logging { val env = Seq( // Resets SPARK_TESTING to avoid loading Log4J configurations in testing class paths - "SPARK_TESTING" -> "0", - // Allows the child process to inherit the parent's class path so the server works when - // *-provided profiles are used. - "SPARK_TEST_PARENT_CLASS_PATH" -> sys.props("java.class.path")) + "SPARK_TESTING" -> "0") Process(command, None, env: _*).run(ProcessLogger( captureThriftServerOutput("stdout"), From 9ef79a3257ef54848491bf90b68a0ad887a78199 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Thu, 8 Jan 2015 12:29:10 -0800 Subject: [PATCH 20/22] Alternative way to propagate test classpath to child processes. --- core/src/main/scala/org/apache/spark/util/Utils.scala | 7 ------- pom.xml | 4 +++- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index a9776a2e00056..7740a3635f20f 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -991,13 +991,6 @@ private[spark] object Utils extends Logging { environment.put(key, value) } - // When testing, expose the parent class path to the child. This is processed by - // compute-classpath.{cmd,sh} and makes all needed jars available to child processes - // when the assembly is built with the "*-provided" profiles enabled. - if (sys.props.contains("spark.testing")) { - environment.put("SPARK_DIST_CLASSPATH", sys.props("java.class.path")) - } - val process = builder.start() new Thread("read stderr for " + command(0)) { override def run() { diff --git a/pom.xml b/pom.xml index 96e8bd896dba2..505c312af7c1b 100644 --- a/pom.xml +++ b/pom.xml @@ -1144,13 +1144,15 @@ SparkTestSuite.txt -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} + + ${test_classpath} + true ${spark.test.home} 1 false false - ${test_classpath} true From eb228c0d6f6b311dfb0655f9e701f1b83cb2aaa1 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Thu, 8 Jan 2015 12:47:48 -0800 Subject: [PATCH 21/22] Fix borked merge. --- yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index c31dc54ec0c4e..c363d755c1752 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -656,6 +656,9 @@ object Client extends Logging { val APP_FILE_PERMISSION: FsPermission = FsPermission.createImmutable(Integer.parseInt("644", 8).toShort) + // Distribution-defined classpath to add to processes + val ENV_DIST_CLASSPATH = "SPARK_DIST_CLASSPATH" + /** * Find the user-defined Spark jar if configured, or return the jar containing this * class if not. From 82eb688f44d2df63a7b7ff311e5d40970f67fc43 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Thu, 8 Jan 2015 15:22:48 -0800 Subject: [PATCH 22/22] Add a comment. --- pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pom.xml b/pom.xml index 505c312af7c1b..703e5c47bf59b 100644 --- a/pom.xml +++ b/pom.xml @@ -1145,6 +1145,10 @@ -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} + ${test_classpath}