apache · asfgit · Mar 10, 2016 · Jul 11, 2014 · Jul 11, 2014 · Nov 4, 2014
diff --git a/runners/spark/.gitignore b/runners/spark/.gitignore
@@ -0,0 +1,10 @@
+.classpath
+.project
+.settings
+.cache
+target
+*.iml
+.idea
+gen
+.DS_Store
+dependency-reduced-pom.xml
diff --git a/runners/spark/.travis.yml b/runners/spark/.travis.yml
@@ -0,0 +1,22 @@
+language: java
+sudo: false
+install: mvn ${JAVA} ${SPARK} -DskipTests=true -Dmaven.javadoc.skip=true -B -V install
+script: mvn ${JAVA} ${SPARK} ${JACOCO} -Dmaven.javadoc.skip=true -B verify
+matrix:
+ include:
+   # Covers Java 7, Open JDK, Spark 1.3.x, and code coverage
+   - jdk: openjdk7
+     env: JACOCO=-Pjacoco
+   # Covers Spark 1.4.x
+   - jdk: openjdk7
+     env: SPARK=-Dspark.version=1.4.1
+   # Covers Spark 1.5.x
+   - jdk: openjdk7
+     env: SPARK=-Dspark.version=1.5.1
+   # Covers Java 8, Oracle JDK
+   - jdk: oraclejdk8
+     env: JAVA=-Djava.version=1.8
+cache:
+  directories:
+    - $HOME/.m2
+after_success: if [ -n "$JACOCO" ]; then bash <(curl -s https://codecov.io/bash); fi
diff --git a/runners/spark/CONTRIBUTING.md b/runners/spark/CONTRIBUTING.md
@@ -0,0 +1,8 @@
+## Licensing
+
+Contributions via GitHub pull requests are gladly accepted from their original author.
+Along with any pull requests, please state that the contribution is your original work and
+that you license the work to the project under the project's open source license.
+Whether or not you state this explicitly, by submitting any copyrighted material via
+pull request, email, or other means you agree to license the material under the project's
+open source license and warrant that you have the legal authority to do so.
diff --git a/runners/spark/LICENSE b/runners/spark/LICENSE
diff --git a/runners/spark/README.md b/runners/spark/README.md
@@ -0,0 +1,113 @@
+spark-dataflow
+==============
+
+## Intro
+
+Spark-dataflow allows users to execute data pipelines written against the Google Cloud Dataflow API
+with Apache Spark. Spark-dataflow is an early prototype, and we'll be working on it continuously.
+If this project interests you, we welcome issues, comments, and (especially!) pull requests.
+To get an idea of what we have already identified as
+areas that need improvement, checkout the issues listed in the github repo.
+
+## Motivation
+
+We had two primary goals when we started working on Spark-dataflow:
+
+1. *Provide portability for data pipelines written for Google Cloud Dataflow.* Google makes
+it really easy to get started writing pipelines against the Dataflow API, but they wanted
+to be sure that creating a pipeline using their tools would not lock developers in to their
+platform. A Spark-based implementation of Dataflow means that you can take your pipeline
+logic with you wherever you go. This also means that any new machine learning and anomaly
+detection algorithms that are developed against the Dataflow API are available to everyone,
+regardless of their underlying execution platform.
+
+2. *Experiment with new data pipeline design patterns.* The Dataflow API has a number of
+interesting ideas, especially with respect to the unification of batch and stream data
+processing into a single API that maps into two separate engines. The Dataflow streaming
+engine, based on Google's [Millwheel](http://research.google.com/pubs/pub41378.html), does
+not have a direct open source analogue, and we wanted to understand how to replicate its
+functionality using frameworks like Spark Streaming.
+
+## Getting Started
+
+The Maven coordinates of the current version of this project are:
+
+    <groupId>com.cloudera.dataflow.spark</groupId>
+    <artifactId>spark-dataflow</artifactId>
+    <version>0.4.2</version>
+
+and are hosted in Cloudera's repository at:
+
+    <repository>
+      <id>cloudera.repo</id>
+      <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+    </repository>
+
+If we wanted to run a dataflow pipeline with the default options of a single threaded spark
+instance in local mode, we would do the following:
+
+    Pipeline p = <logic for pipeline creation >
+    EvaluationResult result = SparkPipelineRunner.create().run(p);
+
+To create a pipeline runner to run against a different spark cluster, with a custom master url we
+would do the following:
+
+    Pipeline p = <logic for pipeline creation >
+    SparkPipelineOptions options = SparkPipelineOptionsFactory.create();
+    options.setSparkMaster("spark://host:port");
+    EvaluationResult result = SparkPipelineRunner.create(options).run(p);
+
+## Word Count Example
+
+First download a text document to use as input:
+
+    curl http://www.gutenberg.org/cache/epub/1128/pg1128.txt > /tmp/kinglear.txt
+
+Then run the [word count example][wc] from the SDK using a single threaded Spark instance
+in local mode:
+
+    mvn exec:exec -DmainClass=com.google.cloud.dataflow.examples.WordCount \
+      -Dinput=/tmp/kinglear.txt -Doutput=/tmp/out -Drunner=SparkPipelineRunner \
+      -DsparkMaster=local
+
+Check the output by running:
+
+    head /tmp/out-00000-of-00001
+
+__Note: running examples using `mvn exec:exec` only works for Spark local mode at the
+moment. See the next section for how to run on a cluster.__
+
+[wc]: https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/WordCount.java
+
+## Running on a Cluster
+
+Spark Dataflow pipelines can be run on a cluster using the `spark-submit` command.
+
+First copy a text document to HDFS:
+
+    curl http://www.gutenberg.org/cache/epub/1128/pg1128.txt | hadoop fs -put - kinglear.txt
+
+Then run the word count example using Spark submit with the `yarn-client` master
+(`yarn-cluster` works just as well):
+
+    spark-submit \
+      --class com.google.cloud.dataflow.examples.WordCount \
+      --master yarn-client \
+      target/spark-dataflow-*-spark-app.jar \
+        --inputFile=kinglear.txt --output=out --runner=SparkPipelineRunner --sparkMaster=yarn-client
+
+Check the output by running:
+
+    hadoop fs -tail out-00000-of-00002
+
+## How to Release
+
+Committers can release the project using the standard [Maven Release Plugin](http://maven.apache.org/maven-release/maven-release-plugin/) commands:
+
+    mvn release:prepare
+    mvn release:perform -Darguments="-Dgpg.passphrase=XXX"
+
+Note that you will need a [public GPG key](http://www.apache.org/dev/openpgp.html).
+
+[![Build Status](https://travis-ci.org/cloudera/spark-dataflow.png?branch=master)](https://travis-ci.org/cloudera/spark-dataflow)
+[![codecov.io](https://codecov.io/github/cloudera/spark-dataflow/coverage.svg?branch=master)](https://codecov.io/github/cloudera/spark-dataflow?branch=master)
diff --git a/runners/spark/build-resources/checkstyle.xml b/runners/spark/build-resources/checkstyle.xml
@@ -0,0 +1,222 @@
+<?xml version="1.0"?>
+<!DOCTYPE module PUBLIC
+        "-//Puppy Crawl//DTD Check Configuration 1.2//EN"
+        "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">
+<!--
+  Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
+
+  Cloudera, Inc. licenses this file to you under the Apache License,
+  Version 2.0 (the "License"). You may not use this file except in
+  compliance with the License. You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+  CONDITIONS OF ANY KIND, either express or implied. See the License for
+  the specific language governing permissions and limitations under the
+  License.
+  -->
+<!--
+
+  Checkstyle configuration for spark-dataflow that is based on the
+  sun_checks.xml file that is bundled with Checkstyle and includes
+  checks for:
+
+    - the Java Language Specification at
+      http://java.sun.com/docs/books/jls/second_edition/html/index.html
+
+    - the Sun Code Conventions at http://java.sun.com/docs/codeconv/
+
+    - the Javadoc guidelines at
+      http://java.sun.com/j2se/javadoc/writingdoccomments/index.html
+
+    - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html
+
+    - some best practices
+
+  Checkstyle is very configurable. Be sure to read the documentation at
+  http://checkstyle.sf.net (or in your downloaded distribution).
+
+  Most Checks are configurable, be sure to consult the documentation.
+
+  To completely disable a check, just comment it out or delete it from the file.
+
+  Finally, it is worth reading the documentation.
+
+-->
+
+<module name="Checker">
+    <!-- Checks for the file header.                     -->
+    <!-- See http://checkstyle.sf.net/config_header.html -->
+    <module name="Header">
+        <property name="headerFile" value="${checkstyle.header.file}"/>
+        <property name="ignoreLines" value="2"/>
+        <property name="fileExtensions" value="java, scala"/>
+    </module>
+
+    <!-- Checks whether files end with a new line.                        -->
+    <!-- See http://checkstyle.sf.net/config_misc.html#NewlineAtEndOfFile -->
+    <module name="NewlineAtEndOfFile"/>
+
+    <module name="FileLength"/>
+    <module name="FileTabCharacter"/>
+
+    <!-- <module name="JavadocPackage"/> -->
+
+    <module name="TreeWalker">
+        <!-- Checks for Javadoc comments.                     -->
+        <!-- See http://checkstyle.sf.net/config_javadoc.html -->
+        <!-- <module name="JavadocType"/> -->
+        <module name="JavadocMethod">
+            <property name="scope" value="package"/>
+            <property name="allowUndeclaredRTE" value="true"/>
+            <property name="allowThrowsTagsForSubclasses" value="true"/>
+            <property name="validateThrows" value="true"/>
+            <property name="allowMissingJavadoc" value="true"/>
+        </module>
+        <module name="JavadocStyle"/>
+
+        <module name="SuperClone"/>
+        <module name="SuperFinalize"/>
+
+        <!-- Checks for Naming Conventions.                  -->
+        <!-- See http://checkstyle.sf.net/config_naming.html -->
+        <module name="ConstantName"/>
+        <module name="ClassTypeParameterName">
+            <property name="format" value="^[A-Z]+$"/>
+        </module>
+        <module name="LocalFinalVariableName"/>
+        <module name="LocalVariableName"/>
+        <module name="MethodName"/>
+        <module name="MethodTypeParameterName">
+            <property name="format" value="^[A-Z]+$"/>
+        </module>
+        <module name="PackageName"/>
+        <module name="ParameterName"/>
+        <!-- <module name="StaticVariableName"/> -->
+        <module name="TypeName"/>
+
+        <!-- Checks for imports                              -->
+        <!-- See http://checkstyle.sf.net/config_import.html -->
+        <module name="IllegalImport"/>
+        <!-- defaults to sun.* packages -->
+        <module name="RedundantImport"/>
+        <module name="UnusedImports"/>
+        <module name="ImportOrder">
+            <property name="groups" value="/^(java)|(javax)/,*,/^(com\.cloudera)/"/>
+            <property name="ordered" value="true"/>
+            <property name="separated" value="true"/>
+            <property name="option" value="top"/>
+        </module>
+
+        <!-- Checks for Size Violations.                    -->
+        <!-- See http://checkstyle.sf.net/config_sizes.html -->
+        <module name="LineLength">
+            <property name="max" value="100"/>
+        </module>
+        <module name="MethodLength"/>
+        <module name="ParameterNumber">
+          <property name="max" value="8"/>
+        </module>
+        <module name="OuterTypeNumber"/>
+
+        <!-- Checks for whitespace                               -->
+        <!-- See http://checkstyle.sf.net/config_whitespace.html -->
+        <module name="GenericWhitespace"/>
+        <module name="EmptyForIteratorPad"/>
+        <module name="MethodParamPad"/>
+        <module name="NoWhitespaceAfter">
+            <property name="tokens"
+                      value="BNOT, DEC, DOT, INC, LNOT, UNARY_MINUS, UNARY_PLUS"/>
+        </module>
+        <module name="NoWhitespaceBefore"/>
+        <!-- <module name="OperatorWrap"/> -->
+        <module name="ParenPad"/>
+        <module name="TypecastParenPad"/>
+        <module name="WhitespaceAfter">
+            <property name="tokens" value="COMMA, SEMI"/>
+        </module>
+        <module name="WhitespaceAround">
+            <property name="allowEmptyConstructors" value="true"/>
+            <property name="allowEmptyMethods" value="true"/>
+            <property name="tokens"
+                      value="BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN, BXOR, BXOR_ASSIGN, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE, LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL, PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN, STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
+        </module>
+
+        <!-- Modifier Checks                                    -->
+        <!-- See http://checkstyle.sf.net/config_modifiers.html -->
+        <module name="ModifierOrder"/>
+        <module name="RedundantModifier"/>
+
+
+        <!-- Checks for blocks. You know, those {}'s         -->
+        <!-- See http://checkstyle.sf.net/config_blocks.html -->
+        <module name="AvoidNestedBlocks">
+            <property name="allowInSwitchCase" value="true"/>
+        </module>
+        <module name="EmptyBlock">
+            <!-- catch blocks need a statement or a comment. -->
+            <property name="option" value="text"/>
+            <property name="tokens" value="LITERAL_CATCH"/>
+        </module>
+        <module name="EmptyBlock">
+            <!-- all other blocks need a real statement. -->
+            <property name="option" value="stmt"/>
+            <property name="tokens" value="LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY,
+          LITERAL_IF, LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT,
+          STATIC_INIT"/>
+        </module>
+        <module name="LeftCurly"/>
+        <module name="NeedBraces"/>
+        <module name="RightCurly"/>
+
+        <!-- Checks for common coding problems               -->
+        <!-- See http://checkstyle.sf.net/config_coding.html -->
+        <!-- module name="AvoidInlineConditionals"/-->
+        <module name="EmptyStatement"/>
+        <module name="EqualsHashCode"/>
+        <module name="StringLiteralEquality"/>
+        <module name="HiddenField">
+            <property name="ignoreConstructorParameter" value="true"/>
+        </module>
+        <module name="IllegalInstantiation"/>
+        <module name="InnerAssignment"/>
+        <module name="MissingSwitchDefault"/>
+        <!--<module name="RedundantThrows"/>-->
+        <module name="SimplifyBooleanExpression"/>
+        <module name="SimplifyBooleanReturn"/>
+        <module name="DefaultComesLast"/>
+
+        <!-- Checks for class design                         -->
+        <!-- See http://checkstyle.sf.net/config_design.html -->
+        <module name="FinalClass"/>
+        <module name="HideUtilityClassConstructor"/>
+        <module name="InterfaceIsType"/>
+        <module name="VisibilityModifier">
+            <property name="protectedAllowed" value="true"/>
+        </module>
+        <module name="MissingOverride"/>
+
+        <!-- Miscellaneous other checks.                   -->
+        <!-- See http://checkstyle.sf.net/config_misc.html -->
+        <module name="ArrayTypeStyle"/>
+        <module name="ArrayTrailingComma"/>
+        <module name="UpperEll"/>
+        <module name="Regexp">
+            <property name="format" value="[ \t]+$"/>
+            <property name="illegalPattern" value="true"/>
+            <property name="message" value="Trailing whitespace"/>
+        </module>
+
+        <module name="FileContentsHolder"/>
+    </module>
+
+    <!-- allow warnings to be suppressed -->
+    <module name="SuppressionCommentFilter">
+        <property name="offCommentFormat" value="CSOFF\: ([\w\|]+)"/>
+        <property name="onCommentFormat" value="CSON\: ([\w\|]+)"/>
+        <property name="checkFormat" value="$1"/>
+    </module>
+
+    <module name="SuppressionFilter"/>
+</module>
diff --git a/runners/spark/build-resources/header-file.txt b/runners/spark/build-resources/header-file.txt
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2015, Cloudera, Inc. All Rights Reserved.
+ *
+ * Cloudera, Inc. licenses this file to you under the Apache License,
+ * Version 2.0 (the "License"). You may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for
+ * the specific language governing permissions and limitations under the
+ * License.
+ */