+ */
+public class TopHashtagsExample {
+
+ private static final Logger LOG = LoggerFactory.getLogger(TopHashtagsExample.class);
+
+ /**
+ * Options for the app.
+ */
+ public static interface Options extends PipelineOptions {
+ @Description("Sliding window length in minutes")
+ @Default.Integer(10)
+ Integer getSlidingWindowLengthMinutes();
+ void setSlidingWindowLengthMinutes(Integer value);
+
+ @Description("Trigger window interval in minutes")
+ @Default.Integer(1)
+ Integer getSlidingWindowIntervalMinutes();
+ void setSlidingWindowIntervalMinutes(Integer value);
+
+ @Description("Bootstrap Server(s) for Kafka")
+ @Required
+ String getBootstrapServers();
+ void setBootstrapServers(String servers);
+
+ @Description("One or more comma separated topics to read from")
+ @Required
+ List getTopics();
+ void setTopics(List topics);
+
+ @Description("Number of Top Hashtags to track")
+ @Default.Integer(10)
+ Integer getNumTopHashtags();
+ void setNumTopHashtags(Integer count);
+
+ @Description("Kafka topic name for writing results")
+ @Required
+ String getOutputTopic();
+ void setOutputTopic(String topic);
+ }
+
+ public static void main(String args[]) {
+
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+
+ Pipeline pipeline = Pipeline.create(options);
+
+ pipeline
+ .apply(KafkaIO.read()
+ .withBootstrapServers(options.getBootstrapServers())
+ .withTopics(options.getTopics())
+ .withValueCoder(StringUtf8Coder.of())
+ .withTimestampFn(TWEET_TIMESTAMP_OR_NOW)
+ .withoutMetadata())
+ .apply(Values.create())
+ .apply(ParDo.of(new ExtractHashtagsFn()))
+ .apply(Window.into(SlidingWindows
+ .of(Duration.standardMinutes(options.getSlidingWindowLengthMinutes()))
+ .every(Duration.standardMinutes(options.getSlidingWindowIntervalMinutes()))))
+ .apply(Count.perElement())
+ .apply(Top.of(options.getNumTopHashtags(), new KV.OrderByValue())
+ .withoutDefaults())
+ .apply(ParDo.of(new OutputFormatter()))
+ .apply(ParDo.of(new KafkaWriter(options)));
+
+ pipeline.run();
+ }
+
+ // The rest of the file implements DoFns to do the following:
+ // - extract hashtags
+ // - format results in json
+ // - write the results back to Kafka (useful for fetching monitoring the end result).
+
+ private static final ObjectMapper JSON_MAPPER = new ObjectMapper();
+
+ /**
+ * Emit hashtags in the tweet (if any).
+ */
+ private static class ExtractHashtagsFn extends DoFn {
+
+ @Override
+ public void processElement(ProcessContext ctx) throws Exception {
+ for (JsonNode hashtag : JSON_MAPPER.readTree(ctx.element())
+ .with("entities")
+ .withArray("hashtags")) {
+ ctx.output(hashtag.get("text").asText());
+ }
+ }
+ }
+
+ // extract timestamp from "timestamp_ms" field.
+ private static final SerializableFunction, Instant> TWEET_TIMESTAMP_OR_NOW =
+ new SerializableFunction, Instant>() {
+ @Override
+ public Instant apply(KV kv) {
+ try {
+ long tsMillis = JSON_MAPPER.readTree(kv.getValue()).path("timestamp_ms").asLong();
+ return tsMillis == 0 ? Instant.now() : new Instant(tsMillis);
+ } catch (Exception e) {
+ throw Throwables.propagate(e);
+ }
+ }
+ };
+
+ // return json string containing top hashtags and window information time
+ private static class OutputFormatter extends DoFn>, String>
+ implements DoFn.RequiresWindowAccess {
+
+ private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormat
+ .forPattern("yyyy-MM-dd HH:mm:ss")
+ .withZoneUTC();
+ private static final ObjectWriter JSON_WRITER = new ObjectMapper()
+ .writerWithType(OutputJson.class);
+
+ static class OutputJson {
+ @JsonProperty String windowStart;
+ @JsonProperty String windowEnd;
+ @JsonProperty String generatedAt;
+ @JsonProperty List topHashtags;
+
+ OutputJson(String windowStart, String windowEnd,
+ String generatedAt, List topHashtags) {
+ this.windowStart = windowStart;
+ this.windowEnd = windowEnd;
+ this.generatedAt = generatedAt;
+ this.topHashtags = topHashtags;
+ }
+ }
+
+ static class HashtagInfo {
+ @JsonProperty final String hashtag;
+ @JsonProperty final long count;
+ HashtagInfo(String hashtag, long count) {
+ this.hashtag = hashtag;
+ this.count = count;
+ }
+ }
+
+ @Override
+ public void processElement(ProcessContext ctx) throws Exception {
+
+ List topHashtags = new ArrayList<>(ctx.element().size());
+
+ for (KV tag : ctx.element()) {
+ topHashtags.add(new HashtagInfo(tag.getKey(), tag.getValue()));
+ }
+
+ IntervalWindow window = (IntervalWindow) ctx.window();
+
+ String json = JSON_WRITER.writeValueAsString(new OutputJson(
+ DATE_FORMATTER.print(window.start()),
+ DATE_FORMATTER.print(window.end()),
+ DATE_FORMATTER.print(Instant.now()),
+ topHashtags));
+
+ ctx.output(json);
+ }
+ }
+
+ private static class KafkaWriter extends DoFn {
+
+ private final String topic;
+ private final Map config;
+ private static transient KafkaProducer producer = null;
+
+ public KafkaWriter(Options options) {
+ this.topic = options.getOutputTopic();
+ this.config = ImmutableMap.of(
+ "bootstrap.servers", options.getBootstrapServers(),
+ "key.serializer", StringSerializer.class.getName(),
+ "value.serializer", StringSerializer.class.getName());
+ }
+
+ @Override
+ public void startBundle(Context c) throws Exception {
+ if (producer == null) { // in Beam, startBundle might be called multiple times.
+ producer = new KafkaProducer(config);
+ }
+ }
+
+ @Override
+ public void finishBundle(Context c) throws Exception {
+ producer.flush();
+ }
+
+ @Override
+ public void processElement(ProcessContext ctx) throws Exception {
+ LOG.trace("Top Hashtags : {}", ctx.element());
+ producer.send(new ProducerRecord(topic, ctx.element()));
+ }
+ }
+}
diff --git a/contrib/kafka/pom.xml b/contrib/kafka/pom.xml
new file mode 100644
index 0000000000..7fe8165bc8
--- /dev/null
+++ b/contrib/kafka/pom.xml
@@ -0,0 +1,176 @@
+
+
+
+ 4.0.0
+
+ com.google.cloud.dataflow
+ google-cloud-dataflow-java-contrib-kafka
+ Google Cloud Dataflow Kafka Connectors
+ Dataflow Library to read Kafka topics
+ 0.0.1-SNAPSHOT
+ jar
+
+
+
+ Apache License, Version 2.0
+ http://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+
+
+
+
+ UTF-8
+ [1.2.0,2.0.0)
+ 1.3
+ 4.11
+ 1.7.7
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.2
+
+ 1.7
+ 1.7
+
+
+
+
+ org.apache.maven.plugins
+ maven-checkstyle-plugin
+ 2.12
+
+
+ com.puppycrawl.tools
+ checkstyle
+ 6.6
+
+
+
+ ../../checkstyle.xml
+ true
+ true
+ true
+
+
+
+
+ check
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 2.4
+
+
+ attach-sources
+ compile
+
+ jar
+
+
+
+ attach-test-sources
+ test-compile
+
+ test-jar
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.10.3
+
+ Google Cloud Dataflow Kafka Contrib
+ Google Cloud Dataflow Kafka Contrib
+
+ com.google.cloud.dataflow.contrib.kafka
+
+ ]]>
+
+
+
+ https://cloud.google.com/dataflow/java-sdk/JavaDoc/
+ ${basedir}/../../javadoc/dataflow-sdk-docs
+
+
+ http://docs.guava-libraries.googlecode.com/git-history/release18/javadoc/
+ ${basedir}/../../javadoc/guava-docs
+
+
+
+
+
+
+ jar
+
+ package
+
+
+
+
+
+
+
+
+ com.google.cloud.dataflow
+ google-cloud-dataflow-java-sdk-all
+ ${google-cloud-dataflow-version}
+
+
+
+ org.apache.kafka
+ kafka-clients
+ [0.9,)
+
+
+
+
+ org.hamcrest
+ hamcrest-all
+ ${hamcrest.version}
+ test
+
+
+
+ junit
+ junit
+ ${junit.version}
+ test
+
+
+
+ org.slf4j
+ slf4j-jdk14
+ ${slf4j.version}
+ test
+
+
+
diff --git a/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaCheckpointMark.java b/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaCheckpointMark.java
new file mode 100644
index 0000000000..9b33ee809c
--- /dev/null
+++ b/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaCheckpointMark.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.contrib.kafka;
+
+import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
+import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
+import com.google.cloud.dataflow.sdk.io.UnboundedSource;
+
+import org.apache.kafka.common.TopicPartition;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Checkpoint for an unbounded KafkaIO.Read. Consists of Kafka topic name, partition id,
+ * and the latest offset consumed so far.
+ */
+@DefaultCoder(SerializableCoder.class)
+public class KafkaCheckpointMark implements UnboundedSource.CheckpointMark, Serializable {
+
+ private final List partitions;
+
+ public KafkaCheckpointMark(List partitions) {
+ this.partitions = partitions;
+ }
+
+ public List getPartitions() {
+ return partitions;
+ }
+
+ @Override
+ public void finalizeCheckpoint() throws IOException {
+ /* nothing to do */
+
+ // We might want to support committing offset in Kafka for better resume point when the job
+ // is restarted (checkpoint is not available for job restarts).
+ }
+
+ /**
+ * A tuple to hold topic, partition, and offset that comprise the checkpoint
+ * for a single partition.
+ */
+ public static class PartitionMark implements Serializable {
+ private final TopicPartition topicPartition;
+ private final long offset;
+
+ public PartitionMark(TopicPartition topicPartition, long offset) {
+ this.topicPartition = topicPartition;
+ this.offset = offset;
+ }
+
+ public TopicPartition getTopicPartition() {
+ return topicPartition;
+ }
+
+ public long getOffset() {
+ return offset;
+ }
+ }
+}
+
diff --git a/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaIO.java b/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaIO.java
new file mode 100644
index 0000000000..ad254ee735
--- /dev/null
+++ b/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaIO.java
@@ -0,0 +1,1053 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.contrib.kafka;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+
+import com.google.cloud.dataflow.contrib.kafka.KafkaCheckpointMark.PartitionMark;
+import com.google.cloud.dataflow.sdk.coders.ByteArrayCoder;
+import com.google.cloud.dataflow.sdk.coders.Coder;
+import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
+import com.google.cloud.dataflow.sdk.io.Read.Unbounded;
+import com.google.cloud.dataflow.sdk.io.UnboundedSource;
+import com.google.cloud.dataflow.sdk.io.UnboundedSource.CheckpointMark;
+import com.google.cloud.dataflow.sdk.io.UnboundedSource.UnboundedReader;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
+import com.google.cloud.dataflow.sdk.util.ExposedByteArrayInputStream;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PBegin;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PInput;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.base.Optional;
+import com.google.common.base.Throwables;
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+
+import org.apache.kafka.clients.consumer.Consumer;
+import org.apache.kafka.clients.consumer.ConsumerConfig;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.apache.kafka.clients.consumer.ConsumerRecords;
+import org.apache.kafka.clients.consumer.KafkaConsumer;
+import org.apache.kafka.common.PartitionInfo;
+import org.apache.kafka.common.TopicPartition;
+import org.apache.kafka.common.errors.WakeupException;
+import org.apache.kafka.common.serialization.ByteArrayDeserializer;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.TimeUnit;
+
+import javax.annotation.Nullable;
+
+/**
+ * An unbounded source for Kafka topics. Kafka version 0.9
+ * and above are supported.
+ *
+ *
Reading from Kafka topics
+ *
+ *
KafkaIO source returns unbounded collection of Kafka records as
+ * {@code PCollection>}. A {@link KafkaRecord} includes basic
+ * metadata like topic-partition and offset, along with key and value associated with a Kafka
+ * record.
+ *
+ *
Although most applications consumer single topic, the source can be configured to consume
+ * multiple topics or even a specific set of {@link TopicPartition}s.
+ *
+ *
To configure a Kafka source, you must specify at the minimum Kafka bootstrapServers
+ * and one or more topics to consume. The following example illustrates various options for
+ * configuring the source :
+ *
+ *
{@code
+ *
+ * pipeline
+ * .apply(KafkaIO.read()
+ * .withBootstrapServers("broker_1:9092,broker_2:9092")
+ * .withTopics(ImmutableList.of("topic_a", "topic_b"))
+ * // above two are required configuration. returns PCollection
+ *
+ * // rest of the settings are optional :
+ *
+ * // set a Coder for Key and Value (note the change to return type)
+ * .withKeyCoder(BigEndianLongCoder.of()) // PCollection
+ * .withValueCoder(StringUtf8Coder.of()) // PCollection
+ *
+ * // you can further customize KafkaConsumer used to read the records by adding more
+ * // settings for ConsumerConfig. e.g :
+ * .updateConsumerProperties(ImmutableMap.of("receive.buffer.bytes", 1024 * 1024))
+ *
+ * // custom function for calculating record timestamp (default is processing time)
+ * .withTimestampFn(new MyTypestampFunction())
+ *
+ * // custom function for watermark (default is record timestamp)
+ * .withWatermarkFn(new MyWatermarkFunction())
+ *
+ * // finally, if you don't need Kafka metadata, you can drop it
+ * .withoutMetadata() // PCollection>
+ * )
+ * .apply(Values.create()) // PCollection
+ * ...
+ * }
+ *
+ *
Partition Assignment and Checkpointing
+ * The Kafka partitions are evenly distributed among splits (workers).
+ * Dataflow checkpointing is fully supported and
+ * each split can resume from previous checkpoint. See
+ * {@link UnboundedKafkaSource#generateInitialSplits(int, PipelineOptions)} for more details on
+ * splits and checkpoint support.
+ *
+ *
When the pipeline starts for the first time without any checkpoint, the source starts
+ * consuming from the latest offsets. You can override this behavior to consume from the
+ * beginning by setting appropriate appropriate properties in {@link ConsumerConfig}, through
+ * {@link Read#updateConsumerProperties(Map)}.
+ *
+ *
Advanced Kafka Configuration
+ * KafakIO allows setting most of the properties in {@link ConsumerConfig}. E.g. if you would like
+ * to enable offset auto commit (for external monitoring or other purposes), you can set
+ * "group.id", "enable.auto.commit", etc.
+ */
+public class KafkaIO {
+
+ private static final Logger LOG = LoggerFactory.getLogger(KafkaIO.class);
+
+ private static class NowTimestampFn implements SerializableFunction {
+ @Override
+ public Instant apply(T input) {
+ return Instant.now();
+ }
+ }
+
+
+ /**
+ * Creates and uninitialized {@link Read} {@link PTransform}. Before use, basic Kafka
+ * configuration should set with {@link Read#withBootstrapServers(String)} and
+ * {@link Read#withTopics(List)}. Other optional settings include key and value coders,
+ * custom timestamp and watermark functions. Additionally, {@link Read#withMetadata()} provides
+ * access to Kafka metadata for each record (topic name, partition, offset).
+ */
+ public static Read read() {
+ return new Read(
+ new ArrayList(),
+ new ArrayList(),
+ ByteArrayCoder.of(),
+ ByteArrayCoder.of(),
+ Read.KAFKA_9_CONSUMER_FACTORY_FN,
+ Read.DEFAULT_CONSUMER_PROPERTIES,
+ Long.MAX_VALUE,
+ null);
+ }
+
+ /**
+ * A {@link PTransform} to read from Kafka topics. See {@link KafkaIO} for more
+ * information on usage and configuration.
+ */
+ public static class Read extends TypedRead {
+
+ /**
+ * Returns a new {@link Read} with Kafka consumer pointing to {@code bootstrapServers}.
+ */
+ public Read withBootstrapServers(String bootstrapServers) {
+ return updateConsumerProperties(
+ ImmutableMap.of(
+ ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers));
+ }
+
+ /**
+ * Returns a new {@link Read} that reads from the topics. All the partitions are from each
+ * of the topics is read.
+ * See {@link UnboundedKafkaSource#generateInitialSplits(int, PipelineOptions)} for description
+ * of how the partitions are distributed among the splits.
+ */
+ public Read withTopics(List topics) {
+ checkState(topicPartitions.isEmpty(), "Only topics or topicPartitions can be set, not both");
+
+ return new Read(ImmutableList.copyOf(topics), topicPartitions, keyCoder, valueCoder,
+ consumerFactoryFn, consumerConfig, maxNumRecords, maxReadTime);
+ }
+
+ /**
+ * Returns a new {@link Read} that reads from the partitions. This allows reading only a subset
+ * of partitions for one or more topics when (if ever) needed.
+ * See {@link UnboundedKafkaSource#generateInitialSplits(int, PipelineOptions)} for description
+ * of how the partitions are distributed among the splits.
+ */
+ public Read withTopicPartitions(List topicPartitions) {
+ checkState(topics.isEmpty(), "Only topics or topicPartitions can be set, not both");
+
+ return new Read(topics, ImmutableList.copyOf(topicPartitions), keyCoder, valueCoder,
+ consumerFactoryFn, consumerConfig, maxNumRecords, maxReadTime);
+ }
+
+ /**
+ * Returns a new {@link Read} with {@link Coder} for key bytes.
+ */
+ public Read withKeyCoder(Coder keyCoder) {
+ return new Read(topics, topicPartitions, keyCoder, valueCoder,
+ consumerFactoryFn, consumerConfig, maxNumRecords, maxReadTime);
+ }
+
+ /**
+ * Returns a new {@link Read} with {@link Coder} for value bytes.
+ */
+ public Read withValueCoder(Coder valueCoder) {
+ return new Read(topics, topicPartitions, keyCoder, valueCoder,
+ consumerFactoryFn, consumerConfig, maxNumRecords, maxReadTime);
+ }
+
+ /**
+ * A factory to create Kafka {@link Consumer} from consumer configuration.
+ * This is useful for supporting another version of Kafka consumer.
+ * Default is {@link KafkaConsumer}.
+ */
+ public Read withConsumerFactoryFn(
+ SerializableFunction