GoogleCloudPlatform · kevinsookocheff-wf · Jan 20, 2016 · Jan 20, 2016 · dhalperi · Jan 27, 2016
diff --git a/contrib/kafka/AUTHORS.md b/contrib/kafka/AUTHORS.md
@@ -0,0 +1,6 @@
+# Authors of 'kafka' module
+
+The following is the official list of authors for copyright purposes of this community-contributed module.
+
+    Workiva
+    Kevin Sookocheff, kevin [dot] sookocheff [at] workiva [dot] com
diff --git a/contrib/kafka/README.md b/contrib/kafka/README.md
@@ -0,0 +1,7 @@
+# Kafka module
+
+This library provides a streaming Dataflow source for Kafka. It is a work in progress and
+is not considered ready for production use.
+
+Dependencies:
+Kafka >= 0.9.0.0
diff --git a/...rib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaStreamingRouter.java b/...rib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaStreamingRouter.java
@@ -0,0 +1,137 @@
+/**
+ * Sample job that reads from a Kafka topic and submits to BigQuery.
+ * This job will form the basis of a streaming router that pulls messages
+ * from Kafka and writes them to downstream systems.
+ */
+
+package com.google.cloud.dataflow.contrib.kafka;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.workiva.cloud.dataflow.coders.ConsumerRecordCoder;
+import com.workiva.cloud.dataflow.io.KafkaSource;
+
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.joda.time.Duration;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+/**
+ * Echo messages from Kafka to BigQuery.
+ */
+public class KafkaStreamingRouter {
+
+  static final int WINDOW_SIZE = 1;  // Default window duration in minutes
+
+  /**
+   * Prepares the data for writing to BigQuery by building a TableRow object.
+   */
+  static class FormatMessageFn extends DoFn<ConsumerRecord<String, String>, TableRow> {
+  private static final long serialVersionUID = 1L;
+
+  @Override
+    public void processElement(ProcessContext c) {
+      TableRow row = new TableRow()
+          .set("topic", c.element().topic())
+          .set("offset", c.element().offset())
+          .set("partition", c.element().partition())
+          .set("value", c.element().value());
+
+      c.output(row);
+    }
+  }
+
+  /** A PTransform DoFn that combines keys and values to one string. */
+  static class ConsumerRecordToRowConverter
+      extends PTransform<PCollection<ConsumerRecord<String, String>>, PCollection<TableRow>> {
+    private static final long serialVersionUID = 1L;
+
+    @Override
+    public PCollection<TableRow> apply(PCollection<ConsumerRecord<String, String>> messages) {
+      PCollection<TableRow> results = messages.apply(
+          ParDo.of(new FormatMessageFn()));
+
+      return results;
+    }
+
+  }
+
+   /**
+   * Options supported by {@link KafkaStreamingRouterOptions}.
+   *
+   * <p> Inherits standard configuration options.
+   */
+  public static interface KafkaStreamingRouterOptions extends DataflowPipelineOptions {
+    @Description("Kafka servers in the cluster.")
+    @Default.String("127.0.0.0:9092")
+    String getBootstrapServers();
+    void setBootstrapServers(String value);
+
+    @Description("The group id of these consumers.")
+    @Default.String("test")
+    String getGroupId();
+    void setGroupId(String value);
+
+    @Description("The topic to read.")
+    @Default.String("test")
+    String getTopic();
+    void setTopic(String value);
+  }
+
+  public static void main(String[] args) throws IOException {
+    KafkaStreamingRouterOptions options = PipelineOptionsFactory.fromArgs(args)
+        .withValidation()
+        .as(KafkaStreamingRouterOptions.class);
+    options.setProject("cloud-project-namew");
+    options.setStreaming(true);
+    options.setNumWorkers(3);
+    options.setMaxNumWorkers(3);
+
+    String tableSpec = new StringBuilder()
+        .append(options.getProject()).append(":")
+        .append("kafkastreamingrouter").append(".")
+        .append("output")
+        .toString();
+
+    List<TableFieldSchema> fields = new ArrayList<>();
+    fields.add(new TableFieldSchema().setName("topic").setType("STRING"));
+    fields.add(new TableFieldSchema().setName("offset").setType("INTEGER"));
+    fields.add(new TableFieldSchema().setName("partition").setType("INTEGER"));
+    fields.add(new TableFieldSchema().setName("key").setType("STRING"));
+    fields.add(new TableFieldSchema().setName("value").setType("STRING"));
+    TableSchema schema = new TableSchema().setFields(fields);
+
+    Pipeline pipeline = Pipeline.create(options);
+
+    CoderRegistry cr = pipeline.getCoderRegistry();
+    cr.registerCoder(ConsumerRecord.class, ConsumerRecordCoder.class);
+
+    pipeline
+      .apply(KafkaSource.readFrom(options.getTopic(), options.getBootstrapServers(), options.getGroupId()))
+      .apply(Window.<ConsumerRecord<String,String>>into(FixedWindows.of(
+              Duration.standardMinutes(WINDOW_SIZE))))
+      .apply(new ConsumerRecordToRowConverter())
+      .apply(BigQueryIO.Write.to(tableSpec)
+            .withSchema(schema)
+            .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));
+
+    pipeline.run();
+  }
+}
diff --git a/...fka/src/main/java/com/google/cloud/dataflow/contrib/kafka/coders/ConsumerRecordCoder.java b/...fka/src/main/java/com/google/cloud/dataflow/contrib/kafka/coders/ConsumerRecordCoder.java
@@ -0,0 +1,52 @@
+package com.google.cloud.dataflow.contrib.kafka.coders;
+
+import com.google.cloud.dataflow.sdk.coders.CustomCoder;
+import com.google.common.base.Charsets;
+import com.google.common.io.CharStreams;
+import com.google.gson.Gson;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.codehaus.jackson.annotate.JsonCreator;
+
+/**
+ * A {@code ConsumerRecordCoder} encodes Kafka ConsumerRecord objects.
+ */
+public class ConsumerRecordCoder extends CustomCoder<ConsumerRecord<String, String>> {
+  private static final long serialVersionUID = 1L;
+
+
+  @JsonCreator
+  public static ConsumerRecordCoder of() {
+    return INSTANCE;
+  }
+
+  private static final ConsumerRecordCoder INSTANCE = new ConsumerRecordCoder();
+
+  private ConsumerRecordCoder() {}
+
+  @Override
+  public void encode(ConsumerRecord<String, String> value, OutputStream outStream, Context context)
+      throws IOException {
+    Gson gson = new Gson();
+    String json = gson.toJson(value);
+    outStream.write(json.getBytes(Charsets.UTF_8));
+  }
+
+  @Override
+  public ConsumerRecord<String, String> decode(InputStream inStream, Context context)
+      throws IOException {
+    String json;
+    try (InputStreamReader reader = new InputStreamReader(inStream, Charsets.UTF_8)){
+      json = CharStreams.toString(reader);
+    }
+    Gson gson = new Gson();
+    @SuppressWarnings("unchecked")
+    ConsumerRecord<String, String> record = gson.fromJson(json, ConsumerRecord.class);
+    return record;
+  }
+}
diff --git a/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/io/KafkaCheckpoint.java b/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/io/KafkaCheckpoint.java
@@ -0,0 +1,104 @@
+package com.google.cloud.dataflow.contrib.kafka.io;
+
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.apache.kafka.clients.consumer.OffsetAndMetadata;
+import org.apache.kafka.common.TopicPartition;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.cloud.dataflow.sdk.io.UnboundedSource;
+import com.google.common.collect.Maps;
+
+/**
+ * A marker representing the progress and state of the {@link UnboundedReader}.
+ *
+ * Starts a separate thread to read messages from Kafka to populate a {@link
+ * LinkedBlockingQueue}. As messages are read, the furthest processed offset is
+ * tracked. When messages have been durably committed by Dataflow, the furthest
+ * offset read is written to Kafka.
+ *
+ * Will return duplicate messages.
+ */
+public class KafkaCheckpoint implements UnboundedSource.CheckpointMark {
+
+  private static final Logger LOG = LoggerFactory.getLogger(KafkaCheckpoint.class);
+
+  private final LinkedBlockingQueue<ConsumerRecord<String, String>> recordQueue;
+  private final ConcurrentMap<TopicPartition, OffsetAndMetadata> furthestCommitted;
+  private Map<TopicPartition, OffsetAndMetadata> furthestRead;
+  private ConsumerRecord<String, String> current;
+  private KafkaProducer producer;
+
+  KafkaCheckpoint(String bootstrapServers, String groupId, String topic) {
+    recordQueue = new LinkedBlockingQueue<ConsumerRecord<String, String>>();
+    furthestCommitted = Maps.newConcurrentMap();
+    furthestRead = Maps.newHashMap();
+
+    KafkaProducer.Builder builder = new KafkaProducer.Builder();
+    KafkaProducer producer = builder
+      .withBootstrapServers(bootstrapServers)
+      .withGroupId(groupId)
+      .withTopic(topic)
+      .withQueue(recordQueue)
+      .withCommitted(furthestCommitted)
+      .build();
+
+    new Thread(producer).start();
+  }
+
+  /**
+   * Commit the furthest read offsets to the Kafka cluster.
+   */
+  @Override
+  public void finalizeCheckpoint() {
+    furthestCommitted.clear();
+    furthestCommitted.putAll(furthestRead);
+
+    producer.commit();
+
+    furthestRead = Maps.newHashMap();
+  }
+
+  /**
+   * Advance to the next available record. Returns true if a record exists.
+   * False otherwise.
+   */
+  public boolean advance() {
+    current = this.recordQueue.poll();
+    if (current == null) {
+      return false;
+    }
+
+    TopicPartition p = new TopicPartition(current.topic(), current.partition());
+    OffsetAndMetadata offset = new OffsetAndMetadata(current.offset() + 1);
+    furthestRead.put(p, offset);
+
+    return true;
+  }
+
+  /**
+   * Return the current record.
+   */
+  public ConsumerRecord<String, String> current() throws NoSuchElementException {
+    if (current == null) {
+      throw new NoSuchElementException();
+    }
+
+    LOG.info("Current Record: topic={}, partition={}, offset={}, value={}",
+        current.topic(), current.partition(), current.offset(), current.value());
+
+    return current;
+  }
+
+  /**
+   * Shutdown the producer thread.
+   */
+  public void shutdown() {
+    producer.shutdown();
+  }
+}