To execute this pipeline using the Dataflow service, specify pipeline configuration:
- *
{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * and an output prefix on GCS:
- * --output=gs://YOUR_OUTPUT_PREFIX
- * }
- *
- *
The default input is {@code gs://dataflow-samples/shakespeare/} and can be overridden with
- * {@code --input}.
- */
-public class TFIDF {
- /**
- * Options supported by {@link TFIDF}.
- *
- * Inherits standard configuration options.
- */
- private interface Options extends PipelineOptions, FlinkPipelineOptions {
- @Description("Path to the directory or GCS prefix containing files to read from")
- @Default.String("gs://dataflow-samples/shakespeare/")
- String getInput();
- void setInput(String value);
-
- @Description("Prefix of output URI to write to")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- /**
- * Lists documents contained beneath the {@code options.input} prefix/directory.
- */
- public static Set listInputDocuments(Options options)
- throws URISyntaxException, IOException {
- URI baseUri = new URI(options.getInput());
-
- // List all documents in the directory or GCS prefix.
- URI absoluteUri;
- if (baseUri.getScheme() != null) {
- absoluteUri = baseUri;
- } else {
- absoluteUri = new URI(
- "file",
- baseUri.getAuthority(),
- baseUri.getPath(),
- baseUri.getQuery(),
- baseUri.getFragment());
- }
-
- Set uris = new HashSet<>();
- if (absoluteUri.getScheme().equals("file")) {
- File directory = new File(absoluteUri);
- for (String entry : directory.list()) {
- File path = new File(directory, entry);
- uris.add(path.toURI());
- }
- } else if (absoluteUri.getScheme().equals("gs")) {
- GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
- URI gcsUriGlob = new URI(
- absoluteUri.getScheme(),
- absoluteUri.getAuthority(),
- absoluteUri.getPath() + "*",
- absoluteUri.getQuery(),
- absoluteUri.getFragment());
- for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
- uris.add(entry.toUri());
- }
- }
-
- return uris;
- }
-
- /**
- * Reads the documents at the provided uris and returns all lines
- * from the documents tagged with which document they are from.
- */
- public static class ReadDocuments
- extends PTransform>> {
- private static final long serialVersionUID = 0;
-
- private Iterable uris;
-
- public ReadDocuments(Iterable uris) {
- this.uris = uris;
- }
-
- @Override
- public Coder> getDefaultOutputCoder() {
- return KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of());
- }
-
- @Override
- public PCollection> apply(PInput input) {
- Pipeline pipeline = input.getPipeline();
-
- // Create one TextIO.Read transform for each document
- // and add its output to a PCollectionList
- PCollectionList> urisToLines =
- PCollectionList.empty(pipeline);
-
- // TextIO.Read supports:
- // - file: URIs and paths locally
- // - gs: URIs on the service
- for (final URI uri : uris) {
- String uriString;
- if (uri.getScheme().equals("file")) {
- uriString = new File(uri).getPath();
- } else {
- uriString = uri.toString();
- }
-
- PCollection> oneUriToLines = pipeline
- .apply(TextIO.Read.from(uriString)
- .named("TextIO.Read(" + uriString + ")"))
- .apply("WithKeys(" + uriString + ")", WithKeys.of(uri));
-
- urisToLines = urisToLines.and(oneUriToLines);
- }
-
- return urisToLines.apply(Flatten.>pCollections());
- }
- }
-
- /**
- * A transform containing a basic TF-IDF pipeline. The input consists of KV objects
- * where the key is the document's URI and the value is a piece
- * of the document's content. The output is mapping from terms to
- * scores for each document URI.
- */
- public static class ComputeTfIdf
- extends PTransform>, PCollection>>> {
- private static final long serialVersionUID = 0;
-
- public ComputeTfIdf() { }
-
- @Override
- public PCollection>> apply(
- PCollection> uriToContent) {
-
- // Compute the total number of documents, and
- // prepare this singleton PCollectionView for
- // use as a side input.
- final PCollectionView totalDocuments =
- uriToContent
- .apply("GetURIs", Keys.create())
- .apply("RemoveDuplicateDocs", RemoveDuplicates.create())
- .apply(Count.globally())
- .apply(View.asSingleton());
-
- // Create a collection of pairs mapping a URI to each
- // of the words in the document associated with that that URI.
- PCollection> uriToWords = uriToContent
- .apply(ParDo.named("SplitWords").of(
- new DoFn, KV>() {
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey();
- String line = c.element().getValue();
- for (String word : line.split("\\W+")) {
- // Log INFO messages when the word “love” is found.
- if (word.toLowerCase().equals("love")) {
- LOG.info("Found {}", word.toLowerCase());
- }
-
- if (!word.isEmpty()) {
- c.output(KV.of(uri, word.toLowerCase()));
- }
- }
- }
- }));
-
- // Compute a mapping from each word to the total
- // number of documents in which it appears.
- PCollection> wordToDocCount = uriToWords
- .apply("RemoveDuplicateWords", RemoveDuplicates.>create())
- .apply(Values.create())
- .apply("CountDocs", Count.perElement());
-
- // Compute a mapping from each URI to the total
- // number of words in the document associated with that URI.
- PCollection> uriToWordTotal = uriToWords
- .apply("GetURIs2", Keys.create())
- .apply("CountWords", Count.perElement());
-
- // Count, for each (URI, word) pair, the number of
- // occurrences of that word in the document associated
- // with the URI.
- PCollection, Long>> uriAndWordToCount = uriToWords
- .apply("CountWordDocPairs", Count.>perElement());
-
- // Adjust the above collection to a mapping from
- // (URI, word) pairs to counts into an isomorphic mapping
- // from URI to (word, count) pairs, to prepare for a join
- // by the URI key.
- PCollection>> uriToWordAndCount = uriAndWordToCount
- .apply(ParDo.named("ShiftKeys").of(
- new DoFn, Long>, KV>>() {
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey().getKey();
- String word = c.element().getKey().getValue();
- Long occurrences = c.element().getValue();
- c.output(KV.of(uri, KV.of(word, occurrences)));
- }
- }));
-
- // Prepare to join the mapping of URI to (word, count) pairs with
- // the mapping of URI to total word counts, by associating
- // each of the input PCollection> with
- // a tuple tag. Each input must have the same key type, URI
- // in this case. The type parameter of the tuple tag matches
- // the types of the values for each collection.
- final TupleTag wordTotalsTag = new TupleTag<>();
- final TupleTag> wordCountsTag = new TupleTag<>();
- KeyedPCollectionTuple coGbkInput = KeyedPCollectionTuple
- .of(wordTotalsTag, uriToWordTotal)
- .and(wordCountsTag, uriToWordAndCount);
-
- // Perform a CoGroupByKey (a sort of pre-join) on the prepared
- // inputs. This yields a mapping from URI to a CoGbkResult
- // (CoGroupByKey Result). The CoGbkResult is a mapping
- // from the above tuple tags to the values in each input
- // associated with a particular URI. In this case, each
- // KV group a URI with the total number of
- // words in that document as well as all the (word, count)
- // pairs for particular words.
- PCollection> uriToWordAndCountAndTotal = coGbkInput
- .apply("CoGroupByUri", CoGroupByKey.create());
-
- // Compute a mapping from each word to a (URI, term frequency)
- // pair for each URI. A word's term frequency for a document
- // is simply the number of times that word occurs in the document
- // divided by the total number of words in the document.
- PCollection>> wordToUriAndTf = uriToWordAndCountAndTotal
- .apply(ParDo.named("ComputeTermFrequencies").of(
- new DoFn, KV>>() {
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey();
- Long wordTotal = c.element().getValue().getOnly(wordTotalsTag);
-
- for (KV wordAndCount
- : c.element().getValue().getAll(wordCountsTag)) {
- String word = wordAndCount.getKey();
- Long wordCount = wordAndCount.getValue();
- Double termFrequency = wordCount.doubleValue() / wordTotal.doubleValue();
- c.output(KV.of(word, KV.of(uri, termFrequency)));
- }
- }
- }));
-
- // Compute a mapping from each word to its document frequency.
- // A word's document frequency in a corpus is the number of
- // documents in which the word appears divided by the total
- // number of documents in the corpus. Note how the total number of
- // documents is passed as a side input; the same value is
- // presented to each invocation of the DoFn.
- PCollection> wordToDf = wordToDocCount
- .apply(ParDo
- .named("ComputeDocFrequencies")
- .withSideInputs(totalDocuments)
- .of(new DoFn, KV>() {
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- String word = c.element().getKey();
- Long documentCount = c.element().getValue();
- Long documentTotal = c.sideInput(totalDocuments);
- Double documentFrequency = documentCount.doubleValue()
- / documentTotal.doubleValue();
-
- c.output(KV.of(word, documentFrequency));
- }
- }));
-
- // Join the term frequency and document frequency
- // collections, each keyed on the word.
- final TupleTag> tfTag = new TupleTag<>();
- final TupleTag dfTag = new TupleTag<>();
- PCollection> wordToUriAndTfAndDf = KeyedPCollectionTuple
- .of(tfTag, wordToUriAndTf)
- .and(dfTag, wordToDf)
- .apply(CoGroupByKey.create());
-
- // Compute a mapping from each word to a (URI, TF-IDF) score
- // for each URI. There are a variety of definitions of TF-IDF
- // ("term frequency - inverse document frequency") score;
- // here we use a basic version that is the term frequency
- // divided by the log of the document frequency.
-
- return wordToUriAndTfAndDf
- .apply(ParDo.named("ComputeTfIdf").of(
- new DoFn, KV>>() {
- private static final long serialVersionUID1 = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- String word = c.element().getKey();
- Double df = c.element().getValue().getOnly(dfTag);
-
- for (KV uriAndTf : c.element().getValue().getAll(tfTag)) {
- URI uri = uriAndTf.getKey();
- Double tf = uriAndTf.getValue();
- Double tfIdf = tf * Math.log(1 / df);
- c.output(KV.of(word, KV.of(uri, tfIdf)));
- }
- }
- }));
- }
-
- // Instantiate Logger.
- // It is suggested that the user specify the class name of the containing class
- // (in this case ComputeTfIdf).
- private static final Logger LOG = LoggerFactory.getLogger(ComputeTfIdf.class);
- }
-
- /**
- * A {@link PTransform} to write, in CSV format, a mapping from term and URI
- * to score.
- */
- public static class WriteTfIdf
- extends PTransform>>, PDone> {
- private static final long serialVersionUID = 0;
-
- private String output;
-
- public WriteTfIdf(String output) {
- this.output = output;
- }
-
- @Override
- public PDone apply(PCollection>> wordToUriAndTfIdf) {
- return wordToUriAndTfIdf
- .apply(ParDo.named("Format").of(new DoFn>, String>() {
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- c.output(String.format("%s,\t%s,\t%f",
- c.element().getKey(),
- c.element().getValue().getKey(),
- c.element().getValue().getValue()));
- }
- }))
- .apply(TextIO.Write
- .to(output)
- .withSuffix(".csv"));
- }
- }
-
- public static void main(String[] args) throws Exception {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
-
- options.setRunner(FlinkPipelineRunner.class);
-
- Pipeline pipeline = Pipeline.create(options);
- pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));
-
- pipeline
- .apply(new ReadDocuments(listInputDocuments(options)))
- .apply(new ComputeTfIdf())
- .apply(new WriteTfIdf(options.getOutput()));
-
- pipeline.run();
- }
-}
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/WordCount.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/WordCount.java
deleted file mode 100644
index 702fb63ec68d..000000000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/WordCount.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.examples;
-
-import org.apache.beam.runners.flink.FlinkPipelineOptions;
-import org.apache.beam.runners.flink.FlinkPipelineRunner;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.MapElements;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.SimpleFunction;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-
-public class WordCount {
-
- public static class ExtractWordsFn extends DoFn {
- private final Aggregator emptyLines =
- createAggregator("emptyLines", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().trim().isEmpty()) {
- emptyLines.addValue(1L);
- }
-
- // Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
-
- // Output each word encountered into the output PCollection.
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- public static class CountWords extends PTransform,
- PCollection>> {
- @Override
- public PCollection> apply(PCollection lines) {
-
- // Convert lines of text into individual words.
- PCollection words = lines.apply(
- ParDo.of(new ExtractWordsFn()));
-
- // Count the number of times each word occurs.
- PCollection> wordCounts =
- words.apply(Count.perElement());
-
- return wordCounts;
- }
- }
-
- /** A SimpleFunction that converts a Word and Count into a printable string. */
- public static class FormatAsTextFn extends SimpleFunction, String> {
- @Override
- public String apply(KV input) {
- return input.getKey() + ": " + input.getValue();
- }
- }
-
- /**
- * Options supported by {@link WordCount}.
- *
- * Inherits standard configuration options.
- */
- public interface Options extends PipelineOptions, FlinkPipelineOptions {
- @Description("Path of the file to read from")
- @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
- String getInput();
- void setInput(String value);
-
- @Description("Path of the file to write to")
- String getOutput();
- void setOutput(String value);
- }
-
- public static void main(String[] args) {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
- .as(Options.class);
- options.setRunner(FlinkPipelineRunner.class);
-
- Pipeline p = Pipeline.create(options);
-
- p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
- .apply(new CountWords())
- .apply(MapElements.via(new FormatAsTextFn()))
- .apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));
-
- p.run();
- }
-
-}
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/AutoComplete.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/AutoComplete.java
deleted file mode 100644
index 9d1168ba40ee..000000000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/AutoComplete.java
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.examples.streaming;
-
-import org.apache.beam.runners.flink.FlinkPipelineRunner;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSocketSource;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.coders.AvroCoder;
-import org.apache.beam.sdk.coders.DefaultCoder;
-import org.apache.beam.sdk.io.Read;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.Filter;
-import org.apache.beam.sdk.transforms.Flatten;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.Partition;
-import org.apache.beam.sdk.transforms.Partition.PartitionFn;
-import org.apache.beam.sdk.transforms.SerializableFunction;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.transforms.Top;
-import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
-import org.apache.beam.sdk.transforms.windowing.FixedWindows;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.transforms.windowing.WindowFn;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PBegin;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionList;
-
-import org.joda.time.Duration;
-
-import java.io.IOException;
-import java.util.List;
-
-/**
- * To run the example, first open a socket on a terminal by executing the command:
- *
- *
- * nc -lk 9999
- *
- *
- * and then launch the example. Now whatever you type in the terminal is going to be
- * the input to the program.
- * */
-public class AutoComplete {
-
- /**
- * A PTransform that takes as input a list of tokens and returns
- * the most common tokens per prefix.
- */
- public static class ComputeTopCompletions
- extends PTransform, PCollection>>> {
- private static final long serialVersionUID = 0;
-
- private final int candidatesPerPrefix;
- private final boolean recursive;
-
- protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.recursive = recursive;
- }
-
- public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
- return new ComputeTopCompletions(candidatesPerPrefix, recursive);
- }
-
- @Override
- public PCollection>> apply(PCollection input) {
- PCollection candidates = input
- // First count how often each token appears.
- .apply(new Count.PerElement())
-
- // Map the KV outputs of Count into our own CompletionCandiate class.
- .apply(ParDo.named("CreateCompletionCandidates").of(
- new DoFn, CompletionCandidate>() {
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- CompletionCandidate cand = new CompletionCandidate(c.element().getKey(), c.element().getValue());
- c.output(cand);
- }
- }));
-
- // Compute the top via either a flat or recursive algorithm.
- if (recursive) {
- return candidates
- .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
- .apply(Flatten.>>pCollections());
- } else {
- return candidates
- .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
- }
- }
- }
-
- /**
- * Lower latency, but more expensive.
- */
- private static class ComputeTopFlat
- extends PTransform,
- PCollection>>> {
- private static final long serialVersionUID = 0;
-
- private final int candidatesPerPrefix;
- private final int minPrefix;
-
- public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.minPrefix = minPrefix;
- }
-
- @Override
- public PCollection>> apply(
- PCollection input) {
- return input
- // For each completion candidate, map it to all prefixes.
- .apply(ParDo.of(new AllPrefixes(minPrefix)))
-
- // Find and return the top candiates for each prefix.
- .apply(Top.largestPerKey(candidatesPerPrefix)
- .withHotKeyFanout(new HotKeyFanout()));
- }
-
- private static class HotKeyFanout implements SerializableFunction {
- private static final long serialVersionUID = 0;
-
- @Override
- public Integer apply(String input) {
- return (int) Math.pow(4, 5 - input.length());
- }
- }
- }
-
- /**
- * Cheaper but higher latency.
- *
- *
Returns two PCollections, the first is top prefixes of size greater
- * than minPrefix, and the second is top prefixes of size exactly
- * minPrefix.
- */
- private static class ComputeTopRecursive
- extends PTransform,
- PCollectionList>>> {
- private static final long serialVersionUID = 0;
-
- private final int candidatesPerPrefix;
- private final int minPrefix;
-
- public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.minPrefix = minPrefix;
- }
-
- private class KeySizePartitionFn implements PartitionFn>> {
- private static final long serialVersionUID = 0;
-
- @Override
- public int partitionFor(KV> elem, int numPartitions) {
- return elem.getKey().length() > minPrefix ? 0 : 1;
- }
- }
-
- private static class FlattenTops
- extends DoFn>, CompletionCandidate> {
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- for (CompletionCandidate cc : c.element().getValue()) {
- c.output(cc);
- }
- }
- }
-
- @Override
- public PCollectionList>> apply(
- PCollection input) {
- if (minPrefix > 10) {
- // Base case, partitioning to return the output in the expected format.
- return input
- .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
- .apply(Partition.of(2, new KeySizePartitionFn()));
- } else {
- // If a candidate is in the top N for prefix a...b, it must also be in the top
- // N for a...bX for every X, which is typlically a much smaller set to consider.
- // First, compute the top candidate for prefixes of size at least minPrefix + 1.
- PCollectionList>> larger = input
- .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
- // Consider the top candidates for each prefix of length minPrefix + 1...
- PCollection>> small =
- PCollectionList
- .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
- // ...together with those (previously excluded) candidates of length
- // exactly minPrefix...
- .and(input.apply(Filter.by(new SerializableFunction() {
- private static final long serialVersionUID = 0;
-
- @Override
- public Boolean apply(CompletionCandidate c) {
- return c.getValue().length() == minPrefix;
- }
- })))
- .apply("FlattenSmall", Flatten.pCollections())
- // ...set the key to be the minPrefix-length prefix...
- .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
- // ...and (re)apply the Top operator to all of them together.
- .apply(Top.largestPerKey(candidatesPerPrefix));
-
- PCollection>> flattenLarger = larger
- .apply("FlattenLarge", Flatten.>>pCollections());
-
- return PCollectionList.of(flattenLarger).and(small);
- }
- }
- }
-
- /**
- * A DoFn that keys each candidate by all its prefixes.
- */
- private static class AllPrefixes
- extends DoFn> {
- private static final long serialVersionUID = 0;
-
- private final int minPrefix;
- private final int maxPrefix;
- public AllPrefixes(int minPrefix) {
- this(minPrefix, Integer.MAX_VALUE);
- }
- public AllPrefixes(int minPrefix, int maxPrefix) {
- this.minPrefix = minPrefix;
- this.maxPrefix = maxPrefix;
- }
- @Override
- public void processElement(ProcessContext c) {
- String word = c.element().value;
- for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
- KV kv = KV.of(word.substring(0, i), c.element());
- c.output(kv);
- }
- }
- }
-
- /**
- * Class used to store tag-count pairs.
- */
- @DefaultCoder(AvroCoder.class)
- static class CompletionCandidate implements Comparable {
- private long count;
- private String value;
-
- public CompletionCandidate(String value, long count) {
- this.value = value;
- this.count = count;
- }
-
- public String getValue() {
- return value;
- }
-
- // Empty constructor required for Avro decoding.
- @SuppressWarnings("unused")
- public CompletionCandidate() {}
-
- @Override
- public int compareTo(CompletionCandidate o) {
- if (this.count < o.count) {
- return -1;
- } else if (this.count == o.count) {
- return this.value.compareTo(o.value);
- } else {
- return 1;
- }
- }
-
- @Override
- public boolean equals(Object other) {
- if (other instanceof CompletionCandidate) {
- CompletionCandidate that = (CompletionCandidate) other;
- return this.count == that.count && this.value.equals(that.value);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Long.valueOf(count).hashCode() ^ value.hashCode();
- }
-
- @Override
- public String toString() {
- return "CompletionCandidate[" + value + ", " + count + "]";
- }
- }
-
- static class ExtractWordsFn extends DoFn {
- private final Aggregator emptyLines =
- createAggregator("emptyLines", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().trim().isEmpty()) {
- emptyLines.addValue(1L);
- }
-
- // Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
-
- // Output each word encountered into the output PCollection.
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- /**
- * Takes as input a the top candidates per prefix, and emits an entity
- * suitable for writing to Datastore.
- */
- static class FormatForPerTaskLocalFile extends DoFn>, String>
- implements DoFn.RequiresWindowAccess{
-
- private static final long serialVersionUID = 0;
-
- @Override
- public void processElement(ProcessContext c) {
- StringBuilder str = new StringBuilder();
- KV> elem = c.element();
-
- str.append(elem.getKey() +" @ "+ c.window() +" -> ");
- for(CompletionCandidate cand: elem.getValue()) {
- str.append(cand.toString() + " ");
- }
- System.out.println(str.toString());
- c.output(str.toString());
- }
- }
-
- /**
- * Options supported by this class.
- *
- *