apache · jihoonson · Aug 7, 2018 · Mar 15, 2018 · Mar 15, 2018 · Mar 15, 2018
diff --git a/api/src/main/java/io/druid/data/input/FiniteFirehoseFactory.java b/api/src/main/java/io/druid/data/input/FiniteFirehoseFactory.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package io.druid.data.input;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import io.druid.data.input.impl.InputRowParser;
+
+import java.io.IOException;
+import java.util.stream.Stream;
+
+/**
+ * {@link FiniteFirehoseFactory} designed for batch processing. Its implementations assume that the amount of inputs is
+ * limited.
+ *
+ * @param <T> parser type
+ * @param <S> input split type
+ */
+public interface FiniteFirehoseFactory<T extends InputRowParser, S> extends FirehoseFactory<T>
+{
+  /**
+   * Returns true if this {@link FiniteFirehoseFactory} supports parallel batch indexing.
+   */
+  @JsonIgnore
+  @Override
+  default boolean isSplittable()
+  {
+    return true;
+  }
+
+  /**
+   * Returns a {@link Stream} for {@link InputSplit}s. In parallel batch indexing, each {@link InputSplit} is processed
+   * by a sub task.
+   *
+   * Listing splits may cause high overhead in some implementations. In this case, {@link InputSplit}s should be listed
+   * lazily so that the listing overhead could be amortized.
+   */
+  @JsonIgnore
+  Stream<InputSplit<S>> getSplits() throws IOException;
+
+  /**
+   * Returns number of splits returned by {@link #getSplits()}.
+   */
+  @JsonIgnore
+  int getNumSplits() throws IOException;
+
+  /**
+   * Returns the same {@link FiniteFirehoseFactory} but with the given {@link InputSplit}. The returned
+   * {@link FiniteFirehoseFactory} is used by sub tasks in parallel batch indexing.
+   */
+  FiniteFirehoseFactory<T, S> withSplit(InputSplit<S> split);
+}
diff --git a/api/src/main/java/io/druid/data/input/FirehoseFactory.java b/api/src/main/java/io/druid/data/input/FirehoseFactory.java
@@ -72,4 +72,9 @@ default Firehose connect(T parser, @Nullable File temporaryDirectory) throws IOE
   {
     return connect(parser);
   }
+
+  default boolean isSplittable()
+  {
+    return false;
+  }
 }
diff --git a/api/src/main/java/io/druid/data/input/InputSplit.java b/api/src/main/java/io/druid/data/input/InputSplit.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package io.druid.data.input;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+/**
+ * Input unit for distributed batch ingestion. Used in {@link FiniteFirehoseFactory}.
+ * An {@link InputSplit} represents the input data processed by a {@code io.druid.indexing.common.task.Task}.
+ */
+public class InputSplit<T>
+{
+  private final T split;
+
+  @JsonCreator
+  public InputSplit(@JsonProperty("split") T split)
+  {
+    this.split = split;
+  }
+
+  @JsonProperty("split")
+  public T get()
+  {
+    return split;
+  }
+
+  @Override
+  public String toString()
+  {
+    return "InputSplit{" +
+           "split=" + split +
+           "}";
+  }
+}
diff --git a/api/src/main/java/io/druid/data/input/impl/AbstractTextFilesFirehoseFactory.java b/api/src/main/java/io/druid/data/input/impl/AbstractTextFilesFirehoseFactory.java
@@ -22,20 +22,22 @@
 import com.google.common.base.Preconditions;
 import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
+import io.druid.data.input.FiniteFirehoseFactory;
 import io.druid.data.input.Firehose;
-import io.druid.data.input.FirehoseFactory;
+import io.druid.data.input.InputSplit;
 import io.druid.java.util.common.logger.Logger;
-import org.apache.commons.io.Charsets;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.LineIterator;
 
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 import java.util.NoSuchElementException;
+import java.util.stream.Stream;
 
 /**
  * This is an abstract class for firehose factory for making firehoses reading text files.
@@ -44,7 +46,7 @@
  * @param <T> object type representing input data
  */
 public abstract class AbstractTextFilesFirehoseFactory<T>
-    implements FirehoseFactory<StringInputRowParser>
+    implements FiniteFirehoseFactory<StringInputRowParser, T>
 {
   private static final Logger LOG = new Logger(AbstractTextFilesFirehoseFactory.class);
 
@@ -53,9 +55,7 @@ public abstract class AbstractTextFilesFirehoseFactory<T>
   @Override
   public Firehose connect(StringInputRowParser firehoseParser, File temporaryDirectory) throws IOException
   {
-    if (objects == null) {
-      objects = ImmutableList.copyOf(Preconditions.checkNotNull(initObjects(), "initObjects"));
-    }
+    initializeObjectsIfNeeded();
     final Iterator<T> iterator = objects.iterator();
     return new FileIteratingFirehose(
         new Iterator<LineIterator>()
@@ -74,7 +74,7 @@ public LineIterator next()
             }
             final T object = iterator.next();
             try {
-              return IOUtils.lineIterator(wrapObjectStream(object, openObjectStream(object)), Charsets.UTF_8);
+              return IOUtils.lineIterator(wrapObjectStream(object, openObjectStream(object)), StandardCharsets.UTF_8);
             }
             catch (Exception e) {
               LOG.error(
@@ -90,6 +90,32 @@ public LineIterator next()
     );
   }
 
+  protected void initializeObjectsIfNeeded() throws IOException
+  {
+    if (objects == null) {
+      objects = ImmutableList.copyOf(Preconditions.checkNotNull(initObjects(), "initObjects"));
+    }
+  }
+
+  public List<T> getObjects()
+  {
+    return objects;
+  }
+
+  @Override
+  public Stream<InputSplit<T>> getSplits() throws IOException
+  {
+    initializeObjectsIfNeeded();
+    return getObjects().stream().map(InputSplit::new);
+  }
+
+  @Override
+  public int getNumSplits() throws IOException
+  {
+    initializeObjectsIfNeeded();
+    return getObjects().size();
+  }
+
   /**
    * Initialize objects to be read by this firehose.  Since firehose factories are constructed whenever
    * io.druid.indexing.common.task.Task objects are deserialized, actual initialization of objects is deferred

diff --git a/api/src/main/java/io/druid/indexer/RunnerTaskState.java b/api/src/main/java/io/druid/indexer/RunnerTaskState.java
@@ -25,5 +25,5 @@ public enum RunnerTaskState
   WAITING,
   PENDING,
   RUNNING,
-  NONE; // is used for a completed task
+  NONE // is used for a completed task
 }
diff --git a/api/src/main/java/io/druid/indexer/TaskStatusPlus.java b/api/src/main/java/io/druid/indexer/TaskStatusPlus.java
@@ -172,4 +172,20 @@ public int hashCode()
         getErrorMsg()
     );
   }
+
+  @Override
+  public String toString()
+  {
+    return "TaskStatusPlus{" +
+           "id='" + id + '\'' +
+           ", type='" + type + '\'' +
+           ", createdTime=" + createdTime +
+           ", queueInsertionTime=" + queueInsertionTime +
+           ", state=" + state +
+           ", duration=" + duration +
+           ", location=" + location +
+           ", dataSource='" + dataSource + '\'' +
+           ", errorMsg='" + errorMsg + '\'' +
+           '}';
+  }
 }
diff --git a/...test/java/io/druid/data/input/impl/prefetch/PrefetchableTextFilesFirehoseFactoryTest.java b/...test/java/io/druid/data/input/impl/prefetch/PrefetchableTextFilesFirehoseFactoryTest.java
@@ -23,7 +23,9 @@
 import com.google.common.base.Predicate;
 import com.google.common.collect.Lists;
 import com.google.common.io.CountingOutputStream;
+import io.druid.data.input.FiniteFirehoseFactory;
 import io.druid.data.input.Firehose;
+import io.druid.data.input.InputSplit;
 import io.druid.data.input.Row;
 import io.druid.data.input.impl.CSVParseSpec;
 import io.druid.data.input.impl.DimensionsSpec;
@@ -605,6 +607,12 @@ protected InputStream openObjectStream(File object, long start) throws IOExcepti
     private int readCount;
     private int numConnectionResets;
 
+    @Override
+    public FiniteFirehoseFactory<StringInputRowParser, File> withSplit(InputSplit<File> split)
+    {
+      throw new UnsupportedOperationException();
+    }
+
     private class TestInputStream extends InputStream
     {
       private static final int NUM_READ_COUNTS_BEFORE_ERROR = 10;

diff --git a/common/src/main/java/io/druid/metadata/MetadataStorageActionHandler.java b/common/src/main/java/io/druid/metadata/MetadataStorageActionHandler.java
@@ -78,6 +78,9 @@ void insert(
    */
   Optional<StatusType> getStatus(String entryId);
 
+  @Nullable
+  TaskInfo<EntryType, StatusType> getTaskInfo(String entryId);
+
   /**
    * Return up to {@code maxNumStatuses} {@link TaskInfo} objects for all inactive entries
    * created on or later than the given timestamp

diff --git a/docs/content/development/extensions-contrib/azure.md b/docs/content/development/extensions-contrib/azure.md
@@ -33,6 +33,9 @@ The storage account is shared with the one used for Azure deep storage functiona
 
 As with the S3 blobstore, it is assumed to be gzipped if the extension ends in .gz
 
+This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native_tasks.html#parallel-index-task).
+Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object.
+
 Sample spec:
 
 ```json

diff --git a/docs/content/development/extensions-contrib/cloudfiles.md b/docs/content/development/extensions-contrib/cloudfiles.md
@@ -32,6 +32,9 @@ The storage account is shared with the one used for Racksapce's Cloud Files deep
 
 As with the Azure blobstore, it is assumed to be gzipped if the extension ends in .gz
 
+This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native_tasks.html#parallel-index-task).
+Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object.
+
 Sample spec:
 
 ```json

diff --git a/docs/content/development/extensions-contrib/google.md b/docs/content/development/extensions-contrib/google.md
@@ -27,6 +27,9 @@ This firehose ingests events, similar to the StaticS3Firehose, but from an Googl
 
 As with the S3 blobstore, it is assumed to be gzipped if the extension ends in .gz
 
+This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native_tasks.html#parallel-index-task).
+Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object.
+
 Sample spec:
 
 ```json

diff --git a/docs/content/development/extensions-core/s3.md b/docs/content/development/extensions-core/s3.md
@@ -43,6 +43,8 @@ You can enable [server-side encryption](https://docs.aws.amazon.com/AmazonS3/lat
 ## StaticS3Firehose
 
 This firehose ingests events from a predefined list of S3 objects.
+This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native_tasks.html#parallel-index-task).
+Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object.
 
 Sample spec:
 

diff --git a/docs/content/ingestion/firehose.md b/docs/content/ingestion/firehose.md
@@ -20,6 +20,8 @@ For additional firehoses, please see our [extensions list](../development/extens
 
 This Firehose can be used to read the data from files on local disk.
 It can be used for POCs to ingest data on disk.
+This firehose is _splittable_ and can be used by [native parallel index tasks](./native_tasks.html#parallel-index-task).
+Since each split represents a file in this firehose, each worker task of `index_parallel` will read a file.
 A sample local firehose spec is shown below:
 
 ```json
@@ -39,6 +41,8 @@ A sample local firehose spec is shown below:
 #### HttpFirehose
 
 This Firehose can be used to read the data from remote sites via HTTP.
+This firehose is _splittable_ and can be used by [native parallel index tasks](./native_tasks.html#parallel-index-task).
+Since each split represents a file in this firehose, each worker task of `index_parallel` will read a file.
 A sample http firehose spec is shown below:
 
 ```json