From 7bd60777f47d282f7c8adb368e1cc06ad92b7fbc Mon Sep 17 00:00:00 2001 From: fjy Date: Thu, 20 Apr 2017 16:25:51 -0700 Subject: [PATCH 01/20] initial commit --- .../druid/data/input/impl/CSVParseSpec.java | 42 +++-- .../data/input/impl/DelimitedParseSpec.java | 58 +++++-- .../input/impl/FileIteratingFirehose.java | 1 + .../data/input/impl/StringInputRowParser.java | 17 +- .../data/input/impl/TimeAndDimsParseSpec.java | 6 + .../data/input/impl/CSVParseSpecTest.java | 6 +- .../input/impl/DelimitedParseSpecTest.java | 15 +- .../input/impl/FileIteratingFirehoseTest.java | 3 +- .../druid/data/input/impl/ParseSpecTest.java | 9 +- docs/content/ingestion/data-formats.md | 7 +- .../druid/segment/MapVirtualColumnTest.java | 3 +- .../namespace/URIExtractionNamespace.java | 12 ++ .../indexer/BatchDeltaIngestionTest.java | 3 +- .../DetermineHashedPartitionsJobTest.java | 29 ++-- .../indexer/DeterminePartitionsJobTest.java | 3 +- .../druid/indexer/IndexGeneratorJobTest.java | 12 +- .../java/io/druid/indexer/JobHelperTest.java | 3 +- .../indexer/path/DatasourcePathSpecTest.java | 3 +- .../updater/HadoopConverterJobTest.java | 3 +- .../druid/indexing/common/task/IndexTask.java | 4 + .../indexing/common/task/IndexTaskTest.java | 149 +++++++++++++++--- .../java/util/common/parsers/CSVParser.java | 34 +++- .../util/common/parsers/DelimitedParser.java | 29 +++- .../java/util/common/parsers/JSONParser.java | 6 + .../util/common/parsers/JSONPathParser.java | 5 + .../util/common/parsers/JavaScriptParser.java | 6 + .../java/util/common/parsers/Parser.java | 5 + .../java/util/common/parsers/RegexParser.java | 7 +- .../common/parsers/ToLowerCaseParser.java | 6 + .../druid/query/MultiValuedDimensionTest.java | 3 +- .../GroupByQueryRunnerFactoryTest.java | 3 +- .../test/java/io/druid/segment/TestIndex.java | 3 +- .../firehose/ReplayableFirehoseFactory.java | 5 + .../firehose/IngestSegmentFirehoseTest.java | 3 +- 34 files changed, 412 insertions(+), 91 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java index bbe1fc4d2288..e8b2788db53a 100644 --- a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java @@ -23,7 +23,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Optional; import com.google.common.base.Preconditions; - import io.druid.java.util.common.parsers.CSVParser; import io.druid.java.util.common.parsers.Parser; @@ -35,26 +34,35 @@ public class CSVParseSpec extends ParseSpec { private final String listDelimiter; private final List columns; + private final boolean firstRowIsHeader; @JsonCreator public CSVParseSpec( @JsonProperty("timestampSpec") TimestampSpec timestampSpec, @JsonProperty("dimensionsSpec") DimensionsSpec dimensionsSpec, @JsonProperty("listDelimiter") String listDelimiter, - @JsonProperty("columns") List columns + @JsonProperty("columns") List columns, + @JsonProperty("firstRowIsHeader") boolean firstRowIsHeader ) { super(timestampSpec, dimensionsSpec); this.listDelimiter = listDelimiter; - Preconditions.checkNotNull(columns, "columns"); - for (String column : columns) { - Preconditions.checkArgument(!column.contains(","), "Column[%s] has a comma, it cannot", column); - } - this.columns = columns; - verify(dimensionsSpec.getDimensionNames()); + if (columns != null) { + for (String column : columns) { + Preconditions.checkArgument(!column.contains(","), "Column[%s] has a comma, it cannot", column); + } + verify(dimensionsSpec.getDimensionNames()); + } else { + Preconditions.checkArgument( + firstRowIsHeader, + "If columns field is not set, the first row of your data must have your header and firstRowIsHeader must be set to true." + ); + } + + this.firstRowIsHeader = firstRowIsHeader; } @JsonProperty @@ -69,6 +77,12 @@ public List getColumns() return columns; } + @JsonProperty + public boolean isFirstRowIsHeader() + { + return firstRowIsHeader; + } + @Override public void verify(List usedCols) { @@ -80,23 +94,27 @@ public void verify(List usedCols) @Override public Parser makeParser() { - return new CSVParser(Optional.fromNullable(listDelimiter), columns); + if (firstRowIsHeader) { + return new CSVParser(Optional.fromNullable(listDelimiter), columns, firstRowIsHeader); + } else { + return new CSVParser(Optional.fromNullable(listDelimiter), columns); + } } @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns); + return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns, firstRowIsHeader); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns); + return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns, firstRowIsHeader); } public ParseSpec withColumns(List cols) { - return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols); + return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols, firstRowIsHeader); } } diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index 6d7096d7921f..c0c01520634d 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -23,7 +23,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Optional; import com.google.common.base.Preconditions; - import io.druid.java.util.common.parsers.DelimitedParser; import io.druid.java.util.common.parsers.Parser; @@ -36,6 +35,7 @@ public class DelimitedParseSpec extends ParseSpec private final String delimiter; private final String listDelimiter; private final List columns; + private final boolean firstRowIsHeader; @JsonCreator public DelimitedParseSpec( @@ -43,20 +43,28 @@ public DelimitedParseSpec( @JsonProperty("dimensionsSpec") DimensionsSpec dimensionsSpec, @JsonProperty("delimiter") String delimiter, @JsonProperty("listDelimiter") String listDelimiter, - @JsonProperty("columns") List columns + @JsonProperty("columns") List columns, + @JsonProperty("firstRowIsHeader") boolean firstRowIsHeader ) { super(timestampSpec, dimensionsSpec); this.delimiter = delimiter; this.listDelimiter = listDelimiter; - Preconditions.checkNotNull(columns, "columns"); this.columns = columns; - for (String column : this.columns) { - Preconditions.checkArgument(!column.contains(","), "Column[%s] has a comma, it cannot", column); + if (columns != null) { + for (String column : this.columns) { + Preconditions.checkArgument(!column.contains(","), "Column[%s] has a comma, it cannot", column); + } + verify(dimensionsSpec.getDimensionNames()); + } else { + Preconditions.checkArgument( + firstRowIsHeader, + "If columns field is not set, the first row of your data must have your header and firstRowIsHeader must be set to true." + ); } - verify(dimensionsSpec.getDimensionNames()); + this.firstRowIsHeader = firstRowIsHeader; } @JsonProperty("delimiter") @@ -77,6 +85,12 @@ public List getColumns() return columns; } + @JsonProperty + public boolean isFirstRowIsHeader() + { + return firstRowIsHeader; + } + @Override public void verify(List usedCols) { @@ -88,38 +102,48 @@ public void verify(List usedCols) @Override public Parser makeParser() { - Parser retVal = new DelimitedParser( - Optional.fromNullable(delimiter), - Optional.fromNullable(listDelimiter) - ); - retVal.setFieldNames(columns); - return retVal; + if (firstRowIsHeader) { + return new DelimitedParser( + Optional.fromNullable(delimiter), + Optional.fromNullable(listDelimiter), + firstRowIsHeader + ); + } else { + return new DelimitedParser( + Optional.fromNullable(delimiter), + Optional.fromNullable(listDelimiter) + ); + } } @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns); + return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns, firstRowIsHeader); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns); + return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns, firstRowIsHeader); } public ParseSpec withDelimiter(String delim) { - return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delim, listDelimiter, columns); + return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delim, listDelimiter, columns, + firstRowIsHeader + ); } public ParseSpec withListDelimiter(String delim) { - return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, delim, columns); + return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, delim, columns, firstRowIsHeader); } public ParseSpec withColumns(List cols) { - return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, listDelimiter, cols); + return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, listDelimiter, cols, + firstRowIsHeader + ); } } diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 97e33f04a894..cebf0e018e99 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -51,6 +51,7 @@ public boolean hasMore() { while ((lineIterator == null || !lineIterator.hasNext()) && lineIterators.hasNext()) { lineIterator = lineIterators.next(); + parser.reset(); } return lineIterator != null && lineIterator.hasNext(); diff --git a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java index 6a13fcb7bd19..58baeed5b7ca 100644 --- a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java +++ b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java @@ -22,7 +22,6 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Charsets; - import io.druid.data.input.ByteBufferInputRowParser; import io.druid.data.input.InputRow; import io.druid.java.util.common.parsers.ParseException; @@ -33,6 +32,7 @@ import java.nio.charset.Charset; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; +import java.util.LinkedHashMap; import java.util.Map; /** @@ -124,18 +124,27 @@ private Map buildStringKeyMap(ByteBuffer input) return theMap; } - private Map parseString(String inputString) + public void reset() { - return parser.parse(inputString); + parser.reset(); } public InputRow parse(String input) + { + return parseMap(parseString(input)); + } + + private Map parseString(String inputString) { - return parseMap(parseString(input)); + return parser.parse(inputString); } private InputRow parseMap(Map theMap) { + // if the file has a header, null is returned + if (theMap == null) { + return null; + } return mapParser.parse(theMap); } } diff --git a/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java b/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java index e6740cb63f4b..2789537d1574 100644 --- a/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java @@ -64,6 +64,12 @@ public List getFieldNames() { throw new UnsupportedOperationException("not supported"); } + + @Override + public void reset() + { + // do nothing + } }; } diff --git a/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java index 7d99a2b804de..520f98bc03fb 100644 --- a/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java @@ -41,7 +41,8 @@ public void testColumnMissing() throws Exception Lists.newArrayList() ), ",", - Arrays.asList("a") + Arrays.asList("a"), + false ); } @@ -60,7 +61,8 @@ public void testComma() throws Exception Lists.newArrayList() ), ",", - Arrays.asList("a") + Arrays.asList("a"), + false ); } } diff --git a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java index 1ebad8c64141..3cadfee4ff25 100644 --- a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java @@ -40,7 +40,8 @@ public void testSerde() throws IOException new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("abc")), null, null), "\u0001", "\u0002", - Arrays.asList("abc") + Arrays.asList("abc"), + false ); final DelimitedParseSpec serde = jsonMapper.readValue( jsonMapper.writeValueAsString(spec), @@ -71,7 +72,8 @@ public void testColumnMissing() throws Exception ), ",", " ", - Arrays.asList("a") + Arrays.asList("a"), + false ); } @@ -91,12 +93,14 @@ public void testComma() throws Exception ), ",", null, - Arrays.asList("a") + Arrays.asList("a"), + false ); } @Test(expected = NullPointerException.class) - public void testDefaultColumnList(){ + public void testDefaultColumnList() + { final DelimitedParseSpec spec = new DelimitedParseSpec( new TimestampSpec( "timestamp", @@ -111,7 +115,8 @@ public void testDefaultColumnList(){ ",", null, // pass null columns not allowed - null + null, + false ); } } diff --git a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java index 87f5d20fcd5c..dc5df609c28a 100644 --- a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java +++ b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java @@ -67,7 +67,8 @@ public LineIterator apply(String s) new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("x")), null, null), ",", - ImmutableList.of("ts", "x") + ImmutableList.of("ts", "x"), + false ), null ); diff --git a/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java index 4b38453fe35f..593934080cca 100644 --- a/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java @@ -45,7 +45,8 @@ public void testDuplicateNames() throws Exception ), ",", " ", - Arrays.asList("a", "b") + Arrays.asList("a", "b"), + false ); } @@ -65,7 +66,8 @@ public void testDimAndDimExcluOverlap() throws Exception ), ",", null, - Arrays.asList("a", "B") + Arrays.asList("a", "B"), + false ); } @@ -85,7 +87,8 @@ public void testDimExclusionDuplicate() throws Exception ), ",", null, - Arrays.asList("a", "B") + Arrays.asList("a", "B"), + false ); } } diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index 95807cc60a6a..eef8d32c2d42 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -81,13 +81,15 @@ Since the CSV data cannot contain the column names (no header is allowed), these "column" : "timestamp" }, "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], + "firstRowIsHeader" : "false", "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] } } ``` -The `columns` field must match the columns of your input data in the same order. +If your file does not have a header as the first line of the file, you must set the `columns` field and ensure that the order of the fields matches the columns of your input data in the same order. +If your file does have a header, set `firstRowIsHeader` to true, and do not include the `columns` key. ### TSV @@ -105,7 +107,8 @@ The `columns` field must match the columns of your input data in the same order. } ``` -The `columns` field must match the columns of your input data in the same order. +If your file does not have a header as the first line of the file, you must set the `columns` field and ensure that the order of the fields matches the columns of your input data in the same order. +If your file does have a header, set `firstRowIsHeader` to true, and do not include the `columns` key. Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. diff --git a/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java b/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java index 46d99cbbcc96..3450f11bfada 100644 --- a/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java +++ b/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java @@ -96,7 +96,8 @@ public static Iterable constructorFeeder() throws IOException new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("dim", "keys", "values")), null, null), "\t", ",", - Arrays.asList("ts", "dim", "keys", "values") + Arrays.asList("ts", "dim", "keys", "values"), + false ) , "utf8" ); diff --git a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java index a7c27c66638e..9df181bdf8d3 100644 --- a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java +++ b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java @@ -238,6 +238,12 @@ public List getFieldNames() { return delegate.getFieldNames(); } + + @Override + public void reset() + { + // do nothing + } } @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "format") @@ -597,6 +603,12 @@ public List getFieldNames() { throw new UOE("No field names available"); } + + @Override + public void reset() + { + // do nothing + } }; } diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java index 7201bd119133..dd472d6f3bcf 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java @@ -346,7 +346,8 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map data(){ + public static Collection data() + { int[] first = new int[1]; Arrays.fill(first, 13); int[] second = new int[6]; @@ -67,7 +68,8 @@ public static Collection data(){ return Arrays.asList( new Object[][]{ { - DetermineHashedPartitionsJobTest.class.getClass().getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(), + DetermineHashedPartitionsJobTest.class.getClass() + .getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(), 1L, "2011-04-10T00:00:00.000Z/2011-04-11T00:00:00.000Z", 0, @@ -75,7 +77,8 @@ public static Collection data(){ first }, { - DetermineHashedPartitionsJobTest.class.getClass().getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(), + DetermineHashedPartitionsJobTest.class.getClass() + .getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(), 100L, "2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z", 0, @@ -83,7 +86,8 @@ public static Collection data(){ second }, { - DetermineHashedPartitionsJobTest.class.getClass().getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(), + DetermineHashedPartitionsJobTest.class.getClass() + .getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(), 1L, "2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z", 0, @@ -116,7 +120,12 @@ public DetermineHashedPartitionsJobTest( new DelimitedParseSpec( new TimestampSpec("ts", null, null), new DimensionsSpec( - DimensionsSpec.getDefaultSchemas(ImmutableList.of("market", "quality", "placement", "placementish")), + DimensionsSpec.getDefaultSchemas(ImmutableList.of( + "market", + "quality", + "placement", + "placementish" + )), null, null ), @@ -129,7 +138,8 @@ public DetermineHashedPartitionsJobTest( "placement", "placementish", "index" - ) + ), + false ), null ), @@ -176,7 +186,8 @@ public DetermineHashedPartitionsJobTest( } @Test - public void testDetermineHashedPartitions(){ + public void testDetermineHashedPartitions() + { DetermineHashedPartitionsJob determineHashedPartitionsJob = new DetermineHashedPartitionsJob(indexerConfig); determineHashedPartitionsJob.run(); Map> shardSpecs = indexerConfig.getSchema().getTuningConfig().getShardSpecs(); @@ -184,8 +195,8 @@ public void testDetermineHashedPartitions(){ expectedNumTimeBuckets, shardSpecs.entrySet().size() ); - int i=0; - for(Map.Entry> entry : shardSpecs.entrySet()) { + int i = 0; + for (Map.Entry> entry : shardSpecs.entrySet()) { Assert.assertEquals( expectedNumOfShards[i++], entry.getValue().size(), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/DeterminePartitionsJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/DeterminePartitionsJobTest.java index 9fc2e8eb0400..81766d3fd56a 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/DeterminePartitionsJobTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/DeterminePartitionsJobTest.java @@ -227,7 +227,8 @@ public DeterminePartitionsJobTest( new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "country")), null, null), null, - ImmutableList.of("timestamp", "host", "country", "visited_num") + ImmutableList.of("timestamp", "host", "country", "visited_num"), + false ), null ), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/IndexGeneratorJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/IndexGeneratorJobTest.java index 250b9dd8b045..8ba7bfb9487f 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/IndexGeneratorJobTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/IndexGeneratorJobTest.java @@ -142,7 +142,8 @@ public static Collection constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + false ), null ), @@ -188,7 +189,8 @@ public static Collection constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + false ) ), null, @@ -233,7 +235,8 @@ public static Collection constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + false ), null ), @@ -289,7 +292,8 @@ public static Collection constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + false ) ), null, diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java index 8af6c470ff69..74d6ffeb23eb 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java @@ -77,7 +77,8 @@ public void setup() throws Exception new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + false ), null ), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java index f1c7d558c909..b29d3bb9c4a3 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java @@ -265,7 +265,8 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(null, null, null), null, - ImmutableList.of("timestamp", "host", "visited") + ImmutableList.of("timestamp", "host", "visited"), + false ), null ), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java index 4b9adba54c4f..ffa6d4780173 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java @@ -164,7 +164,8 @@ public InputStream openStream() throws IOException new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList(TestIndex.DIMENSIONS)), null, null), "\t", "\u0001", - Arrays.asList(TestIndex.COLUMNS) + Arrays.asList(TestIndex.COLUMNS), + false ), null ), diff --git a/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java b/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java index 75a86d8b84bb..c13671d23204 100644 --- a/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java +++ b/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java @@ -272,6 +272,10 @@ private Map> determineShardSpecs( while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); + if (inputRow == null) { + continue; + } + final Interval interval; if (determineIntervals) { interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp()); diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java index 81e85c70dba1..864ee8ab777e 100644 --- a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java +++ b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java @@ -23,6 +23,7 @@ import com.google.common.collect.Lists; import io.druid.data.input.impl.CSVParseSpec; import io.druid.data.input.impl.DimensionsSpec; +import io.druid.data.input.impl.ParseSpec; import io.druid.data.input.impl.SpatialDimensionSchema; import io.druid.data.input.impl.StringInputRowParser; import io.druid.data.input.impl.TimestampSpec; @@ -74,6 +75,23 @@ public class IndexTaskTest { @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + private static final ParseSpec DEFAULT_PARSE_SPEC = new CSVParseSpec( + new TimestampSpec( + "ts", + "auto", + null + ), + new DimensionsSpec( + DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "dim")), + Lists.newArrayList(), + Lists.newArrayList() + ), + null, + Arrays.asList("ts", "dim", "val"), + false + ); + private final IndexSpec indexSpec; private final ObjectMapper jsonMapper; private IndexMerger indexMerger; @@ -107,7 +125,7 @@ public void testDeterminePartitions() throws Exception IndexTask indexTask = new IndexTask( null, null, - createIngestionSpec(tmpDir, null, 2, null, false, false), + createIngestionSpec(tmpDir, null, null, 2, null, false, false), null, jsonMapper ); @@ -145,7 +163,7 @@ public void testForceExtendableShardSpecs() throws Exception IndexTask indexTask = new IndexTask( null, null, - createIngestionSpec(tmpDir, null, 2, null, true, false), + createIngestionSpec(tmpDir, null, null, 2, null, true, false), null, jsonMapper ); @@ -185,6 +203,7 @@ public void testWithArbitraryGranularity() throws Exception null, createIngestionSpec( tmpDir, + null, new ArbitraryGranularitySpec( Granularities.MINUTE, Arrays.asList(new Interval("2014/2015")) @@ -220,6 +239,7 @@ public void testIntervalBucketing() throws Exception null, createIngestionSpec( tmpDir, + null, new UniformGranularitySpec( Granularities.HOUR, Granularities.HOUR, @@ -254,7 +274,7 @@ public void testNumShardsProvided() throws Exception IndexTask indexTask = new IndexTask( null, null, - createIngestionSpec(tmpDir, null, null, 1, false, false), + createIngestionSpec(tmpDir, null, null, null, 1, false, false), null, jsonMapper ); @@ -285,7 +305,7 @@ public void testAppendToExisting() throws Exception IndexTask indexTask = new IndexTask( null, null, - createIngestionSpec(tmpDir, null, 2, null, false, true), + createIngestionSpec(tmpDir, null, null, 2, null, false, true), null, jsonMapper ); @@ -323,6 +343,7 @@ public void testIntervalNotSpecified() throws Exception null, createIngestionSpec( tmpDir, + null, new UniformGranularitySpec( Granularities.HOUR, Granularities.MINUTE, @@ -357,6 +378,110 @@ public void testIntervalNotSpecified() throws Exception Assert.assertEquals(0, segments.get(2).getShardSpec().getPartitionNum()); } + @Test + public void testCSVFileWithHeader() throws Exception + { + File tmpDir = temporaryFolder.newFolder(); + + File tmpFile = File.createTempFile("druid", "index", tmpDir); + + PrintWriter writer = new PrintWriter(tmpFile); + writer.println("time,d,val"); + writer.println("2014-01-01T00:00:10Z,a,1"); + + writer.close(); + + IndexTask indexTask = new IndexTask( + null, + null, + createIngestionSpec( + tmpDir, + new CSVParseSpec( + new TimestampSpec( + "time", + "auto", + null + ), + new DimensionsSpec( + null, + Lists.newArrayList(), + Lists.newArrayList() + ), + null, + null, + true + ), + null, + 2, + null, + false, + false + ), + null, + jsonMapper + ); + + final List segments = runTask(indexTask); + + Assert.assertEquals(1, segments.size()); + + Assert.assertEquals(Arrays.asList("d"), segments.get(0).getDimensions()); + Assert.assertEquals(Arrays.asList("val"), segments.get(0).getMetrics()); + Assert.assertEquals(new Interval("2014/P1D"), segments.get(0).getInterval()); + } + + @Test + public void testCSVFileWithHeaderColumnOverride() throws Exception + { + File tmpDir = temporaryFolder.newFolder(); + + File tmpFile = File.createTempFile("druid", "index", tmpDir); + + PrintWriter writer = new PrintWriter(tmpFile); + writer.println("time,d,val"); + writer.println("2014-01-01T00:00:10Z,a,1"); + + writer.close(); + + IndexTask indexTask = new IndexTask( + null, + null, + createIngestionSpec( + tmpDir, + new CSVParseSpec( + new TimestampSpec( + "time", + "auto", + null + ), + new DimensionsSpec( + null, + Lists.newArrayList(), + Lists.newArrayList() + ), + null, + Arrays.asList("time", "dim", "val"), + true + ), + null, + 2, + null, + false, + false + ), + null, + jsonMapper + ); + + final List segments = runTask(indexTask); + + Assert.assertEquals(1, segments.size()); + + Assert.assertEquals(Arrays.asList("dim"), segments.get(0).getDimensions()); + Assert.assertEquals(Arrays.asList("val"), segments.get(0).getMetrics()); + Assert.assertEquals(new Interval("2014/P1D"), segments.get(0).getInterval()); + } + private final List runTask(final IndexTask indexTask) throws Exception { final List segments = Lists.newArrayList(); @@ -434,6 +559,7 @@ public DataSegment push(File file, DataSegment segment) throws IOException private IndexTask.IndexIngestionSpec createIngestionSpec( File baseDir, + ParseSpec parseSpec, GranularitySpec granularitySpec, Integer targetPartitionSize, Integer numShards, @@ -446,20 +572,7 @@ private IndexTask.IndexIngestionSpec createIngestionSpec( "test", jsonMapper.convertValue( new StringInputRowParser( - new CSVParseSpec( - new TimestampSpec( - "ts", - "auto", - null - ), - new DimensionsSpec( - DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "dim")), - Lists.newArrayList(), - Lists.newArrayList() - ), - null, - Arrays.asList("ts", "dim", "val") - ), + parseSpec != null ? parseSpec : DEFAULT_PARSE_SPEC, null ), Map.class diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index e45ed7e6acc7..b9c8a06f383c 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -40,6 +40,8 @@ public class CSVParser implements Parser private final au.com.bytecode.opencsv.CSVParser parser = new au.com.bytecode.opencsv.CSVParser(); private ArrayList fieldNames = null; + private boolean firstRowIsHeader = false; + private boolean hasParsedHeader = true; public CSVParser(final Optional listDelimiter) { @@ -78,6 +80,18 @@ public CSVParser(final Optional listDelimiter, final String header) setFieldNames(header); } + public CSVParser( + final Optional listDelimiter, + final Iterable fieldNames, + final boolean firstRowIsHeader + ) + { + this(listDelimiter, fieldNames); + + this.firstRowIsHeader = firstRowIsHeader; + this.hasParsedHeader = !firstRowIsHeader; + } + public String getListDelimiter() { return listDelimiter; @@ -92,8 +106,10 @@ public List getFieldNames() @Override public void setFieldNames(final Iterable fieldNames) { - ParserUtils.validateFields(fieldNames); - this.fieldNames = Lists.newArrayList(fieldNames); + if (fieldNames != null) { + ParserUtils.validateFields(fieldNames); + this.fieldNames = Lists.newArrayList(fieldNames); + } } public void setFieldNames(final String header) @@ -112,6 +128,14 @@ public Map parse(final String input) try { String[] values = parser.parseLine(input); + if (firstRowIsHeader && !hasParsedHeader) { + if (fieldNames == null) { + setFieldNames(Arrays.asList(values)); + } + hasParsedHeader = true; + return null; + } + if (fieldNames == null) { setFieldNames(ParserUtils.generateFieldNames(values.length)); } @@ -122,4 +146,10 @@ public Map parse(final String input) throw new ParseException(e, "Unable to parse row [%s]", input); } } + + @Override + public void reset() + { + hasParsedHeader = !firstRowIsHeader; + } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index c37c4bda989c..948500bb5d3f 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -44,6 +44,8 @@ public class DelimitedParser implements Parser private final Function valueFunction; private ArrayList fieldNames = null; + private boolean firstRowIsHeader = false; + private boolean hasParsedHeader = true; public DelimitedParser(final Optional delimiter, Optional listDelimiter) { @@ -89,13 +91,24 @@ public DelimitedParser( } public DelimitedParser(final Optional delimiter, final Optional listDelimiter, final String header) - { this(delimiter, listDelimiter); setFieldNames(header); } + public DelimitedParser( + final Optional delimiter, + final Optional listDelimiter, + final boolean firstRowIsHeader + ) + { + this(delimiter, listDelimiter); + + this.firstRowIsHeader = firstRowIsHeader; + this.hasParsedHeader = !firstRowIsHeader; + } + public String getDelimiter() { return delimiter; @@ -135,6 +148,14 @@ public Map parse(final String input) try { Iterable values = splitter.split(input); + if (firstRowIsHeader && !hasParsedHeader) { + if (fieldNames == null) { + setFieldNames(values); + } + hasParsedHeader = true; + return null; + } + if (fieldNames == null) { setFieldNames(ParserUtils.generateFieldNames(Iterators.size(values.iterator()))); } @@ -145,4 +166,10 @@ public Map parse(final String input) throw new ParseException(e, "Unable to parse row [%s]", input); } } + + @Override + public void reset() + { + hasParsedHeader = !firstRowIsHeader; + } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java index 15d2dc8b3b8c..2a6885c35cbd 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java @@ -150,4 +150,10 @@ public Map parse(String input) throw new ParseException(e, "Unable to parse row [%s]", input); } } + + @Override + public void reset() + { + // do nothing + } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java index 2717d2db915a..404f5cc75917 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java @@ -282,4 +282,9 @@ public String getExpr() } } + @Override + public void reset() + { + // do nothing + } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java index 2357be2245c4..36dbbaf467b0 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java @@ -99,4 +99,10 @@ public List getFieldNames() { throw new UnsupportedOperationException(); } + + @Override + public void reset() + { + // do nothing + } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java index 78b29de72244..2c21779c678a 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java @@ -34,6 +34,11 @@ public interface Parser */ public Map parse(String input); + /** + * Resets state within a parser. + */ + public void reset(); + /** * Set the fieldNames that you expect to see in parsed Maps. Deprecated; Parsers should not, in general, be * expected to know what fields they will return. Some individual types of parsers do need to know (like a TSV diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java index 87b6321aa20f..7cd0061b12d0 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java @@ -117,9 +117,14 @@ public void setFieldNames(Iterable fieldNames) } @Override - public List getFieldNames() { return fieldNames; } + + @Override + public void reset() + { + // do nothing + } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java index 7e45bf23d941..7eef985e53eb 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java @@ -64,4 +64,10 @@ public List getFieldNames() { return baseParser.getFieldNames(); } + + @Override + public void reset() + { + baseParser.reset(); + } } diff --git a/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java index d4fb1223ec82..a20afb72f5c5 100644 --- a/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java +++ b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java @@ -130,7 +130,8 @@ public static void setupClass() throws Exception new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags")), null, null), "\t", - ImmutableList.of("timestamp", "product", "tags") + ImmutableList.of("timestamp", "product", "tags"), + false ), "UTF-8" ); diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java index dc7648154da2..c46c8eb8d22d 100644 --- a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java +++ b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java @@ -144,7 +144,8 @@ private Segment createSegment() throws Exception new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags")), null, null), "\t", - ImmutableList.of("timestamp", "product", "tags") + ImmutableList.of("timestamp", "product", "tags"), + false ), "UTF-8" ); diff --git a/processing/src/test/java/io/druid/segment/TestIndex.java b/processing/src/test/java/io/druid/segment/TestIndex.java index 6f5297684919..3289836d9553 100644 --- a/processing/src/test/java/io/druid/segment/TestIndex.java +++ b/processing/src/test/java/io/druid/segment/TestIndex.java @@ -291,7 +291,8 @@ public static IncrementalIndex loadIncrementalIndex( new DimensionsSpec(DIMENSION_SCHEMAS, null, null), "\t", "\u0001", - Arrays.asList(COLUMNS) + Arrays.asList(COLUMNS), + false ) , "utf8" ); diff --git a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java index 53a586ce4f28..2bca6eae939a 100644 --- a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java +++ b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java @@ -162,6 +162,11 @@ public ReplayableFirehose(InputRowParser parser) throws IOException while (delegateFirehose.hasMore() && cos.getCount() < getMaxTempFileSize()) { try { InputRow row = delegateFirehose.nextRow(); + + if (row == null) { + continue; + } + generator.writeObject(row); dimensionScratch.addAll(row.getDimensions()); counter++; diff --git a/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java b/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java index ea8b5590701a..1e295721a820 100644 --- a/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java +++ b/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java @@ -108,7 +108,8 @@ private void createTestIndex(File segmentDir) throws Exception new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited") + ImmutableList.of("timestamp", "host", "visited"), + false ), Charsets.UTF_8.toString() ); From c3965ed0e04f6ab12773e20accd3e79dafd63df6 Mon Sep 17 00:00:00 2001 From: fjy Date: Thu, 20 Apr 2017 16:32:33 -0700 Subject: [PATCH 02/20] small fixes --- .../java/io/druid/data/input/impl/FileIteratingFirehose.java | 2 +- docs/content/ingestion/data-formats.md | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index cebf0e018e99..185f40b1309a 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -51,7 +51,6 @@ public boolean hasMore() { while ((lineIterator == null || !lineIterator.hasNext()) && lineIterators.hasNext()) { lineIterator = lineIterators.next(); - parser.reset(); } return lineIterator != null && lineIterator.hasNext(); @@ -68,6 +67,7 @@ public InputRow nextRow() } lineIterator = lineIterators.next(); + parser.reset(); } return parser.parse(lineIterator.next()); diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index eef8d32c2d42..96e95b2a5983 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -72,8 +72,6 @@ If you have nested JSON, [Druid can automatically flatten it for you](flatten-js ### CSV -Since the CSV data cannot contain the column names (no header is allowed), these must be added before that data can be processed: - ```json "parseSpec":{ "format" : "csv", From f68be6b68494cafbd19ca561523bca0c008ceebb Mon Sep 17 00:00:00 2001 From: fjy Date: Thu, 20 Apr 2017 16:40:06 -0700 Subject: [PATCH 03/20] fix bug --- .../main/java/io/druid/data/input/impl/StringInputRowParser.java | 1 - 1 file changed, 1 deletion(-) diff --git a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java index 58baeed5b7ca..0b14a5a4d827 100644 --- a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java +++ b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java @@ -32,7 +32,6 @@ import java.nio.charset.Charset; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; -import java.util.LinkedHashMap; import java.util.Map; /** From e188eea46cf7ea7e7964a9a76fa63a7a09189f4c Mon Sep 17 00:00:00 2001 From: fjy Date: Tue, 25 Apr 2017 15:23:15 -0700 Subject: [PATCH 04/20] fix bug --- .../java/io/druid/data/input/impl/DelimitedParseSpec.java | 4 +++- .../java/io/druid/data/input/impl/DelimitedParseSpecTest.java | 3 +-- .../io/druid/java/util/common/parsers/DelimitedParser.java | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index c0c01520634d..026ad6d2ef73 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -106,12 +106,14 @@ public Parser makeParser() return new DelimitedParser( Optional.fromNullable(delimiter), Optional.fromNullable(listDelimiter), + columns, firstRowIsHeader ); } else { return new DelimitedParser( Optional.fromNullable(delimiter), - Optional.fromNullable(listDelimiter) + Optional.fromNullable(listDelimiter), + columns ); } } diff --git a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java index 3cadfee4ff25..af80391bb7b3 100644 --- a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java @@ -98,7 +98,7 @@ public void testComma() throws Exception ); } - @Test(expected = NullPointerException.class) + @Test(expected = IllegalArgumentException.class) public void testDefaultColumnList() { final DelimitedParseSpec spec = new DelimitedParseSpec( @@ -114,7 +114,6 @@ public void testDefaultColumnList() ), ",", null, - // pass null columns not allowed null, false ); diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index 948500bb5d3f..a3abfdbc4a5c 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -100,6 +100,7 @@ public DelimitedParser(final Optional delimiter, final Optional public DelimitedParser( final Optional delimiter, final Optional listDelimiter, + final Iterable fieldNames, final boolean firstRowIsHeader ) { From 96374cb81ec0316395b5eb835239e3da4789af8f Mon Sep 17 00:00:00 2001 From: fjy Date: Tue, 25 Apr 2017 17:43:20 -0700 Subject: [PATCH 05/20] address code review --- .../druid/data/input/impl/CSVParseSpec.java | 26 +++++------ .../data/input/impl/DelimitedParseSpec.java | 44 ++++++++----------- .../input/impl/FileIteratingFirehose.java | 6 +++ .../data/input/impl/StringInputRowParser.java | 6 +-- .../data/input/impl/TimeAndDimsParseSpec.java | 7 --- docs/content/ingestion/data-formats.md | 21 ++++++--- .../namespace/URIExtractionNamespace.java | 8 ---- .../java/util/common/parsers/CSVParser.java | 12 ++--- .../java/util/common/parsers/JSONParser.java | 6 --- .../util/common/parsers/JSONPathParser.java | 8 +--- .../util/common/parsers/JavaScriptParser.java | 6 --- .../java/util/common/parsers/Parser.java | 5 ++- .../java/util/common/parsers/RegexParser.java | 6 --- 13 files changed, 65 insertions(+), 96 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java index e8b2788db53a..601664657884 100644 --- a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java @@ -34,7 +34,7 @@ public class CSVParseSpec extends ParseSpec { private final String listDelimiter; private final List columns; - private final boolean firstRowIsHeader; + private final boolean hasHeaderRow; @JsonCreator public CSVParseSpec( @@ -42,7 +42,7 @@ public CSVParseSpec( @JsonProperty("dimensionsSpec") DimensionsSpec dimensionsSpec, @JsonProperty("listDelimiter") String listDelimiter, @JsonProperty("columns") List columns, - @JsonProperty("firstRowIsHeader") boolean firstRowIsHeader + @JsonProperty("hasHeaderRow") boolean hasHeaderRow ) { super(timestampSpec, dimensionsSpec); @@ -57,12 +57,12 @@ public CSVParseSpec( verify(dimensionsSpec.getDimensionNames()); } else { Preconditions.checkArgument( - firstRowIsHeader, - "If columns field is not set, the first row of your data must have your header and firstRowIsHeader must be set to true." + hasHeaderRow, + "If columns field is not set, the first row of your data must have your header and hasHeaderRow must be set to true." ); } - this.firstRowIsHeader = firstRowIsHeader; + this.hasHeaderRow = hasHeaderRow; } @JsonProperty @@ -78,9 +78,9 @@ public List getColumns() } @JsonProperty - public boolean isFirstRowIsHeader() + public boolean isHasHeaderRow() { - return firstRowIsHeader; + return hasHeaderRow; } @Override @@ -94,27 +94,23 @@ public void verify(List usedCols) @Override public Parser makeParser() { - if (firstRowIsHeader) { - return new CSVParser(Optional.fromNullable(listDelimiter), columns, firstRowIsHeader); - } else { - return new CSVParser(Optional.fromNullable(listDelimiter), columns); - } + return new CSVParser(Optional.fromNullable(listDelimiter), columns, hasHeaderRow); } @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns, firstRowIsHeader); + return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns, hasHeaderRow); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns, firstRowIsHeader); + return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns, hasHeaderRow); } public ParseSpec withColumns(List cols) { - return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols, firstRowIsHeader); + return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols, hasHeaderRow); } } diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index 026ad6d2ef73..4b0b218b20f1 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -35,7 +35,7 @@ public class DelimitedParseSpec extends ParseSpec private final String delimiter; private final String listDelimiter; private final List columns; - private final boolean firstRowIsHeader; + private final boolean hasHeaderRow; @JsonCreator public DelimitedParseSpec( @@ -44,7 +44,7 @@ public DelimitedParseSpec( @JsonProperty("delimiter") String delimiter, @JsonProperty("listDelimiter") String listDelimiter, @JsonProperty("columns") List columns, - @JsonProperty("firstRowIsHeader") boolean firstRowIsHeader + @JsonProperty("hasHeaderRow") boolean hasHeaderRow ) { super(timestampSpec, dimensionsSpec); @@ -59,12 +59,12 @@ public DelimitedParseSpec( verify(dimensionsSpec.getDimensionNames()); } else { Preconditions.checkArgument( - firstRowIsHeader, - "If columns field is not set, the first row of your data must have your header and firstRowIsHeader must be set to true." + hasHeaderRow, + "If columns field is not set, the first row of your data must have your header and hasHeaderRow must be set to true." ); } - this.firstRowIsHeader = firstRowIsHeader; + this.hasHeaderRow = hasHeaderRow; } @JsonProperty("delimiter") @@ -86,9 +86,9 @@ public List getColumns() } @JsonProperty - public boolean isFirstRowIsHeader() + public boolean isHasHeaderRow() { - return firstRowIsHeader; + return hasHeaderRow; } @Override @@ -102,50 +102,42 @@ public void verify(List usedCols) @Override public Parser makeParser() { - if (firstRowIsHeader) { - return new DelimitedParser( - Optional.fromNullable(delimiter), - Optional.fromNullable(listDelimiter), - columns, - firstRowIsHeader - ); - } else { - return new DelimitedParser( - Optional.fromNullable(delimiter), - Optional.fromNullable(listDelimiter), - columns - ); - } + return new DelimitedParser( + Optional.fromNullable(delimiter), + Optional.fromNullable(listDelimiter), + columns, + hasHeaderRow + ); } @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns, firstRowIsHeader); + return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns, hasHeaderRow); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns, firstRowIsHeader); + return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns, hasHeaderRow); } public ParseSpec withDelimiter(String delim) { return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delim, listDelimiter, columns, - firstRowIsHeader + hasHeaderRow ); } public ParseSpec withListDelimiter(String delim) { - return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, delim, columns, firstRowIsHeader); + return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, delim, columns, hasHeaderRow); } public ParseSpec withColumns(List cols) { return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, listDelimiter, cols, - firstRowIsHeader + hasHeaderRow ); } } diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 185f40b1309a..03edd8eb462e 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -50,7 +50,13 @@ public FileIteratingFirehose( public boolean hasMore() { while ((lineIterator == null || !lineIterator.hasNext()) && lineIterators.hasNext()) { + // Close old streams, maybe. + if (lineIterator != null) { + lineIterator.close(); + } + lineIterator = lineIterators.next(); + parser.reset(); } return lineIterator != null && lineIterator.hasNext(); diff --git a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java index 0b14a5a4d827..15e9e1810127 100644 --- a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java +++ b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java @@ -129,9 +129,9 @@ public void reset() } public InputRow parse(String input) - { - return parseMap(parseString(input)); - } + { + return parseMap(parseString(input)); + } private Map parseString(String inputString) { diff --git a/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java b/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java index 2789537d1574..ec1fdb2a0402 100644 --- a/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/TimeAndDimsParseSpec.java @@ -21,7 +21,6 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; - import io.druid.java.util.common.parsers.Parser; import java.util.List; @@ -64,12 +63,6 @@ public List getFieldNames() { throw new UnsupportedOperationException("not supported"); } - - @Override - public void reset() - { - // do nothing - } }; } diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index 96e95b2a5983..f539fa0b0a08 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -79,20 +79,25 @@ If you have nested JSON, [Druid can automatically flatten it for you](flatten-js "column" : "timestamp" }, "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], - "firstRowIsHeader" : "false", "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] } } ``` +#### CSV Index Tasks + If your file does not have a header as the first line of the file, you must set the `columns` field and ensure that the order of the fields matches the columns of your input data in the same order. -If your file does have a header, set `firstRowIsHeader` to true, and do not include the `columns` key. +If your file does have a header, you can set a field called `hasHeaderRow` to true, and do not include the `columns` key. + +#### Other CSV Ingestion Tasks -### TSV +The `columns` field must be included and and ensure that the order of the fields matches the columns of your input data in the same order. + +### TSV (Delimited) ```json - "parseSpec":{ + "parseSpec": { "format" : "tsv", "timestampSpec" : { "column" : "timestamp" @@ -105,11 +110,17 @@ If your file does have a header, set `firstRowIsHeader` to true, and do not incl } ``` +#### TSV (Delimited) Index Tasks + If your file does not have a header as the first line of the file, you must set the `columns` field and ensure that the order of the fields matches the columns of your input data in the same order. -If your file does have a header, set `firstRowIsHeader` to true, and do not include the `columns` key. +If your file does have a header, you can set a field called `hasHeaderRow` to true, and do not include the `columns` key. Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. +#### Other TSV (Delimited) Ingestion Tasks + +The `columns` field must be included and and ensure that the order of the fields matches the columns of your input data in the same order. + ### Regex ```json diff --git a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java index 9df181bdf8d3..971dfe5aa387 100644 --- a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java +++ b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java @@ -33,7 +33,6 @@ import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; - import io.druid.guice.annotations.Json; import io.druid.java.util.common.IAE; import io.druid.java.util.common.UOE; @@ -41,7 +40,6 @@ import io.druid.java.util.common.parsers.DelimitedParser; import io.druid.java.util.common.parsers.JSONParser; import io.druid.java.util.common.parsers.Parser; - import org.joda.time.Period; import javax.annotation.Nullable; @@ -603,12 +601,6 @@ public List getFieldNames() { throw new UOE("No field names available"); } - - @Override - public void reset() - { - // do nothing - } }; } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index b9c8a06f383c..4306ae4a65e6 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -40,7 +40,7 @@ public class CSVParser implements Parser private final au.com.bytecode.opencsv.CSVParser parser = new au.com.bytecode.opencsv.CSVParser(); private ArrayList fieldNames = null; - private boolean firstRowIsHeader = false; + private boolean hasHeaderRow = false; private boolean hasParsedHeader = true; public CSVParser(final Optional listDelimiter) @@ -83,13 +83,13 @@ public CSVParser(final Optional listDelimiter, final String header) public CSVParser( final Optional listDelimiter, final Iterable fieldNames, - final boolean firstRowIsHeader + final boolean hasHeaderRow ) { this(listDelimiter, fieldNames); - this.firstRowIsHeader = firstRowIsHeader; - this.hasParsedHeader = !firstRowIsHeader; + this.hasHeaderRow = hasHeaderRow; + this.hasParsedHeader = !hasHeaderRow; } public String getListDelimiter() @@ -128,7 +128,7 @@ public Map parse(final String input) try { String[] values = parser.parseLine(input); - if (firstRowIsHeader && !hasParsedHeader) { + if (hasHeaderRow && !hasParsedHeader) { if (fieldNames == null) { setFieldNames(Arrays.asList(values)); } @@ -150,6 +150,6 @@ public Map parse(final String input) @Override public void reset() { - hasParsedHeader = !firstRowIsHeader; + hasParsedHeader = !hasHeaderRow; } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java index 2a6885c35cbd..15d2dc8b3b8c 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONParser.java @@ -150,10 +150,4 @@ public Map parse(String input) throw new ParseException(e, "Unable to parse row [%s]", input); } } - - @Override - public void reset() - { - // do nothing - } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java index 404f5cc75917..d23d599d3bca 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/JSONPathParser.java @@ -238,7 +238,7 @@ public enum FieldType /** * Specifies a field to be added to the parsed object Map, using JsonPath notation. - * + *

* See https://github.com/jayway/JsonPath for more information. */ public static class FieldSpec @@ -281,10 +281,4 @@ public String getExpr() return expr; } } - - @Override - public void reset() - { - // do nothing - } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java index 36dbbaf467b0..2357be2245c4 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/JavaScriptParser.java @@ -99,10 +99,4 @@ public List getFieldNames() { throw new UnsupportedOperationException(); } - - @Override - public void reset() - { - // do nothing - } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java index 2c21779c678a..d5c0ca395de8 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java @@ -37,7 +37,10 @@ public interface Parser /** * Resets state within a parser. */ - public void reset(); + public default void reset() + { + // do nothing + } /** * Set the fieldNames that you expect to see in parsed Maps. Deprecated; Parsers should not, in general, be diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java index 7cd0061b12d0..329b02aa944a 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/RegexParser.java @@ -121,10 +121,4 @@ public List getFieldNames() { return fieldNames; } - - @Override - public void reset() - { - // do nothing - } } From 8bbb5d300029080ef1f59202817ed4c38b50ddd0 Mon Sep 17 00:00:00 2001 From: fjy Date: Tue, 25 Apr 2017 18:01:47 -0700 Subject: [PATCH 06/20] more cr --- .../data/input/impl/StringInputRowParser.java | 2 +- .../java/util/common/parsers/CSVParser.java | 57 +++++++++------ .../util/common/parsers/DelimitedParser.java | 70 ++++++++++++------- .../java/util/common/parsers/Parser.java | 2 +- .../firehose/ReplayableFirehoseFactory.java | 1 + 5 files changed, 82 insertions(+), 50 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java index 15e9e1810127..3041c8ec3191 100644 --- a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java +++ b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java @@ -140,7 +140,7 @@ private Map parseString(String inputString) private InputRow parseMap(Map theMap) { - // if the file has a header, null is returned + // If a header is present in the data (and with proper configurations), a null is returned if (theMap == null) { return null; } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index 4306ae4a65e6..1ea546811865 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -33,26 +33,17 @@ public class CSVParser implements Parser { - private final String listDelimiter; - private final Splitter listSplitter; - private final Function valueFunction; - - private final au.com.bytecode.opencsv.CSVParser parser = new au.com.bytecode.opencsv.CSVParser(); - - private ArrayList fieldNames = null; - private boolean hasHeaderRow = false; - private boolean hasParsedHeader = true; - - public CSVParser(final Optional listDelimiter) + private static final Function getValueFunction( + final String listDelimiter, + final Splitter listSplitter + ) { - this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; - this.listSplitter = Splitter.on(this.listDelimiter); - this.valueFunction = new Function() + return new Function() { @Override public Object apply(String input) { - if (input.contains(CSVParser.this.listDelimiter)) { + if (input.contains(listDelimiter)) { return Lists.newArrayList( Iterables.transform( listSplitter.split(input), @@ -66,6 +57,26 @@ public Object apply(String input) }; } + private final String listDelimiter; + private final Splitter listSplitter; + private final Function valueFunction; + + private final boolean hasHeaderRow; + + private final au.com.bytecode.opencsv.CSVParser parser = new au.com.bytecode.opencsv.CSVParser(); + + private ArrayList fieldNames = null; + private boolean hasParsedHeader = false; + + public CSVParser(final Optional listDelimiter) + { + this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; + this.listSplitter = Splitter.on(this.listDelimiter); + this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); + + this.hasHeaderRow = false; + } + public CSVParser(final Optional listDelimiter, final Iterable fieldNames) { this(listDelimiter); @@ -86,10 +97,12 @@ public CSVParser( final boolean hasHeaderRow ) { - this(listDelimiter, fieldNames); - + this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; + this.listSplitter = Splitter.on(this.listDelimiter); + this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); this.hasHeaderRow = hasHeaderRow; - this.hasParsedHeader = !hasHeaderRow; + + setFieldNames(fieldNames); } public String getListDelimiter() @@ -106,10 +119,8 @@ public List getFieldNames() @Override public void setFieldNames(final Iterable fieldNames) { - if (fieldNames != null) { - ParserUtils.validateFields(fieldNames); - this.fieldNames = Lists.newArrayList(fieldNames); - } + ParserUtils.validateFields(fieldNames); + this.fieldNames = Lists.newArrayList(fieldNames); } public void setFieldNames(final String header) @@ -150,6 +161,6 @@ public Map parse(final String input) @Override public void reset() { - hasParsedHeader = !hasHeaderRow; + hasParsedHeader = false; } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index a3abfdbc4a5c..b6325ab19d6e 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -36,16 +36,40 @@ public class DelimitedParser implements Parser { private static final String DEFAULT_DELIMITER = "\t"; + private static final Function getValueFunction( + final String listDelimiter, + final Splitter listSplitter + ) + { + return new Function() + { + @Override + public Object apply(String input) + { + if (input.contains(listDelimiter)) { + return Lists.newArrayList( + Iterables.transform( + listSplitter.split(input), + ParserUtils.nullEmptyStringFunction + ) + ); + } else { + return ParserUtils.nullEmptyStringFunction.apply(input); + } + } + }; + } + private final String delimiter; private final String listDelimiter; private final Splitter splitter; private final Splitter listSplitter; private final Function valueFunction; + private final boolean hasHeaderRow; private ArrayList fieldNames = null; - private boolean firstRowIsHeader = false; - private boolean hasParsedHeader = true; + private boolean hasParsedHeader = false; public DelimitedParser(final Optional delimiter, Optional listDelimiter) { @@ -60,23 +84,8 @@ public DelimitedParser(final Optional delimiter, Optional listDe this.splitter = Splitter.on(this.delimiter); this.listSplitter = Splitter.on(this.listDelimiter); - this.valueFunction = new Function() - { - @Override - public Object apply(String input) - { - if (input.contains(DelimitedParser.this.listDelimiter)) { - return Lists.newArrayList( - Iterables.transform( - listSplitter.split(input), - ParserUtils.nullEmptyStringFunction - ) - ); - } else { - return ParserUtils.nullEmptyStringFunction.apply(input); - } - } - }; + this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); + this.hasHeaderRow = false; } public DelimitedParser( @@ -101,13 +110,24 @@ public DelimitedParser( final Optional delimiter, final Optional listDelimiter, final Iterable fieldNames, - final boolean firstRowIsHeader + final boolean hasHeaderRow ) { - this(delimiter, listDelimiter); + this.delimiter = delimiter.isPresent() ? delimiter.get() : DEFAULT_DELIMITER; + this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; - this.firstRowIsHeader = firstRowIsHeader; - this.hasParsedHeader = !firstRowIsHeader; + Preconditions.checkState( + !this.delimiter.equals(this.listDelimiter), + "Cannot have same delimiter and list delimiter of [%s]", + this.delimiter + ); + + this.splitter = Splitter.on(this.delimiter); + this.listSplitter = Splitter.on(this.listDelimiter); + this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); + this.hasHeaderRow = hasHeaderRow; + + setFieldNames(fieldNames); } public String getDelimiter() @@ -149,7 +169,7 @@ public Map parse(final String input) try { Iterable values = splitter.split(input); - if (firstRowIsHeader && !hasParsedHeader) { + if (hasHeaderRow && !hasParsedHeader) { if (fieldNames == null) { setFieldNames(values); } @@ -171,6 +191,6 @@ public Map parse(final String input) @Override public void reset() { - hasParsedHeader = !firstRowIsHeader; + hasParsedHeader = false; } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java index d5c0ca395de8..d284317a620f 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java @@ -35,7 +35,7 @@ public interface Parser public Map parse(String input); /** - * Resets state within a parser. + * Resets state within a parser. This may or may not get called at the start of reading of every file. */ public default void reset() { diff --git a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java index 2bca6eae939a..c50367a796ba 100644 --- a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java +++ b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java @@ -163,6 +163,7 @@ public ReplayableFirehose(InputRowParser parser) throws IOException try { InputRow row = delegateFirehose.nextRow(); + // If a header is present in the data (and with proper configurations), a null InputRow will get created if (row == null) { continue; } From aeb0fc68e927e6a774d91c26ea58f696ab70fb31 Mon Sep 17 00:00:00 2001 From: fjy Date: Tue, 25 Apr 2017 18:03:57 -0700 Subject: [PATCH 07/20] more cr --- .../query/lookup/namespace/URIExtractionNamespace.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java index 971dfe5aa387..a5103a4e2485 100644 --- a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java +++ b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java @@ -236,12 +236,6 @@ public List getFieldNames() { return delegate.getFieldNames(); } - - @Override - public void reset() - { - // do nothing - } } @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "format") From 7e7d6f04740e040ea5616d6fa22b0be93f536df1 Mon Sep 17 00:00:00 2001 From: fjy Date: Tue, 25 Apr 2017 18:12:44 -0700 Subject: [PATCH 08/20] more cr --- .../src/main/java/io/druid/indexing/common/task/IndexTask.java | 1 + 1 file changed, 1 insertion(+) diff --git a/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java b/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java index c13671d23204..70f6140da070 100644 --- a/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java +++ b/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java @@ -272,6 +272,7 @@ private Map> determineShardSpecs( while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); + // If a header is present in the data (and with proper configurations), a null InputRow will get created if (inputRow == null) { continue; } From 6ad82f59aa31b74cf29350db2dbc07ca7eade84e Mon Sep 17 00:00:00 2001 From: fjy Date: Wed, 26 Apr 2017 14:45:43 -0700 Subject: [PATCH 09/20] fix --- .../java/io/druid/java/util/common/parsers/CSVParser.java | 6 ++++-- .../io/druid/java/util/common/parsers/DelimitedParser.java | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index 1ea546811865..377e8138ada4 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -119,8 +119,10 @@ public List getFieldNames() @Override public void setFieldNames(final Iterable fieldNames) { - ParserUtils.validateFields(fieldNames); - this.fieldNames = Lists.newArrayList(fieldNames); + if (fieldNames != null) { + ParserUtils.validateFields(fieldNames); + this.fieldNames = Lists.newArrayList(fieldNames); + } } public void setFieldNames(final String header) diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index b6325ab19d6e..ddf43300fdc9 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -149,8 +149,10 @@ public List getFieldNames() @Override public void setFieldNames(final Iterable fieldNames) { - ParserUtils.validateFields(fieldNames); - this.fieldNames = Lists.newArrayList(fieldNames); + if (fieldNames != null) { + ParserUtils.validateFields(fieldNames); + this.fieldNames = Lists.newArrayList(fieldNames); + } } public void setFieldNames(String header) From d0ca1723146ca265af89b9662d6ca61700e0c236 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sat, 6 May 2017 15:42:32 +0900 Subject: [PATCH 10/20] Skip head rows for CSV and TSV --- .../druid/data/input/impl/CSVParseSpec.java | 25 ++++++-- .../data/input/impl/DelimitedParseSpec.java | 61 ++++++++++++++++--- .../data/input/impl/CSVParseSpecTest.java | 6 +- .../input/impl/DelimitedParseSpecTest.java | 10 ++- .../input/impl/FileIteratingFirehoseTest.java | 3 +- .../druid/data/input/impl/ParseSpecTest.java | 9 ++- .../extensions-core/lookups-cached-global.md | 8 ++- docs/content/ingestion/data-formats.md | 10 ++- .../druid/segment/MapVirtualColumnTest.java | 3 +- .../namespace/URIExtractionNamespace.java | 13 ++-- .../namespace/URIExtractionNamespaceTest.java | 31 ++++++---- .../indexer/HadoopDruidIndexerConfig.java | 13 +++- .../indexer/BatchDeltaIngestionTest.java | 3 +- .../DetermineHashedPartitionsJobTest.java | 3 +- .../indexer/DeterminePartitionsJobTest.java | 3 +- .../druid/indexer/IndexGeneratorJobTest.java | 12 ++-- .../java/io/druid/indexer/JobHelperTest.java | 3 +- .../indexer/path/DatasourcePathSpecTest.java | 3 +- .../updater/HadoopConverterJobTest.java | 3 +- .../indexing/common/task/IndexTaskTest.java | 3 +- .../java/util/common/parsers/CSVParser.java | 20 ++++-- .../util/common/parsers/DelimitedParser.java | 25 ++++++-- .../util/common/parsers/CSVParserTest.java | 32 ++++++++-- .../common/parsers/DelimitedParserTest.java | 37 +++++++++-- .../druid/query/MultiValuedDimensionTest.java | 3 +- .../GroupByQueryRunnerFactoryTest.java | 3 +- .../test/java/io/druid/segment/TestIndex.java | 3 +- .../firehose/IngestSegmentFirehoseTest.java | 3 +- 28 files changed, 269 insertions(+), 82 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java index bbe1fc4d2288..fa27158814ad 100644 --- a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java @@ -23,7 +23,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Optional; import com.google.common.base.Preconditions; - import io.druid.java.util.common.parsers.CSVParser; import io.druid.java.util.common.parsers.Parser; @@ -35,13 +34,15 @@ public class CSVParseSpec extends ParseSpec { private final String listDelimiter; private final List columns; + private final Integer maxNumSkipHeadRows; @JsonCreator public CSVParseSpec( @JsonProperty("timestampSpec") TimestampSpec timestampSpec, @JsonProperty("dimensionsSpec") DimensionsSpec dimensionsSpec, @JsonProperty("listDelimiter") String listDelimiter, - @JsonProperty("columns") List columns + @JsonProperty("columns") List columns, + @JsonProperty("maxNumSkipHeadRows") Integer maxNumSkipHeadRows ) { super(timestampSpec, dimensionsSpec); @@ -53,6 +54,7 @@ public CSVParseSpec( } this.columns = columns; + this.maxNumSkipHeadRows = maxNumSkipHeadRows; verify(dimensionsSpec.getDimensionNames()); } @@ -69,6 +71,12 @@ public List getColumns() return columns; } + @JsonProperty("maxNumSkipHeadRows") + public Integer getMaxNumSkipHeadRows() + { + return maxNumSkipHeadRows; + } + @Override public void verify(List usedCols) { @@ -80,23 +88,28 @@ public void verify(List usedCols) @Override public Parser makeParser() { - return new CSVParser(Optional.fromNullable(listDelimiter), columns); + return new CSVParser(Optional.fromNullable(listDelimiter), columns, maxNumSkipHeadRows); } @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns); + return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns, maxNumSkipHeadRows); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns); + return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns, maxNumSkipHeadRows); } public ParseSpec withColumns(List cols) { - return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols); + return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols, maxNumSkipHeadRows); + } + + public ParseSpec withMaxNumSkipHeadRows(Integer maxNumSkipHeadRows) + { + return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, columns, maxNumSkipHeadRows); } } diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index 6d7096d7921f..410a65c3e9ac 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -36,6 +36,7 @@ public class DelimitedParseSpec extends ParseSpec private final String delimiter; private final String listDelimiter; private final List columns; + private final Integer maxNumSkipHeadRows; @JsonCreator public DelimitedParseSpec( @@ -43,15 +44,17 @@ public DelimitedParseSpec( @JsonProperty("dimensionsSpec") DimensionsSpec dimensionsSpec, @JsonProperty("delimiter") String delimiter, @JsonProperty("listDelimiter") String listDelimiter, - @JsonProperty("columns") List columns + @JsonProperty("columns") List columns, + @JsonProperty("maxNumSkipHeadRows") Integer maxNumSkipHeadRows ) { super(timestampSpec, dimensionsSpec); this.delimiter = delimiter; this.listDelimiter = listDelimiter; - Preconditions.checkNotNull(columns, "columns"); - this.columns = columns; + this.columns = Preconditions.checkNotNull(columns, "columns"); + this.maxNumSkipHeadRows = maxNumSkipHeadRows; + for (String column : this.columns) { Preconditions.checkArgument(!column.contains(","), "Column[%s] has a comma, it cannot", column); } @@ -77,6 +80,12 @@ public List getColumns() return columns; } + @JsonProperty("maxNumSkipHeadRows") + public Integer getMaxNumSkipHeadRows() + { + return maxNumSkipHeadRows; + } + @Override public void verify(List usedCols) { @@ -90,7 +99,8 @@ public Parser makeParser() { Parser retVal = new DelimitedParser( Optional.fromNullable(delimiter), - Optional.fromNullable(listDelimiter) + Optional.fromNullable(listDelimiter), + maxNumSkipHeadRows ); retVal.setFieldNames(columns); return retVal; @@ -99,27 +109,60 @@ public Parser makeParser() @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns); + return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns, maxNumSkipHeadRows); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns); + return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns, maxNumSkipHeadRows); } public ParseSpec withDelimiter(String delim) { - return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delim, listDelimiter, columns); + return new DelimitedParseSpec( + getTimestampSpec(), + getDimensionsSpec(), + delim, + listDelimiter, + columns, + maxNumSkipHeadRows + ); } public ParseSpec withListDelimiter(String delim) { - return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, delim, columns); + return new DelimitedParseSpec( + getTimestampSpec(), + getDimensionsSpec(), + delimiter, + delim, + columns, + maxNumSkipHeadRows + ); } public ParseSpec withColumns(List cols) { - return new DelimitedParseSpec(getTimestampSpec(), getDimensionsSpec(), delimiter, listDelimiter, cols); + return new DelimitedParseSpec( + getTimestampSpec(), + getDimensionsSpec(), + delimiter, + listDelimiter, + cols, + maxNumSkipHeadRows + ); + } + + public ParseSpec withMaxNumSkipHeadRows(Integer maxNumSkipHeadRows) + { + return new DelimitedParseSpec( + getTimestampSpec(), + getDimensionsSpec(), + delimiter, + listDelimiter, + columns, + maxNumSkipHeadRows + ); } } diff --git a/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java index 7d99a2b804de..ed13ca931123 100644 --- a/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java @@ -41,7 +41,8 @@ public void testColumnMissing() throws Exception Lists.newArrayList() ), ",", - Arrays.asList("a") + Arrays.asList("a"), + null ); } @@ -60,7 +61,8 @@ public void testComma() throws Exception Lists.newArrayList() ), ",", - Arrays.asList("a") + Arrays.asList("a"), + null ); } } diff --git a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java index 1ebad8c64141..237894ff12cb 100644 --- a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java @@ -40,7 +40,8 @@ public void testSerde() throws IOException new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("abc")), null, null), "\u0001", "\u0002", - Arrays.asList("abc") + Arrays.asList("abc"), + null ); final DelimitedParseSpec serde = jsonMapper.readValue( jsonMapper.writeValueAsString(spec), @@ -71,7 +72,8 @@ public void testColumnMissing() throws Exception ), ",", " ", - Arrays.asList("a") + Arrays.asList("a"), + null ); } @@ -91,7 +93,8 @@ public void testComma() throws Exception ), ",", null, - Arrays.asList("a") + Arrays.asList("a"), + null ); } @@ -111,6 +114,7 @@ public void testDefaultColumnList(){ ",", null, // pass null columns not allowed + null, null ); } diff --git a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java index 87f5d20fcd5c..fd779ad399f4 100644 --- a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java +++ b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java @@ -67,7 +67,8 @@ public LineIterator apply(String s) new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("x")), null, null), ",", - ImmutableList.of("ts", "x") + ImmutableList.of("ts", "x"), + 0 ), null ); diff --git a/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java index 4b38453fe35f..589b804fa851 100644 --- a/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java @@ -45,7 +45,8 @@ public void testDuplicateNames() throws Exception ), ",", " ", - Arrays.asList("a", "b") + Arrays.asList("a", "b"), + null ); } @@ -65,7 +66,8 @@ public void testDimAndDimExcluOverlap() throws Exception ), ",", null, - Arrays.asList("a", "B") + Arrays.asList("a", "B"), + null ); } @@ -85,7 +87,8 @@ public void testDimExclusionDuplicate() throws Exception ), ",", null, - Arrays.asList("a", "B") + Arrays.asList("a", "B"), + null ); } } diff --git a/docs/content/development/extensions-core/lookups-cached-global.md b/docs/content/development/extensions-core/lookups-cached-global.md index 72d346201c1d..70d223da559e 100644 --- a/docs/content/development/extensions-core/lookups-cached-global.md +++ b/docs/content/development/extensions-core/lookups-cached-global.md @@ -201,6 +201,7 @@ Only ONE file which matches the search will be used. For most implementations, t |`columns`|The list of columns in the csv file|yes|`null`| |`keyColumn`|The name of the column containing the key|no|The first column| |`valueColumn`|The name of the column containing the value|no|The second column| +|`maxNumSkipHeadRows`|The number of head rows to be skipped|no|0| *example input* @@ -217,7 +218,8 @@ truck,something3,buck "format": "csv", "columns": ["value","somethingElse","key"], "keyColumn": "key", - "valueColumn": "value" + "valueColumn": "value", + "maxNumSkipHeadRows": 5 } ``` @@ -230,6 +232,7 @@ truck,something3,buck |`valueColumn`|The name of the column containing the value|no|The second column| |`delimiter`|The delimiter in the file|no|tab (`\t`)| |`listDelimiter`|The list delimiter in the file|no| (`\u0001`)| +|`maxNumSkipHeadRows`|The number of head rows to be skipped|no|0| *example input* @@ -248,7 +251,8 @@ truck|something,3|buck "columns": ["value","somethingElse","key"], "keyColumn": "key", "valueColumn": "value", - "delimiter": "|" + "delimiter": "|", + "maxNumSkipHeadRows": 5 } ``` diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index 95807cc60a6a..5b13939152b6 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -83,11 +83,14 @@ Since the CSV data cannot contain the column names (no header is allowed), these "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] - } + }, + "maxNumSkipHeadRows" : 5 } ``` The `columns` field must match the columns of your input data in the same order. +Additionally, you can set the number of head rows to be skipped to `maxNumSkipHeadRows`. Note that this option is effective +only for non-Hadoop index tasks. ### TSV @@ -101,13 +104,16 @@ The `columns` field must match the columns of your input data in the same order. "delimiter":"|", "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] - } + }, + "maxNumSkipHeadRows" : 5 } ``` The `columns` field must match the columns of your input data in the same order. Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. +Additionally, you can set the number of head rows to be skipped to `maxNumSkipHeadRows`. Note that this option is effective +only for non-Hadoop index tasks. ### Regex diff --git a/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java b/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java index 46d99cbbcc96..6a4886e38e77 100644 --- a/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java +++ b/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java @@ -96,7 +96,8 @@ public static Iterable constructorFeeder() throws IOException new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("dim", "keys", "values")), null, null), "\t", ",", - Arrays.asList("ts", "dim", "keys", "values") + Arrays.asList("ts", "dim", "keys", "values"), + null ) , "utf8" ); diff --git a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java index a7c27c66638e..c562cae8e588 100644 --- a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java +++ b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java @@ -33,7 +33,6 @@ import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; - import io.druid.guice.annotations.Json; import io.druid.java.util.common.IAE; import io.druid.java.util.common.UOE; @@ -41,7 +40,6 @@ import io.druid.java.util.common.parsers.DelimitedParser; import io.druid.java.util.common.parsers.JSONParser; import io.druid.java.util.common.parsers.Parser; - import org.joda.time.Period; import javax.annotation.Nullable; @@ -264,7 +262,8 @@ public static class CSVFlatDataParser implements FlatDataParser public CSVFlatDataParser( @JsonProperty("columns") List columns, @JsonProperty("keyColumn") final String keyColumn, - @JsonProperty("valueColumn") final String valueColumn + @JsonProperty("valueColumn") final String valueColumn, + @JsonProperty("maxNumSkipHeadRows") final Integer maxNumSkipHeadRows ) { Preconditions.checkArgument( @@ -293,7 +292,7 @@ public CSVFlatDataParser( ); this.parser = new DelegateParser( - new CSVParser(Optional.absent(), columns), + new CSVParser(Optional.absent(), columns, maxNumSkipHeadRows), this.keyColumn, this.valueColumn ); @@ -373,7 +372,8 @@ public TSVFlatDataParser( @JsonProperty("delimiter") String delimiter, @JsonProperty("listDelimiter") String listDelimiter, @JsonProperty("keyColumn") final String keyColumn, - @JsonProperty("valueColumn") final String valueColumn + @JsonProperty("valueColumn") final String valueColumn, + @JsonProperty("maxNumSkipHeadRows") final Integer maxNumSkipHeadRows ) { Preconditions.checkArgument( @@ -382,7 +382,8 @@ public TSVFlatDataParser( ); final DelimitedParser delegate = new DelimitedParser( Optional.fromNullable(Strings.emptyToNull(delimiter)), - Optional.fromNullable(Strings.emptyToNull(listDelimiter)) + Optional.fromNullable(Strings.emptyToNull(listDelimiter)), + maxNumSkipHeadRows ); Preconditions.checkArgument( !(Strings.isNullOrEmpty(keyColumn) ^ Strings.isNullOrEmpty(valueColumn)), diff --git a/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java b/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java index 6d620b73e0b1..4227eab20d8b 100644 --- a/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java +++ b/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java @@ -84,7 +84,8 @@ public void testCSV() "col1", "col2", "col3" - ), "col2", "col3" + ), "col2", "col3", + null ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); } @@ -97,7 +98,8 @@ public void testBadCSV() "col1", "col2", "col3" - ), "col2", "col3ADFSDF" + ), "col2", "col3ADFSDF", + null ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); } @@ -110,7 +112,8 @@ public void testBadCSV2() "col1", "col2", "col3" - ), "col2", "col3" + ), "col2", "col3", + null ); Map map = parser.getParser().parse("A"); } @@ -122,7 +125,8 @@ public void testTSV() ImmutableList.of("col1", "col2", "col3"), "|", null, "col2", - "col3" + "col3", + null ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A|B|C")); } @@ -134,7 +138,8 @@ public void testWithListDelimiterTSV() ImmutableList.of("col1", "col2", "col3"), "\\u0001", "\\u0002", "col2", - "col3" + "col3", + null ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A\\u0001B\\u0001C")); } @@ -146,7 +151,8 @@ public void testBadTSV() ImmutableList.of("col1", "col2", "col3fdsfds"), ",", null, "col2", - "col3" + "col3", + null ); Map map = parser.getParser().parse("A,B,C"); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); @@ -160,7 +166,8 @@ public void testBadTSV2() ImmutableList.of("col1", "col2", "col3"), ",", null, "col2", - "col3" + "col3", + null ); Map map = parser.getParser().parse("A"); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); @@ -301,11 +308,12 @@ public void testSimpleJSONSerDe() throws IOException "col1", "col2", "col3" - ), "col2", "col3" + ), "col2", "col3", + null ), new URIExtractionNamespace.ObjectMapperFlatDataParser(mapper), new URIExtractionNamespace.JSONFlatDataParser(mapper, "keyField", "valueField"), - new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B") + new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B", null) )) { final String str = mapper.writeValueAsString(parser); final URIExtractionNamespace.FlatDataParser parser2 = mapper.readValue( @@ -326,11 +334,12 @@ public void testSimpleToString() throws IOException "col1", "col2", "col3" - ), "col2", "col3" + ), "col2", "col3", + null ), new URIExtractionNamespace.ObjectMapperFlatDataParser(mapper), new URIExtractionNamespace.JSONFlatDataParser(mapper, "keyField", "valueField"), - new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B") + new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B", null) )) { Assert.assertFalse(parser.toString().contains("@")); } diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java index 77938c687021..368b23ea59ae 100644 --- a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java +++ b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java @@ -39,7 +39,10 @@ import com.google.inject.Module; import io.druid.common.utils.JodaUtils; import io.druid.data.input.InputRow; +import io.druid.data.input.impl.CSVParseSpec; +import io.druid.data.input.impl.DelimitedParseSpec; import io.druid.data.input.impl.InputRowParser; +import io.druid.data.input.impl.ParseSpec; import io.druid.guice.GuiceInjectors; import io.druid.guice.JsonConfigProvider; import io.druid.guice.annotations.Self; @@ -358,7 +361,15 @@ public boolean isCombineText() public InputRowParser getParser() { - return schema.getDataSchema().getParser(); + final InputRowParser parser = schema.getDataSchema().getParser(); + final ParseSpec parseSpec = parser.getParseSpec(); + if (parseSpec instanceof CSVParseSpec) { + return parser.withParseSpec(((CSVParseSpec) parseSpec).withMaxNumSkipHeadRows(0)); + } else if (parseSpec instanceof DelimitedParseSpec) { + return parser.withParseSpec(((DelimitedParseSpec) parseSpec).withMaxNumSkipHeadRows(0)); + } else { + return parser; + } } public HadoopyShardSpec getShardSpec(Bucket bucket) diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java index 7201bd119133..6b292a19095d 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java @@ -346,7 +346,8 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + null ), null ), @@ -188,7 +189,8 @@ public static Collection constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + null ) ), null, @@ -233,7 +235,8 @@ public static Collection constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + null ), null ), @@ -289,7 +292,8 @@ public static Collection constructFeed() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + null ) ), null, diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java index 8af6c470ff69..80ba3ba6b7ce 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java @@ -77,7 +77,8 @@ public void setup() throws Exception new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited_num") + ImmutableList.of("timestamp", "host", "visited_num"), + null ), null ), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java index f1c7d558c909..1f5929fa938f 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java @@ -265,7 +265,8 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig() new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(null, null, null), null, - ImmutableList.of("timestamp", "host", "visited") + ImmutableList.of("timestamp", "host", "visited"), + null ), null ), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java index 4b9adba54c4f..a1332296f6be 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java @@ -164,7 +164,8 @@ public InputStream openStream() throws IOException new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList(TestIndex.DIMENSIONS)), null, null), "\t", "\u0001", - Arrays.asList(TestIndex.COLUMNS) + Arrays.asList(TestIndex.COLUMNS), + null ), null ), diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java index 81e85c70dba1..c2f00ea737bc 100644 --- a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java +++ b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java @@ -458,7 +458,8 @@ private IndexTask.IndexIngestionSpec createIngestionSpec( Lists.newArrayList() ), null, - Arrays.asList("ts", "dim", "val") + Arrays.asList("ts", "dim", "val"), + null ), null ), diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index e45ed7e6acc7..80e639025aeb 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -33,18 +33,24 @@ public class CSVParser implements Parser { + private static final int DEFAULT_NUM_SKIP_HEAD_ROWS = 0; + private final String listDelimiter; private final Splitter listSplitter; private final Function valueFunction; + private final int maxNumSkipHeadRows; private final au.com.bytecode.opencsv.CSVParser parser = new au.com.bytecode.opencsv.CSVParser(); private ArrayList fieldNames = null; - public CSVParser(final Optional listDelimiter) + private int numSkippedRows; + + public CSVParser(final Optional listDelimiter, Integer maxNumSkipHeadRows) { this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; this.listSplitter = Splitter.on(this.listDelimiter); + this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; this.valueFunction = new Function() { @Override @@ -64,16 +70,16 @@ public Object apply(String input) }; } - public CSVParser(final Optional listDelimiter, final Iterable fieldNames) + public CSVParser(final Optional listDelimiter, final Iterable fieldNames, Integer maxNumSkipHeadRows) { - this(listDelimiter); + this(listDelimiter, maxNumSkipHeadRows); setFieldNames(fieldNames); } - public CSVParser(final Optional listDelimiter, final String header) + public CSVParser(final Optional listDelimiter, final String header, Integer maxNumSkipHeadRows) { - this(listDelimiter); + this(listDelimiter, maxNumSkipHeadRows); setFieldNames(header); } @@ -109,6 +115,10 @@ public void setFieldNames(final String header) @Override public Map parse(final String input) { + if (numSkippedRows++ < maxNumSkipHeadRows) { + return null; + } + try { String[] values = parser.parseLine(input); diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index c37c4bda989c..92605e96b06c 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -34,10 +34,12 @@ public class DelimitedParser implements Parser { + private static final int DEFAULT_NUM_SKIP_HEAD_ROWS = 0; private static final String DEFAULT_DELIMITER = "\t"; private final String delimiter; private final String listDelimiter; + private final int maxNumSkipHeadRows; private final Splitter splitter; private final Splitter listSplitter; @@ -45,10 +47,13 @@ public class DelimitedParser implements Parser private ArrayList fieldNames = null; - public DelimitedParser(final Optional delimiter, Optional listDelimiter) + private int numSkippedRows; + + public DelimitedParser(final Optional delimiter, Optional listDelimiter, Integer maxNumSkipHeadRows) { this.delimiter = delimiter.isPresent() ? delimiter.get() : DEFAULT_DELIMITER; this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; + this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; Preconditions.checkState( !this.delimiter.equals(this.listDelimiter), @@ -80,18 +85,24 @@ public Object apply(String input) public DelimitedParser( final Optional delimiter, final Optional listDelimiter, - final Iterable fieldNames + final Iterable fieldNames, + final Integer maxNumSkipHeadRows ) { - this(delimiter, listDelimiter); + this(delimiter, listDelimiter, maxNumSkipHeadRows); setFieldNames(fieldNames); } - public DelimitedParser(final Optional delimiter, final Optional listDelimiter, final String header) + public DelimitedParser( + final Optional delimiter, + final Optional listDelimiter, + final String header, + final Integer maxNumSkipHeadRows + ) { - this(delimiter, listDelimiter); + this(delimiter, listDelimiter, maxNumSkipHeadRows); setFieldNames(header); } @@ -132,6 +143,10 @@ public void setFieldNames(String header) @Override public Map parse(final String input) { + if (numSkippedRows++ < maxNumSkipHeadRows) { + return null; + } + try { Iterable values = splitter.split(input); diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java index 8121fd9fd0c7..ee552dfffe8d 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java @@ -21,7 +21,7 @@ import com.google.common.base.Optional; import com.google.common.collect.ImmutableMap; -import junit.framework.Assert; +import org.junit.Assert; import org.junit.Test; import java.util.Map; @@ -36,7 +36,7 @@ public void testValidHeader() final Parser csvParser; boolean parseable = true; try { - csvParser = new CSVParser(Optional.fromNullable(null), csv); + csvParser = new CSVParser(Optional.fromNullable(null), csv, 0); } catch (Exception e) { parseable = false; @@ -53,7 +53,7 @@ public void testInvalidHeader() final Parser csvParser; boolean parseable = true; try { - csvParser = new CSVParser(Optional.fromNullable(null), csv); + csvParser = new CSVParser(Optional.fromNullable(null), csv, 0); } catch (Exception e) { parseable = false; @@ -67,7 +67,7 @@ public void testInvalidHeader() public void testCSVParserWithHeader() { String header = "time,value1,value2"; - final Parser csvParser = new CSVParser(Optional.fromNullable(null), header); + final Parser csvParser = new CSVParser(Optional.fromNullable(null), header, 0); String body = "hello,world,foo"; final Map jsonMap = csvParser.parse(body); Assert.assertEquals( @@ -80,7 +80,7 @@ public void testCSVParserWithHeader() @Test public void testCSVParserWithoutHeader() { - final Parser csvParser = new CSVParser(Optional.fromNullable(null)); + final Parser csvParser = new CSVParser(Optional.fromNullable(null), 0); String body = "hello,world,foo"; final Map jsonMap = csvParser.parse(body); Assert.assertEquals( @@ -89,4 +89,26 @@ public void testCSVParserWithoutHeader() jsonMap ); } + + @Test + public void testCSVParserSkipHeadRows() + { + final Parser csvParser = new CSVParser(Optional.fromNullable(null), 2); + String[] texts = new String[] { + "1st,header,line", + "2nd,header,line", + "hello,world,foo" + }; + int i; + for (i = 0; i < 2; i++) { + Assert.assertNull(csvParser.parse(texts[i])); + } + + final Map jsonMap = csvParser.parse(texts[i]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("column_1", "hello", "column_2", "world", "column_3", "foo"), + jsonMap + ); + } } diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java index 3ca58c67f671..f79cd0a04464 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java @@ -21,7 +21,7 @@ import com.google.common.base.Optional; import com.google.common.collect.ImmutableMap; -import junit.framework.Assert; +import org.junit.Assert; import org.junit.Test; import java.util.Map; @@ -36,7 +36,7 @@ public void testValidHeader() final Parser delimitedParser; boolean parseable = true; try { - delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv); + delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv, 0); } catch (Exception e) { parseable = false; @@ -53,7 +53,7 @@ public void testInvalidHeader() final Parser delimitedParser; boolean parseable = true; try { - delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv); + delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv, 0); } catch (Exception e) { parseable = false; @@ -67,7 +67,12 @@ public void testInvalidHeader() public void testTSVParserWithHeader() { String header = "time\tvalue1\tvalue2"; - final Parser delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), header); + final Parser delimitedParser = new DelimitedParser( + Optional.of("\t"), + Optional.absent(), + header, + 0 + ); String body = "hello\tworld\tfoo"; final Map jsonMap = delimitedParser.parse(body); Assert.assertEquals( @@ -80,7 +85,7 @@ public void testTSVParserWithHeader() @Test public void testTSVParserWithoutHeader() { - final Parser delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent()); + final Parser delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), 0); String body = "hello\tworld\tfoo"; final Map jsonMap = delimitedParser.parse(body); Assert.assertEquals( @@ -89,4 +94,26 @@ public void testTSVParserWithoutHeader() jsonMap ); } + + @Test + public void testTSVParserSkipHeadRows() + { + final Parser parser = new DelimitedParser(Optional.of("\t"), Optional.absent(), 2); + String[] texts = new String[] { + "1st\theader\tline", + "2nd\theader\tline", + "hello\tworld\tfoo" + }; + int i; + for (i = 0; i < 2; i++) { + org.junit.Assert.assertNull(parser.parse(texts[i])); + } + + final Map jsonMap = parser.parse(texts[i]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("column_1", "hello", "column_2", "world", "column_3", "foo"), + jsonMap + ); + } } diff --git a/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java index d4fb1223ec82..7ebf18d438ea 100644 --- a/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java +++ b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java @@ -130,7 +130,8 @@ public static void setupClass() throws Exception new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags")), null, null), "\t", - ImmutableList.of("timestamp", "product", "tags") + ImmutableList.of("timestamp", "product", "tags"), + null ), "UTF-8" ); diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java index dc7648154da2..ee3cd64c75db 100644 --- a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java +++ b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java @@ -144,7 +144,8 @@ private Segment createSegment() throws Exception new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags")), null, null), "\t", - ImmutableList.of("timestamp", "product", "tags") + ImmutableList.of("timestamp", "product", "tags"), + null ), "UTF-8" ); diff --git a/processing/src/test/java/io/druid/segment/TestIndex.java b/processing/src/test/java/io/druid/segment/TestIndex.java index 6f5297684919..cdd17ebcb903 100644 --- a/processing/src/test/java/io/druid/segment/TestIndex.java +++ b/processing/src/test/java/io/druid/segment/TestIndex.java @@ -291,7 +291,8 @@ public static IncrementalIndex loadIncrementalIndex( new DimensionsSpec(DIMENSION_SCHEMAS, null, null), "\t", "\u0001", - Arrays.asList(COLUMNS) + Arrays.asList(COLUMNS), + null ) , "utf8" ); diff --git a/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java b/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java index ea8b5590701a..83f6a32e5ddd 100644 --- a/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java +++ b/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java @@ -108,7 +108,8 @@ private void createTestIndex(File segmentDir) throws Exception new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, - ImmutableList.of("timestamp", "host", "visited") + ImmutableList.of("timestamp", "host", "visited"), + null ), Charsets.UTF_8.toString() ); From 1eaffbd9889ede3492c60faf8cfb15ac9f4e459a Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 7 May 2017 15:17:21 +0900 Subject: [PATCH 11/20] Move checking skipHeadRows to FileIteratingFirehose --- .../druid/data/input/impl/CSVParseSpec.java | 7 +- .../data/input/impl/DelimitedParseSpec.java | 16 +-- .../input/impl/FileIteratingFirehose.java | 47 ++++--- .../input/impl/FileIteratingFirehoseTest.java | 118 +++++++++++------- .../extensions-core/lookups-cached-global.md | 8 +- .../namespace/URIExtractionNamespace.java | 11 +- .../namespace/URIExtractionNamespaceTest.java | 31 ++--- .../indexer/HadoopDruidIndexerConfig.java | 13 +- .../java/util/common/parsers/CSVParser.java | 20 +-- .../util/common/parsers/DelimitedParser.java | 21 +--- .../util/common/parsers/CSVParserTest.java | 30 +---- .../common/parsers/DelimitedParserTest.java | 31 +---- 12 files changed, 144 insertions(+), 209 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java index fa27158814ad..d0fbea5e7276 100644 --- a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java @@ -88,7 +88,7 @@ public void verify(List usedCols) @Override public Parser makeParser() { - return new CSVParser(Optional.fromNullable(listDelimiter), columns, maxNumSkipHeadRows); + return new CSVParser(Optional.fromNullable(listDelimiter), columns); } @Override @@ -107,9 +107,4 @@ public ParseSpec withColumns(List cols) { return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols, maxNumSkipHeadRows); } - - public ParseSpec withMaxNumSkipHeadRows(Integer maxNumSkipHeadRows) - { - return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, columns, maxNumSkipHeadRows); - } } diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index 410a65c3e9ac..10de474b4839 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -23,7 +23,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Optional; import com.google.common.base.Preconditions; - import io.druid.java.util.common.parsers.DelimitedParser; import io.druid.java.util.common.parsers.Parser; @@ -99,8 +98,7 @@ public Parser makeParser() { Parser retVal = new DelimitedParser( Optional.fromNullable(delimiter), - Optional.fromNullable(listDelimiter), - maxNumSkipHeadRows + Optional.fromNullable(listDelimiter) ); retVal.setFieldNames(columns); return retVal; @@ -153,16 +151,4 @@ public ParseSpec withColumns(List cols) maxNumSkipHeadRows ); } - - public ParseSpec withMaxNumSkipHeadRows(Integer maxNumSkipHeadRows) - { - return new DelimitedParseSpec( - getTimestampSpec(), - getDimensionsSpec(), - delimiter, - listDelimiter, - columns, - maxNumSkipHeadRows - ); - } } diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 97e33f04a894..30f3d4f1758c 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -19,7 +19,7 @@ package io.druid.data.input.impl; -import com.google.common.base.Throwables; +import com.google.common.collect.Iterators; import io.druid.data.input.Firehose; import io.druid.data.input.InputRow; import io.druid.utils.Runnables; @@ -27,13 +27,17 @@ import java.io.IOException; import java.util.Iterator; +import java.util.NoSuchElementException; /** */ public class FileIteratingFirehose implements Firehose { + private static final int DEFAULT_NUM_SKIP_HEAD_ROWS = 0; + private final Iterator lineIterators; private final StringInputRowParser parser; + private final int maxNumSkipHeadRows; private LineIterator lineIterator = null; @@ -42,15 +46,29 @@ public FileIteratingFirehose( StringInputRowParser parser ) { - this.lineIterators = lineIterators; + // Skip empty files + this.lineIterators = Iterators.filter(lineIterators, iterator -> iterator != null && iterator.hasNext()); this.parser = parser; + final ParseSpec parseSpec = parser.getParseSpec(); + if (parseSpec instanceof CSVParseSpec) { + final Integer maxNumSkipHeadRows = ((CSVParseSpec) parseSpec).getMaxNumSkipHeadRows(); + this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; + } else if (parseSpec instanceof DelimitedParseSpec) { + final Integer maxNumSkipHeadRows = ((DelimitedParseSpec) parseSpec).getMaxNumSkipHeadRows(); + this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; + } else { + maxNumSkipHeadRows = DEFAULT_NUM_SKIP_HEAD_ROWS; + } } @Override public boolean hasMore() { while ((lineIterator == null || !lineIterator.hasNext()) && lineIterators.hasNext()) { - lineIterator = lineIterators.next(); + lineIterator = getNextLineIterator(); + for (int i = 0; i < maxNumSkipHeadRows && lineIterator.hasNext(); i++) { + lineIterator.next(); + } } return lineIterator != null && lineIterator.hasNext(); @@ -59,21 +77,20 @@ public boolean hasMore() @Override public InputRow nextRow() { - try { - if (lineIterator == null || !lineIterator.hasNext()) { - // Close old streams, maybe. - if (lineIterator != null) { - lineIterator.close(); - } + if (!hasMore()) { + throw new NoSuchElementException(); + } - lineIterator = lineIterators.next(); - } + return parser.parse(lineIterator.next()); + } - return parser.parse(lineIterator.next()); - } - catch (Exception e) { - throw Throwables.propagate(e); + private LineIterator getNextLineIterator() + { + if (lineIterator != null) { + lineIterator.close(); } + + return lineIterators.next(); } @Override diff --git a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java index fd779ad399f4..41a3bb16f536 100644 --- a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java +++ b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java @@ -19,68 +19,98 @@ package io.druid.data.input.impl; -import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; - -import io.druid.java.util.common.Pair; -import junit.framework.Assert; - import org.apache.commons.io.LineIterator; +import org.junit.Assert; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; +import java.io.IOException; import java.io.StringReader; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.List; +import java.util.stream.Collectors; +@RunWith(Parameterized.class) public class FileIteratingFirehoseTest { - private static final List>> fixtures = ImmutableList.of( - Pair.of(new String[]{"2000,foo"}, ImmutableList.of("foo")), - Pair.of(new String[]{"2000,foo\n2000,bar\n"}, ImmutableList.of("foo", "bar")), - Pair.of(new String[]{"2000,foo\n2000,bar\n", "2000,baz"}, ImmutableList.of("foo", "bar", "baz")), - Pair.of(new String[]{"2000,foo\n2000,bar\n", "", "2000,baz"}, ImmutableList.of("foo", "bar", "baz")), - Pair.of(new String[]{"2000,foo\n2000,bar\n", "", "2000,baz", ""}, ImmutableList.of("foo", "bar", "baz")), - Pair.of(new String[]{""}, ImmutableList.of()), - Pair.of(new String[]{}, ImmutableList.of()) - ); + @Parameters(name = "{0}, {1}") + public static Collection constructorFeeder() throws IOException + { + final List> inputTexts = ImmutableList.of( + ImmutableList.of("2000,foo"), + ImmutableList.of("2000,foo\n2000,bar\n"), + ImmutableList.of("2000,foo\n2000,bar\n", "2000,baz"), + ImmutableList.of("2000,foo\n2000,bar\n", "", "2000,baz"), + ImmutableList.of("2000,foo\n2000,bar\n", "", "2000,baz", ""), + ImmutableList.of("2000,foo\n2000,bar\n2000,baz", "", "2000,baz", "2000,foo\n2000,bar\n3000,baz"), + ImmutableList.of(""), + ImmutableList.of() + ); - @Test - public void testFirehose() throws Exception + final List args = new ArrayList<>(); + for (int numSkipHeadRows = 0; numSkipHeadRows < 3; numSkipHeadRows++) { + for (List texts : inputTexts) { + args.add(new Object[] {texts, numSkipHeadRows}); + } + } + + return args; + } + + private final StringInputRowParser parser; + private final List inputs; + private final List expectedResults; + + public FileIteratingFirehoseTest(List texts, int numSkipHeadRows) { - for (Pair> fixture : fixtures) { - final List lineIterators = Lists.transform( - Arrays.asList(fixture.lhs), - new Function() - { - @Override - public LineIterator apply(String s) - { - return new LineIterator(new StringReader(s)); - } - } - ); + parser = new StringInputRowParser( + new CSVParseSpec( + new TimestampSpec("ts", "auto", null), + new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("x")), null, null), + ",", + ImmutableList.of("ts", "x"), + numSkipHeadRows + ), + null + ); - final StringInputRowParser parser = new StringInputRowParser( - new CSVParseSpec( - new TimestampSpec("ts", "auto", null), - new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("x")), null, null), - ",", - ImmutableList.of("ts", "x"), - 0 - ), - null - ); + this.inputs = texts; + this.expectedResults = inputs.stream() + .map(input -> input.split("\n")) + .filter(lines -> numSkipHeadRows < lines.length) + .flatMap(lines -> { + final List skippedLines = Arrays.asList(lines) + .subList(numSkipHeadRows, lines.length); + return skippedLines.stream() + .filter(line -> line.length() > 0) + .map(line -> line.split(",")[1]) + .collect(Collectors.toList()).stream(); + }) + .collect(Collectors.toList()); + } - final FileIteratingFirehose firehose = new FileIteratingFirehose(lineIterators.iterator(), parser); - final List results = Lists.newArrayList(); + @Test + public void testFirehose() throws Exception + { + final List lineIterators = inputs.stream() + .map(s -> new LineIterator(new StringReader(s))) + .collect(Collectors.toList()); + lineIterators.add(null); // test skip null iterator - while (firehose.hasMore()) { - results.add(Joiner.on("|").join(firehose.nextRow().getDimension("x"))); - } + final FileIteratingFirehose firehose = new FileIteratingFirehose(lineIterators.iterator(), parser); + final List results = Lists.newArrayList(); - Assert.assertEquals(fixture.rhs, results); + while (firehose.hasMore()) { + results.add(Joiner.on("|").join(firehose.nextRow().getDimension("x"))); } + + Assert.assertEquals(expectedResults, results); } } diff --git a/docs/content/development/extensions-core/lookups-cached-global.md b/docs/content/development/extensions-core/lookups-cached-global.md index 70d223da559e..72d346201c1d 100644 --- a/docs/content/development/extensions-core/lookups-cached-global.md +++ b/docs/content/development/extensions-core/lookups-cached-global.md @@ -201,7 +201,6 @@ Only ONE file which matches the search will be used. For most implementations, t |`columns`|The list of columns in the csv file|yes|`null`| |`keyColumn`|The name of the column containing the key|no|The first column| |`valueColumn`|The name of the column containing the value|no|The second column| -|`maxNumSkipHeadRows`|The number of head rows to be skipped|no|0| *example input* @@ -218,8 +217,7 @@ truck,something3,buck "format": "csv", "columns": ["value","somethingElse","key"], "keyColumn": "key", - "valueColumn": "value", - "maxNumSkipHeadRows": 5 + "valueColumn": "value" } ``` @@ -232,7 +230,6 @@ truck,something3,buck |`valueColumn`|The name of the column containing the value|no|The second column| |`delimiter`|The delimiter in the file|no|tab (`\t`)| |`listDelimiter`|The list delimiter in the file|no| (`\u0001`)| -|`maxNumSkipHeadRows`|The number of head rows to be skipped|no|0| *example input* @@ -251,8 +248,7 @@ truck|something,3|buck "columns": ["value","somethingElse","key"], "keyColumn": "key", "valueColumn": "value", - "delimiter": "|", - "maxNumSkipHeadRows": 5 + "delimiter": "|" } ``` diff --git a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java index c562cae8e588..a5103a4e2485 100644 --- a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java +++ b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java @@ -262,8 +262,7 @@ public static class CSVFlatDataParser implements FlatDataParser public CSVFlatDataParser( @JsonProperty("columns") List columns, @JsonProperty("keyColumn") final String keyColumn, - @JsonProperty("valueColumn") final String valueColumn, - @JsonProperty("maxNumSkipHeadRows") final Integer maxNumSkipHeadRows + @JsonProperty("valueColumn") final String valueColumn ) { Preconditions.checkArgument( @@ -292,7 +291,7 @@ public CSVFlatDataParser( ); this.parser = new DelegateParser( - new CSVParser(Optional.absent(), columns, maxNumSkipHeadRows), + new CSVParser(Optional.absent(), columns), this.keyColumn, this.valueColumn ); @@ -372,8 +371,7 @@ public TSVFlatDataParser( @JsonProperty("delimiter") String delimiter, @JsonProperty("listDelimiter") String listDelimiter, @JsonProperty("keyColumn") final String keyColumn, - @JsonProperty("valueColumn") final String valueColumn, - @JsonProperty("maxNumSkipHeadRows") final Integer maxNumSkipHeadRows + @JsonProperty("valueColumn") final String valueColumn ) { Preconditions.checkArgument( @@ -382,8 +380,7 @@ public TSVFlatDataParser( ); final DelimitedParser delegate = new DelimitedParser( Optional.fromNullable(Strings.emptyToNull(delimiter)), - Optional.fromNullable(Strings.emptyToNull(listDelimiter)), - maxNumSkipHeadRows + Optional.fromNullable(Strings.emptyToNull(listDelimiter)) ); Preconditions.checkArgument( !(Strings.isNullOrEmpty(keyColumn) ^ Strings.isNullOrEmpty(valueColumn)), diff --git a/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java b/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java index 4227eab20d8b..6d620b73e0b1 100644 --- a/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java +++ b/extensions-core/lookups-cached-global/src/test/java/io/druid/query/lookup/namespace/URIExtractionNamespaceTest.java @@ -84,8 +84,7 @@ public void testCSV() "col1", "col2", "col3" - ), "col2", "col3", - null + ), "col2", "col3" ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); } @@ -98,8 +97,7 @@ public void testBadCSV() "col1", "col2", "col3" - ), "col2", "col3ADFSDF", - null + ), "col2", "col3ADFSDF" ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); } @@ -112,8 +110,7 @@ public void testBadCSV2() "col1", "col2", "col3" - ), "col2", "col3", - null + ), "col2", "col3" ); Map map = parser.getParser().parse("A"); } @@ -125,8 +122,7 @@ public void testTSV() ImmutableList.of("col1", "col2", "col3"), "|", null, "col2", - "col3", - null + "col3" ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A|B|C")); } @@ -138,8 +134,7 @@ public void testWithListDelimiterTSV() ImmutableList.of("col1", "col2", "col3"), "\\u0001", "\\u0002", "col2", - "col3", - null + "col3" ); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A\\u0001B\\u0001C")); } @@ -151,8 +146,7 @@ public void testBadTSV() ImmutableList.of("col1", "col2", "col3fdsfds"), ",", null, "col2", - "col3", - null + "col3" ); Map map = parser.getParser().parse("A,B,C"); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); @@ -166,8 +160,7 @@ public void testBadTSV2() ImmutableList.of("col1", "col2", "col3"), ",", null, "col2", - "col3", - null + "col3" ); Map map = parser.getParser().parse("A"); Assert.assertEquals(ImmutableMap.of("B", "C"), parser.getParser().parse("A,B,C")); @@ -308,12 +301,11 @@ public void testSimpleJSONSerDe() throws IOException "col1", "col2", "col3" - ), "col2", "col3", - null + ), "col2", "col3" ), new URIExtractionNamespace.ObjectMapperFlatDataParser(mapper), new URIExtractionNamespace.JSONFlatDataParser(mapper, "keyField", "valueField"), - new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B", null) + new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B") )) { final String str = mapper.writeValueAsString(parser); final URIExtractionNamespace.FlatDataParser parser2 = mapper.readValue( @@ -334,12 +326,11 @@ public void testSimpleToString() throws IOException "col1", "col2", "col3" - ), "col2", "col3", - null + ), "col2", "col3" ), new URIExtractionNamespace.ObjectMapperFlatDataParser(mapper), new URIExtractionNamespace.JSONFlatDataParser(mapper, "keyField", "valueField"), - new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B", null) + new URIExtractionNamespace.TSVFlatDataParser(ImmutableList.of("A", "B"), ",", null, "A", "B") )) { Assert.assertFalse(parser.toString().contains("@")); } diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java index 368b23ea59ae..77938c687021 100644 --- a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java +++ b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java @@ -39,10 +39,7 @@ import com.google.inject.Module; import io.druid.common.utils.JodaUtils; import io.druid.data.input.InputRow; -import io.druid.data.input.impl.CSVParseSpec; -import io.druid.data.input.impl.DelimitedParseSpec; import io.druid.data.input.impl.InputRowParser; -import io.druid.data.input.impl.ParseSpec; import io.druid.guice.GuiceInjectors; import io.druid.guice.JsonConfigProvider; import io.druid.guice.annotations.Self; @@ -361,15 +358,7 @@ public boolean isCombineText() public InputRowParser getParser() { - final InputRowParser parser = schema.getDataSchema().getParser(); - final ParseSpec parseSpec = parser.getParseSpec(); - if (parseSpec instanceof CSVParseSpec) { - return parser.withParseSpec(((CSVParseSpec) parseSpec).withMaxNumSkipHeadRows(0)); - } else if (parseSpec instanceof DelimitedParseSpec) { - return parser.withParseSpec(((DelimitedParseSpec) parseSpec).withMaxNumSkipHeadRows(0)); - } else { - return parser; - } + return schema.getDataSchema().getParser(); } public HadoopyShardSpec getShardSpec(Bucket bucket) diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index 80e639025aeb..e45ed7e6acc7 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -33,24 +33,18 @@ public class CSVParser implements Parser { - private static final int DEFAULT_NUM_SKIP_HEAD_ROWS = 0; - private final String listDelimiter; private final Splitter listSplitter; private final Function valueFunction; - private final int maxNumSkipHeadRows; private final au.com.bytecode.opencsv.CSVParser parser = new au.com.bytecode.opencsv.CSVParser(); private ArrayList fieldNames = null; - private int numSkippedRows; - - public CSVParser(final Optional listDelimiter, Integer maxNumSkipHeadRows) + public CSVParser(final Optional listDelimiter) { this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; this.listSplitter = Splitter.on(this.listDelimiter); - this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; this.valueFunction = new Function() { @Override @@ -70,16 +64,16 @@ public Object apply(String input) }; } - public CSVParser(final Optional listDelimiter, final Iterable fieldNames, Integer maxNumSkipHeadRows) + public CSVParser(final Optional listDelimiter, final Iterable fieldNames) { - this(listDelimiter, maxNumSkipHeadRows); + this(listDelimiter); setFieldNames(fieldNames); } - public CSVParser(final Optional listDelimiter, final String header, Integer maxNumSkipHeadRows) + public CSVParser(final Optional listDelimiter, final String header) { - this(listDelimiter, maxNumSkipHeadRows); + this(listDelimiter); setFieldNames(header); } @@ -115,10 +109,6 @@ public void setFieldNames(final String header) @Override public Map parse(final String input) { - if (numSkippedRows++ < maxNumSkipHeadRows) { - return null; - } - try { String[] values = parser.parseLine(input); diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index 92605e96b06c..b0c5fd1a0142 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -34,12 +34,10 @@ public class DelimitedParser implements Parser { - private static final int DEFAULT_NUM_SKIP_HEAD_ROWS = 0; private static final String DEFAULT_DELIMITER = "\t"; private final String delimiter; private final String listDelimiter; - private final int maxNumSkipHeadRows; private final Splitter splitter; private final Splitter listSplitter; @@ -47,13 +45,10 @@ public class DelimitedParser implements Parser private ArrayList fieldNames = null; - private int numSkippedRows; - - public DelimitedParser(final Optional delimiter, Optional listDelimiter, Integer maxNumSkipHeadRows) + public DelimitedParser(final Optional delimiter, Optional listDelimiter) { this.delimiter = delimiter.isPresent() ? delimiter.get() : DEFAULT_DELIMITER; this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; - this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; Preconditions.checkState( !this.delimiter.equals(this.listDelimiter), @@ -85,11 +80,10 @@ public Object apply(String input) public DelimitedParser( final Optional delimiter, final Optional listDelimiter, - final Iterable fieldNames, - final Integer maxNumSkipHeadRows + final Iterable fieldNames ) { - this(delimiter, listDelimiter, maxNumSkipHeadRows); + this(delimiter, listDelimiter); setFieldNames(fieldNames); } @@ -97,12 +91,11 @@ public DelimitedParser( public DelimitedParser( final Optional delimiter, final Optional listDelimiter, - final String header, - final Integer maxNumSkipHeadRows + final String header ) { - this(delimiter, listDelimiter, maxNumSkipHeadRows); + this(delimiter, listDelimiter); setFieldNames(header); } @@ -143,10 +136,6 @@ public void setFieldNames(String header) @Override public Map parse(final String input) { - if (numSkippedRows++ < maxNumSkipHeadRows) { - return null; - } - try { Iterable values = splitter.split(input); diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java index ee552dfffe8d..cc74942f186b 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java @@ -36,7 +36,7 @@ public void testValidHeader() final Parser csvParser; boolean parseable = true; try { - csvParser = new CSVParser(Optional.fromNullable(null), csv, 0); + csvParser = new CSVParser(Optional.fromNullable(null), csv); } catch (Exception e) { parseable = false; @@ -53,7 +53,7 @@ public void testInvalidHeader() final Parser csvParser; boolean parseable = true; try { - csvParser = new CSVParser(Optional.fromNullable(null), csv, 0); + csvParser = new CSVParser(Optional.fromNullable(null), csv); } catch (Exception e) { parseable = false; @@ -67,7 +67,7 @@ public void testInvalidHeader() public void testCSVParserWithHeader() { String header = "time,value1,value2"; - final Parser csvParser = new CSVParser(Optional.fromNullable(null), header, 0); + final Parser csvParser = new CSVParser(Optional.fromNullable(null), header); String body = "hello,world,foo"; final Map jsonMap = csvParser.parse(body); Assert.assertEquals( @@ -80,7 +80,7 @@ public void testCSVParserWithHeader() @Test public void testCSVParserWithoutHeader() { - final Parser csvParser = new CSVParser(Optional.fromNullable(null), 0); + final Parser csvParser = new CSVParser(Optional.fromNullable(null)); String body = "hello,world,foo"; final Map jsonMap = csvParser.parse(body); Assert.assertEquals( @@ -89,26 +89,4 @@ public void testCSVParserWithoutHeader() jsonMap ); } - - @Test - public void testCSVParserSkipHeadRows() - { - final Parser csvParser = new CSVParser(Optional.fromNullable(null), 2); - String[] texts = new String[] { - "1st,header,line", - "2nd,header,line", - "hello,world,foo" - }; - int i; - for (i = 0; i < 2; i++) { - Assert.assertNull(csvParser.parse(texts[i])); - } - - final Map jsonMap = csvParser.parse(texts[i]); - Assert.assertEquals( - "jsonMap", - ImmutableMap.of("column_1", "hello", "column_2", "world", "column_3", "foo"), - jsonMap - ); - } } diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java index f79cd0a04464..fd7f795f2f6f 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java @@ -36,7 +36,7 @@ public void testValidHeader() final Parser delimitedParser; boolean parseable = true; try { - delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv, 0); + delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv); } catch (Exception e) { parseable = false; @@ -53,7 +53,7 @@ public void testInvalidHeader() final Parser delimitedParser; boolean parseable = true; try { - delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv, 0); + delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), tsv); } catch (Exception e) { parseable = false; @@ -70,8 +70,7 @@ public void testTSVParserWithHeader() final Parser delimitedParser = new DelimitedParser( Optional.of("\t"), Optional.absent(), - header, - 0 + header ); String body = "hello\tworld\tfoo"; final Map jsonMap = delimitedParser.parse(body); @@ -85,7 +84,7 @@ public void testTSVParserWithHeader() @Test public void testTSVParserWithoutHeader() { - final Parser delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent(), 0); + final Parser delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent()); String body = "hello\tworld\tfoo"; final Map jsonMap = delimitedParser.parse(body); Assert.assertEquals( @@ -94,26 +93,4 @@ public void testTSVParserWithoutHeader() jsonMap ); } - - @Test - public void testTSVParserSkipHeadRows() - { - final Parser parser = new DelimitedParser(Optional.of("\t"), Optional.absent(), 2); - String[] texts = new String[] { - "1st\theader\tline", - "2nd\theader\tline", - "hello\tworld\tfoo" - }; - int i; - for (i = 0; i < 2; i++) { - org.junit.Assert.assertNull(parser.parse(texts[i])); - } - - final Map jsonMap = parser.parse(texts[i]); - Assert.assertEquals( - "jsonMap", - ImmutableMap.of("column_1", "hello", "column_2", "world", "column_3", "foo"), - jsonMap - ); - } } From e72338938e37be3d261fbb6d7c373a5c8d807b20 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 7 May 2017 16:08:35 +0900 Subject: [PATCH 12/20] Remove checking null iterators --- .../java/io/druid/data/input/impl/FileIteratingFirehose.java | 3 +-- .../io/druid/data/input/impl/FileIteratingFirehoseTest.java | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 30f3d4f1758c..5a37a88459cb 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -46,8 +46,7 @@ public FileIteratingFirehose( StringInputRowParser parser ) { - // Skip empty files - this.lineIterators = Iterators.filter(lineIterators, iterator -> iterator != null && iterator.hasNext()); + this.lineIterators = lineIterators; this.parser = parser; final ParseSpec parseSpec = parser.getParseSpec(); if (parseSpec instanceof CSVParseSpec) { diff --git a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java index 41a3bb16f536..915928562b5a 100644 --- a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java +++ b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java @@ -102,7 +102,6 @@ public void testFirehose() throws Exception final List lineIterators = inputs.stream() .map(s -> new LineIterator(new StringReader(s))) .collect(Collectors.toList()); - lineIterators.add(null); // test skip null iterator final FileIteratingFirehose firehose = new FileIteratingFirehose(lineIterators.iterator(), parser); final List results = Lists.newArrayList(); From aadaa7ab9acdb403bc175355c7988b2caf794b17 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 7 May 2017 16:24:04 +0900 Subject: [PATCH 13/20] Remove unused imports --- .../java/io/druid/data/input/impl/FileIteratingFirehose.java | 1 - 1 file changed, 1 deletion(-) diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 5a37a88459cb..3660788b49c2 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -19,7 +19,6 @@ package io.druid.data.input.impl; -import com.google.common.collect.Iterators; import io.druid.data.input.Firehose; import io.druid.data.input.InputRow; import io.druid.utils.Runnables; From ff74a92bdd517c7fd2d220f61f9218c32395f613 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 9 May 2017 10:26:44 +0900 Subject: [PATCH 14/20] Address comments --- .../druid/data/input/impl/CSVParseSpec.java | 18 +++++++-------- .../data/input/impl/DelimitedParseSpec.java | 22 +++++++++---------- .../input/impl/FileIteratingFirehose.java | 14 +++++------- docs/content/ingestion/data-formats.md | 12 +++++----- 4 files changed, 31 insertions(+), 35 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java index d0fbea5e7276..0b9a63ca6af5 100644 --- a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java @@ -34,7 +34,7 @@ public class CSVParseSpec extends ParseSpec { private final String listDelimiter; private final List columns; - private final Integer maxNumSkipHeadRows; + private final int skipHeaderRows; @JsonCreator public CSVParseSpec( @@ -42,7 +42,7 @@ public CSVParseSpec( @JsonProperty("dimensionsSpec") DimensionsSpec dimensionsSpec, @JsonProperty("listDelimiter") String listDelimiter, @JsonProperty("columns") List columns, - @JsonProperty("maxNumSkipHeadRows") Integer maxNumSkipHeadRows + @JsonProperty("skipHeaderRows") int skipHeaderRows ) { super(timestampSpec, dimensionsSpec); @@ -54,7 +54,7 @@ public CSVParseSpec( } this.columns = columns; - this.maxNumSkipHeadRows = maxNumSkipHeadRows; + this.skipHeaderRows = skipHeaderRows; verify(dimensionsSpec.getDimensionNames()); } @@ -71,10 +71,10 @@ public List getColumns() return columns; } - @JsonProperty("maxNumSkipHeadRows") - public Integer getMaxNumSkipHeadRows() + @JsonProperty("skipHeaderRows") + public Integer getSkipHeaderRows() { - return maxNumSkipHeadRows; + return skipHeaderRows; } @Override @@ -94,17 +94,17 @@ public Parser makeParser() @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns, maxNumSkipHeadRows); + return new CSVParseSpec(spec, getDimensionsSpec(), listDelimiter, columns, skipHeaderRows); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns, maxNumSkipHeadRows); + return new CSVParseSpec(getTimestampSpec(), spec, listDelimiter, columns, skipHeaderRows); } public ParseSpec withColumns(List cols) { - return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols, maxNumSkipHeadRows); + return new CSVParseSpec(getTimestampSpec(), getDimensionsSpec(), listDelimiter, cols, skipHeaderRows); } } diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index 10de474b4839..e32cf9627ab1 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -35,7 +35,7 @@ public class DelimitedParseSpec extends ParseSpec private final String delimiter; private final String listDelimiter; private final List columns; - private final Integer maxNumSkipHeadRows; + private final int skipHeaderRows; @JsonCreator public DelimitedParseSpec( @@ -44,7 +44,7 @@ public DelimitedParseSpec( @JsonProperty("delimiter") String delimiter, @JsonProperty("listDelimiter") String listDelimiter, @JsonProperty("columns") List columns, - @JsonProperty("maxNumSkipHeadRows") Integer maxNumSkipHeadRows + @JsonProperty("skipHeaderRows") int skipHeaderRows ) { super(timestampSpec, dimensionsSpec); @@ -52,7 +52,7 @@ public DelimitedParseSpec( this.delimiter = delimiter; this.listDelimiter = listDelimiter; this.columns = Preconditions.checkNotNull(columns, "columns"); - this.maxNumSkipHeadRows = maxNumSkipHeadRows; + this.skipHeaderRows = skipHeaderRows; for (String column : this.columns) { Preconditions.checkArgument(!column.contains(","), "Column[%s] has a comma, it cannot", column); @@ -79,10 +79,10 @@ public List getColumns() return columns; } - @JsonProperty("maxNumSkipHeadRows") - public Integer getMaxNumSkipHeadRows() + @JsonProperty("skipHeaderRows") + public Integer getSkipHeaderRows() { - return maxNumSkipHeadRows; + return skipHeaderRows; } @Override @@ -107,13 +107,13 @@ public Parser makeParser() @Override public ParseSpec withTimestampSpec(TimestampSpec spec) { - return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns, maxNumSkipHeadRows); + return new DelimitedParseSpec(spec, getDimensionsSpec(), delimiter, listDelimiter, columns, skipHeaderRows); } @Override public ParseSpec withDimensionsSpec(DimensionsSpec spec) { - return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns, maxNumSkipHeadRows); + return new DelimitedParseSpec(getTimestampSpec(), spec, delimiter, listDelimiter, columns, skipHeaderRows); } public ParseSpec withDelimiter(String delim) @@ -124,7 +124,7 @@ public ParseSpec withDelimiter(String delim) delim, listDelimiter, columns, - maxNumSkipHeadRows + skipHeaderRows ); } @@ -136,7 +136,7 @@ public ParseSpec withListDelimiter(String delim) delimiter, delim, columns, - maxNumSkipHeadRows + skipHeaderRows ); } @@ -148,7 +148,7 @@ public ParseSpec withColumns(List cols) delimiter, listDelimiter, cols, - maxNumSkipHeadRows + skipHeaderRows ); } } diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 3660788b49c2..395934d1c8c7 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -32,11 +32,9 @@ */ public class FileIteratingFirehose implements Firehose { - private static final int DEFAULT_NUM_SKIP_HEAD_ROWS = 0; - private final Iterator lineIterators; private final StringInputRowParser parser; - private final int maxNumSkipHeadRows; + private final int skipHeadRows; private LineIterator lineIterator = null; @@ -49,13 +47,11 @@ public FileIteratingFirehose( this.parser = parser; final ParseSpec parseSpec = parser.getParseSpec(); if (parseSpec instanceof CSVParseSpec) { - final Integer maxNumSkipHeadRows = ((CSVParseSpec) parseSpec).getMaxNumSkipHeadRows(); - this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; + this.skipHeadRows = ((CSVParseSpec) parseSpec).getSkipHeaderRows(); } else if (parseSpec instanceof DelimitedParseSpec) { - final Integer maxNumSkipHeadRows = ((DelimitedParseSpec) parseSpec).getMaxNumSkipHeadRows(); - this.maxNumSkipHeadRows = maxNumSkipHeadRows == null ? DEFAULT_NUM_SKIP_HEAD_ROWS : maxNumSkipHeadRows; + this.skipHeadRows = ((DelimitedParseSpec) parseSpec).getSkipHeaderRows(); } else { - maxNumSkipHeadRows = DEFAULT_NUM_SKIP_HEAD_ROWS; + skipHeadRows = 0; } } @@ -64,7 +60,7 @@ public boolean hasMore() { while ((lineIterator == null || !lineIterator.hasNext()) && lineIterators.hasNext()) { lineIterator = getNextLineIterator(); - for (int i = 0; i < maxNumSkipHeadRows && lineIterator.hasNext(); i++) { + for (int i = 0; i < skipHeadRows && lineIterator.hasNext(); i++) { lineIterator.next(); } } diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index 5b13939152b6..b36f1efc76f4 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -84,13 +84,13 @@ Since the CSV data cannot contain the column names (no header is allowed), these "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] }, - "maxNumSkipHeadRows" : 5 + "skipHeaderRows" : 5 } ``` The `columns` field must match the columns of your input data in the same order. -Additionally, you can set the number of head rows to be skipped to `maxNumSkipHeadRows`. Note that this option is effective -only for non-Hadoop index tasks. +Additionally, you can set the number of head rows to be skipped to `skipHeaderRows`. Note that this option is effective +only for non-Hadoop batch index tasks. ### TSV @@ -105,15 +105,15 @@ only for non-Hadoop index tasks. "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] }, - "maxNumSkipHeadRows" : 5 + "skipHeaderRows" : 5 } ``` The `columns` field must match the columns of your input data in the same order. Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. -Additionally, you can set the number of head rows to be skipped to `maxNumSkipHeadRows`. Note that this option is effective -only for non-Hadoop index tasks. +Additionally, you can set the number of head rows to be skipped to `skipHeaderRows`. Note that this option is effective +only for non-Hadoop batch index tasks. ### Regex From 92c5b3ea30802ecb1c67fe25d7a1bd55ac8e6435 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 9 May 2017 10:39:42 +0900 Subject: [PATCH 15/20] Fix compilation error --- .../java/io/druid/data/input/impl/CSVParseSpecTest.java | 4 ++-- .../io/druid/data/input/impl/DelimitedParseSpecTest.java | 8 ++++---- .../test/java/io/druid/data/input/impl/ParseSpecTest.java | 6 +++--- .../test/java/io/druid/segment/MapVirtualColumnTest.java | 2 +- .../java/io/druid/indexer/BatchDeltaIngestionTest.java | 2 +- .../druid/indexer/DetermineHashedPartitionsJobTest.java | 2 +- .../java/io/druid/indexer/DeterminePartitionsJobTest.java | 2 +- .../test/java/io/druid/indexer/IndexGeneratorJobTest.java | 8 ++++---- .../src/test/java/io/druid/indexer/JobHelperTest.java | 2 +- .../io/druid/indexer/path/DatasourcePathSpecTest.java | 2 +- .../io/druid/indexer/updater/HadoopConverterJobTest.java | 2 +- .../java/io/druid/indexing/common/task/IndexTaskTest.java | 2 +- .../java/io/druid/query/MultiValuedDimensionTest.java | 2 +- .../query/groupby/GroupByQueryRunnerFactoryTest.java | 2 +- processing/src/test/java/io/druid/segment/TestIndex.java | 2 +- .../realtime/firehose/IngestSegmentFirehoseTest.java | 2 +- 16 files changed, 25 insertions(+), 25 deletions(-) diff --git a/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java index ed13ca931123..35d7d1230def 100644 --- a/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/CSVParseSpecTest.java @@ -42,7 +42,7 @@ public void testColumnMissing() throws Exception ), ",", Arrays.asList("a"), - null + 0 ); } @@ -62,7 +62,7 @@ public void testComma() throws Exception ), ",", Arrays.asList("a"), - null + 0 ); } } diff --git a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java index 237894ff12cb..ec8f50d79b67 100644 --- a/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/DelimitedParseSpecTest.java @@ -41,7 +41,7 @@ public void testSerde() throws IOException "\u0001", "\u0002", Arrays.asList("abc"), - null + 0 ); final DelimitedParseSpec serde = jsonMapper.readValue( jsonMapper.writeValueAsString(spec), @@ -73,7 +73,7 @@ public void testColumnMissing() throws Exception ",", " ", Arrays.asList("a"), - null + 0 ); } @@ -94,7 +94,7 @@ public void testComma() throws Exception ",", null, Arrays.asList("a"), - null + 0 ); } @@ -115,7 +115,7 @@ public void testDefaultColumnList(){ null, // pass null columns not allowed null, - null + 0 ); } } diff --git a/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java b/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java index 589b804fa851..076f5fcd5e51 100644 --- a/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java +++ b/api/src/test/java/io/druid/data/input/impl/ParseSpecTest.java @@ -46,7 +46,7 @@ public void testDuplicateNames() throws Exception ",", " ", Arrays.asList("a", "b"), - null + 0 ); } @@ -67,7 +67,7 @@ public void testDimAndDimExcluOverlap() throws Exception ",", null, Arrays.asList("a", "B"), - null + 0 ); } @@ -88,7 +88,7 @@ public void testDimExclusionDuplicate() throws Exception ",", null, Arrays.asList("a", "B"), - null + 0 ); } } diff --git a/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java b/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java index 6a4886e38e77..b28baa335f5b 100644 --- a/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java +++ b/extensions-contrib/virtual-columns/src/test/java/io/druid/segment/MapVirtualColumnTest.java @@ -97,7 +97,7 @@ public static Iterable constructorFeeder() throws IOException "\t", ",", Arrays.asList("ts", "dim", "keys", "values"), - null + 0 ) , "utf8" ); diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java index 6b292a19095d..2f08df4221a4 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java @@ -347,7 +347,7 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map constructFeed() new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "visited_num"), - null + 0 ), null ), @@ -190,7 +190,7 @@ public static Collection constructFeed() new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "visited_num"), - null + 0 ) ), null, @@ -236,7 +236,7 @@ public static Collection constructFeed() new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "visited_num"), - null + 0 ), null ), @@ -293,7 +293,7 @@ public static Collection constructFeed() new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "visited_num"), - null + 0 ) ), null, diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java index 80ba3ba6b7ce..2864d9714402 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java @@ -78,7 +78,7 @@ public void setup() throws Exception new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "visited_num"), - null + 0 ), null ), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java index 1f5929fa938f..bb495f882b6b 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/path/DatasourcePathSpecTest.java @@ -266,7 +266,7 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig() new DimensionsSpec(null, null, null), null, ImmutableList.of("timestamp", "host", "visited"), - null + 0 ), null ), diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java index a1332296f6be..8bd86fac0e14 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java @@ -165,7 +165,7 @@ public InputStream openStream() throws IOException "\t", "\u0001", Arrays.asList(TestIndex.COLUMNS), - null + 0 ), null ), diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java index c2f00ea737bc..2a252483b4d5 100644 --- a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java +++ b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java @@ -459,7 +459,7 @@ private IndexTask.IndexIngestionSpec createIngestionSpec( ), null, Arrays.asList("ts", "dim", "val"), - null + 0 ), null ), diff --git a/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java index 7ebf18d438ea..bde6394c0fc1 100644 --- a/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java +++ b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java @@ -131,7 +131,7 @@ public static void setupClass() throws Exception new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags")), null, null), "\t", ImmutableList.of("timestamp", "product", "tags"), - null + 0 ), "UTF-8" ); diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java index ee3cd64c75db..f9a02b3e2e85 100644 --- a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java +++ b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerFactoryTest.java @@ -145,7 +145,7 @@ private Segment createSegment() throws Exception new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags")), null, null), "\t", ImmutableList.of("timestamp", "product", "tags"), - null + 0 ), "UTF-8" ); diff --git a/processing/src/test/java/io/druid/segment/TestIndex.java b/processing/src/test/java/io/druid/segment/TestIndex.java index cdd17ebcb903..98aacad597c5 100644 --- a/processing/src/test/java/io/druid/segment/TestIndex.java +++ b/processing/src/test/java/io/druid/segment/TestIndex.java @@ -292,7 +292,7 @@ public static IncrementalIndex loadIncrementalIndex( "\t", "\u0001", Arrays.asList(COLUMNS), - null + 0 ) , "utf8" ); diff --git a/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java b/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java index 83f6a32e5ddd..3a4cfa8eacf8 100644 --- a/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java +++ b/server/src/test/java/io/druid/segment/realtime/firehose/IngestSegmentFirehoseTest.java @@ -109,7 +109,7 @@ private void createTestIndex(File segmentDir) throws Exception new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "visited"), - null + 0 ), Charsets.UTF_8.toString() ); From 7ffa9679309845bc115793220fe2e21b16d4938d Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Wed, 10 May 2017 08:53:44 +0900 Subject: [PATCH 16/20] Address comments --- .../java/io/druid/data/input/Firehose.java | 9 +- .../druid/data/input/impl/CSVParseSpec.java | 2 +- .../data/input/impl/DelimitedParseSpec.java | 3 +- .../input/impl/FileIteratingFirehose.java | 12 --- .../data/input/impl/StringInputRowParser.java | 11 ++- .../input/impl/FileIteratingFirehoseTest.java | 39 +++++--- .../extensions-core/lookups-cached-global.md | 8 +- docs/content/ingestion/data-formats.md | 26 ++--- .../namespace/URIExtractionNamespace.java | 37 +++++++- .../druid/indexing/common/task/IndexTask.java | 2 +- .../java/util/common/parsers/CSVParser.java | 57 +++++++---- .../util/common/parsers/DelimitedParser.java | 94 +++++++++---------- .../java/util/common/parsers/Parser.java | 20 +++- .../util/common/parsers/CSVParserTest.java | 29 +++++- .../common/parsers/DelimitedParserTest.java | 35 ++++++- .../firehose/ReplayableFirehoseFactory.java | 1 - 16 files changed, 255 insertions(+), 130 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/Firehose.java b/api/src/main/java/io/druid/data/input/Firehose.java index a768e778d81a..4f4c640f1040 100644 --- a/api/src/main/java/io/druid/data/input/Firehose.java +++ b/api/src/main/java/io/druid/data/input/Firehose.java @@ -19,6 +19,7 @@ package io.druid.data.input; +import javax.annotation.Nullable; import java.io.Closeable; /** @@ -46,14 +47,16 @@ public interface Firehose extends Closeable * * @return true if and when there is another row available, false if the stream has dried up */ - public boolean hasMore(); + boolean hasMore(); /** * The next row available. Should only be called if hasMore returns true. + * The return value can be null which means the caller must skip this row. * * @return The next row */ - public InputRow nextRow(); + @Nullable + InputRow nextRow(); /** * Returns a runnable that will "commit" everything read up to the point at which commit() is called. This is @@ -74,5 +77,5 @@ public interface Firehose extends Closeable * because of InputRows delivered by prior calls to ##nextRow(). *

*/ - public Runnable commit(); + Runnable commit(); } diff --git a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java index 15d1be6dedb7..1d71f34fb93a 100644 --- a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java @@ -103,7 +103,7 @@ public void verify(List usedCols) @Override public Parser makeParser() { - return new CSVParser(Optional.fromNullable(listDelimiter), columns, hasHeaderRow); + return new CSVParser(Optional.fromNullable(listDelimiter), columns, hasHeaderRow, skipHeaderRows); } @Override diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index 20a74d3c6e6c..05739f998b1e 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -116,7 +116,8 @@ public Parser makeParser() Optional.fromNullable(delimiter), Optional.fromNullable(listDelimiter), columns, - hasHeaderRow + hasHeaderRow, + skipHeaderRows ); } diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 2e2c20c4efa5..3a0327bb4d09 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -34,7 +34,6 @@ public class FileIteratingFirehose implements Firehose { private final Iterator lineIterators; private final StringInputRowParser parser; - private final int skipHeadRows; private LineIterator lineIterator = null; @@ -45,14 +44,6 @@ public FileIteratingFirehose( { this.lineIterators = lineIterators; this.parser = parser; - final ParseSpec parseSpec = parser.getParseSpec(); - if (parseSpec instanceof CSVParseSpec) { - this.skipHeadRows = ((CSVParseSpec) parseSpec).getSkipHeaderRows(); - } else if (parseSpec instanceof DelimitedParseSpec) { - this.skipHeadRows = ((DelimitedParseSpec) parseSpec).getSkipHeaderRows(); - } else { - skipHeadRows = 0; - } } @Override @@ -60,9 +51,6 @@ public boolean hasMore() { while ((lineIterator == null || !lineIterator.hasNext()) && lineIterators.hasNext()) { lineIterator = getNextLineIterator(); - for (int i = 0; i < skipHeadRows && lineIterator.hasNext(); i++) { - lineIterator.next(); - } } return lineIterator != null && lineIterator.hasNext(); diff --git a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java index 3041c8ec3191..0c10256b6afb 100644 --- a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java +++ b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java @@ -27,6 +27,7 @@ import io.druid.java.util.common.parsers.ParseException; import io.druid.java.util.common.parsers.Parser; +import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; @@ -56,6 +57,7 @@ public StringInputRowParser( this.parseSpec = parseSpec; this.mapParser = new MapInputRowParser(parseSpec); this.parser = parseSpec.makeParser(); + parser.startFileFromBeginning(); if (encoding != null) { this.charset = Charset.forName(encoding); @@ -128,17 +130,20 @@ public void reset() parser.reset(); } - public InputRow parse(String input) + @Nullable + public InputRow parse(@Nullable String input) { return parseMap(parseString(input)); } - private Map parseString(String inputString) + @Nullable + private Map parseString(@Nullable String inputString) { return parser.parse(inputString); } - private InputRow parseMap(Map theMap) + @Nullable + private InputRow parseMap(@Nullable Map theMap) { // If a header is present in the data (and with proper configurations), a null is returned if (theMap == null) { diff --git a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java index 294c45514def..30239233bcbd 100644 --- a/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java +++ b/api/src/test/java/io/druid/data/input/impl/FileIteratingFirehoseTest.java @@ -22,6 +22,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; +import io.druid.data.input.InputRow; import org.apache.commons.io.LineIterator; import org.junit.Assert; import org.junit.Test; @@ -36,6 +37,7 @@ import java.util.Collection; import java.util.List; import java.util.stream.Collectors; +import java.util.stream.IntStream; @RunWith(Parameterized.class) public class FileIteratingFirehoseTest @@ -68,7 +70,7 @@ public static Collection constructorFeeder() throws IOException private final List inputs; private final List expectedResults; - public FileIteratingFirehoseTest(List texts, int numSkipHeadRows) + public FileIteratingFirehoseTest(List texts, int numSkipHeaderRows) { parser = new StringInputRowParser( new CSVParseSpec( @@ -77,7 +79,7 @@ public FileIteratingFirehoseTest(List texts, int numSkipHeadRows) ",", ImmutableList.of("ts", "x"), false, - numSkipHeadRows + numSkipHeaderRows ), null ); @@ -85,14 +87,15 @@ public FileIteratingFirehoseTest(List texts, int numSkipHeadRows) this.inputs = texts; this.expectedResults = inputs.stream() .map(input -> input.split("\n")) - .filter(lines -> numSkipHeadRows < lines.length) .flatMap(lines -> { - final List skippedLines = Arrays.asList(lines) - .subList(numSkipHeadRows, lines.length); - return skippedLines.stream() - .filter(line -> line.length() > 0) - .map(line -> line.split(",")[1]) - .collect(Collectors.toList()).stream(); + final List filteredLines = Arrays.asList(lines).stream() + .filter(line -> line.length() > 0) + .map(line -> line.split(",")[1]) + .collect(Collectors.toList()); + + final int numRealSkippedRows = Math.min(filteredLines.size(), numSkipHeaderRows); + IntStream.range(0, numRealSkippedRows).forEach(i -> filteredLines.set(i, null)); + return filteredLines.stream(); }) .collect(Collectors.toList()); } @@ -104,13 +107,19 @@ public void testFirehose() throws Exception .map(s -> new LineIterator(new StringReader(s))) .collect(Collectors.toList()); - final FileIteratingFirehose firehose = new FileIteratingFirehose(lineIterators.iterator(), parser); - final List results = Lists.newArrayList(); + try (final FileIteratingFirehose firehose = new FileIteratingFirehose(lineIterators.iterator(), parser)) { + final List results = Lists.newArrayList(); - while (firehose.hasMore()) { - results.add(Joiner.on("|").join(firehose.nextRow().getDimension("x"))); - } + while (firehose.hasMore()) { + final InputRow inputRow = firehose.nextRow(); + if (inputRow == null) { + results.add(null); + } else { + results.add(Joiner.on("|").join(inputRow.getDimension("x"))); + } + } - Assert.assertEquals(expectedResults, results); + Assert.assertEquals(expectedResults, results); + } } } diff --git a/docs/content/development/extensions-core/lookups-cached-global.md b/docs/content/development/extensions-core/lookups-cached-global.md index 72d346201c1d..1fee40d12e56 100644 --- a/docs/content/development/extensions-core/lookups-cached-global.md +++ b/docs/content/development/extensions-core/lookups-cached-global.md @@ -195,12 +195,13 @@ The `namespaceParseSpec` can be one of a number of values. Each of the examples Only ONE file which matches the search will be used. For most implementations, the discriminator for choosing the URIs is by whichever one reports the most recent timestamp for its modification time. ### csv lookupParseSpec - |Parameter|Description|Required|Default| |---------|-----------|--------|-------| -|`columns`|The list of columns in the csv file|yes|`null`| +|`columns`|The list of columns in the csv file|no if `hasHeaderRow` is set|`null`| |`keyColumn`|The name of the column containing the key|no|The first column| |`valueColumn`|The name of the column containing the value|no|The second column| +|`hasHeaderRow`|A flag to indicate that column information can be extracted from the input files' header row|no|false| +|`skipHeaderRows`|Number of header rows to be skipped|no|0| *example input* @@ -222,7 +223,6 @@ truck,something3,buck ``` ### tsv lookupParseSpec - |Parameter|Description|Required|Default| |---------|-----------|--------|-------| |`columns`|The list of columns in the csv file|yes|`null`| @@ -230,6 +230,8 @@ truck,something3,buck |`valueColumn`|The name of the column containing the value|no|The second column| |`delimiter`|The delimiter in the file|no|tab (`\t`)| |`listDelimiter`|The list delimiter in the file|no| (`\u0001`)| +|`hasHeaderRow`|A flag to indicate that column information can be extracted from the input files' header row|no|false| +|`skipHeaderRows`|Number of header rows to be skipped|no|0| *example input* diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index c7f4ba0741ea..381b3fdba851 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -73,7 +73,7 @@ If you have nested JSON, [Druid can automatically flatten it for you](flatten-js ### CSV ```json - "parseSpec":{ + "parseSpec": { "format" : "csv", "timestampSpec" : { "column" : "timestamp" @@ -81,16 +81,17 @@ If you have nested JSON, [Druid can automatically flatten it for you](flatten-js "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] - }, - "skipHeaderRows" : 5 + } } ``` #### CSV Index Tasks -If your file does not have a header as the first line of the file, you must set the `columns` field and ensure that the order of the fields matches the columns of your input data in the same order. -If your file does have a header, you can set a field called `hasHeaderRow` to true, and do not include the `columns` key. -Additionally, you can set the number of head rows to be skipped to `skipHeaderRows`. Note that this option is effective +If your input files contain a header, the `columns` field is optional and you don't need to set. +Instead, you can set the `hasHeaderRow` field to true, which makes Druid automatically extract the column information from the header. +Otherwise, you must set the `columns` field and ensure that field must match the columns of your input data in the same order. + +Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. Note that `hasHeaderRow` and `skipHeaderRows` are effective only for non-Hadoop batch index tasks. #### Other CSV Ingestion Tasks @@ -109,18 +110,19 @@ The `columns` field must be included and and ensure that the order of the fields "delimiter":"|", "dimensionsSpec" : { "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] - }, - "skipHeaderRows" : 5 + } } ``` +Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. + #### TSV (Delimited) Index Tasks -If your file does not have a header as the first line of the file, you must set the `columns` field and ensure that the order of the fields matches the columns of your input data in the same order. -If your file does have a header, you can set a field called `hasHeaderRow` to true, and do not include the `columns` key. +If your input files contain a header, the `columns` field is optional and you don't need to set. +Instead, you can set the `hasHeaderRow` field to true, which makes Druid automatically extract the column information from the header. +Otherwise, you must set the `columns` field and ensure that field must match the columns of your input data in the same order. -Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. -Additionally, you can set the number of head rows to be skipped to `skipHeaderRows`. Note that this option is effective +Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. Note that `hasHeaderRow` and `skipHeaderRows` are effective only for non-Hadoop batch index tasks. #### Other TSV (Delimited) Ingestion Tasks diff --git a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java index a5103a4e2485..1c6bfd0447ef 100644 --- a/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java +++ b/extensions-core/lookups-cached-global/src/main/java/io/druid/query/lookup/namespace/URIExtractionNamespace.java @@ -27,6 +27,7 @@ import com.fasterxml.jackson.annotation.JsonTypeName; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Strings; @@ -262,7 +263,9 @@ public static class CSVFlatDataParser implements FlatDataParser public CSVFlatDataParser( @JsonProperty("columns") List columns, @JsonProperty("keyColumn") final String keyColumn, - @JsonProperty("valueColumn") final String valueColumn + @JsonProperty("valueColumn") final String valueColumn, + @JsonProperty("hasHeaderRow") boolean hasHeaderRow, + @JsonProperty("skipHeaderRows") int skipHeaderRows ) { Preconditions.checkArgument( @@ -291,12 +294,22 @@ public CSVFlatDataParser( ); this.parser = new DelegateParser( - new CSVParser(Optional.absent(), columns), + new CSVParser(Optional.absent(), columns, hasHeaderRow, skipHeaderRows), this.keyColumn, this.valueColumn ); } + @VisibleForTesting + CSVFlatDataParser( + List columns, + String keyColumn, + String valueColumn + ) + { + this(columns, keyColumn, valueColumn, false, 0); + } + @JsonProperty public List getColumns() { @@ -371,7 +384,9 @@ public TSVFlatDataParser( @JsonProperty("delimiter") String delimiter, @JsonProperty("listDelimiter") String listDelimiter, @JsonProperty("keyColumn") final String keyColumn, - @JsonProperty("valueColumn") final String valueColumn + @JsonProperty("valueColumn") final String valueColumn, + @JsonProperty("hasHeaderRow") boolean hasHeaderRow, + @JsonProperty("skipHeaderRows") int skipHeaderRows ) { Preconditions.checkArgument( @@ -380,7 +395,9 @@ public TSVFlatDataParser( ); final DelimitedParser delegate = new DelimitedParser( Optional.fromNullable(Strings.emptyToNull(delimiter)), - Optional.fromNullable(Strings.emptyToNull(listDelimiter)) + Optional.fromNullable(Strings.emptyToNull(listDelimiter)), + hasHeaderRow, + skipHeaderRows ); Preconditions.checkArgument( !(Strings.isNullOrEmpty(keyColumn) ^ Strings.isNullOrEmpty(valueColumn)), @@ -408,6 +425,18 @@ public TSVFlatDataParser( this.parser = new DelegateParser(delegate, this.keyColumn, this.valueColumn); } + @VisibleForTesting + TSVFlatDataParser( + List columns, + String delimiter, + String listDelimiter, + String keyColumn, + String valueColumn + ) + { + this(columns, delimiter, listDelimiter, keyColumn, valueColumn, false, 0); + } + @JsonProperty public List getColumns() { diff --git a/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java b/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java index 70f6140da070..fd36470b111d 100644 --- a/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java +++ b/indexing-service/src/main/java/io/druid/indexing/common/task/IndexTask.java @@ -272,7 +272,7 @@ private Map> determineShardSpecs( while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); - // If a header is present in the data (and with proper configurations), a null InputRow will get created + // The null inputRow means the caller must skip this row. if (inputRow == null) { continue; } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index 377e8138ada4..a22636512d3a 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -19,6 +19,7 @@ package io.druid.java.util.common.parsers; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Optional; import com.google.common.base.Splitter; @@ -60,54 +61,59 @@ public Object apply(String input) private final String listDelimiter; private final Splitter listSplitter; private final Function valueFunction; - private final boolean hasHeaderRow; + private final int maxSkipHeaderRows; private final au.com.bytecode.opencsv.CSVParser parser = new au.com.bytecode.opencsv.CSVParser(); private ArrayList fieldNames = null; private boolean hasParsedHeader = false; + private int skippedHeaderRows; + private boolean supportSkipHeaderRows; - public CSVParser(final Optional listDelimiter) + public CSVParser( + final Optional listDelimiter, + final boolean hasHeaderRow, + final int maxSkipHeaderRows + ) { this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; this.listSplitter = Splitter.on(this.listDelimiter); this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); - this.hasHeaderRow = false; + this.hasHeaderRow = hasHeaderRow; + this.maxSkipHeaderRows = maxSkipHeaderRows; } - public CSVParser(final Optional listDelimiter, final Iterable fieldNames) + public CSVParser( + final Optional listDelimiter, + final Iterable fieldNames, + final boolean hasHeaderRow, + final int maxSkipHeaderRows + ) { - this(listDelimiter); + this(listDelimiter, hasHeaderRow, maxSkipHeaderRows); setFieldNames(fieldNames); } - public CSVParser(final Optional listDelimiter, final String header) + @VisibleForTesting + CSVParser(final Optional listDelimiter, final String header) { - this(listDelimiter); + this(listDelimiter, false, 0); setFieldNames(header); } - public CSVParser( - final Optional listDelimiter, - final Iterable fieldNames, - final boolean hasHeaderRow - ) + public String getListDelimiter() { - this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; - this.listSplitter = Splitter.on(this.listDelimiter); - this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); - this.hasHeaderRow = hasHeaderRow; - - setFieldNames(fieldNames); + return listDelimiter; } - public String getListDelimiter() + @Override + public void startFileFromBeginning() { - return listDelimiter; + supportSkipHeaderRows = true; } @Override @@ -138,6 +144,11 @@ public void setFieldNames(final String header) @Override public Map parse(final String input) { + if (!supportSkipHeaderRows && (hasHeaderRow || maxSkipHeaderRows > 0)) { + throw new UnsupportedOperationException("hasHeaderRow or maxSkipHeaderRows is not supported. " + + "Please check the indexTask supports these options."); + } + try { String[] values = parser.parseLine(input); @@ -149,6 +160,11 @@ public Map parse(final String input) return null; } + if (skippedHeaderRows < maxSkipHeaderRows) { + skippedHeaderRows++; + return null; + } + if (fieldNames == null) { setFieldNames(ParserUtils.generateFieldNames(values.length)); } @@ -164,5 +180,6 @@ public Map parse(final String input) public void reset() { hasParsedHeader = false; + skippedHeaderRows = 0; } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index ddf43300fdc9..c90ae63c0f5c 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -19,6 +19,7 @@ package io.druid.java.util.common.parsers; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Optional; import com.google.common.base.Preconditions; @@ -36,42 +37,44 @@ public class DelimitedParser implements Parser { private static final String DEFAULT_DELIMITER = "\t"; - private static final Function getValueFunction( + private static Function getValueFunction( final String listDelimiter, final Splitter listSplitter ) { - return new Function() - { - @Override - public Object apply(String input) - { - if (input.contains(listDelimiter)) { - return Lists.newArrayList( - Iterables.transform( - listSplitter.split(input), - ParserUtils.nullEmptyStringFunction - ) - ); - } else { - return ParserUtils.nullEmptyStringFunction.apply(input); - } + return (input) -> { + if (input.contains(listDelimiter)) { + return Lists.newArrayList( + Iterables.transform( + listSplitter.split(input), + ParserUtils.nullEmptyStringFunction + ) + ); + } else { + return ParserUtils.nullEmptyStringFunction.apply(input); } }; } private final String delimiter; private final String listDelimiter; - private final Splitter splitter; private final Splitter listSplitter; private final Function valueFunction; private final boolean hasHeaderRow; + private final int maxSkipHeaderRows; private ArrayList fieldNames = null; private boolean hasParsedHeader = false; + private int skippedHeaderRows; + private boolean supportSkipHeaderRows; - public DelimitedParser(final Optional delimiter, Optional listDelimiter) + public DelimitedParser( + final Optional delimiter, + final Optional listDelimiter, + final boolean hasHeaderRow, + final int maxSkipHeaderRows + ) { this.delimiter = delimiter.isPresent() ? delimiter.get() : DEFAULT_DELIMITER; this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; @@ -85,51 +88,31 @@ public DelimitedParser(final Optional delimiter, Optional listDe this.splitter = Splitter.on(this.delimiter); this.listSplitter = Splitter.on(this.listDelimiter); this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); - this.hasHeaderRow = false; + this.hasHeaderRow = hasHeaderRow; + this.maxSkipHeaderRows = maxSkipHeaderRows; } public DelimitedParser( final Optional delimiter, final Optional listDelimiter, - final Iterable fieldNames + final Iterable fieldNames, + final boolean hasHeaderRow, + final int maxSkipHeaderRows ) { - this(delimiter, listDelimiter); + this(delimiter, listDelimiter, hasHeaderRow, maxSkipHeaderRows); setFieldNames(fieldNames); } - public DelimitedParser(final Optional delimiter, final Optional listDelimiter, final String header) + @VisibleForTesting + DelimitedParser(final Optional delimiter, final Optional listDelimiter, final String header) { - this(delimiter, listDelimiter); + this(delimiter, listDelimiter, false, 0); setFieldNames(header); } - public DelimitedParser( - final Optional delimiter, - final Optional listDelimiter, - final Iterable fieldNames, - final boolean hasHeaderRow - ) - { - this.delimiter = delimiter.isPresent() ? delimiter.get() : DEFAULT_DELIMITER; - this.listDelimiter = listDelimiter.isPresent() ? listDelimiter.get() : Parsers.DEFAULT_LIST_DELIMITER; - - Preconditions.checkState( - !this.delimiter.equals(this.listDelimiter), - "Cannot have same delimiter and list delimiter of [%s]", - this.delimiter - ); - - this.splitter = Splitter.on(this.delimiter); - this.listSplitter = Splitter.on(this.listDelimiter); - this.valueFunction = getValueFunction(this.listDelimiter, this.listSplitter); - this.hasHeaderRow = hasHeaderRow; - - setFieldNames(fieldNames); - } - public String getDelimiter() { return delimiter; @@ -140,6 +123,12 @@ public String getListDelimiter() return listDelimiter; } + @Override + public void startFileFromBeginning() + { + supportSkipHeaderRows = true; + } + @Override public List getFieldNames() { @@ -168,6 +157,11 @@ public void setFieldNames(String header) @Override public Map parse(final String input) { + if (!supportSkipHeaderRows && (hasHeaderRow || maxSkipHeaderRows > 0)) { + throw new UnsupportedOperationException("hasHeaderRow or maxSkipHeaderRows is not supported. " + + "Please check the indexTask supports these options."); + } + try { Iterable values = splitter.split(input); @@ -179,6 +173,11 @@ public Map parse(final String input) return null; } + if (skippedHeaderRows < maxSkipHeaderRows) { + skippedHeaderRows++; + return null; + } + if (fieldNames == null) { setFieldNames(ParserUtils.generateFieldNames(Iterators.size(values.iterator()))); } @@ -194,5 +193,6 @@ public Map parse(final String input) public void reset() { hasParsedHeader = false; + skippedHeaderRows = 0; } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java index d284317a620f..ca9b785f0fcb 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java @@ -19,6 +19,7 @@ package io.druid.java.util.common.parsers; +import javax.annotation.Nullable; import java.util.List; import java.util.Map; @@ -28,16 +29,25 @@ public interface Parser { /** - * Parse a String into a Map. + * Initialize this parser for centralized batch processing of files like IndexTask. + */ + default void startFileFromBeginning() + { + + } + + /** + * Parse a String into a Map. The result can be null which means the given input string will be ignored. * * @throws ParseException if the String cannot be parsed */ - public Map parse(String input); + @Nullable + Map parse(String input); /** * Resets state within a parser. This may or may not get called at the start of reading of every file. */ - public default void reset() + default void reset() { // do nothing } @@ -48,12 +58,12 @@ public default void reset() * parser) and those parsers have their own way of setting field names. */ @Deprecated - public void setFieldNames(Iterable fieldNames); + void setFieldNames(Iterable fieldNames); /** * Returns the fieldNames that we expect to see in parsed Maps, if known, or null otherwise. Deprecated; Parsers * should not, in general, be expected to know what fields they will return. */ @Deprecated - public List getFieldNames(); + List getFieldNames(); } diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java index cc74942f186b..32ef1783ca3a 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java @@ -80,7 +80,7 @@ public void testCSVParserWithHeader() @Test public void testCSVParserWithoutHeader() { - final Parser csvParser = new CSVParser(Optional.fromNullable(null)); + final Parser csvParser = new CSVParser(Optional.fromNullable(null), false, 0); String body = "hello,world,foo"; final Map jsonMap = csvParser.parse(body); Assert.assertEquals( @@ -89,4 +89,31 @@ public void testCSVParserWithoutHeader() jsonMap ); } + + @Test + public void testCSVParserWithSkipHeaderRows() + { + final int skipHeaderRows = 2; + final Parser csvParser = new CSVParser( + Optional.absent(), + false, + skipHeaderRows + ); + csvParser.startFileFromBeginning(); + final String[] body = new String[] { + "header,line,1", + "header,line,2", + "hello,world,foo" + }; + int index; + for (index = 0; index < skipHeaderRows; index++) { + Assert.assertNull(csvParser.parse(body[index])); + } + final Map jsonMap = csvParser.parse(body[index]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("column_1", "hello", "column_2", "world", "column_3", "foo"), + jsonMap + ); + } } diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java index fd7f795f2f6f..a1659ea702e0 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java @@ -84,7 +84,12 @@ public void testTSVParserWithHeader() @Test public void testTSVParserWithoutHeader() { - final Parser delimitedParser = new DelimitedParser(Optional.of("\t"), Optional.absent()); + final Parser delimitedParser = new DelimitedParser( + Optional.of("\t"), + Optional.absent(), + false, + 0 + ); String body = "hello\tworld\tfoo"; final Map jsonMap = delimitedParser.parse(body); Assert.assertEquals( @@ -93,4 +98,32 @@ public void testTSVParserWithoutHeader() jsonMap ); } + + @Test + public void testTSVParserWithSkipHeaderRows() + { + final int skipHeaderRows = 2; + final Parser delimitedParser = new DelimitedParser( + Optional.of("\t"), + Optional.absent(), + false, + skipHeaderRows + ); + delimitedParser.startFileFromBeginning(); + final String[] body = new String[] { + "header\tline\t1", + "header\tline\t2", + "hello\tworld\tfoo" + }; + int index; + for (index = 0; index < skipHeaderRows; index++) { + Assert.assertNull(delimitedParser.parse(body[index])); + } + final Map jsonMap = delimitedParser.parse(body[index]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("column_1", "hello", "column_2", "world", "column_3", "foo"), + jsonMap + ); + } } diff --git a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java index c50367a796ba..2bca6eae939a 100644 --- a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java +++ b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java @@ -163,7 +163,6 @@ public ReplayableFirehose(InputRowParser parser) throws IOException try { InputRow row = delegateFirehose.nextRow(); - // If a header is present in the data (and with proper configurations), a null InputRow will get created if (row == null) { continue; } From d447f37fe5d91683fdfac9008c70f86c37631c53 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Wed, 10 May 2017 10:26:14 +0900 Subject: [PATCH 17/20] Add more tests --- .../util/common/parsers/CSVParserTest.java | 17 +++++++++++++++++ .../common/parsers/DelimitedParserTest.java | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java index 32ef1783ca3a..37a589b276a2 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java @@ -116,4 +116,21 @@ public void testCSVParserWithSkipHeaderRows() jsonMap ); } + + @Test(expected = UnsupportedOperationException.class) + public void testCSVParserWithoutStartFileFromBeginning() + { + final int skipHeaderRows = 2; + final Parser csvParser = new CSVParser( + Optional.absent(), + false, + skipHeaderRows + ); + final String[] body = new String[] { + "header\tline\t1", + "header\tline\t2", + "hello\tworld\tfoo" + }; + csvParser.parse(body[0]); + } } diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java index a1659ea702e0..d91ed25cbbc2 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java @@ -126,4 +126,22 @@ public void testTSVParserWithSkipHeaderRows() jsonMap ); } + + @Test(expected = UnsupportedOperationException.class) + public void testTSVParserWithoutStartFileFromBeginning() + { + final int skipHeaderRows = 2; + final Parser delimitedParser = new DelimitedParser( + Optional.of("\t"), + Optional.absent(), + false, + skipHeaderRows + ); + final String[] body = new String[] { + "header\tline\t1", + "header\tline\t2", + "hello\tworld\tfoo" + }; + delimitedParser.parse(body[0]); + } } From 2b25c9041fcba07ec0a5bfffd98c3f6b789bab85 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Wed, 10 May 2017 10:34:20 +0900 Subject: [PATCH 18/20] Add a comment to ReplayableFirehose --- .../segment/realtime/firehose/ReplayableFirehoseFactory.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java index 2bca6eae939a..08cd87ac0893 100644 --- a/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java +++ b/server/src/main/java/io/druid/segment/realtime/firehose/ReplayableFirehoseFactory.java @@ -163,6 +163,11 @@ public ReplayableFirehose(InputRowParser parser) throws IOException try { InputRow row = delegateFirehose.nextRow(); + // delegateFirehose may return a null row if the underlying parser returns null. + // This should be written, but deserialization of null values causes an error of + // 'com.fasterxml.jackson.databind.RuntimeJsonMappingException: Can not deserialize instance of io.druid.data.input.MapBasedRow out of VALUE_NULL token' + // Since ReplayableFirehoseFactory is planed to be removed in https://github.com/druid-io/druid/pull/4193, + // we simply skip null rows for now. if (row == null) { continue; } From a9e7679e756c4c55072ec04e35d275a102013d21 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 11 May 2017 16:28:46 +0900 Subject: [PATCH 19/20] Addressing comments --- .../druid/data/input/impl/CSVParseSpec.java | 2 +- .../data/input/impl/DelimitedParseSpec.java | 2 +- .../input/impl/FileIteratingFirehose.java | 2 +- .../data/input/impl/StringInputRowParser.java | 5 ++--- docs/content/ingestion/data-formats.md | 16 ++++++++++++---- .../java/util/common/parsers/CSVParser.java | 19 +++++++------------ .../util/common/parsers/DelimitedParser.java | 19 +++++++------------ .../java/util/common/parsers/Parser.java | 11 ++--------- .../common/parsers/ToLowerCaseParser.java | 12 ++++++------ 9 files changed, 39 insertions(+), 49 deletions(-) diff --git a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java index 1d71f34fb93a..a4b09a2010a9 100644 --- a/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/CSVParseSpec.java @@ -87,7 +87,7 @@ public boolean isHasHeaderRow() } @JsonProperty("skipHeaderRows") - public Integer getSkipHeaderRows() + public int getSkipHeaderRows() { return skipHeaderRows; } diff --git a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java index 05739f998b1e..c3383eb351c4 100644 --- a/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java +++ b/api/src/main/java/io/druid/data/input/impl/DelimitedParseSpec.java @@ -96,7 +96,7 @@ public boolean isHasHeaderRow() } @JsonProperty("skipHeaderRows") - public Integer getSkipHeaderRows() + public int getSkipHeaderRows() { return skipHeaderRows; } diff --git a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java index 3a0327bb4d09..648831bece31 100644 --- a/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java +++ b/api/src/main/java/io/druid/data/input/impl/FileIteratingFirehose.java @@ -73,7 +73,7 @@ private LineIterator getNextLineIterator() } final LineIterator iterator = lineIterators.next(); - parser.reset(); + parser.startFileFromBeginning(); return iterator; } diff --git a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java index 0c10256b6afb..a640ef10ac41 100644 --- a/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java +++ b/api/src/main/java/io/druid/data/input/impl/StringInputRowParser.java @@ -57,7 +57,6 @@ public StringInputRowParser( this.parseSpec = parseSpec; this.mapParser = new MapInputRowParser(parseSpec); this.parser = parseSpec.makeParser(); - parser.startFileFromBeginning(); if (encoding != null) { this.charset = Charset.forName(encoding); @@ -125,9 +124,9 @@ private Map buildStringKeyMap(ByteBuffer input) return theMap; } - public void reset() + public void startFileFromBeginning() { - parser.reset(); + parser.startFileFromBeginning(); } @Nullable diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index 381b3fdba851..266dc458ca4e 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -91,8 +91,12 @@ If your input files contain a header, the `columns` field is optional and you do Instead, you can set the `hasHeaderRow` field to true, which makes Druid automatically extract the column information from the header. Otherwise, you must set the `columns` field and ensure that field must match the columns of your input data in the same order. -Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. Note that `hasHeaderRow` and `skipHeaderRows` are effective -only for non-Hadoop batch index tasks. +Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. If both `skipHeaderRows` and `hasHeaderRow` options are set, +`skipHeaderRows` is fist applied. For example, if you set `skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will +skip the first two lines and then extract column information from the third line. + +Note that `hasHeaderRow` and `skipHeaderRows` are effective only for non-Hadoop batch index tasks. Other types of index +tasks will fail with an exception. #### Other CSV Ingestion Tasks @@ -122,8 +126,12 @@ If your input files contain a header, the `columns` field is optional and you do Instead, you can set the `hasHeaderRow` field to true, which makes Druid automatically extract the column information from the header. Otherwise, you must set the `columns` field and ensure that field must match the columns of your input data in the same order. -Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. Note that `hasHeaderRow` and `skipHeaderRows` are effective -only for non-Hadoop batch index tasks. +Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. If both `skipHeaderRows` and `hasHeaderRow` options are set, +`skipHeaderRows` is fist applied. For example, if you set `skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will +skip the first two lines and then extract column information from the third line. + +Note that `hasHeaderRow` and `skipHeaderRows` are effective only for non-Hadoop batch index tasks. Other types of index +tasks will fail with an exception. #### Other TSV (Delimited) Ingestion Tasks diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index a22636512d3a..957e72560d2a 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -114,6 +114,8 @@ public String getListDelimiter() public void startFileFromBeginning() { supportSkipHeaderRows = true; + hasParsedHeader = false; + skippedHeaderRows = 0; } @Override @@ -152,6 +154,11 @@ public Map parse(final String input) try { String[] values = parser.parseLine(input); + if (skippedHeaderRows < maxSkipHeaderRows) { + skippedHeaderRows++; + return null; + } + if (hasHeaderRow && !hasParsedHeader) { if (fieldNames == null) { setFieldNames(Arrays.asList(values)); @@ -160,11 +167,6 @@ public Map parse(final String input) return null; } - if (skippedHeaderRows < maxSkipHeaderRows) { - skippedHeaderRows++; - return null; - } - if (fieldNames == null) { setFieldNames(ParserUtils.generateFieldNames(values.length)); } @@ -175,11 +177,4 @@ public Map parse(final String input) throw new ParseException(e, "Unable to parse row [%s]", input); } } - - @Override - public void reset() - { - hasParsedHeader = false; - skippedHeaderRows = 0; - } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index c90ae63c0f5c..27114df31e95 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -127,6 +127,8 @@ public String getListDelimiter() public void startFileFromBeginning() { supportSkipHeaderRows = true; + hasParsedHeader = false; + skippedHeaderRows = 0; } @Override @@ -165,6 +167,11 @@ public Map parse(final String input) try { Iterable values = splitter.split(input); + if (skippedHeaderRows < maxSkipHeaderRows) { + skippedHeaderRows++; + return null; + } + if (hasHeaderRow && !hasParsedHeader) { if (fieldNames == null) { setFieldNames(values); @@ -173,11 +180,6 @@ public Map parse(final String input) return null; } - if (skippedHeaderRows < maxSkipHeaderRows) { - skippedHeaderRows++; - return null; - } - if (fieldNames == null) { setFieldNames(ParserUtils.generateFieldNames(Iterators.size(values.iterator()))); } @@ -188,11 +190,4 @@ public Map parse(final String input) throw new ParseException(e, "Unable to parse row [%s]", input); } } - - @Override - public void reset() - { - hasParsedHeader = false; - skippedHeaderRows = 0; - } } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java index ca9b785f0fcb..8cc6fd6a1d6b 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/Parser.java @@ -29,7 +29,8 @@ public interface Parser { /** - * Initialize this parser for centralized batch processing of files like IndexTask. + * This method may or may not get called at the start of reading of every file depending on the type of IndexTasks. + * The parser state should be reset if exists. */ default void startFileFromBeginning() { @@ -44,14 +45,6 @@ default void startFileFromBeginning() @Nullable Map parse(String input); - /** - * Resets state within a parser. This may or may not get called at the start of reading of every file. - */ - default void reset() - { - // do nothing - } - /** * Set the fieldNames that you expect to see in parsed Maps. Deprecated; Parsers should not, in general, be * expected to know what fields they will return. Some individual types of parsers do need to know (like a TSV diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java index 7eef985e53eb..fede1aa1f98c 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/ToLowerCaseParser.java @@ -54,20 +54,20 @@ public Map parse(String input) } @Override - public void setFieldNames(Iterable fieldNames) + public void startFileFromBeginning() { - baseParser.setFieldNames(fieldNames); + baseParser.startFileFromBeginning(); } @Override - public List getFieldNames() + public void setFieldNames(Iterable fieldNames) { - return baseParser.getFieldNames(); + baseParser.setFieldNames(fieldNames); } @Override - public void reset() + public List getFieldNames() { - baseParser.reset(); + return baseParser.getFieldNames(); } } From 5290907169f91d6b98fec92497f77a03665c67a1 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 16 May 2017 09:45:24 +0900 Subject: [PATCH 20/20] Add docs and fix typos --- .../development/extensions-core/lookups-cached-global.md | 9 ++++++++- docs/content/ingestion/data-formats.md | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/content/development/extensions-core/lookups-cached-global.md b/docs/content/development/extensions-core/lookups-cached-global.md index 1fee40d12e56..385cc07ad7cd 100644 --- a/docs/content/development/extensions-core/lookups-cached-global.md +++ b/docs/content/development/extensions-core/lookups-cached-global.md @@ -203,6 +203,10 @@ Only ONE file which matches the search will be used. For most implementations, t |`hasHeaderRow`|A flag to indicate that column information can be extracted from the input files' header row|no|false| |`skipHeaderRows`|Number of header rows to be skipped|no|0| +If both `skipHeaderRows` and `hasHeaderRow` options are set, `skipHeaderRows` is first applied. For example, if you set +`skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will skip the first two lines and then extract column information +from the third line. + *example input* ``` @@ -225,7 +229,7 @@ truck,something3,buck ### tsv lookupParseSpec |Parameter|Description|Required|Default| |---------|-----------|--------|-------| -|`columns`|The list of columns in the csv file|yes|`null`| +|`columns`|The list of columns in the tsv file|yes|`null`| |`keyColumn`|The name of the column containing the key|no|The first column| |`valueColumn`|The name of the column containing the value|no|The second column| |`delimiter`|The delimiter in the file|no|tab (`\t`)| @@ -233,6 +237,9 @@ truck,something3,buck |`hasHeaderRow`|A flag to indicate that column information can be extracted from the input files' header row|no|false| |`skipHeaderRows`|Number of header rows to be skipped|no|0| +If both `skipHeaderRows` and `hasHeaderRow` options are set, `skipHeaderRows` is first applied. For example, if you set +`skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will skip the first two lines and then extract column information +from the third line. *example input* diff --git a/docs/content/ingestion/data-formats.md b/docs/content/ingestion/data-formats.md index 266dc458ca4e..859f75e12be8 100644 --- a/docs/content/ingestion/data-formats.md +++ b/docs/content/ingestion/data-formats.md @@ -92,7 +92,7 @@ Instead, you can set the `hasHeaderRow` field to true, which makes Druid automat Otherwise, you must set the `columns` field and ensure that field must match the columns of your input data in the same order. Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. If both `skipHeaderRows` and `hasHeaderRow` options are set, -`skipHeaderRows` is fist applied. For example, if you set `skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will +`skipHeaderRows` is first applied. For example, if you set `skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will skip the first two lines and then extract column information from the third line. Note that `hasHeaderRow` and `skipHeaderRows` are effective only for non-Hadoop batch index tasks. Other types of index @@ -127,7 +127,7 @@ Instead, you can set the `hasHeaderRow` field to true, which makes Druid automat Otherwise, you must set the `columns` field and ensure that field must match the columns of your input data in the same order. Also, you can skip some header rows by setting `skipHeaderRows` in your parseSpec. If both `skipHeaderRows` and `hasHeaderRow` options are set, -`skipHeaderRows` is fist applied. For example, if you set `skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will +`skipHeaderRows` is first applied. For example, if you set `skipHeaderRows` to 2 and `hasHeaderRow` to true, Druid will skip the first two lines and then extract column information from the third line. Note that `hasHeaderRow` and `skipHeaderRows` are effective only for non-Hadoop batch index tasks. Other types of index