From fa5c3bb014e557ee6dc6f2278f2ed64cf1584700 Mon Sep 17 00:00:00 2001 From: Himanshu Gupta Date: Sat, 19 Dec 2015 02:58:54 -0600 Subject: [PATCH 1/3] adding decorate(DimensionSelector) to DimensionSpec to enable support for arbitrary filtering/transformations to returned dimension values --- .../CardinalityAggregatorFactory.java | 3 +- .../query/dimension/DefaultDimensionSpec.java | 7 ++ .../druid/query/dimension/DimensionSpec.java | 15 ++- .../dimension/ExtractionDimensionSpec.java | 7 ++ .../query/groupby/GroupByQueryEngine.java | 5 +- .../druid/query/search/SearchQueryRunner.java | 2 +- .../druid/query/select/SelectQueryEngine.java | 3 +- .../java/io/druid/query/topn/TopNMapFn.java | 3 +- .../druid/segment/ColumnSelectorFactory.java | 6 +- .../segment/QueryableIndexStorageAdapter.java | 15 ++- .../segment/filter/ExtractionFilter.java | 5 +- .../druid/segment/filter/SelectorFilter.java | 5 +- .../segment/incremental/IncrementalIndex.java | 15 ++- .../IncrementalIndexStorageAdapter.java | 14 ++- .../aggregation/FilteredAggregatorTest.java | 94 ++++++++++--------- .../IncrementalIndexStorageAdapterTest.java | 5 +- .../firehose/IngestSegmentFirehose.java | 5 +- 17 files changed, 136 insertions(+), 73 deletions(-) diff --git a/processing/src/main/java/io/druid/query/aggregation/cardinality/CardinalityAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/cardinality/CardinalityAggregatorFactory.java index a87dbb512406..6b177116e036 100644 --- a/processing/src/main/java/io/druid/query/aggregation/cardinality/CardinalityAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/cardinality/CardinalityAggregatorFactory.java @@ -33,6 +33,7 @@ import io.druid.query.aggregation.BufferAggregator; import io.druid.query.aggregation.hyperloglog.HyperLogLogCollector; import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory; +import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.segment.ColumnSelectorFactory; import io.druid.segment.DimensionSelector; import org.apache.commons.codec.binary.Base64; @@ -107,7 +108,7 @@ private List makeDimensionSelectors(final ColumnSelectorFacto @Override public DimensionSelector apply(@Nullable String input) { - return columnFactory.makeDimensionSelector(input, null); + return columnFactory.makeDimensionSelector(new DefaultDimensionSpec(input, input)); } } ), Predicates.notNull() diff --git a/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java index 4ffb0a9fa673..86ecfd38f961 100644 --- a/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java +++ b/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.metamx.common.StringUtils; import io.druid.query.extraction.ExtractionFn; +import io.druid.segment.DimensionSelector; import java.nio.ByteBuffer; @@ -66,6 +67,12 @@ public ExtractionFn getExtractionFn() return null; } + @Override + public DimensionSelector decorate(DimensionSelector selector) + { + return selector; + } + @Override public byte[] getCacheKey() { diff --git a/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java index 58b0f24fb74b..0b36f791a038 100644 --- a/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java +++ b/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; import io.druid.query.extraction.ExtractionFn; +import io.druid.segment.DimensionSelector; /** */ @@ -32,13 +33,17 @@ }) public interface DimensionSpec { - public String getDimension(); + String getDimension(); - public String getOutputName(); + String getOutputName(); - public ExtractionFn getExtractionFn(); + //ExtractionFn can be implemented with decorate(..) fn + @Deprecated + ExtractionFn getExtractionFn(); - public byte[] getCacheKey(); + DimensionSelector decorate(DimensionSelector selector); - public boolean preservesOrdering(); + byte[] getCacheKey(); + + boolean preservesOrdering(); } diff --git a/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java index b5fb93329350..ca30b89b0d42 100644 --- a/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java +++ b/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java @@ -24,6 +24,7 @@ import com.google.common.base.Preconditions; import com.metamx.common.StringUtils; import io.druid.query.extraction.ExtractionFn; +import io.druid.segment.DimensionSelector; import java.nio.ByteBuffer; @@ -77,6 +78,12 @@ public ExtractionFn getExtractionFn() return extractionFn; } + @Override + public DimensionSelector decorate(DimensionSelector selector) + { + return selector; + } + @Override public byte[] getCacheKey() { diff --git a/processing/src/main/java/io/druid/query/groupby/GroupByQueryEngine.java b/processing/src/main/java/io/druid/query/groupby/GroupByQueryEngine.java index a750a273fe3b..bdb8dae2d849 100644 --- a/processing/src/main/java/io/druid/query/groupby/GroupByQueryEngine.java +++ b/processing/src/main/java/io/druid/query/groupby/GroupByQueryEngine.java @@ -316,10 +316,7 @@ public RowIterator(GroupByQuery query, final Cursor cursor, ByteBuffer metricsBu for (int i = 0; i < dimensionSpecs.size(); ++i) { final DimensionSpec dimSpec = dimensionSpecs.get(i); - final DimensionSelector selector = cursor.makeDimensionSelector( - dimSpec.getDimension(), - dimSpec.getExtractionFn() - ); + final DimensionSelector selector = cursor.makeDimensionSelector(dimSpec); if (selector != null) { dimensions.add(selector); dimNames.add(dimSpec.getOutputName()); diff --git a/processing/src/main/java/io/druid/query/search/SearchQueryRunner.java b/processing/src/main/java/io/druid/query/search/SearchQueryRunner.java index 102f70c34a3f..2be205b83e2f 100644 --- a/processing/src/main/java/io/druid/query/search/SearchQueryRunner.java +++ b/processing/src/main/java/io/druid/query/search/SearchQueryRunner.java @@ -175,7 +175,7 @@ public TreeSet accumulate(TreeSet set, Cursor cursor) for (DimensionSpec dim : dimsToSearch) { dimSelectors.put( dim.getOutputName(), - cursor.makeDimensionSelector(dim.getDimension(), dim.getExtractionFn()) + cursor.makeDimensionSelector(dim) ); } diff --git a/processing/src/main/java/io/druid/query/select/SelectQueryEngine.java b/processing/src/main/java/io/druid/query/select/SelectQueryEngine.java index d905ef232c6a..87d9735ad4e6 100644 --- a/processing/src/main/java/io/druid/query/select/SelectQueryEngine.java +++ b/processing/src/main/java/io/druid/query/select/SelectQueryEngine.java @@ -26,6 +26,7 @@ import com.metamx.common.guava.Sequence; import io.druid.query.QueryRunnerHelper; import io.druid.query.Result; +import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.segment.Cursor; import io.druid.segment.DimensionSelector; import io.druid.segment.LongColumnSelector; @@ -89,7 +90,7 @@ public Result apply(Cursor cursor) final Map dimSelectors = Maps.newHashMap(); for (String dim : dims) { // switching to using DimensionSpec for select would allow the use of extractionFn here. - final DimensionSelector dimSelector = cursor.makeDimensionSelector(dim, null); + final DimensionSelector dimSelector = cursor.makeDimensionSelector(new DefaultDimensionSpec(dim, dim)); dimSelectors.put(dim, dimSelector); } diff --git a/processing/src/main/java/io/druid/query/topn/TopNMapFn.java b/processing/src/main/java/io/druid/query/topn/TopNMapFn.java index cbc586e51098..d31e84fa7910 100644 --- a/processing/src/main/java/io/druid/query/topn/TopNMapFn.java +++ b/processing/src/main/java/io/druid/query/topn/TopNMapFn.java @@ -43,8 +43,7 @@ public TopNMapFn( public Result apply(Cursor cursor) { final DimensionSelector dimSelector = cursor.makeDimensionSelector( - query.getDimensionSpec().getDimension(), - query.getDimensionSpec().getExtractionFn() + query.getDimensionSpec() ); if (dimSelector == null) { return null; diff --git a/processing/src/main/java/io/druid/segment/ColumnSelectorFactory.java b/processing/src/main/java/io/druid/segment/ColumnSelectorFactory.java index cae2784b2c94..2aff7f831c19 100644 --- a/processing/src/main/java/io/druid/segment/ColumnSelectorFactory.java +++ b/processing/src/main/java/io/druid/segment/ColumnSelectorFactory.java @@ -19,16 +19,14 @@ package io.druid.segment; -import io.druid.query.extraction.ExtractionFn; - -import javax.annotation.Nullable; +import io.druid.query.dimension.DimensionSpec; /** * Factory class for MetricSelectors */ public interface ColumnSelectorFactory { - public DimensionSelector makeDimensionSelector(String dimensionName, @Nullable ExtractionFn extractionFn); + public DimensionSelector makeDimensionSelector(DimensionSpec dimensionSpec); public FloatColumnSelector makeFloatColumnSelector(String columnName); public LongColumnSelector makeLongColumnSelector(String columnName); public ObjectColumnSelector makeObjectColumnSelector(String columnName); diff --git a/processing/src/main/java/io/druid/segment/QueryableIndexStorageAdapter.java b/processing/src/main/java/io/druid/segment/QueryableIndexStorageAdapter.java index f067ceb578c9..e67cf05df405 100644 --- a/processing/src/main/java/io/druid/segment/QueryableIndexStorageAdapter.java +++ b/processing/src/main/java/io/druid/segment/QueryableIndexStorageAdapter.java @@ -30,6 +30,7 @@ import com.metamx.common.guava.Sequences; import io.druid.granularity.QueryGranularity; import io.druid.query.QueryInterruptedException; +import io.druid.query.dimension.DimensionSpec; import io.druid.query.extraction.ExtractionFn; import io.druid.query.filter.Filter; import io.druid.segment.column.Column; @@ -44,7 +45,6 @@ import org.joda.time.DateTime; import org.joda.time.Interval; -import javax.annotation.Nullable; import java.io.Closeable; import java.io.IOException; import java.util.Iterator; @@ -296,10 +296,19 @@ public void reset() @Override public DimensionSelector makeDimensionSelector( - final String dimension, - @Nullable final ExtractionFn extractionFn + DimensionSpec dimensionSpec ) { + return dimensionSpec.decorate(makeDimensionSelectorUndecorated(dimensionSpec)); + } + + private DimensionSelector makeDimensionSelectorUndecorated( + DimensionSpec dimensionSpec + ) + { + final String dimension = dimensionSpec.getDimension(); + final ExtractionFn extractionFn = dimensionSpec.getExtractionFn(); + final Column columnDesc = index.getColumn(dimension); if (columnDesc == null) { return NULL_DIMENSION_SELECTOR; diff --git a/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java b/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java index cfea3f3737cf..7be2e97da449 100644 --- a/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java +++ b/processing/src/main/java/io/druid/segment/filter/ExtractionFilter.java @@ -23,6 +23,7 @@ import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.metamx.collections.bitmap.ImmutableBitmap; +import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.query.extraction.ExtractionFn; import io.druid.query.filter.BitmapIndexSelector; import io.druid.query.filter.Filter; @@ -124,7 +125,9 @@ public boolean apply(String input) @Override public ValueMatcher makeMatcher(ColumnSelectorFactory columnSelectorFactory) { - final DimensionSelector dimensionSelector = columnSelectorFactory.makeDimensionSelector(dimension, null); + final DimensionSelector dimensionSelector = columnSelectorFactory.makeDimensionSelector( + new DefaultDimensionSpec(dimension, dimension) + ); if (dimensionSelector == null) { return new BooleanValueMatcher(value.equals(Strings.nullToEmpty(fn.apply(null)))); } else { diff --git a/processing/src/main/java/io/druid/segment/filter/SelectorFilter.java b/processing/src/main/java/io/druid/segment/filter/SelectorFilter.java index aacf9a95de49..861eb35e245f 100644 --- a/processing/src/main/java/io/druid/segment/filter/SelectorFilter.java +++ b/processing/src/main/java/io/druid/segment/filter/SelectorFilter.java @@ -21,6 +21,7 @@ import com.google.common.base.Strings; import com.metamx.collections.bitmap.ImmutableBitmap; +import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.query.filter.BitmapIndexSelector; import io.druid.query.filter.Filter; import io.druid.query.filter.ValueMatcher; @@ -60,7 +61,9 @@ public ValueMatcher makeMatcher(ValueMatcherFactory factory) @Override public ValueMatcher makeMatcher(ColumnSelectorFactory columnSelectorFactory) { - final DimensionSelector dimensionSelector = columnSelectorFactory.makeDimensionSelector(dimension, null); + final DimensionSelector dimensionSelector = columnSelectorFactory.makeDimensionSelector( + new DefaultDimensionSpec(dimension, dimension) + ); // Missing columns match a null or empty string value and don't match anything else if (dimensionSelector == null) { diff --git a/processing/src/main/java/io/druid/segment/incremental/IncrementalIndex.java b/processing/src/main/java/io/druid/segment/incremental/IncrementalIndex.java index 643239242996..cf7ff364c8c5 100644 --- a/processing/src/main/java/io/druid/segment/incremental/IncrementalIndex.java +++ b/processing/src/main/java/io/druid/segment/incremental/IncrementalIndex.java @@ -38,6 +38,7 @@ import io.druid.granularity.QueryGranularity; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.PostAggregator; +import io.druid.query.dimension.DimensionSpec; import io.druid.query.extraction.ExtractionFn; import io.druid.segment.ColumnSelectorFactory; import io.druid.segment.DimensionSelector; @@ -169,8 +170,20 @@ public Object get() } @Override - public DimensionSelector makeDimensionSelector(final String dimension, final ExtractionFn extractionFn) + public DimensionSelector makeDimensionSelector( + DimensionSpec dimensionSpec + ) { + return dimensionSpec.decorate(makeDimensionSelectorUndecorated(dimensionSpec)); + } + + private DimensionSelector makeDimensionSelectorUndecorated( + DimensionSpec dimensionSpec + ) + { + final String dimension = dimensionSpec.getDimension(); + final ExtractionFn extractionFn = dimensionSpec.getExtractionFn(); + return new DimensionSelector() { @Override diff --git a/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java b/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java index b5163ce522ce..9f3b058e2b50 100644 --- a/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java +++ b/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java @@ -30,6 +30,7 @@ import com.metamx.common.guava.Sequences; import io.druid.granularity.QueryGranularity; import io.druid.query.QueryInterruptedException; +import io.druid.query.dimension.DimensionSpec; import io.druid.query.extraction.ExtractionFn; import io.druid.query.filter.Filter; import io.druid.query.filter.ValueMatcher; @@ -294,10 +295,19 @@ public void reset() @Override public DimensionSelector makeDimensionSelector( - final String dimension, - @Nullable final ExtractionFn extractionFn + DimensionSpec dimensionSpec ) { + return dimensionSpec.decorate(makeDimensionSelectorUndecorated(dimensionSpec)); + } + + private DimensionSelector makeDimensionSelectorUndecorated( + DimensionSpec dimensionSpec + ) + { + final String dimension = dimensionSpec.getDimension(); + final ExtractionFn extractionFn = dimensionSpec.getExtractionFn(); + if (dimension.equals(Column.TIME_COLUMN_NAME)) { return new SingleScanTimeDimSelector(makeLongColumnSelector(dimension), extractionFn); } diff --git a/processing/src/test/java/io/druid/query/aggregation/FilteredAggregatorTest.java b/processing/src/test/java/io/druid/query/aggregation/FilteredAggregatorTest.java index 8f1cbb8e2402..7f41e2be25c4 100644 --- a/processing/src/test/java/io/druid/query/aggregation/FilteredAggregatorTest.java +++ b/processing/src/test/java/io/druid/query/aggregation/FilteredAggregatorTest.java @@ -20,6 +20,7 @@ package io.druid.query.aggregation; import com.google.common.collect.Lists; +import io.druid.query.dimension.DimensionSpec; import io.druid.query.extraction.ExtractionFn; import io.druid.query.filter.AndDimFilter; import io.druid.query.filter.DimFilter; @@ -73,53 +74,58 @@ private ColumnSelectorFactory makeColumnSelector(final TestFloatColumnSelector s return new ColumnSelectorFactory() { @Override - public DimensionSelector makeDimensionSelector(String dimensionName, ExtractionFn extractionFn) + public DimensionSelector makeDimensionSelector(DimensionSpec dimensionSpec) { + final String dimensionName = dimensionSpec.getDimension(); + final ExtractionFn extractionFn = dimensionSpec.getExtractionFn(); + if (dimensionName.equals("dim")) { - return new DimensionSelector() - { - @Override - public IndexedInts getRow() - { - if (selector.getIndex() % 3 == 2) { - return new ArrayBasedIndexedInts(new int[]{1}); - } else { - return new ArrayBasedIndexedInts(new int[]{0}); - } - } - - @Override - public int getValueCardinality() - { - return 2; - } - - @Override - public String lookupName(int id) - { - switch (id) { - case 0: - return "a"; - case 1: - return "b"; - default: - throw new IllegalArgumentException(); - } - } - - @Override - public int lookupId(String name) - { - switch (name) { - case "a": - return 0; - case "b": - return 1; - default: - throw new IllegalArgumentException(); + return dimensionSpec.decorate( + new DimensionSelector() + { + @Override + public IndexedInts getRow() + { + if (selector.getIndex() % 3 == 2) { + return new ArrayBasedIndexedInts(new int[]{1}); + } else { + return new ArrayBasedIndexedInts(new int[]{0}); + } + } + + @Override + public int getValueCardinality() + { + return 2; + } + + @Override + public String lookupName(int id) + { + switch (id) { + case 0: + return "a"; + case 1: + return "b"; + default: + throw new IllegalArgumentException(); + } + } + + @Override + public int lookupId(String name) + { + switch (name) { + case "a": + return 0; + case "b": + return 1; + default: + throw new IllegalArgumentException(); + } + } } - } - }; + ); } else { throw new UnsupportedOperationException(); } diff --git a/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java b/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java index 6cea28faa4fb..6816f40ed99f 100644 --- a/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java +++ b/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java @@ -36,6 +36,7 @@ import io.druid.query.aggregation.CountAggregatorFactory; import io.druid.query.aggregation.JavaScriptAggregatorFactory; import io.druid.query.aggregation.LongSumAggregatorFactory; +import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.query.filter.DimFilters; import io.druid.query.groupby.GroupByQuery; import io.druid.query.groupby.GroupByQueryConfig; @@ -260,7 +261,7 @@ public void testResetSanity() throws IOException Cursor cursor = Sequences.toList(Sequences.limit(cursorSequence, 1), Lists.newArrayList()).get(0); DimensionSelector dimSelector; - dimSelector = cursor.makeDimensionSelector("sally", null); + dimSelector = cursor.makeDimensionSelector(new DefaultDimensionSpec("sally", "sally")); Assert.assertEquals("bo", dimSelector.lookupName(dimSelector.getRow().get(0))); index.add( @@ -274,7 +275,7 @@ public void testResetSanity() throws IOException // Cursor reset should not be affected by out of order values cursor.reset(); - dimSelector = cursor.makeDimensionSelector("sally", null); + dimSelector = cursor.makeDimensionSelector(new DefaultDimensionSpec("sally", "sally")); Assert.assertEquals("bo", dimSelector.lookupName(dimSelector.getRow().get(0))); } diff --git a/server/src/main/java/io/druid/segment/realtime/firehose/IngestSegmentFirehose.java b/server/src/main/java/io/druid/segment/realtime/firehose/IngestSegmentFirehose.java index ad1b6f705bf6..3b885b0b8cb9 100644 --- a/server/src/main/java/io/druid/segment/realtime/firehose/IngestSegmentFirehose.java +++ b/server/src/main/java/io/druid/segment/realtime/firehose/IngestSegmentFirehose.java @@ -31,6 +31,7 @@ import io.druid.data.input.InputRow; import io.druid.data.input.MapBasedInputRow; import io.druid.granularity.QueryGranularity; +import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.query.filter.DimFilter; import io.druid.query.select.EventHolder; import io.druid.segment.Cursor; @@ -85,7 +86,9 @@ public Sequence apply(final Cursor cursor) final Map dimSelectors = Maps.newHashMap(); for (String dim : dims) { - final DimensionSelector dimSelector = cursor.makeDimensionSelector(dim, null); + final DimensionSelector dimSelector = cursor.makeDimensionSelector( + new DefaultDimensionSpec(dim, dim) + ); // dimSelector is null if the dimension is not present if (dimSelector != null) { dimSelectors.put(dim, dimSelector); From b47d807738429dd0e54c3f1b5654cfcbba3dab2f Mon Sep 17 00:00:00 2001 From: Himanshu Gupta Date: Sat, 19 Dec 2015 03:01:20 -0600 Subject: [PATCH 2/3] Add support for filtering at DimensionSpec level so that multivalued dimensions can be filtered correctly also adding UTs for multi-valued dimensions --- docs/content/querying/dimensionspecs.md | 20 ++ .../dimension/BaseFilteredDimensionSpec.java | 68 +++++ .../druid/query/dimension/DimensionSpec.java | 4 +- .../dimension/ListFilteredDimensionSpec.java | 191 +++++++++++++ .../dimension/RegexFilteredDimensionSpec.java | 162 +++++++++++ .../query/filter/DimFilterCacheHelper.java | 4 +- .../segment/data/ListBasedIndexedInts.java | 63 +++++ .../druid/query/MultiValuedDimensionTest.java | 260 ++++++++++++++++++ .../aggregation/AggregationTestHelper.java | 82 +++--- .../ListFilteredDimensionSpecTest.java | 113 ++++++++ .../RegexFilteredDimensionSpecTest.java | 76 +++++ 11 files changed, 1001 insertions(+), 42 deletions(-) create mode 100644 processing/src/main/java/io/druid/query/dimension/BaseFilteredDimensionSpec.java create mode 100644 processing/src/main/java/io/druid/query/dimension/ListFilteredDimensionSpec.java create mode 100644 processing/src/main/java/io/druid/query/dimension/RegexFilteredDimensionSpec.java create mode 100644 processing/src/main/java/io/druid/segment/data/ListBasedIndexedInts.java create mode 100644 processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java create mode 100644 processing/src/test/java/io/druid/query/dimension/ListFilteredDimensionSpecTest.java create mode 100644 processing/src/test/java/io/druid/query/dimension/RegexFilteredDimensionSpecTest.java diff --git a/docs/content/querying/dimensionspecs.md b/docs/content/querying/dimensionspecs.md index d4e5a3350913..9f19c2698fc3 100644 --- a/docs/content/querying/dimensionspecs.md +++ b/docs/content/querying/dimensionspecs.md @@ -252,3 +252,23 @@ A null dimension value can be mapped to a specific value by specifying the empty This allows distinguishing between a null dimension and a lookup resulting in a null. For example, specifying `{"":"bar","bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["bar", "oof", "baz"]`. Omitting the empty string key will cause the missing value to take over. For example, specifying `{"bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["oof", "oof", "baz"]`. + +### Filtering DimensionSpecs +These are only valid for multi-valued dimensions. If you have a row in druid that has a multi-valued dimension with values ["v1", "v2", "v3"] and you send a groupBy/topN query grouping by that dimension with [query filter](filter.html) for value "v1". In the response you will get 3 rows containing "v1", "v2" and "v3". This behavior might be unintuitive for some use cases. + +It happens because `query filter` is internally used on the bitmaps and only used to match the row to be included in the query result processing. With multivalued dimensions, "query filter" behaves like a contains check, which will match the row with dimension value ["v1", "v2", "v3"]. Please see the section on "Multi-value columns" in [segment](../design/segments.html) for more details. +Then groupBy/topN processing pipeline "explodes" all multi-valued dimensions resulting 3 rows for "v1", "v2" and "v3" each. + +In addition to "query filter" which efficiently selects the rows to be processed, you can use the filtering dimension spec to filter for specific values within the values of a multi-valued dimension. These dimensionSpecs take a delegate DimensionSpec and a filtering criteria. From the "exploded" rows, only rows matching the given filtering criteria are returned in the query result. + +The following filtered dimension spec acts as a whiltelist or blacklist for values as per the "isWhitelist" attribute value. +```json +{ "type" : "listFiltered", "delegate" : , "values": , "isWhitelist": } +``` + +Following filtered dimension spec retains only the values matching regex. Note that `listFiltered` is faster than this and one should use that for whitelist or blacklist usecase. +```json +{ "type" : "regexFiltered", "delegate" : , "pattern": } +``` + +For more details and examples, see [multi-valued dimensions](multi-valued-dimensions.html). diff --git a/processing/src/main/java/io/druid/query/dimension/BaseFilteredDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/BaseFilteredDimensionSpec.java new file mode 100644 index 000000000000..50ff76e88bb7 --- /dev/null +++ b/processing/src/main/java/io/druid/query/dimension/BaseFilteredDimensionSpec.java @@ -0,0 +1,68 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query.dimension; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import io.druid.query.extraction.ExtractionFn; + +/** + */ +public abstract class BaseFilteredDimensionSpec implements DimensionSpec +{ + protected final DimensionSpec delegate; + + public BaseFilteredDimensionSpec( + @JsonProperty("delegate") DimensionSpec delegate + ) + { + this.delegate = Preconditions.checkNotNull(delegate, "delegate must not be null"); + } + + @JsonProperty + public DimensionSpec getDelegate() + { + return delegate; + } + + @Override + public String getDimension() + { + return delegate.getDimension(); + } + + @Override + public String getOutputName() + { + return delegate.getOutputName(); + } + + @Override + public ExtractionFn getExtractionFn() + { + return delegate.getExtractionFn(); + } + + @Override + public boolean preservesOrdering() + { + return delegate.preservesOrdering(); + } +} diff --git a/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java index 0b36f791a038..807da4d2c9b3 100644 --- a/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java +++ b/processing/src/main/java/io/druid/query/dimension/DimensionSpec.java @@ -29,7 +29,9 @@ @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type", defaultImpl = LegacyDimensionSpec.class) @JsonSubTypes(value = { @JsonSubTypes.Type(name = "default", value = DefaultDimensionSpec.class), - @JsonSubTypes.Type(name = "extraction", value = ExtractionDimensionSpec.class) + @JsonSubTypes.Type(name = "extraction", value = ExtractionDimensionSpec.class), + @JsonSubTypes.Type(name = "regexFiltered", value = RegexFilteredDimensionSpec.class), + @JsonSubTypes.Type(name = "listFiltered", value = ListFilteredDimensionSpec.class) }) public interface DimensionSpec { diff --git a/processing/src/main/java/io/druid/query/dimension/ListFilteredDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/ListFilteredDimensionSpec.java new file mode 100644 index 000000000000..e17e3f70f356 --- /dev/null +++ b/processing/src/main/java/io/druid/query/dimension/ListFilteredDimensionSpec.java @@ -0,0 +1,191 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query.dimension; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import com.metamx.common.StringUtils; +import io.druid.query.filter.DimFilterCacheHelper; +import io.druid.segment.DimensionSelector; +import io.druid.segment.data.IndexedInts; +import io.druid.segment.data.ListBasedIndexedInts; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + */ +public class ListFilteredDimensionSpec extends BaseFilteredDimensionSpec +{ + + private static final byte CACHE_TYPE_ID = 0x3; + + private final List values; + private final boolean isWhitelist; + + public ListFilteredDimensionSpec( + @JsonProperty("delegate") DimensionSpec delegate, + @JsonProperty("values") List values, + @JsonProperty("isWhitelist") Boolean isWhitelist + ) + { + super(delegate); + + Preconditions.checkArgument(values != null && values.size() > 0, "values list must be non-empty"); + this.values = values; + + this.isWhitelist = isWhitelist == null ? true : isWhitelist.booleanValue(); + } + + @JsonProperty + public List getValues() + { + return values; + } + + @JsonProperty("isWhitelist") + public boolean isWhitelist() + { + return isWhitelist; + } + + @Override + public DimensionSelector decorate(final DimensionSelector selector) + { + if (selector == null) { + return selector; + } + + final Set matched = new HashSet<>(values.size()); + for (String value : values) { + int i = selector.lookupId(value); + if (i >= 0) { + matched.add(i); + } + }; + + return new DimensionSelector() + { + @Override + public IndexedInts getRow() + { + IndexedInts baseRow = selector.getRow(); + List result = new ArrayList<>(baseRow.size()); + + for (int i : baseRow) { + if (matched.contains(i)) { + if (isWhitelist) { + result.add(i); + } + } else { + if (!isWhitelist) { + result.add(i); + } + } + } + + return new ListBasedIndexedInts(result); + } + + @Override + public int getValueCardinality() + { + return matched.size(); + } + + @Override + public String lookupName(int id) + { + return selector.lookupName(id); + } + + @Override + public int lookupId(String name) + { + return selector.lookupId(name); + } + }; + } + + @Override + public byte[] getCacheKey() + { + byte[] delegateCacheKey = delegate.getCacheKey(); + + byte[][] valuesBytes = new byte[values.size()][]; + int valuesBytesSize = 0; + int index = 0; + for (String value : values) { + valuesBytes[index] = StringUtils.toUtf8(value); + valuesBytesSize += valuesBytes[index].length + 1; + ++index; + } + + ByteBuffer filterCacheKey = ByteBuffer.allocate(3 + delegateCacheKey.length + valuesBytesSize) + .put(CACHE_TYPE_ID) + .put(delegateCacheKey) + .put((byte) (isWhitelist ? 1 : 0)) + .put(DimFilterCacheHelper.STRING_SEPARATOR); + for (byte[] bytes : valuesBytes) { + filterCacheKey.put(bytes) + .put(DimFilterCacheHelper.STRING_SEPARATOR); + } + return filterCacheKey.array(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + ListFilteredDimensionSpec that = (ListFilteredDimensionSpec) o; + + if (isWhitelist != that.isWhitelist) { + return false; + } + return values.equals(that.values); + + } + + @Override + public int hashCode() + { + int result = values.hashCode(); + result = 31 * result + (isWhitelist ? 1 : 0); + return result; + } + + @Override + public String toString() + { + return "ListFilteredDimensionSpec{" + + "values=" + values + + ", isWhitelist=" + isWhitelist + + '}'; + } +} diff --git a/processing/src/main/java/io/druid/query/dimension/RegexFilteredDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/RegexFilteredDimensionSpec.java new file mode 100644 index 000000000000..07cda27c04ef --- /dev/null +++ b/processing/src/main/java/io/druid/query/dimension/RegexFilteredDimensionSpec.java @@ -0,0 +1,162 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query.dimension; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import com.google.common.base.Strings; +import com.metamx.common.StringUtils; +import io.druid.query.filter.DimFilterCacheHelper; +import io.druid.segment.DimensionSelector; +import io.druid.segment.data.IndexedInts; +import io.druid.segment.data.ListBasedIndexedInts; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; +import java.util.regex.Pattern; + +/** + */ +public class RegexFilteredDimensionSpec extends BaseFilteredDimensionSpec +{ + + private static final byte CACHE_TYPE_ID = 0x2; + + private final String pattern; + + private final Pattern compiledRegex; + + public RegexFilteredDimensionSpec( + @JsonProperty("delegate") DimensionSpec delegate, + @JsonProperty("pattern") String pattern //rows not matching the pattern will be discarded + ) + { + super(delegate); + this.pattern = Preconditions.checkNotNull(pattern, "pattern must not be null"); + this.compiledRegex = Pattern.compile(pattern); + } + + @JsonProperty + public String getPattern() + { + return pattern; + } + + @Override + public DimensionSelector decorate(final DimensionSelector selector) + { + if (selector == null) { + return selector; + } + + final BitSet bitSetOfIds = new BitSet(selector.getValueCardinality()); + for (int i = 0; i < selector.getValueCardinality(); i++) { + if (compiledRegex.matcher(Strings.nullToEmpty(selector.lookupName(i))).matches()) { + bitSetOfIds.set(i); + } + } + + return new DimensionSelector() + { + @Override + public IndexedInts getRow() + { + IndexedInts baseRow = selector.getRow(); + List result = new ArrayList<>(baseRow.size()); + + for (int i : baseRow) { + if (bitSetOfIds.get(i)) { + result.add(i); + } + } + + return new ListBasedIndexedInts(result); + } + + @Override + public int getValueCardinality() + { + return bitSetOfIds.cardinality(); + } + + @Override + public String lookupName(int id) + { + return selector.lookupName(id); + } + + @Override + public int lookupId(String name) + { + return selector.lookupId(name); + } + }; + } + + @Override + public byte[] getCacheKey() + { + byte[] delegateCacheKey = delegate.getCacheKey(); + byte[] regexBytes = StringUtils.toUtf8(pattern); + return ByteBuffer.allocate(2 + delegateCacheKey.length + regexBytes.length) + .put(CACHE_TYPE_ID) + .put(delegateCacheKey) + .put(DimFilterCacheHelper.STRING_SEPARATOR) + .put(regexBytes) + .array(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + RegexFilteredDimensionSpec that = (RegexFilteredDimensionSpec) o; + + if (!delegate.equals(that.delegate)) { + return false; + } + return pattern.equals(that.pattern); + + } + + @Override + public int hashCode() + { + int result = delegate.hashCode(); + result = 31 * result + pattern.hashCode(); + return result; + } + + @Override + public String toString() + { + return "RegexFilteredDimensionSpec{" + + "pattern='" + pattern + '\'' + + '}'; + } +} diff --git a/processing/src/main/java/io/druid/query/filter/DimFilterCacheHelper.java b/processing/src/main/java/io/druid/query/filter/DimFilterCacheHelper.java index d6e970d54427..706e5ce5471a 100644 --- a/processing/src/main/java/io/druid/query/filter/DimFilterCacheHelper.java +++ b/processing/src/main/java/io/druid/query/filter/DimFilterCacheHelper.java @@ -24,7 +24,7 @@ /** */ -class DimFilterCacheHelper +public class DimFilterCacheHelper { static final byte NOOP_CACHE_ID = -0x4; static final byte SELECTOR_CACHE_ID = 0x0; @@ -37,7 +37,7 @@ class DimFilterCacheHelper static final byte JAVASCRIPT_CACHE_ID = 0x7; static final byte SPATIAL_CACHE_ID = 0x8; static final byte IN_CACHE_ID = 0x9; - static final byte STRING_SEPARATOR = (byte) 0xFF; + public static final byte STRING_SEPARATOR = (byte) 0xFF; public static byte BOUND_CACHE_ID = 0xA; static byte[] computeCacheKey(byte cacheIdKey, List filters) diff --git a/processing/src/main/java/io/druid/segment/data/ListBasedIndexedInts.java b/processing/src/main/java/io/druid/segment/data/ListBasedIndexedInts.java new file mode 100644 index 000000000000..df44e5aa3754 --- /dev/null +++ b/processing/src/main/java/io/druid/segment/data/ListBasedIndexedInts.java @@ -0,0 +1,63 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.segment.data; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +/** + */ +public class ListBasedIndexedInts implements IndexedInts +{ + private final List expansion; + + public ListBasedIndexedInts(List expansion) {this.expansion = expansion;} + + @Override + public int size() + { + return expansion.size(); + } + + @Override + public int get(int index) + { + return expansion.get(index); + } + + @Override + public Iterator iterator() + { + return new IndexedIntsIterator(this); + } + + @Override + public void fill(int index, int[] toFill) + { + throw new UnsupportedOperationException("fill not supported"); + } + + @Override + public void close() throws IOException + { + + } +} diff --git a/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java new file mode 100644 index 000000000000..0fefbce30a61 --- /dev/null +++ b/processing/src/test/java/io/druid/query/MultiValuedDimensionTest.java @@ -0,0 +1,260 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query; + +import com.fasterxml.jackson.databind.Module; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.common.io.Files; +import com.metamx.common.guava.Sequence; +import com.metamx.common.guava.Sequences; +import io.druid.data.input.Row; +import io.druid.data.input.impl.CSVParseSpec; +import io.druid.data.input.impl.DimensionsSpec; +import io.druid.data.input.impl.StringInputRowParser; +import io.druid.data.input.impl.TimestampSpec; +import io.druid.granularity.QueryGranularity; +import io.druid.query.aggregation.AggregationTestHelper; +import io.druid.query.aggregation.AggregatorFactory; +import io.druid.query.aggregation.CountAggregatorFactory; +import io.druid.query.dimension.DefaultDimensionSpec; +import io.druid.query.dimension.DimensionSpec; +import io.druid.query.dimension.RegexFilteredDimensionSpec; +import io.druid.query.filter.SelectorDimFilter; +import io.druid.query.groupby.GroupByQuery; +import io.druid.query.groupby.GroupByQueryRunnerTestHelper; +import io.druid.query.spec.LegacySegmentSpec; +import io.druid.segment.IncrementalIndexSegment; +import io.druid.segment.IndexSpec; +import io.druid.segment.QueryableIndex; +import io.druid.segment.QueryableIndexSegment; +import io.druid.segment.TestHelper; +import io.druid.segment.incremental.IncrementalIndex; +import io.druid.segment.incremental.OnheapIncrementalIndex; +import org.apache.commons.io.FileUtils; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + */ +public class MultiValuedDimensionTest +{ + private AggregationTestHelper helper; + + private static IncrementalIndex incrementalIndex; + private static QueryableIndex queryableIndex; + + private static File persistedSegmentDir; + + public MultiValuedDimensionTest() throws Exception + { + helper = new AggregationTestHelper( + ImmutableList.of(), null + ); + } + + @BeforeClass + public static void setupClass() throws Exception + { + incrementalIndex = new OnheapIncrementalIndex( + 0, + QueryGranularity.NONE, + new AggregatorFactory[]{ + new CountAggregatorFactory("count") + }, + true, + 5000 + ); + + StringInputRowParser parser = new StringInputRowParser( + new CSVParseSpec( + new TimestampSpec("timestamp", "iso", null), + new DimensionsSpec(ImmutableList.of("product", "tags"), null, null), + "\t", + ImmutableList.of("timestamp", "product", "tags") + ), + "UTF-8" + ); + + String[] rows = new String[]{ + "2011-01-12T00:00:00.000Z,product_1,t1\tt2\tt3", + "2011-01-13T00:00:00.000Z,product_2,t3\tt4\tt5", + "2011-01-14T00:00:00.000Z,product_3,t5\tt6\tt7", + }; + + for (String row : rows) { + incrementalIndex.add(parser.parse(row)); + } + + persistedSegmentDir = Files.createTempDir(); + TestHelper.getTestIndexMerger() + .persist(incrementalIndex, persistedSegmentDir, ImmutableMap.of(), new IndexSpec()); + + queryableIndex = TestHelper.getTestIndexIO().loadIndex(persistedSegmentDir); + } + + @Test + public void testGroupByNoFilter() throws Exception + { + GroupByQuery query = GroupByQuery + .builder() + .setDataSource("xx") + .setQuerySegmentSpec(new LegacySegmentSpec("1970/3000")) + .setGranularity(QueryGranularity.ALL) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("tags", "tags"))) + .setAggregatorSpecs( + Arrays.asList( + new AggregatorFactory[] + { + new CountAggregatorFactory("count") + } + ) + ) + .build(); + + Sequence result = helper.runQueryOnSegmentsObjs( + ImmutableList.of( + new QueryableIndexSegment("sid1", queryableIndex), + new IncrementalIndexSegment(incrementalIndex, "sid2") + ), + query + ); + + List expectedResults = Arrays.asList( + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t1", "count", 2L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t2", "count", 2L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t3", "count", 4L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t4", "count", 2L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t5", "count", 4L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t6", "count", 2L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t7", "count", 2L) + ); + + TestHelper.assertExpectedObjects(expectedResults, Sequences.toList(result, new ArrayList()), ""); + + result = helper.runQueryOnSegmentsObjs( + ImmutableList.of( + new QueryableIndexSegment("sid1", queryableIndex), + new IncrementalIndexSegment(incrementalIndex, "sid2") + ), + query + ); + } + + @Test + public void testGroupByWithDimFilter() throws Exception + { + GroupByQuery query = GroupByQuery + .builder() + .setDataSource("xx") + .setQuerySegmentSpec(new LegacySegmentSpec("1970/3000")) + .setGranularity(QueryGranularity.ALL) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("tags", "tags"))) + .setAggregatorSpecs( + Arrays.asList( + new AggregatorFactory[] + { + new CountAggregatorFactory("count") + } + ) + ) + .setDimFilter( + new SelectorDimFilter("tags", "t3") + ) + .build(); + + Sequence result = helper.runQueryOnSegmentsObjs( + ImmutableList.of( + new QueryableIndexSegment("sid1", queryableIndex), + new IncrementalIndexSegment(incrementalIndex, "sid2") + ), + query + ); + + List expectedResults = Arrays.asList( + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t1", "count", 2L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t2", "count", 2L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t3", "count", 4L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t4", "count", 2L), + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t5", "count", 2L) + ); + + TestHelper.assertExpectedObjects(expectedResults, Sequences.toList(result, new ArrayList()), ""); + } + + @Test + public void testGroupByWithDimFilterAndWithFilteredDimSpec() throws Exception + { + GroupByQuery query = GroupByQuery + .builder() + .setDataSource("xx") + .setQuerySegmentSpec(new LegacySegmentSpec("1970/3000")) + .setGranularity(QueryGranularity.ALL) + .setDimensions( + Lists.newArrayList( + new RegexFilteredDimensionSpec( + new DefaultDimensionSpec("tags", "tags"), + "t3" + ) + ) + ) + .setAggregatorSpecs( + Arrays.asList( + new AggregatorFactory[] + { + new CountAggregatorFactory("count") + } + ) + ) + .setDimFilter( + new SelectorDimFilter("tags", "t3") + ) + .build(); + + Sequence result = helper.runQueryOnSegmentsObjs( + ImmutableList.of( + new QueryableIndexSegment("sid1", queryableIndex), + new IncrementalIndexSegment(incrementalIndex, "sid2") + ), + query + ); + + List expectedResults = Arrays.asList( + GroupByQueryRunnerTestHelper.createExpectedRow("1970-01-01T00:00:00.000Z", "tags", "t3", "count", 4L) + ); + + TestHelper.assertExpectedObjects(expectedResults, Sequences.toList(result, new ArrayList()), ""); + } + + @AfterClass + public static void cleanup() throws Exception + { + queryableIndex.close(); + incrementalIndex.close(); + FileUtils.deleteDirectory(persistedSegmentDir); + } +} diff --git a/processing/src/test/java/io/druid/query/aggregation/AggregationTestHelper.java b/processing/src/test/java/io/druid/query/aggregation/AggregationTestHelper.java index f0cc8a320fa5..e3adad4f8df0 100644 --- a/processing/src/test/java/io/druid/query/aggregation/AggregationTestHelper.java +++ b/processing/src/test/java/io/druid/query/aggregation/AggregationTestHelper.java @@ -296,12 +296,12 @@ public Sequence runQueryOnSegments(final List segmentDirs, final Stri public Sequence runQueryOnSegments(final List segmentDirs, final GroupByQuery query) { - final List segments = Lists.transform( + final List segments = Lists.transform( segmentDirs, - new Function() + new Function() { @Override - public QueryableIndexSegment apply(File segmentDir) + public Segment apply(File segmentDir) { try { return new QueryableIndexSegment("", indexIO.loadIndex(segmentDir)); @@ -314,48 +314,52 @@ public QueryableIndexSegment apply(File segmentDir) ); try { - final FinalizeResultsQueryRunner baseRunner = new FinalizeResultsQueryRunner( - toolChest.postMergeQueryDecoration( - toolChest.mergeResults( - toolChest.preMergeQueryDecoration( - new ConcatQueryRunner( - Sequences.simple( - Lists.transform( - segments, - new Function() - { - @Override - public QueryRunner apply(final Segment segment) - { - try { - return makeStringSerdeQueryRunner( - mapper, - toolChest, - query, - factory.createRunner(segment) - ); - } - catch (Exception ex) { - throw Throwables.propagate(ex); - } - } - } - ) - ) - ) - ) - ) - ), - toolChest - ); - - return baseRunner.run(query, Maps.newHashMap()); + return runQueryOnSegmentsObjs(segments, query); } finally { for(Segment segment: segments) { CloseQuietly.close(segment); } } + } + + public Sequence runQueryOnSegmentsObjs(final List segments, final GroupByQuery query) + { + final FinalizeResultsQueryRunner baseRunner = new FinalizeResultsQueryRunner( + toolChest.postMergeQueryDecoration( + toolChest.mergeResults( + toolChest.preMergeQueryDecoration( + new ConcatQueryRunner( + Sequences.simple( + Lists.transform( + segments, + new Function() + { + @Override + public QueryRunner apply(final Segment segment) + { + try { + return makeStringSerdeQueryRunner( + mapper, + toolChest, + query, + factory.createRunner(segment) + ); + } + catch (Exception ex) { + throw Throwables.propagate(ex); + } + } + } + ) + ) + ) + ) + ) + ), + toolChest + ); + return baseRunner.run(query, Maps.newHashMap()); } public QueryRunner makeStringSerdeQueryRunner(final ObjectMapper mapper, final QueryToolChest toolChest, final Query query, final QueryRunner baseRunner) diff --git a/processing/src/test/java/io/druid/query/dimension/ListFilteredDimensionSpecTest.java b/processing/src/test/java/io/druid/query/dimension/ListFilteredDimensionSpecTest.java new file mode 100644 index 000000000000..cb5d61a19df7 --- /dev/null +++ b/processing/src/test/java/io/druid/query/dimension/ListFilteredDimensionSpecTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query.dimension; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableList; +import io.druid.segment.TestHelper; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; + +/** + */ +public class ListFilteredDimensionSpecTest +{ + + @Test + public void testSerde() throws Exception + { + ObjectMapper mapper = TestHelper.getObjectMapper(); + + //isWhitelist = true + String jsonStr = "{\n" + + " \"type\": \"listFiltered\",\n" + + " \"delegate\": {\n" + + " \"type\": \"default\",\n" + + " \"dimension\": \"foo\",\n" + + " \"outputName\": \"bar\"\n" + + " },\n" + + " \"values\": [\"xxx\"]\n" + + "}"; + + ListFilteredDimensionSpec actual = (ListFilteredDimensionSpec) mapper.readValue( + mapper.writeValueAsString(mapper.readValue(jsonStr, DimensionSpec.class)), + DimensionSpec.class); + + ListFilteredDimensionSpec expected = new ListFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + ImmutableList.of("xxx"), + true + ); + + Assert.assertEquals(expected, actual); + + //isWhitelist = false + jsonStr = "{\n" + + " \"type\": \"listFiltered\",\n" + + " \"delegate\": {\n" + + " \"type\": \"default\",\n" + + " \"dimension\": \"foo\",\n" + + " \"outputName\": \"bar\"\n" + + " },\n" + + " \"values\": [\"xxx\"],\n" + + " \"isWhitelist\": false\n" + + "}"; + + actual = (ListFilteredDimensionSpec) mapper.readValue( + mapper.writeValueAsString(mapper.readValue(jsonStr, DimensionSpec.class)), + DimensionSpec.class); + + expected = new ListFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + ImmutableList.of("xxx"), + false + ); + + Assert.assertEquals(expected, actual); + } + + @Test + public void testGetCacheKey() + { + ListFilteredDimensionSpec spec1 = new ListFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + ImmutableList.of("xxx"), + null + ); + + ListFilteredDimensionSpec spec2 = new ListFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + ImmutableList.of("xyz"), + null + ); + + Assert.assertFalse(Arrays.equals(spec1.getCacheKey(), spec2.getCacheKey())); + + ListFilteredDimensionSpec spec3 = new ListFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + ImmutableList.of("xxx"), + false + ); + + Assert.assertFalse(Arrays.equals(spec1.getCacheKey(), spec3.getCacheKey())); + } +} diff --git a/processing/src/test/java/io/druid/query/dimension/RegexFilteredDimensionSpecTest.java b/processing/src/test/java/io/druid/query/dimension/RegexFilteredDimensionSpecTest.java new file mode 100644 index 000000000000..42815d698c70 --- /dev/null +++ b/processing/src/test/java/io/druid/query/dimension/RegexFilteredDimensionSpecTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.query.dimension; + +import com.fasterxml.jackson.databind.ObjectMapper; +import io.druid.segment.TestHelper; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; + +/** + */ +public class RegexFilteredDimensionSpecTest +{ + + @Test + public void testSerde() throws Exception + { + ObjectMapper mapper = TestHelper.getObjectMapper(); + + String jsonStr = "{\n" + + " \"type\": \"regexFiltered\",\n" + + " \"delegate\": {\n" + + " \"type\": \"default\",\n" + + " \"dimension\": \"foo\",\n" + + " \"outputName\": \"bar\"\n" + + " },\n" + + " \"pattern\": \"xxx\"\n" + + "}"; + + RegexFilteredDimensionSpec actual = (RegexFilteredDimensionSpec) mapper.readValue( + mapper.writeValueAsString(mapper.readValue(jsonStr, DimensionSpec.class)), + DimensionSpec.class); + + RegexFilteredDimensionSpec expected = new RegexFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + "xxx" + ); + + Assert.assertEquals(expected, actual); + } + + @Test + public void testGetCacheKey() + { + RegexFilteredDimensionSpec spec1 = new RegexFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + "xxx" + ); + + RegexFilteredDimensionSpec spec2 = new RegexFilteredDimensionSpec( + new DefaultDimensionSpec("foo", "bar"), + "xyz" + ); + + Assert.assertFalse(Arrays.equals(spec1.getCacheKey(), spec2.getCacheKey())); + } +} From e1ea93b6fc8120dfee742ab6ad3cf2ba2790e7da Mon Sep 17 00:00:00 2001 From: Himanshu Gupta Date: Wed, 30 Dec 2015 18:01:22 -0600 Subject: [PATCH 3/3] documenting querying behavior on multi-valued dimensions --- .../querying/multi-valued-dimensions.md | 238 ++++++++++++++++++ docs/content/toc.textile | 1 + 2 files changed, 239 insertions(+) create mode 100644 docs/content/querying/multi-valued-dimensions.md diff --git a/docs/content/querying/multi-valued-dimensions.md b/docs/content/querying/multi-valued-dimensions.md new file mode 100644 index 000000000000..65c4fa3d558c --- /dev/null +++ b/docs/content/querying/multi-valued-dimensions.md @@ -0,0 +1,238 @@ +--- +layout: doc_page +--- + +Druid supports "multi-valued" dimensions. See the section on multi-valued columns in [segments](../design/segments.html) for internal representation details. This document describes the behavior of groupBy(topN has similar behavior) queries on multi-valued dimensions when they are used as a dimension being grouped by. + +Suppose, you have a dataSource with a segment that contains following rows with a multi-valued dimension called tags. + +``` +2772011-01-12T00:00:00.000Z,["t1","t2","t3"], #row1 +2782011-01-13T00:00:00.000Z,["t3","t4","t5"], #row2 +2792011-01-14T00:00:00.000Z,["t5","t6","t7"] #row3 +``` + +### Group-By query with no filtering + +See [GroupBy querying](groupbyquery.html) for details. + +```json +{ + "queryType": "groupBy", + "dataSource": "test", + "intervals": [ + "1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z" + ], + "granularity": { + "type": "all" + }, + "dimensions": [ + { + "type": "default", + "dimension": "tags", + "outputName": "tags" + } + ], + "aggregations": [ + { + "type": "count", + "name": "count" + } + ] +} +``` + +returns following result. + +```json +[ + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t1" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t2" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 2, + "tags": "t3" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t4" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 2, + "tags": "t5" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t6" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t7" + } + } +] +``` + +notice how original rows are "exploded" into multiple rows and merged. + +### Group-By query with a selector query filter + +See [query filters](filters.html) for details of selector query filter. + +```json +{ + "queryType": "groupBy", + "dataSource": "test", + "intervals": [ + "1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z" + ], + "filter": { + "type": "selector", + "dimension": "tags", + "value": "t3" + }, + "granularity": { + "type": "all" + }, + "dimensions": [ + { + "type": "default", + "dimension": "tags", + "outputName": "tags" + } + ], + "aggregations": [ + { + "type": "count", + "name": "count" + } + ] +} +``` + +returns following result. + +```json +[ + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t1" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t2" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 2, + "tags": "t3" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t4" + } + }, + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 1, + "tags": "t5" + } + } +] +``` + +You might be surprised to see inclusion of "t1", "t2", "t4" and "t5" in the results. It happens because query filter is applied on the row before explosion. For multi-valued dimensions, selector filter for "t3" would match row1 and row2, after which exploding is done. For multi-valued dimensions, query filter matches a row if any individual value inside the multiple values matches the query filter. + +### Group-By query with a selector query filter and additional filter in "dimensions" attributes + +To solve the problem above and to get only rows for "t3" returned, you would have to use a "filtered dimension spec" as in the query below. + +See section on filtered dimensionSpecs in [dimensionSpecs](dimensionspecs.html) for details. + +```json +{ + "queryType": "groupBy", + "dataSource": "test", + "intervals": [ + "1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z" + ], + "filter": { + "type": "selector", + "dimension": "tags", + "value": "t3" + }, + "granularity": { + "type": "all" + }, + "dimensions": [ + { + "type": "listFiltered", + "delegate": { + "type": "default", + "dimension": "tags", + "outputName": "tags" + }, + "values": ["t3"] + } + ], + "aggregations": [ + { + "type": "count", + "name": "count" + } + ] +} +``` + +returns following result. + +```json +[ + { + "timestamp": "1970-01-01T00:00:00.000Z", + "event": { + "count": 2, + "tags": "t3" + } + } +] +``` + +Note that, for groupBy queries, you could get similar result with a [having spec](having.html) but using a filtered dimensionSpec would be much more efficient because that gets applied at the lowest level in the query processing pipeline while having spec is applied at the highest level of groupBy query processing. + diff --git a/docs/content/toc.textile b/docs/content/toc.textile index 3e695048006a..576e4160b8e8 100644 --- a/docs/content/toc.textile +++ b/docs/content/toc.textile @@ -38,6 +38,7 @@ h2. Querying ** "Context":../querying/query-context.html * "SQL":../querying/sql.html * "Joins":../querying/joins.html +* "Multi-Valued Dimensions":../querying/multi-valued-dimensions.html h2. Design * "Overview":../design/design.html