apache · fjy · Dec 31, 2015 · Dec 19, 2015 · Dec 19, 2015 · Dec 31, 2015
diff --git a/docs/content/querying/dimensionspecs.md b/docs/content/querying/dimensionspecs.md
@@ -252,3 +252,23 @@ A null dimension value can be mapped to a specific value by specifying the empty
 This allows distinguishing between a null dimension and a lookup resulting in a null.
 For example, specifying `{"":"bar","bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["bar", "oof", "baz"]`.
 Omitting the empty string key will cause the missing value to take over. For example, specifying `{"bat":"baz"}` with dimension values `[null, "foo", "bat"]` and replacing missing values with `"oof"` will yield results of `["oof", "oof", "baz"]`.
+
+### Filtering DimensionSpecs
+These are only valid for multi-valued dimensions. If you have a row in druid that has a multi-valued dimension with values ["v1", "v2", "v3"] and you send a groupBy/topN query grouping by that dimension with [query filter](filter.html) for value "v1". In the response you will get 3 rows containing "v1", "v2" and "v3". This behavior might be unintuitive for some use cases.
+
+It happens because `query filter` is internally used on the bitmaps and only used to match the row to be included in the query result processing. With multivalued dimensions, "query filter" behaves like a contains check, which will match the row with dimension value ["v1", "v2", "v3"]. Please see the section on "Multi-value columns" in [segment](../design/segments.html) for more details.
+Then groupBy/topN processing pipeline "explodes" all multi-valued dimensions resulting 3 rows for "v1", "v2" and "v3" each.
+
+In addition to "query filter" which efficiently selects the rows to be processed, you can use the filtering dimension spec to filter for specific values within the values of a multi-valued dimension. These dimensionSpecs take a delegate DimensionSpec and a filtering criteria. From the "exploded" rows, only rows matching the given filtering criteria are returned in the query result.
+
+The following filtered dimension spec acts as a whiltelist or blacklist for values as per the "isWhitelist" attribute value.
+```json
+{ "type" : "listFiltered", "delegate" : <dimensionSpec>, "values": <array of strings>, "isWhitelist": <optional attribute for true/false, default is true> }
+```
+
+Following filtered dimension spec retains only the values matching regex. Note that `listFiltered` is faster than this and one should use that for whitelist or blacklist usecase.
+```json
+{ "type" : "regexFiltered", "delegate" : <dimensionSpec>, "pattern": <java regex pattern> }
+```
+
+For more details and examples, see [multi-valued dimensions](multi-valued-dimensions.html).
diff --git a/docs/content/querying/multi-valued-dimensions.md b/docs/content/querying/multi-valued-dimensions.md
@@ -0,0 +1,238 @@
+---
+layout: doc_page
+---
+
+Druid supports "multi-valued" dimensions. See the section on multi-valued columns in [segments](../design/segments.html) for internal representation details. This document describes the behavior of groupBy(topN has similar behavior) queries on multi-valued dimensions when they are used as a dimension being grouped by.
+
+Suppose, you have a dataSource with a segment that contains following rows with a multi-valued dimension called tags.
+
+```
+2772011-01-12T00:00:00.000Z,["t1","t2","t3"],  #row1
+2782011-01-13T00:00:00.000Z,["t3","t4","t5"],  #row2
+2792011-01-14T00:00:00.000Z,["t5","t6","t7"]   #row3
+```
+
+### Group-By query with no filtering
+
+See [GroupBy querying](groupbyquery.html) for details.
+
+```json
+{
+  "queryType": "groupBy",
+  "dataSource": "test",
+  "intervals": [
+    "1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"
+  ],
+  "granularity": {
+    "type": "all"
+  },
+  "dimensions": [
+    {
+      "type": "default",
+      "dimension": "tags",
+      "outputName": "tags"
+    }
+  ],
+  "aggregations": [
+    {
+      "type": "count",
+      "name": "count"
+    }
+  ]
+}
+```
+
+returns following result.
+
+```json
+[
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t1"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t2"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 2,
+      "tags": "t3"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t4"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 2,
+      "tags": "t5"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t6"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t7"
+    }
+  }
+]
+```
+
+notice how original rows are "exploded" into multiple rows and merged.
+
+### Group-By query with a selector query filter
+
+See [query filters](filters.html) for details of selector query filter.
+
+```json
+{
+  "queryType": "groupBy",
+  "dataSource": "test",
+  "intervals": [
+    "1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"
+  ],
+  "filter": {
+    "type": "selector",
+    "dimension": "tags",
+    "value": "t3"
+  },
+  "granularity": {
+    "type": "all"
+  },
+  "dimensions": [
+    {
+      "type": "default",
+      "dimension": "tags",
+      "outputName": "tags"
+    }
+  ],
+  "aggregations": [
+    {
+      "type": "count",
+      "name": "count"
+    }
+  ]
+}
+```
+
+returns following result.
+
+```json
+[
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t1"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t2"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 2,
+      "tags": "t3"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t4"
+    }
+  },
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 1,
+      "tags": "t5"
+    }
+  }
+]
+```
+
+You might be surprised to see inclusion of "t1", "t2", "t4" and "t5" in the results. It happens because query filter is applied on the row before explosion. For multi-valued dimensions, selector filter for "t3" would match row1 and row2, after which exploding is done. For multi-valued dimensions, query filter matches a row if any individual value inside the multiple values matches the query filter.
+
+### Group-By query with a selector query filter and additional filter in "dimensions" attributes
+
+To solve the problem above and to get only rows for "t3" returned, you would have to use a "filtered dimension spec" as in the query below.
+
+See section on filtered dimensionSpecs in [dimensionSpecs](dimensionspecs.html) for details.
+
+```json
+{
+  "queryType": "groupBy",
+  "dataSource": "test",
+  "intervals": [
+    "1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"
+  ],
+  "filter": {
+    "type": "selector",
+    "dimension": "tags",
+    "value": "t3"
+  },
+  "granularity": {
+    "type": "all"
+  },
+  "dimensions": [
+    {
+      "type": "listFiltered",
+      "delegate": {
+        "type": "default",
+        "dimension": "tags",
+        "outputName": "tags"
+      },
+      "values": ["t3"]
+    }
+  ],
+  "aggregations": [
+    {
+      "type": "count",
+      "name": "count"
+    }
+  ]
+}
+```
+
+returns following result.
+
+```json
+[
+  {
+    "timestamp": "1970-01-01T00:00:00.000Z",
+    "event": {
+      "count": 2,
+      "tags": "t3"
+    }
+  }
+]
+```
+
+Note that, for groupBy queries, you could get similar result with a [having spec](having.html) but using a filtered dimensionSpec would be much more efficient because that gets applied at the lowest level in the query processing pipeline while having spec is applied at the highest level of groupBy query processing.
+
diff --git a/docs/content/toc.textile b/docs/content/toc.textile
@@ -38,6 +38,7 @@ h2. Querying
 ** "Context":../querying/query-context.html
 * "SQL":../querying/sql.html
 * "Joins":../querying/joins.html
+* "Multi-Valued Dimensions":../querying/multi-valued-dimensions.html
 
 h2. Design
 * "Overview":../design/design.html

diff --git a/...ng/src/main/java/io/druid/query/aggregation/cardinality/CardinalityAggregatorFactory.java b/...ng/src/main/java/io/druid/query/aggregation/cardinality/CardinalityAggregatorFactory.java
@@ -33,6 +33,7 @@
 import io.druid.query.aggregation.BufferAggregator;
 import io.druid.query.aggregation.hyperloglog.HyperLogLogCollector;
 import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory;
+import io.druid.query.dimension.DefaultDimensionSpec;
 import io.druid.segment.ColumnSelectorFactory;
 import io.druid.segment.DimensionSelector;
 import org.apache.commons.codec.binary.Base64;
@@ -107,7 +108,7 @@ private List<DimensionSelector> makeDimensionSelectors(final ColumnSelectorFacto
               @Override
               public DimensionSelector apply(@Nullable String input)
               {
-                return columnFactory.makeDimensionSelector(input, null);
+                return columnFactory.makeDimensionSelector(new DefaultDimensionSpec(input, input));
               }
             }
             ), Predicates.notNull()

diff --git a/processing/src/main/java/io/druid/query/dimension/BaseFilteredDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/BaseFilteredDimensionSpec.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to Metamarkets Group Inc. (Metamarkets) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Metamarkets licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package io.druid.query.dimension;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.google.common.base.Preconditions;
+import io.druid.query.extraction.ExtractionFn;
+
+/**
+ */
+public abstract class BaseFilteredDimensionSpec implements DimensionSpec
+{
+  protected final DimensionSpec delegate;
+
+  public BaseFilteredDimensionSpec(
+      @JsonProperty("delegate") DimensionSpec delegate
+  )
+  {
+    this.delegate = Preconditions.checkNotNull(delegate, "delegate must not be null");
+  }
+
+  @JsonProperty
+  public DimensionSpec getDelegate()
+  {
+    return delegate;
+  }
+
+  @Override
+  public String getDimension()
+  {
+    return delegate.getDimension();
+  }
+
+  @Override
+  public String getOutputName()
+  {
+    return delegate.getOutputName();
+  }
+
+  @Override
+  public ExtractionFn getExtractionFn()
+  {
+    return delegate.getExtractionFn();
+  }
+
+  @Override
+  public boolean preservesOrdering()
+  {
+    return delegate.preservesOrdering();
+  }
+}
diff --git a/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java
@@ -23,6 +23,7 @@
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.metamx.common.StringUtils;
 import io.druid.query.extraction.ExtractionFn;
+import io.druid.segment.DimensionSelector;
 
 import java.nio.ByteBuffer;
 
@@ -66,6 +67,12 @@ public ExtractionFn getExtractionFn()
     return null;
   }
 
+  @Override
+  public DimensionSelector decorate(DimensionSelector selector)
+  {
+    return selector;
+  }
+
   @Override
   public byte[] getCacheKey()
   {