apache · jon-wei · Feb 12, 2016 · Jan 6, 2016 · fjy · Feb 6, 2016
diff --git a/docs/content/design/broker.md b/docs/content/design/broker.md
@@ -25,7 +25,7 @@ To determine which nodes to forward queries to, the Broker node first builds a v
 Caching
 -------
 
-Broker nodes employ a cache with a LRU cache invalidation strategy. The broker cache stores per segment results. The cache can be local to each broker node or shared across multiple nodes using an external distributed cache such as [memcached](http://memcached.org/). Each time a broker node receives a query, it ﬁrst maps the query to a set of segments. A subset of these segment results may already exist in the cache and the results can be directly pulled from the cache. For any segment results that do not exist in the cache, the broker node will forward the query to the
+Broker nodes employ a cache with a LRU cache invalidation strategy. The broker cache stores per-segment results. The cache can be local to each broker node or shared across multiple nodes using an external distributed cache such as [memcached](http://memcached.org/). Each time a broker node receives a query, it ﬁrst maps the query to a set of segments. A subset of these segment results may already exist in the cache and the results can be directly pulled from the cache. For any segment results that do not exist in the cache, the broker node will forward the query to the
 historical nodes. Once the historical nodes return their results, the broker will store those results in the cache. Real-time segments are never cached and hence requests for real-time data will always be forwarded to real-time nodes. Real-time data is perpetually changing and caching the results would be unreliable.
 
 HTTP Endpoints

diff --git a/docs/content/design/index.md b/docs/content/design/index.md
@@ -90,7 +90,7 @@ Druid is a column store, which means each individual column is stored separately
 in that query, and Druid is pretty good about only scanning exactly what it needs for a query.
 Different columns can also employ different compression methods. Different columns can also have different indexes associated with them.
 
-Druid indexes data on a per shard (segment) level.
+Druid indexes data on a per-shard (segment) level.
 
 ## Loading the Data
 

diff --git a/docs/content/querying/segmentmetadataquery.md b/docs/content/querying/segmentmetadataquery.md
@@ -2,9 +2,10 @@
 layout: doc_page
 ---
 # Segment Metadata Queries
-Segment metadata queries return per segment information about:
+Segment metadata queries return per-segment information about:
 
 * Cardinality of all columns in the segment
+* Min/max values of string type columns in the segment
 * Estimated byte size for the segment columns if they were stored in a flat format
 * Number of rows stored inside the segment
 * Interval the segment covers
@@ -103,13 +104,17 @@ This is a list of properties that determines the amount of information returned
 
 By default, all analysis types will be used. If a property is not needed, omitting it from this list will result in a more efficient query.
 
-There are four types of column analyses:
+There are five types of column analyses:
 
 #### cardinality
 
 * `cardinality` in the result will return the estimated floor of cardinality for each column. Only relevant for
 dimension columns.
 
+#### minmax
+
+* Estimated min/max values for each column. Only relevant for dimension columns.
+
 #### size
 
 * `size` in the result will contain the estimated total segment byte size as if the data were stored in text format

diff --git a/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java b/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java
@@ -21,14 +21,21 @@
 
 import com.google.common.base.Function;
 import com.google.common.base.Preconditions;
+import com.google.common.base.Strings;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import com.google.common.primitives.Longs;
 import com.metamx.common.StringUtils;
+import com.metamx.common.guava.Accumulator;
+import com.metamx.common.guava.Sequence;
 import com.metamx.common.logger.Logger;
+import io.druid.granularity.QueryGranularity;
+import io.druid.query.dimension.DefaultDimensionSpec;
 import io.druid.query.metadata.metadata.ColumnAnalysis;
 import io.druid.query.metadata.metadata.SegmentMetadataQuery;
+import io.druid.segment.Cursor;
+import io.druid.segment.DimensionSelector;
 import io.druid.segment.QueryableIndex;
 import io.druid.segment.Segment;
 import io.druid.segment.StorageAdapter;
@@ -38,8 +45,10 @@
 import io.druid.segment.column.ColumnCapabilitiesImpl;
 import io.druid.segment.column.ComplexColumn;
 import io.druid.segment.column.ValueType;
+import io.druid.segment.data.IndexedInts;
 import io.druid.segment.serde.ComplexMetricSerde;
 import io.druid.segment.serde.ComplexMetrics;
+import org.joda.time.Interval;
 
 import javax.annotation.Nullable;
 import java.util.EnumSet;
@@ -104,7 +113,11 @@ public Map<String, ColumnAnalysis> analyze(Segment segment)
           analysis = analyzeNumericColumn(capabilities, length, NUM_BYTES_IN_TEXT_FLOAT);
           break;
         case STRING:
-          analysis = analyzeStringColumn(capabilities, column, storageAdapter.getDimensionCardinality(columnName));
+          if (index != null) {
+            analysis = analyzeStringColumn(capabilities, column);
+          } else {
+            analysis = analyzeStringColumn(capabilities, storageAdapter, columnName);
+          }
           break;
         case COMPLEX:
           analysis = analyzeComplexColumn(capabilities, column, storageAdapter.getColumnTypeName(columnName));
@@ -140,6 +153,11 @@ public boolean analyzingCardinality()
     return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.CARDINALITY);
   }
 
+  public boolean analyzingMinMax()
+  {
+    return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.MINMAX);
+  }
+
   private ColumnAnalysis analyzeNumericColumn(
       final ColumnCapabilities capabilities,
       final int length,
@@ -161,28 +179,30 @@ private ColumnAnalysis analyzeNumericColumn(
         capabilities.hasMultipleValues(),
         size,
         null,
+        null,
+        null,
         null
     );
   }
 
   private ColumnAnalysis analyzeStringColumn(
       final ColumnCapabilities capabilities,
-      @Nullable final Column column,
-      final int cardinality
+      final Column column
   )
   {
     long size = 0;
 
-    if (column != null && analyzingSize()) {
-      if (!capabilities.hasBitmapIndexes()) {
-        return ColumnAnalysis.error("string_no_bitmap");
-      }
+    Comparable min = null;
+    Comparable max = null;
 
-      final BitmapIndex bitmapIndex = column.getBitmapIndex();
-      if (cardinality != bitmapIndex.getCardinality()) {
-        return ColumnAnalysis.error("bitmap_wrong_cardinality");
-      }
+    if (!capabilities.hasBitmapIndexes()) {
+      return ColumnAnalysis.error("string_no_bitmap");
+    }
+
+    final BitmapIndex bitmapIndex = column.getBitmapIndex();
+    final int cardinality = bitmapIndex.getCardinality();
 
+    if (analyzingSize()) {
       for (int i = 0; i < cardinality; ++i) {
         String value = bitmapIndex.getValue(i);
         if (value != null) {
@@ -191,11 +211,91 @@ private ColumnAnalysis analyzeStringColumn(
       }
     }
 
+    if (analyzingMinMax() && cardinality > 0) {
+      min = Strings.nullToEmpty(bitmapIndex.getValue(0));
+      max = Strings.nullToEmpty(bitmapIndex.getValue(cardinality - 1));
+    }
+
     return new ColumnAnalysis(
         capabilities.getType().name(),
         capabilities.hasMultipleValues(),
         size,
         analyzingCardinality() ? cardinality : 0,
+        min,
+        max,
+        null
+    );
+  }
+
+  private ColumnAnalysis analyzeStringColumn(
+      final ColumnCapabilities capabilities,
+      final StorageAdapter storageAdapter,
+      final String columnName
+  )
+  {
+    int cardinality = 0;
+    long size = 0;
+
+    Comparable min = null;
+    Comparable max = null;
+
+    if (analyzingCardinality()) {
+      cardinality = storageAdapter.getDimensionCardinality(columnName);
+    }
+
+    if (analyzingSize()) {
+      final long start = storageAdapter.getMinTime().getMillis();
+      final long end = storageAdapter.getMaxTime().getMillis();
+
+      final Sequence<Cursor> cursors =
+          storageAdapter.makeCursors(null, new Interval(start, end), QueryGranularity.ALL, false);
+
+      size = cursors.accumulate(
+          0L,
+          new Accumulator<Long, Cursor>()
+          {
+            @Override
+            public Long accumulate(Long accumulated, Cursor cursor)
+            {
+              DimensionSelector selector = cursor.makeDimensionSelector(
+                  new DefaultDimensionSpec(
+                      columnName,
+                      columnName
+                  )
+              );
+              if (selector == null) {
+                return accumulated;
+              }
+              long current = accumulated;
+              while (!cursor.isDone()) {
+                final IndexedInts vals = selector.getRow();
+                for (int i = 0; i < vals.size(); ++i) {
+                  final String dimVal = selector.lookupName(vals.get(i));
+                  if (dimVal != null && !dimVal.isEmpty()) {
+                    current += StringUtils.toUtf8(dimVal).length;
+                  }
+                }
+                cursor.advance();
+              }
+
+              return current;
+            }
+          }
+      );
+    }
+
+    if (analyzingMinMax()) {
+      min = storageAdapter.getMinValue(columnName);
+      max = storageAdapter.getMaxValue(columnName);
+    }
+
+    return new ColumnAnalysis(
+        capabilities.getType().name(),
+        capabilities.hasMultipleValues(),
+        size,
+        cardinality,
+        min,
+        max,
         null
     );
   }
@@ -218,7 +318,7 @@ private ColumnAnalysis analyzeComplexColumn(
 
       final Function<Object, Long> inputSizeFn = serde.inputSizeFn();
       if (inputSizeFn == null) {
-        return new ColumnAnalysis(typeName, hasMultipleValues, 0, null, null);
+        return new ColumnAnalysis(typeName, hasMultipleValues, 0, null, null, null, null);
       }
 
       final int length = column.getLength();
@@ -232,6 +332,8 @@ private ColumnAnalysis analyzeComplexColumn(
         hasMultipleValues,
         size,
         null,
+        null,
+        null,
         null
     );
   }

diff --git a/processing/src/main/java/io/druid/query/metadata/metadata/ColumnAnalysis.java b/processing/src/main/java/io/druid/query/metadata/metadata/ColumnAnalysis.java
@@ -21,6 +21,7 @@
 
 import com.fasterxml.jackson.annotation.JsonCreator;
 import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeInfo;
 
 import java.util.Objects;
 
@@ -32,13 +33,15 @@ public class ColumnAnalysis
 
   public static ColumnAnalysis error(String reason)
   {
-    return new ColumnAnalysis("STRING", false, -1, null, ERROR_PREFIX + reason);
+    return new ColumnAnalysis("STRING", false, -1, null, null, null, ERROR_PREFIX + reason);
   }
 
   private final String type;
   private final boolean hasMultipleValues;
   private final long size;
   private final Integer cardinality;
+  private final Comparable minValue;
+  private final Comparable maxValue;
   private final String errorMessage;
 
   @JsonCreator
@@ -47,13 +50,17 @@ public ColumnAnalysis(
       @JsonProperty("hasMultipleValues") boolean hasMultipleValues,
       @JsonProperty("size") long size,
       @JsonProperty("cardinality") Integer cardinality,
+      @JsonProperty("minValue") Comparable minValue,
+      @JsonProperty("maxValue") Comparable maxValue,
       @JsonProperty("errorMessage") String errorMessage
   )
   {
     this.type = type;
     this.hasMultipleValues = hasMultipleValues;
     this.size = size;
     this.cardinality = cardinality;
+    this.minValue = minValue;
+    this.maxValue = maxValue;
     this.errorMessage = errorMessage;
   }
 
@@ -81,6 +88,20 @@ public Integer getCardinality()
     return cardinality;
   }
 
+  @JsonTypeInfo(use = JsonTypeInfo.Id.NAME)
+  @JsonProperty
+  public Comparable getMinValue()
+  {
+    return minValue;
+  }
+
+  @JsonTypeInfo(use = JsonTypeInfo.Id.NAME)
+  @JsonProperty
+  public Comparable getMaxValue()
+  {
+    return maxValue;
+  }
+
   @JsonProperty
   public String getErrorMessage()
   {
@@ -113,21 +134,29 @@ public ColumnAnalysis fold(ColumnAnalysis rhs)
     Integer cardinality = getCardinality();
     final Integer rhsCardinality = rhs.getCardinality();
     if (cardinality == null) {
-
       cardinality = rhsCardinality;
-    } else {
-      if (rhsCardinality != null) {
-        cardinality = Math.max(cardinality, rhsCardinality);
-      }
+    } else if (rhsCardinality != null) {
+      cardinality = Math.max(cardinality, rhsCardinality);
     }
 
-    return new ColumnAnalysis(
-        type,
-        hasMultipleValues || rhs.isHasMultipleValues(),
-        size + rhs.getSize(),
-        cardinality,
-        null
-    );
+    final boolean multipleValues = hasMultipleValues || rhs.isHasMultipleValues();
+
+    Comparable newMin = choose(minValue, rhs.minValue, false);
+    Comparable newMax = choose(maxValue, rhs.maxValue, true);
+
+    return new ColumnAnalysis(type, multipleValues, size + rhs.getSize(), cardinality, newMin, newMax, null);
+  }
+
+  private <T extends Comparable> T choose(T obj1, T obj2, boolean max)
+  {
+    if (obj1 == null) {
+      return max ? obj2 : null;
+    }
+    if (obj2 == null) {
+      return max ? obj1 : null;
+    }
+    int compare = max ? obj1.compareTo(obj2) : obj2.compareTo(obj1);
+    return compare > 0 ? obj1 : obj2;
   }
 
   @Override
@@ -138,6 +167,8 @@ public String toString()
            ", hasMultipleValues=" + hasMultipleValues +
            ", size=" + size +
            ", cardinality=" + cardinality +
+           ", minValue=" + minValue +
+           ", maxValue=" + maxValue +
            ", errorMessage='" + errorMessage + '\'' +
            '}';
   }
@@ -156,12 +187,14 @@ public boolean equals(Object o)
            size == that.size &&
            Objects.equals(type, that.type) &&
            Objects.equals(cardinality, that.cardinality) &&
+           Objects.equals(minValue, that.minValue) &&
+           Objects.equals(maxValue, that.maxValue) &&
            Objects.equals(errorMessage, that.errorMessage);
   }
 
   @Override
   public int hashCode()
   {
-    return Objects.hash(type, hasMultipleValues, size, cardinality, errorMessage);
+    return Objects.hash(type, hasMultipleValues, size, cardinality, minValue, maxValue, errorMessage);
   }
 }