-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Support min/max values for metadata query #2208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,14 +21,21 @@ | |
|
|
||
| import com.google.common.base.Function; | ||
| import com.google.common.base.Preconditions; | ||
| import com.google.common.base.Strings; | ||
| import com.google.common.collect.Iterables; | ||
| import com.google.common.collect.Maps; | ||
| import com.google.common.collect.Sets; | ||
| import com.google.common.primitives.Longs; | ||
| import com.metamx.common.StringUtils; | ||
| import com.metamx.common.guava.Accumulator; | ||
| import com.metamx.common.guava.Sequence; | ||
| import com.metamx.common.logger.Logger; | ||
| import io.druid.granularity.QueryGranularity; | ||
| import io.druid.query.dimension.DefaultDimensionSpec; | ||
| import io.druid.query.metadata.metadata.ColumnAnalysis; | ||
| import io.druid.query.metadata.metadata.SegmentMetadataQuery; | ||
| import io.druid.segment.Cursor; | ||
| import io.druid.segment.DimensionSelector; | ||
| import io.druid.segment.QueryableIndex; | ||
| import io.druid.segment.Segment; | ||
| import io.druid.segment.StorageAdapter; | ||
|
|
@@ -38,8 +45,10 @@ | |
| import io.druid.segment.column.ColumnCapabilitiesImpl; | ||
| import io.druid.segment.column.ComplexColumn; | ||
| import io.druid.segment.column.ValueType; | ||
| import io.druid.segment.data.IndexedInts; | ||
| import io.druid.segment.serde.ComplexMetricSerde; | ||
| import io.druid.segment.serde.ComplexMetrics; | ||
| import org.joda.time.Interval; | ||
|
|
||
| import javax.annotation.Nullable; | ||
| import java.util.EnumSet; | ||
|
|
@@ -104,7 +113,11 @@ public Map<String, ColumnAnalysis> analyze(Segment segment) | |
| analysis = analyzeNumericColumn(capabilities, length, NUM_BYTES_IN_TEXT_FLOAT); | ||
| break; | ||
| case STRING: | ||
| analysis = analyzeStringColumn(capabilities, column, storageAdapter.getDimensionCardinality(columnName)); | ||
| if (index != null) { | ||
| analysis = analyzeStringColumn(capabilities, column); | ||
| } else { | ||
| analysis = analyzeStringColumn(capabilities, storageAdapter, columnName); | ||
| } | ||
| break; | ||
| case COMPLEX: | ||
| analysis = analyzeComplexColumn(capabilities, column, storageAdapter.getColumnTypeName(columnName)); | ||
|
|
@@ -140,6 +153,11 @@ public boolean analyzingCardinality() | |
| return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.CARDINALITY); | ||
| } | ||
|
|
||
| public boolean analyzingMinMax() | ||
| { | ||
| return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.MINMAX); | ||
| } | ||
|
|
||
| private ColumnAnalysis analyzeNumericColumn( | ||
| final ColumnCapabilities capabilities, | ||
| final int length, | ||
|
|
@@ -161,28 +179,30 @@ private ColumnAnalysis analyzeNumericColumn( | |
| capabilities.hasMultipleValues(), | ||
| size, | ||
| null, | ||
| null, | ||
| null, | ||
| null | ||
| ); | ||
| } | ||
|
|
||
| private ColumnAnalysis analyzeStringColumn( | ||
| final ColumnCapabilities capabilities, | ||
| @Nullable final Column column, | ||
| final int cardinality | ||
| final Column column | ||
| ) | ||
| { | ||
| long size = 0; | ||
|
|
||
| if (column != null && analyzingSize()) { | ||
| if (!capabilities.hasBitmapIndexes()) { | ||
| return ColumnAnalysis.error("string_no_bitmap"); | ||
| } | ||
| Comparable min = null; | ||
| Comparable max = null; | ||
|
|
||
| final BitmapIndex bitmapIndex = column.getBitmapIndex(); | ||
| if (cardinality != bitmapIndex.getCardinality()) { | ||
| return ColumnAnalysis.error("bitmap_wrong_cardinality"); | ||
| } | ||
| if (!capabilities.hasBitmapIndexes()) { | ||
| return ColumnAnalysis.error("string_no_bitmap"); | ||
| } | ||
|
|
||
| final BitmapIndex bitmapIndex = column.getBitmapIndex(); | ||
| final int cardinality = bitmapIndex.getCardinality(); | ||
|
|
||
| if (analyzingSize()) { | ||
| for (int i = 0; i < cardinality; ++i) { | ||
| String value = bitmapIndex.getValue(i); | ||
| if (value != null) { | ||
|
|
@@ -191,11 +211,91 @@ private ColumnAnalysis analyzeStringColumn( | |
| } | ||
| } | ||
|
|
||
| if (analyzingMinMax() && cardinality > 0) { | ||
| min = Strings.nullToEmpty(bitmapIndex.getValue(0)); | ||
| max = Strings.nullToEmpty(bitmapIndex.getValue(cardinality - 1)); | ||
| } | ||
|
|
||
| return new ColumnAnalysis( | ||
| capabilities.getType().name(), | ||
| capabilities.hasMultipleValues(), | ||
| size, | ||
| analyzingCardinality() ? cardinality : 0, | ||
| min, | ||
| max, | ||
| null | ||
| ); | ||
| } | ||
|
|
||
| private ColumnAnalysis analyzeStringColumn( | ||
| final ColumnCapabilities capabilities, | ||
| final StorageAdapter storageAdapter, | ||
| final String columnName | ||
| ) | ||
| { | ||
| int cardinality = 0; | ||
| long size = 0; | ||
|
|
||
| Comparable min = null; | ||
| Comparable max = null; | ||
|
|
||
| if (analyzingCardinality()) { | ||
| cardinality = storageAdapter.getDimensionCardinality(columnName); | ||
| } | ||
|
|
||
| if (analyzingSize()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. im not sure i understand why this needs to be reimplemented
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I cannot understand why those kind of patches, returning zero for size for incremental index, allowed to be committed but it's already done and I've just wanted it to return some value which is more useful than zero.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @navis mostly because "size" is assumed by many committers to not be very useful, as the docs on it are vague enough that it is not really clear what it's supposed to be doing. Are you getting value from it? If so maybe we should also tighten up the docs & behavior & tests to make it a really useful thing. or, we could also deprecate it if we are not aware of anyone getting value from it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @navis also, incremental improvement- at first introduction, segmentMetadata did not return any info at all, for any analysis type, for incremental index. that has been improved over time for various analysis types.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gianm Thanks for the detailed explanation. I just surprised to see the semantic of query can be changed so easily. It would be better to return -1 or something if we cannot calculate it, not omitting some part of the value.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we replace the previous impl? I don' think anyone is getting value from the "size" field right now. I agree the previous "size" is not useful at all
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @navis I agree in general for general queries, the "size" field here is just a little "special" due to its history and generally being under-specified as to what it actually should be doing. I think in the long run we should either fully specify what it does, or remove it if nobody cares enough to do that. (my vote is for removing it although that would probably have to wait to 0.10 at this point)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw, I have nothing against adding this "size" functionality in this PR. It's at least better than what we're doing now. Although the under-specification remains a problem imo.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just for discussion, I think this PR Is fine: How useful is the current method of size calculation? If we keep the "size" functionality, is the current "flat file" size estimate more or less useful than returning the actual on-disk size of the segments (or expected size, for IncrementalIndexes)?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would think the actual on disk size is more useful. FYI the original use case was for a somewhat esoteric metric that was needed at Metamarkets a long time ago, but as far as I know is no longer needed there. So maybe one of the Metamarketers can chime in on that. |
||
| final long start = storageAdapter.getMinTime().getMillis(); | ||
| final long end = storageAdapter.getMaxTime().getMillis(); | ||
|
|
||
| final Sequence<Cursor> cursors = | ||
| storageAdapter.makeCursors(null, new Interval(start, end), QueryGranularity.ALL, false); | ||
|
|
||
| size = cursors.accumulate( | ||
| 0L, | ||
| new Accumulator<Long, Cursor>() | ||
| { | ||
| @Override | ||
| public Long accumulate(Long accumulated, Cursor cursor) | ||
| { | ||
| DimensionSelector selector = cursor.makeDimensionSelector( | ||
| new DefaultDimensionSpec( | ||
| columnName, | ||
| columnName | ||
| ) | ||
| ); | ||
| if (selector == null) { | ||
| return accumulated; | ||
| } | ||
| long current = accumulated; | ||
| while (!cursor.isDone()) { | ||
| final IndexedInts vals = selector.getRow(); | ||
| for (int i = 0; i < vals.size(); ++i) { | ||
| final String dimVal = selector.lookupName(vals.get(i)); | ||
| if (dimVal != null && !dimVal.isEmpty()) { | ||
| current += StringUtils.toUtf8(dimVal).length; | ||
| } | ||
| } | ||
| cursor.advance(); | ||
| } | ||
|
|
||
| return current; | ||
| } | ||
| } | ||
| ); | ||
| } | ||
|
|
||
| if (analyzingMinMax()) { | ||
| min = storageAdapter.getMinValue(columnName); | ||
| max = storageAdapter.getMaxValue(columnName); | ||
| } | ||
|
|
||
| return new ColumnAnalysis( | ||
| capabilities.getType().name(), | ||
| capabilities.hasMultipleValues(), | ||
| size, | ||
| cardinality, | ||
| min, | ||
| max, | ||
| null | ||
| ); | ||
| } | ||
|
|
@@ -218,7 +318,7 @@ private ColumnAnalysis analyzeComplexColumn( | |
|
|
||
| final Function<Object, Long> inputSizeFn = serde.inputSizeFn(); | ||
| if (inputSizeFn == null) { | ||
| return new ColumnAnalysis(typeName, hasMultipleValues, 0, null, null); | ||
| return new ColumnAnalysis(typeName, hasMultipleValues, 0, null, null, null, null); | ||
| } | ||
|
|
||
| final int length = column.getLength(); | ||
|
|
@@ -232,6 +332,8 @@ private ColumnAnalysis analyzeComplexColumn( | |
| hasMultipleValues, | ||
| size, | ||
| null, | ||
| null, | ||
| null, | ||
| null | ||
| ); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
|
|
||
| import com.fasterxml.jackson.annotation.JsonCreator; | ||
| import com.fasterxml.jackson.annotation.JsonProperty; | ||
| import com.fasterxml.jackson.annotation.JsonTypeInfo; | ||
|
|
||
| import java.util.Objects; | ||
|
|
||
|
|
@@ -32,13 +33,15 @@ public class ColumnAnalysis | |
|
|
||
| public static ColumnAnalysis error(String reason) | ||
| { | ||
| return new ColumnAnalysis("STRING", false, -1, null, ERROR_PREFIX + reason); | ||
| return new ColumnAnalysis("STRING", false, -1, null, null, null, ERROR_PREFIX + reason); | ||
| } | ||
|
|
||
| private final String type; | ||
| private final boolean hasMultipleValues; | ||
| private final long size; | ||
| private final Integer cardinality; | ||
| private final Comparable minValue; | ||
| private final Comparable maxValue; | ||
| private final String errorMessage; | ||
|
|
||
| @JsonCreator | ||
|
|
@@ -47,13 +50,17 @@ public ColumnAnalysis( | |
| @JsonProperty("hasMultipleValues") boolean hasMultipleValues, | ||
| @JsonProperty("size") long size, | ||
| @JsonProperty("cardinality") Integer cardinality, | ||
| @JsonProperty("minValue") Comparable minValue, | ||
| @JsonProperty("maxValue") Comparable maxValue, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add a serde test checking for this? |
||
| @JsonProperty("errorMessage") String errorMessage | ||
| ) | ||
| { | ||
| this.type = type; | ||
| this.hasMultipleValues = hasMultipleValues; | ||
| this.size = size; | ||
| this.cardinality = cardinality; | ||
| this.minValue = minValue; | ||
| this.maxValue = maxValue; | ||
| this.errorMessage = errorMessage; | ||
| } | ||
|
|
||
|
|
@@ -81,6 +88,20 @@ public Integer getCardinality() | |
| return cardinality; | ||
| } | ||
|
|
||
| @JsonTypeInfo(use = JsonTypeInfo.Id.NAME) | ||
| @JsonProperty | ||
| public Comparable getMinValue() | ||
| { | ||
| return minValue; | ||
| } | ||
|
|
||
| @JsonTypeInfo(use = JsonTypeInfo.Id.NAME) | ||
| @JsonProperty | ||
| public Comparable getMaxValue() | ||
| { | ||
| return maxValue; | ||
| } | ||
|
|
||
| @JsonProperty | ||
| public String getErrorMessage() | ||
| { | ||
|
|
@@ -113,21 +134,29 @@ public ColumnAnalysis fold(ColumnAnalysis rhs) | |
| Integer cardinality = getCardinality(); | ||
| final Integer rhsCardinality = rhs.getCardinality(); | ||
| if (cardinality == null) { | ||
|
|
||
| cardinality = rhsCardinality; | ||
| } else { | ||
| if (rhsCardinality != null) { | ||
| cardinality = Math.max(cardinality, rhsCardinality); | ||
| } | ||
| } else if (rhsCardinality != null) { | ||
| cardinality = Math.max(cardinality, rhsCardinality); | ||
| } | ||
|
|
||
| return new ColumnAnalysis( | ||
| type, | ||
| hasMultipleValues || rhs.isHasMultipleValues(), | ||
| size + rhs.getSize(), | ||
| cardinality, | ||
| null | ||
| ); | ||
| final boolean multipleValues = hasMultipleValues || rhs.isHasMultipleValues(); | ||
|
|
||
| Comparable newMin = choose(minValue, rhs.minValue, false); | ||
| Comparable newMax = choose(maxValue, rhs.maxValue, true); | ||
|
|
||
| return new ColumnAnalysis(type, multipleValues, size + rhs.getSize(), cardinality, newMin, newMax, null); | ||
| } | ||
|
|
||
| private <T extends Comparable> T choose(T obj1, T obj2, boolean max) | ||
| { | ||
| if (obj1 == null) { | ||
| return max ? obj2 : null; | ||
| } | ||
| if (obj2 == null) { | ||
| return max ? obj1 : null; | ||
| } | ||
| int compare = max ? obj1.compareTo(obj2) : obj2.compareTo(obj1); | ||
| return compare > 0 ? obj1 : obj2; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -138,6 +167,8 @@ public String toString() | |
| ", hasMultipleValues=" + hasMultipleValues + | ||
| ", size=" + size + | ||
| ", cardinality=" + cardinality + | ||
| ", minValue=" + minValue + | ||
| ", maxValue=" + maxValue + | ||
| ", errorMessage='" + errorMessage + '\'' + | ||
| '}'; | ||
| } | ||
|
|
@@ -156,12 +187,14 @@ public boolean equals(Object o) | |
| size == that.size && | ||
| Objects.equals(type, that.type) && | ||
| Objects.equals(cardinality, that.cardinality) && | ||
| Objects.equals(minValue, that.minValue) && | ||
| Objects.equals(maxValue, that.maxValue) && | ||
| Objects.equals(errorMessage, that.errorMessage); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() | ||
| { | ||
| return Objects.hash(type, hasMultipleValues, size, cardinality, errorMessage); | ||
| return Objects.hash(type, hasMultipleValues, size, cardinality, minValue, maxValue, errorMessage); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can this not just call the other impl? why 2 impls?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
columnis null for incremental index, which make impossible to use other impl.