-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Lazy-ify ValueMatcher BitSet optimization for string dimensions. #5403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,9 +20,9 @@ | |
| package io.druid.benchmark; | ||
|
|
||
| import com.fasterxml.jackson.databind.ObjectMapper; | ||
| import com.google.common.base.Function; | ||
| import com.google.common.base.Predicate; | ||
| import com.google.common.base.Strings; | ||
| import com.google.common.collect.ImmutableList; | ||
| import com.google.common.io.Files; | ||
| import io.druid.benchmark.datagen.BenchmarkDataGenerator; | ||
| import io.druid.benchmark.datagen.BenchmarkSchemaInfo; | ||
|
|
@@ -242,12 +242,7 @@ public void stringRead(Blackhole blackhole) | |
| { | ||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, null); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -258,11 +253,7 @@ public void longRead(Blackhole blackhole) | |
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, null); | ||
|
|
||
| Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole); | ||
| List<Long> strings = longListSeq.limit(1).toList().get(0); | ||
| for (Long st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursorsLong(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -273,11 +264,7 @@ public void timeFilterNone(Blackhole blackhole) | |
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, timeFilterNone); | ||
|
|
||
| Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole); | ||
| List<Long> strings = longListSeq.limit(1).toList().get(0); | ||
| for (Long st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursorsLong(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -288,11 +275,7 @@ public void timeFilterHalf(Blackhole blackhole) | |
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, timeFilterHalf); | ||
|
|
||
| Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole); | ||
| List<Long> strings = longListSeq.limit(1).toList().get(0); | ||
| for (Long st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursorsLong(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -303,11 +286,7 @@ public void timeFilterAll(Blackhole blackhole) | |
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, timeFilterAll); | ||
|
|
||
| Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole); | ||
| List<Long> strings = longListSeq.limit(1).toList().get(0); | ||
| for (Long st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursorsLong(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -319,12 +298,7 @@ public void readWithPreFilter(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, filter); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -336,12 +310,7 @@ public void readWithPostFilter(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, filter); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -353,12 +322,7 @@ public void readWithExFnPreFilter(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, filter); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -370,12 +334,24 @@ public void readWithExFnPostFilter(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, filter); | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| @Benchmark | ||
| @BenchmarkMode(Mode.AverageTime) | ||
| @OutputTimeUnit(TimeUnit.MICROSECONDS) | ||
| public void readAndFilter(Blackhole blackhole) | ||
| { | ||
| Filter andFilter = new AndFilter( | ||
| ImmutableList.of( | ||
| new SelectorFilter("dimUniform", "199"), | ||
| new NoBitmapSelectorDimFilter("dimUniform", "super-199", JS_EXTRACTION_FN).toFilter() | ||
| ) | ||
| ); | ||
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, andFilter); | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -389,12 +365,7 @@ public void readOrFilter(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, orFilter); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -408,12 +379,7 @@ public void readOrFilterCNF(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, Filters.convertToCNF(orFilter)); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -450,12 +416,7 @@ public void readComplexOrFilter(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, dimFilter3.toFilter()); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| @Benchmark | ||
|
|
@@ -492,68 +453,54 @@ public void readComplexOrFilterCNF(Blackhole blackhole) | |
|
|
||
| StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex); | ||
| Sequence<Cursor> cursors = makeCursors(sa, Filters.convertToCNF(dimFilter3.toFilter())); | ||
|
|
||
| Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole); | ||
| List<String> strings = stringListSeq.limit(1).toList().get(0); | ||
| for (String st : strings) { | ||
| blackhole.consume(st); | ||
| } | ||
| readCursors(cursors, blackhole); | ||
| } | ||
|
|
||
| private Sequence<Cursor> makeCursors(StorageAdapter sa, Filter filter) | ||
| { | ||
| return sa.makeCursors(filter, schemaInfo.getDataInterval(), VirtualColumns.EMPTY, Granularities.ALL, false, null); | ||
| } | ||
|
|
||
| private Sequence<List<String>> readCursors(Sequence<Cursor> cursors, final Blackhole blackhole) | ||
| private void readCursors(Sequence<Cursor> cursors, Blackhole blackhole) | ||
| { | ||
| return Sequences.map( | ||
| final Sequence<Void> voids = Sequences.map( | ||
| cursors, | ||
| new Function<Cursor, List<String>>() | ||
| { | ||
| @Override | ||
| public List<String> apply(Cursor input) | ||
| { | ||
| List<String> strings = new ArrayList<String>(); | ||
| List<DimensionSelector> selectors = new ArrayList<>(); | ||
| selectors.add( | ||
| input.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimSequential", null)) | ||
| ); | ||
| //selectors.add(input.makeDimensionSelector(new DefaultDimensionSpec("dimB", null))); | ||
| while (!input.isDone()) { | ||
| for (DimensionSelector selector : selectors) { | ||
| IndexedInts row = selector.getRow(); | ||
| blackhole.consume(selector.lookupName(row.get(0))); | ||
| //strings.add(selector.lookupName(row.get(0))); | ||
| } | ||
| input.advance(); | ||
| input -> { | ||
| List<DimensionSelector> selectors = new ArrayList<>(); | ||
| selectors.add( | ||
| input.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimSequential", null)) | ||
| ); | ||
| while (!input.isDone()) { | ||
| for (DimensionSelector selector : selectors) { | ||
| IndexedInts row = selector.getRow(); | ||
| blackhole.consume(selector.lookupName(row.get(0))); | ||
| } | ||
| return strings; | ||
| input.advance(); | ||
| } | ||
| return null; | ||
| } | ||
| ); | ||
|
|
||
| blackhole.consume(voids.toList()); | ||
| } | ||
|
|
||
| private Sequence<List<Long>> readCursorsLong(Sequence<Cursor> cursors, final Blackhole blackhole) | ||
| private void readCursorsLong(Sequence<Cursor> cursors, final Blackhole blackhole) | ||
| { | ||
| return Sequences.map( | ||
| final Sequence<Void> voids = Sequences.map( | ||
| cursors, | ||
| new Function<Cursor, List<Long>>() | ||
| { | ||
| @Override | ||
| public List<Long> apply(Cursor input) | ||
| { | ||
| List<Long> longvals = new ArrayList<Long>(); | ||
| BaseLongColumnValueSelector selector = input.getColumnSelectorFactory().makeColumnValueSelector("sumLongSequential"); | ||
| while (!input.isDone()) { | ||
| long rowval = selector.getLong(); | ||
| blackhole.consume(rowval); | ||
| input.advance(); | ||
| } | ||
| return longvals; | ||
| input -> { | ||
| BaseLongColumnValueSelector selector = input.getColumnSelectorFactory() | ||
| .makeColumnValueSelector("sumLongSequential"); | ||
| while (!input.isDone()) { | ||
| long rowval = selector.getLong(); | ||
| blackhole.consume(rowval); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest to use some real aggregator, like LongSum, to make the load more realistic
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think it matters as long as the load is consistent between tests? I'd rather leave it using blackhole (especially since the string one can't switch to an aggregator anyway) |
||
| input.advance(); | ||
| } | ||
| return null; | ||
| } | ||
| ); | ||
|
|
||
| blackhole.consume(voids.toList()); | ||
| } | ||
|
|
||
| private static class NoBitmapSelectorFilter extends SelectorFilter | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -170,8 +170,11 @@ private static ValueMatcher makeDictionaryEncodedValueMatcherGeneric( | |
| Predicate<String> predicate | ||
| ) | ||
| { | ||
| final BitSet predicateMatchingValueIds = makePredicateMatchingSet(selector, predicate); | ||
| final BitSet checkedIds = new BitSet(selector.getValueCardinality()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to use one bitset (with two-bit "blocks") than two separate bitsets, for locality. Also, in the future it could be replaced with specialized class that fetches and writes two bits at a time, in order to make less ops in the hot loop.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried this transformation and it ended up slower. Timings after the change are below (compare to the numbers from the PR description). I'm not sure why. Is it possible that the JVM is so smart as to only compute things like bitIndex (inside the BitSets) once, in the case where there are two different BitSets and you get the same index from both in immediate succession? The transformed, slower code was: @Override
public ValueMatcher makeValueMatcher(final Predicate<String> predicate)
{
final BitSet matchingIds = new BitSet(getCardinality() * 2);
// Lazy matcher; only check an id if matches() is called.
return new ValueMatcher()
{
@Override
public boolean matches()
{
final int id = getRowValue();
final int checkBit = id * 2;
final int matchBit = checkBit + 1;
if (matchingIds.get(checkBit)) {
return matchingIds.get(matchBit);
} else {
final boolean matches = predicate.apply(lookupName(id));
matchingIds.set(checkBit);
if (matches) {
matchingIds.set(matchBit);
}
return matches;
}
}
@Override
public void inspectRuntimeShape(RuntimeShapeInspector inspector)
{
inspector.visit("column", SimpleDictionaryEncodedColumn.this);
}
};
}
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on this experiment, and the other, simpler one below (#5403 (comment)), I guess the JVM is that smart. So the original code is fastest, because it doesn't have as many branches and can also reuse computations internal to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This would probably be fastest of all, but since the current code is not too bad in terms of overhead even on the worse case, I think the optimization is not needed for this patch. I suspect that in the real world, it's more common to get the good case (the lazy optimization is useful and saves time) rather than the bad case (lazy optimization is useless, so we just hope it doesn't add too much overhead).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Could you verify this theory by looking at the assembly with It's important to understand clearly why some counter intuitive results are observed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @leventov do you have some tips on how to do this from within intellij on a Mac? I tried adding an annotation
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I always run (and never heard anybody does anything different) JMH with profilers from the command line. Running benchmarks from an IDE is not a good idea anyway (might it affect the results that you have obtained previously?)
If you are on Mac, you should use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I use a JMH IDEA plugin: https://github.com/artyushov/idea-jmh-plugin. Although I have never read the readme until now, and apparently it can affect the results slightly (they say they observed up to a couple of %). What do you do to run the Druid JMH benchmarks? Do you know offhand what needs to be done to build them before they can be run? It'd be nice to add something to the Druid developer docs (https://github.com/druid-io/druid/blob/master/CONTRIBUTING.md) teaching people like me what to do. |
||
| final BitSet matchingIds = new BitSet(selector.getValueCardinality()); | ||
| final boolean matchNull = predicate.apply(null); | ||
|
|
||
| // Lazy matcher; only check an id if matches() is called. | ||
| return new ValueMatcher() | ||
| { | ||
| @Override | ||
|
|
@@ -184,7 +187,20 @@ public boolean matches() | |
| return matchNull; | ||
| } else { | ||
| for (int i = 0; i < size; ++i) { | ||
| if (predicateMatchingValueIds.get(row.get(i))) { | ||
| final int id = row.get(i); | ||
| final boolean matches; | ||
|
|
||
| if (checkedIds.get(id)) { | ||
| matches = matchingIds.get(id); | ||
| } else { | ||
| matches = predicate.apply(selector.lookupName(id)); | ||
| checkedIds.set(id); | ||
| if (matches) { | ||
| matchingIds.set(id); | ||
| } | ||
| } | ||
|
|
||
| if (matches) { | ||
| return true; | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -267,16 +267,27 @@ public void inspectRuntimeShape(RuntimeShapeInspector inspector) | |
| @Override | ||
| public ValueMatcher makeValueMatcher(final Predicate<String> predicate) | ||
| { | ||
| final BitSet predicateMatchingValueIds = DimensionSelectorUtils.makePredicateMatchingSet( | ||
| this, | ||
| predicate | ||
| ); | ||
| final BitSet checkedIds = new BitSet(getCardinality()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same |
||
| final BitSet matchingIds = new BitSet(getCardinality()); | ||
|
|
||
| // Lazy matcher; only check an id if matches() is called. | ||
| return new ValueMatcher() | ||
| { | ||
| @Override | ||
| public boolean matches() | ||
| { | ||
| return predicateMatchingValueIds.get(getRowValue()); | ||
| final int id = getRowValue(); | ||
|
|
||
| if (checkedIds.get(id)) { | ||
| return matchingIds.get(id); | ||
| } else { | ||
| final boolean matches = predicate.apply(lookupName(id)); | ||
| checkedIds.set(id); | ||
| if (matches) { | ||
| matchingIds.set(id); | ||
| } | ||
| return matches; | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggested to use a real aggregator to make the load more realistic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The input is a string so aggregators cannot run on it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, I guess "cardinality" could, but that seems like a bad load to test with since it's pretty heavy.