Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
package io.druid.benchmark;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.io.Files;
import io.druid.benchmark.datagen.BenchmarkDataGenerator;
import io.druid.benchmark.datagen.BenchmarkSchemaInfo;
Expand Down Expand Up @@ -242,12 +242,7 @@ public void stringRead(Blackhole blackhole)
{
StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, null);

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

@Benchmark
Expand All @@ -258,11 +253,7 @@ public void longRead(Blackhole blackhole)
StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, null);

Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole);
List<Long> strings = longListSeq.limit(1).toList().get(0);
for (Long st : strings) {
blackhole.consume(st);
}
readCursorsLong(cursors, blackhole);
}

@Benchmark
Expand All @@ -273,11 +264,7 @@ public void timeFilterNone(Blackhole blackhole)
StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, timeFilterNone);

Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole);
List<Long> strings = longListSeq.limit(1).toList().get(0);
for (Long st : strings) {
blackhole.consume(st);
}
readCursorsLong(cursors, blackhole);
}

@Benchmark
Expand All @@ -288,11 +275,7 @@ public void timeFilterHalf(Blackhole blackhole)
StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, timeFilterHalf);

Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole);
List<Long> strings = longListSeq.limit(1).toList().get(0);
for (Long st : strings) {
blackhole.consume(st);
}
readCursorsLong(cursors, blackhole);
}

@Benchmark
Expand All @@ -303,11 +286,7 @@ public void timeFilterAll(Blackhole blackhole)
StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, timeFilterAll);

Sequence<List<Long>> longListSeq = readCursorsLong(cursors, blackhole);
List<Long> strings = longListSeq.limit(1).toList().get(0);
for (Long st : strings) {
blackhole.consume(st);
}
readCursorsLong(cursors, blackhole);
}

@Benchmark
Expand All @@ -319,12 +298,7 @@ public void readWithPreFilter(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, filter);

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

@Benchmark
Expand All @@ -336,12 +310,7 @@ public void readWithPostFilter(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, filter);

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

@Benchmark
Expand All @@ -353,12 +322,7 @@ public void readWithExFnPreFilter(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, filter);

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

@Benchmark
Expand All @@ -370,12 +334,24 @@ public void readWithExFnPostFilter(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, filter);
readCursors(cursors, blackhole);
}

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public void readAndFilter(Blackhole blackhole)
{
Filter andFilter = new AndFilter(
ImmutableList.of(
new SelectorFilter("dimUniform", "199"),
new NoBitmapSelectorDimFilter("dimUniform", "super-199", JS_EXTRACTION_FN).toFilter()
)
);

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, andFilter);
readCursors(cursors, blackhole);
}

@Benchmark
Expand All @@ -389,12 +365,7 @@ public void readOrFilter(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, orFilter);

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

@Benchmark
Expand All @@ -408,12 +379,7 @@ public void readOrFilterCNF(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, Filters.convertToCNF(orFilter));

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

@Benchmark
Expand Down Expand Up @@ -450,12 +416,7 @@ public void readComplexOrFilter(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, dimFilter3.toFilter());

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

@Benchmark
Expand Down Expand Up @@ -492,68 +453,54 @@ public void readComplexOrFilterCNF(Blackhole blackhole)

StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
Sequence<Cursor> cursors = makeCursors(sa, Filters.convertToCNF(dimFilter3.toFilter()));

Sequence<List<String>> stringListSeq = readCursors(cursors, blackhole);
List<String> strings = stringListSeq.limit(1).toList().get(0);
for (String st : strings) {
blackhole.consume(st);
}
readCursors(cursors, blackhole);
}

private Sequence<Cursor> makeCursors(StorageAdapter sa, Filter filter)
{
return sa.makeCursors(filter, schemaInfo.getDataInterval(), VirtualColumns.EMPTY, Granularities.ALL, false, null);
}

private Sequence<List<String>> readCursors(Sequence<Cursor> cursors, final Blackhole blackhole)
private void readCursors(Sequence<Cursor> cursors, Blackhole blackhole)
{
return Sequences.map(
final Sequence<Void> voids = Sequences.map(
cursors,
new Function<Cursor, List<String>>()
{
@Override
public List<String> apply(Cursor input)
{
List<String> strings = new ArrayList<String>();
List<DimensionSelector> selectors = new ArrayList<>();
selectors.add(
input.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimSequential", null))
);
//selectors.add(input.makeDimensionSelector(new DefaultDimensionSpec("dimB", null)));
while (!input.isDone()) {
for (DimensionSelector selector : selectors) {
IndexedInts row = selector.getRow();
blackhole.consume(selector.lookupName(row.get(0)));
//strings.add(selector.lookupName(row.get(0)));
}
input.advance();
input -> {
List<DimensionSelector> selectors = new ArrayList<>();
selectors.add(
input.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimSequential", null))
);
while (!input.isDone()) {
for (DimensionSelector selector : selectors) {
IndexedInts row = selector.getRow();
blackhole.consume(selector.lookupName(row.get(0)));
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested to use a real aggregator to make the load more realistic

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The input is a string so aggregators cannot run on it.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I guess "cardinality" could, but that seems like a bad load to test with since it's pretty heavy.

}
return strings;
input.advance();
}
return null;
}
);

blackhole.consume(voids.toList());
}

private Sequence<List<Long>> readCursorsLong(Sequence<Cursor> cursors, final Blackhole blackhole)
private void readCursorsLong(Sequence<Cursor> cursors, final Blackhole blackhole)
{
return Sequences.map(
final Sequence<Void> voids = Sequences.map(
cursors,
new Function<Cursor, List<Long>>()
{
@Override
public List<Long> apply(Cursor input)
{
List<Long> longvals = new ArrayList<Long>();
BaseLongColumnValueSelector selector = input.getColumnSelectorFactory().makeColumnValueSelector("sumLongSequential");
while (!input.isDone()) {
long rowval = selector.getLong();
blackhole.consume(rowval);
input.advance();
}
return longvals;
input -> {
BaseLongColumnValueSelector selector = input.getColumnSelectorFactory()
.makeColumnValueSelector("sumLongSequential");
while (!input.isDone()) {
long rowval = selector.getLong();
blackhole.consume(rowval);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest to use some real aggregator, like LongSum, to make the load more realistic

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it matters as long as the load is consistent between tests? I'd rather leave it using blackhole (especially since the string one can't switch to an aggregator anyway)

input.advance();
}
return null;
}
);

blackhole.consume(voids.toList());
}

private static class NoBitmapSelectorFilter extends SelectorFilter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,11 @@ private static ValueMatcher makeDictionaryEncodedValueMatcherGeneric(
Predicate<String> predicate
)
{
final BitSet predicateMatchingValueIds = makePredicateMatchingSet(selector, predicate);
final BitSet checkedIds = new BitSet(selector.getValueCardinality());
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to use one bitset (with two-bit "blocks") than two separate bitsets, for locality. Also, in the future it could be replaced with specialized class that fetches and writes two bits at a time, in order to make less ops in the hot loop.

Copy link
Copy Markdown
Contributor Author

@gianm gianm Feb 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried this transformation and it ended up slower. Timings after the change are below (compare to the numbers from the PR description). I'm not sure why. Is it possible that the JVM is so smart as to only compute things like bitIndex (inside the BitSets) once, in the case where there are two different BitSets and you get the same index from both in immediate succession?

Benchmark                                        (rowsPerSegment)  (schema)  Mode  Cnt    Score       Error  Units
FilterPartitionBenchmark.readWithExFnPostFilter            750000     basic  avgt   25  10690.447 ± 222.691  us/op
FilterPartitionBenchmark.readAndFilter                     750000     basic  avgt   25  305.222   ±   5.011  us/op

The transformed, slower code was:

        @Override
        public ValueMatcher makeValueMatcher(final Predicate<String> predicate)
        {
          final BitSet matchingIds = new BitSet(getCardinality() * 2);

          // Lazy matcher; only check an id if matches() is called.
          return new ValueMatcher()
          {
            @Override
            public boolean matches()
            {
              final int id = getRowValue();
              final int checkBit = id * 2;
              final int matchBit = checkBit + 1;

              if (matchingIds.get(checkBit)) {
                return matchingIds.get(matchBit);
              } else {
                final boolean matches = predicate.apply(lookupName(id));
                matchingIds.set(checkBit);
                if (matches) {
                  matchingIds.set(matchBit);
                }
                return matches;
              }
            }

            @Override
            public void inspectRuntimeShape(RuntimeShapeInspector inspector)
            {
              inspector.visit("column", SimpleDictionaryEncodedColumn.this);
            }
          };
        }

Copy link
Copy Markdown
Contributor Author

@gianm gianm Feb 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on this experiment, and the other, simpler one below (#5403 (comment)), I guess the JVM is that smart. So the original code is fastest, because it doesn't have as many branches and can also reuse computations internal to BitSet.get.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

specialized class that fetches and writes two bits at a time, in order to make less ops in the hot loop.

This would probably be fastest of all, but since the current code is not too bad in terms of overhead even on the worse case, I think the optimization is not needed for this patch. I suspect that in the real world, it's more common to get the good case (the lazy optimization is useful and saves time) rather than the bad case (lazy optimization is useless, so we just hope it doesn't add too much overhead).

Copy link
Copy Markdown
Member

@leventov leventov Feb 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible that the JVM is so smart as to only compute things like bitIndex (inside the BitSets) once, in the case where there are two different BitSets and you get the same index from both in immediate succession?

Could you verify this theory by looking at the assembly with -prof perfasm (dtraceasm on Mac)?

It's important to understand clearly why some counter intuitive results are observed.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @leventov do you have some tips on how to do this from within intellij on a Mac? I tried adding an annotation @Fork(value = 1, jvmArgs = "-prof dtraceasm") but I got a bunch of errors like this. I am hoping this is something you have run into in the past and may know how to fix.

Exception in thread "main" java.lang.NoClassDefFoundError: java/lang/invoke/LambdaForm$MH
	at com.sun.demo.jvmti.hprof.Tracker.nativeCallSite(Native Method)
	at com.sun.demo.jvmti.hprof.Tracker.CallSite(Tracker.java:99)
	at java.lang.invoke.InvokerBytecodeGenerator.emitNewArray(InvokerBytecodeGenerator.java:889)
	at java.lang.invoke.InvokerBytecodeGenerator.generateCustomizedCodeBytes(InvokerBytecodeGenerator.java:688)
	at java.lang.invoke.InvokerBytecodeGenerator.generateCustomizedCode(InvokerBytecodeGenerator.java:618)
	at java.lang.invoke.LambdaForm.compileToBytecode(LambdaForm.java:654)
	at java.lang.invoke.LambdaForm.prepare(LambdaForm.java:635)
	at java.lang.invoke.MethodHandle.<init>(MethodHandle.java:461)
	at java.lang.invoke.BoundMethodHandle.<init>(BoundMethodHandle.java:58)
	at java.lang.invoke.BoundMethodHandle$Species_L.<init>(BoundMethodHandle.java:211)
	at java.lang.invoke.BoundMethodHandle$Species_L.copyWith(BoundMethodHandle.java:228)
	at java.lang.invoke.MethodHandle.asCollector(MethodHandle.java:1002)
	at java.lang.invoke.MethodHandleImpl$AsVarargsCollector.<init>(MethodHandleImpl.java:460)
	at java.lang.invoke.MethodHandleImpl$AsVarargsCollector.<init>(MethodHandleImpl.java:454)
	at java.lang.invoke.MethodHandleImpl.makeVarargsCollector(MethodHandleImpl.java:445)
	at java.lang.invoke.MethodHandle.setVarargs(MethodHandle.java:1325)
	at java.lang.invoke.MethodHandles$Lookup.getDirectMethodCommon(MethodHandles.java:1665)
	at java.lang.invoke.MethodHandles$Lookup.getDirectMethod(MethodHandles.java:1600)
	at java.lang.invoke.MethodHandles$Lookup.findStatic(MethodHandles.java:779)
	at java.lang.invoke.MethodHandleImpl$Lazy.<clinit>(MethodHandleImpl.java:627)
	at java.lang.invoke.MethodHandleImpl.varargsArray(MethodHandleImpl.java:1506)
	at java.lang.invoke.MethodHandleImpl.varargsArray(MethodHandleImpl.java:1623)
	at java.lang.invoke.MethodHandle.asCollector(MethodHandle.java:999)
	at java.lang.invoke.MethodHandleImpl$AsVarargsCollector.<init>(MethodHandleImpl.java:460)
	at java.lang.invoke.MethodHandleImpl$AsVarargsCollector.<init>(MethodHandleImpl.java:454)
	at java.lang.invoke.MethodHandleImpl.makeVarargsCollector(MethodHandleImpl.java:445)
	at java.lang.invoke.MethodHandle.setVarargs(MethodHandle.java:1325)
	at java.lang.invoke.MethodHandles$Lookup.getDirectMethodCommon(MethodHandles.java:1665)
	at java.lang.invoke.MethodHandles$Lookup.getDirectMethod(MethodHandles.java:1600)
	at java.lang.invoke.MethodHandles$Lookup.findStatic(MethodHandles.java:779)
	at java.lang.invoke.CallSite.<clinit>(CallSite.java:226)
	at java.lang.invoke.MethodHandleNatives.linkCallSiteImpl(MethodHandleNatives.java:307)
	at java.lang.invoke.MethodHandleNatives.linkCallSite(MethodHandleNatives.java:297)
	at java.io.ObjectInputStream.<clinit>(ObjectInputStream.java:3578)
	at org.openjdk.jmh.runner.link.BinaryLinkClient.<init>(BinaryLinkClient.java:74)
	at org.openjdk.jmh.runner.ForkedMain.main(ForkedMain.java:72)
HPROF ERROR: Unexpected Exception found afterward [hprof_util.c:494]
HPROF TERMINATED PROCESS

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I always run (and never heard anybody does anything different) JMH with profilers from the command line. Running benchmarks from an IDE is not a good idea anyway (might it affect the results that you have obtained previously?)

-prof perfasm is not a JVM arg, it's an arg of the JMH Main. Try java -jar target/benchmarks -prof list first.

If you are on Mac, you should use dtraceasm. You should also disable "System integrity protection" in order to be able to do this: http://osxdaily.com/2015/10/05/disable-rootless-system-integrity-protection-mac-os-x/

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use a JMH IDEA plugin: https://github.com/artyushov/idea-jmh-plugin. Although I have never read the readme until now, and apparently it can affect the results slightly (they say they observed up to a couple of %).

What do you do to run the Druid JMH benchmarks? Do you know offhand what needs to be done to build them before they can be run? It'd be nice to add something to the Druid developer docs (https://github.com/druid-io/druid/blob/master/CONTRIBUTING.md) teaching people like me what to do.

final BitSet matchingIds = new BitSet(selector.getValueCardinality());
final boolean matchNull = predicate.apply(null);

// Lazy matcher; only check an id if matches() is called.
return new ValueMatcher()
{
@Override
Expand All @@ -184,7 +187,20 @@ public boolean matches()
return matchNull;
} else {
for (int i = 0; i < size; ++i) {
if (predicateMatchingValueIds.get(row.get(i))) {
final int id = row.get(i);
final boolean matches;

if (checkedIds.get(id)) {
matches = matchingIds.get(id);
} else {
matches = predicate.apply(selector.lookupName(id));
checkedIds.set(id);
if (matches) {
matchingIds.set(id);
}
}

if (matches) {
return true;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,16 +267,27 @@ public void inspectRuntimeShape(RuntimeShapeInspector inspector)
@Override
public ValueMatcher makeValueMatcher(final Predicate<String> predicate)
{
final BitSet predicateMatchingValueIds = DimensionSelectorUtils.makePredicateMatchingSet(
this,
predicate
);
final BitSet checkedIds = new BitSet(getCardinality());
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same

final BitSet matchingIds = new BitSet(getCardinality());

// Lazy matcher; only check an id if matches() is called.
return new ValueMatcher()
{
@Override
public boolean matches()
{
return predicateMatchingValueIds.get(getRowValue());
final int id = getRowValue();

if (checkedIds.get(id)) {
return matchingIds.get(id);
} else {
final boolean matches = predicate.apply(lookupName(id));
checkedIds.set(id);
if (matches) {
matchingIds.set(id);
}
return matches;
}
}

@Override
Expand Down