From 91fdebdfc7885cbe434fcf04ccf72791a76633b0 Mon Sep 17 00:00:00 2001 From: Charles Allen Date: Tue, 15 Jan 2019 14:40:52 -0800 Subject: [PATCH 1/4] Densify swapped hll buffer --- .../druid/hll/HyperLogLogCollector.java | 7 ++ .../druid/hll/HyperLogLogCollectorTest.java | 86 ++++++++++++++++++- 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/hll/src/main/java/org/apache/druid/hll/HyperLogLogCollector.java b/hll/src/main/java/org/apache/druid/hll/HyperLogLogCollector.java index 5fd7df778deb..82912146625d 100644 --- a/hll/src/main/java/org/apache/druid/hll/HyperLogLogCollector.java +++ b/hll/src/main/java/org/apache/druid/hll/HyperLogLogCollector.java @@ -387,6 +387,13 @@ public HyperLogLogCollector fold(@Nullable HyperLogLogCollector other) storageBuffer.duplicate().put(other.storageBuffer.asReadOnlyBuffer()); + if (other.storageBuffer.remaining() != other.getNumBytesForDenseStorage()) { + // The other buffer was sparse, densify it + final int newLImit = storageBuffer.position() + other.storageBuffer.remaining(); + storageBuffer.limit(newLImit); + convertToDenseStorage(); + } + other = HyperLogLogCollector.makeCollector(tmpBuffer); } diff --git a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java index 64c19158f468..e45806ecdcd6 100644 --- a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java +++ b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java @@ -22,6 +22,7 @@ import com.google.common.collect.Collections2; import com.google.common.collect.Lists; import com.google.common.hash.HashFunction; +import com.google.common.hash.Hasher; import com.google.common.hash.Hashing; import org.apache.commons.codec.binary.Base64; import org.apache.druid.java.util.common.StringUtils; @@ -31,11 +32,13 @@ import org.junit.Test; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.security.MessageDigest; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Random; +import java.util.function.Predicate; import java.util.concurrent.ThreadLocalRandom; /** @@ -503,7 +506,8 @@ private short computeNumNonZero(byte theByte) return retVal; } - @Ignore @Test // This test can help when finding potential combinations that are weird, but it's non-deterministic + @Ignore + @Test // This test can help when finding potential combinations that are weird, but it's non-deterministic public void testFoldingwithDifferentOffsets() { // final Random random = new Random(37); // this seed will cause this test to fail because of slightly larger errors @@ -534,7 +538,8 @@ public void testFoldingwithDifferentOffsets() } } - @Ignore @Test + @Ignore + @Test public void testFoldingwithDifferentOffsets2() throws Exception { final Random random = new Random(0); @@ -708,6 +713,83 @@ public void testMaxOverflow() Assert.assertEquals(0, collector.getNumNonZeroRegisters()); } + @Test + public void testRegisterSwapWithSparse() + { + final HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + // Skip the first bucket + for (int i = 1; i < HyperLogLogCollector.NUM_BUCKETS; i++) { + collector.add((short) i, (byte) 1); + Assert.assertEquals(i, collector.getNumNonZeroRegisters()); + Assert.assertEquals(0, collector.getRegisterOffset()); + } + Assert.assertEquals( + 15615.219683654448D, + HyperLogLogCollector.makeCollector(collector.toByteBuffer().asReadOnlyBuffer()) + .estimateCardinality(), + 1e-5D + ); + + final byte[] hash = new byte[10]; + hash[0] = 1; // Bucket 0, 1 offset of 0 + collector.add(hash); + Assert.assertEquals(0, collector.getNumNonZeroRegisters()); + Assert.assertEquals(1, collector.getRegisterOffset()); + + // We have a REALLY bad distribution, Sketch as 0 is fine. + Assert.assertEquals( + 0.0D, + HyperLogLogCollector.makeCollector(collector.toByteBuffer().asReadOnlyBuffer()) + .estimateCardinality(), + 1e-5D + ); + final ByteBuffer buffer = collector.toByteBuffer(); + Assert.assertEquals(collector.getNumHeaderBytes(), buffer.remaining()); + + final HyperLogLogCollector denseCollector = HyperLogLogCollector.makeLatestCollector(); + for (int i = 0; i < HyperLogLogCollector.NUM_BUCKETS - 1; i++) { + denseCollector.add((short) i, (byte) 1); + } + + Assert.assertEquals(HyperLogLogCollector.NUM_BUCKETS - 1, denseCollector.getNumNonZeroRegisters()); + final HyperLogLogCollector folded = denseCollector.fold(HyperLogLogCollector.makeCollector(buffer)); + Assert.assertNotNull(folded.toByteBuffer()); + Assert.assertEquals(folded.getStorageBuffer().remaining(), denseCollector.getNumBytesForDenseStorage()); + } + + // Example of a terrible sampling filter + @Test + public void testCanFillUpOnMod() + { + final HashFunction fn = Hashing.murmur3_128(); + final HyperLogLogCollector hyperLogLogCollector = HyperLogLogCollector.makeLatestCollector(); + final byte[] b = new byte[10]; + b[0] = 1; + hyperLogLogCollector.add(b); + final Random random = new Random(347893248701078L); + long loops = 0; + // Do a 1% "sample" where the mod of the hash is 43 + final Predicate pass = i -> { + // ByteOrder.nativeOrder() on lots of systems is ByteOrder.LITTLE_ENDIAN + final ByteBuffer bb = ByteBuffer.wrap(fn.hashInt(i).asBytes()).order(ByteOrder.LITTLE_ENDIAN); + return (bb.getInt() % 100) == 43; + }; + final long loopLimit = 1_000_000L; + do { + final int rnd = random.nextInt(); + if (!pass.test(rnd)) { + continue; + } + final Hasher hasher = fn.newHasher(); + hasher.putInt(rnd); + hyperLogLogCollector.add(hasher.hash().asBytes()); + loops++; + } while (hyperLogLogCollector.getNumNonZeroRegisters() > 0 && loops < loopLimit); + Assert.assertNotEquals(loopLimit, loops); + Assert.assertEquals(hyperLogLogCollector.getNumHeaderBytes(), hyperLogLogCollector.toByteBuffer().remaining()); + log.info("Filled up registers after %s random numbers", loops); + } + @Test public void testMergeMaxOverflow() { From 95e7806ffc2ce8c17f4992eca9181723ffcbda6e Mon Sep 17 00:00:00 2001 From: Charles Allen Date: Tue, 15 Jan 2019 15:08:37 -0800 Subject: [PATCH 2/4] Make test loop limit pre-increment --- .../java/org/apache/druid/hll/HyperLogLogCollectorTest.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java index e45806ecdcd6..30dc4fdca8a2 100644 --- a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java +++ b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java @@ -774,7 +774,7 @@ public void testCanFillUpOnMod() final ByteBuffer bb = ByteBuffer.wrap(fn.hashInt(i).asBytes()).order(ByteOrder.LITTLE_ENDIAN); return (bb.getInt() % 100) == 43; }; - final long loopLimit = 1_000_000L; + final long loopLimit = 1_000_000_000L; do { final int rnd = random.nextInt(); if (!pass.test(rnd)) { @@ -783,8 +783,7 @@ public void testCanFillUpOnMod() final Hasher hasher = fn.newHasher(); hasher.putInt(rnd); hyperLogLogCollector.add(hasher.hash().asBytes()); - loops++; - } while (hyperLogLogCollector.getNumNonZeroRegisters() > 0 && loops < loopLimit); + } while (hyperLogLogCollector.getNumNonZeroRegisters() > 0 && ++loops < loopLimit); Assert.assertNotEquals(loopLimit, loops); Assert.assertEquals(hyperLogLogCollector.getNumHeaderBytes(), hyperLogLogCollector.toByteBuffer().remaining()); log.info("Filled up registers after %s random numbers", loops); From 211ecb84597b35b0c0cc223db9a666814b5035c2 Mon Sep 17 00:00:00 2001 From: Charles Allen Date: Tue, 15 Jan 2019 15:12:36 -0800 Subject: [PATCH 3/4] Reformat --- .../druid/hll/HyperLogLogCollectorTest.java | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java index 30dc4fdca8a2..0677a0d41402 100644 --- a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java +++ b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java @@ -38,10 +38,11 @@ import java.util.Collection; import java.util.List; import java.util.Random; -import java.util.function.Predicate; import java.util.concurrent.ThreadLocalRandom; +import java.util.function.Predicate; /** + * */ public class HyperLogLogCollectorTest { @@ -49,6 +50,18 @@ public class HyperLogLogCollectorTest private final HashFunction fn = Hashing.murmur3_128(); + private static void fillBuckets(HyperLogLogCollector collector, byte startOffset, byte endOffset) + { + byte offset = startOffset; + while (offset <= endOffset) { + // fill buckets to shift registerOffset + for (short bucket = 0; bucket < 2048; ++bucket) { + collector.add(bucket, offset); + } + offset++; + } + } + @Test public void testFolding() { @@ -82,14 +95,13 @@ public void testFolding() } } - /** * This is a very long running test, disabled by default. * It is meant to catch issues when combining a large numer of HLL objects. - * + *

* It compares adding all the values to one HLL vs. * splitting up values into HLLs of 100 values each, and folding those HLLs into a single main HLL. - * + *

* When reaching very large cardinalities (>> 50,000,000), offsets are mismatched between the main HLL and the ones * with 100 values, requiring a floating max as described in * http://druid.io/blog/2014/02/18/hyperloglog-optimizations-for-real-world-systems.html @@ -818,19 +830,6 @@ public void testMergeMaxOverflow() Assert.assertEquals(67, collector.getMaxOverflowValue()); } - - private static void fillBuckets(HyperLogLogCollector collector, byte startOffset, byte endOffset) - { - byte offset = startOffset; - while (offset <= endOffset) { - // fill buckets to shift registerOffset - for (short bucket = 0; bucket < 2048; ++bucket) { - collector.add(bucket, offset); - } - offset++; - } - } - @Test public void testFoldOrder() { From 92e2db0f6203ec98807e518942847a617ede00b7 Mon Sep 17 00:00:00 2001 From: Charles Allen Date: Wed, 6 Mar 2019 12:26:17 -0800 Subject: [PATCH 4/4] Fix test comments --- .../org/apache/druid/hll/HyperLogLogCollectorTest.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java index c3384ff075a6..3727717d1c65 100644 --- a/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java +++ b/hll/src/test/java/org/apache/druid/hll/HyperLogLogCollectorTest.java @@ -97,10 +97,10 @@ public void testFolding() /** * This is a very long running test, disabled by default. * It is meant to catch issues when combining a large numer of HLL objects. - *

+ * * It compares adding all the values to one HLL vs. * splitting up values into HLLs of 100 values each, and folding those HLLs into a single main HLL. - *

+ * * When reaching very large cardinalities (>> 50,000,000), offsets are mismatched between the main HLL and the ones * with 100 values, requiring a floating max as described in * http://druid.io/blog/2014/02/18/hyperloglog-optimizations-for-real-world-systems.html @@ -768,7 +768,7 @@ public void testRegisterSwapWithSparse() Assert.assertEquals(folded.getStorageBuffer().remaining(), denseCollector.getNumBytesForDenseStorage()); } - // Example of a terrible sampling filter + // Example of a terrible sampling filter. Don't use this method @Test public void testCanFillUpOnMod() { @@ -797,7 +797,6 @@ public void testCanFillUpOnMod() } while (hyperLogLogCollector.getNumNonZeroRegisters() > 0 && ++loops < loopLimit); Assert.assertNotEquals(loopLimit, loops); Assert.assertEquals(hyperLogLogCollector.getNumHeaderBytes(), hyperLogLogCollector.toByteBuffer().remaining()); - log.info("Filled up registers after %s random numbers", loops); } @Test