From 266ff6dc710a9e7a253cbbb2d013b3a3230c7dd1 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 17 Oct 2018 18:42:46 -0700 Subject: [PATCH 1/7] use a sha512 hash of bloom filter for cache key instead of filter bytes --- .../guice/BloomFilterSerializersModule.java | 24 ++++++- .../druid/query/filter/BloomDimFilter.java | 21 ++---- .../query/filter/BloomKFilterHolder.java | 60 ++++++++++++++++ .../query/filter/BloomDimFilterTest.java | 69 ++++++++++++++----- 4 files changed, 140 insertions(+), 34 deletions(-) create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java index 21af16d649f7..ad5a99f220a4 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java @@ -20,14 +20,15 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.SerializerProvider; import com.fasterxml.jackson.databind.deser.std.StdDeserializer; import com.fasterxml.jackson.databind.jsontype.NamedType; import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import com.google.common.hash.Hashing; import org.apache.druid.query.filter.BloomDimFilter; +import org.apache.druid.query.filter.BloomKFilterHolder; import org.apache.hive.common.util.BloomKFilter; import java.io.ByteArrayInputStream; @@ -45,6 +46,7 @@ public BloomFilterSerializersModule() ); addSerializer(BloomKFilter.class, new BloomKFilterSerializer()); addDeserializer(BloomKFilter.class, new BloomKFilterDeserializer()); + addDeserializer(BloomKFilterHolder.class, new BloomKFilterHolderDeserializer()); } public static class BloomKFilterSerializer extends StdSerializer @@ -78,11 +80,29 @@ protected BloomKFilterDeserializer() @Override public BloomKFilter deserialize( JsonParser jsonParser, DeserializationContext deserializationContext - ) throws IOException, JsonProcessingException + ) throws IOException { byte[] bytes = jsonParser.getBinaryValue(); return BloomKFilter.deserialize(new ByteArrayInputStream(bytes)); + } + } + + public static class BloomKFilterHolderDeserializer extends StdDeserializer + { + protected BloomKFilterHolderDeserializer() + { + super(BloomKFilterHolder.class); + } + @Override + public BloomKFilterHolder deserialize( + JsonParser jsonParser, DeserializationContext deserializationContext + ) throws IOException + { + byte[] bytes = jsonParser.getBinaryValue(); + BloomKFilter filter = BloomKFilter.deserialize(new ByteArrayInputStream(bytes)); + byte[] hash = Hashing.sha512().hashBytes(bytes).asBytes(); + return new BloomKFilterHolder(filter, hash); } } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java index bfaa37c8857a..814b1281aa66 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java @@ -25,15 +25,12 @@ import com.google.common.base.Predicate; import com.google.common.collect.RangeSet; import com.google.common.collect.Sets; -import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.cache.CacheKeyBuilder; import org.apache.druid.query.extraction.ExtractionFn; import org.apache.druid.segment.filter.DimensionPredicateFilter; import org.apache.hive.common.util.BloomKFilter; -import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.util.HashSet; /** @@ -43,39 +40,33 @@ public class BloomDimFilter implements DimFilter private final String dimension; private final BloomKFilter bloomKFilter; + private final byte[] hash; private final ExtractionFn extractionFn; @JsonCreator public BloomDimFilter( @JsonProperty("dimension") String dimension, - @JsonProperty("bloomKFilter") BloomKFilter bloomKFilter, + @JsonProperty("bloomKFilter") BloomKFilterHolder bloomKFilterHolder, @JsonProperty("extractionFn") ExtractionFn extractionFn ) { Preconditions.checkArgument(dimension != null, "dimension must not be null"); - Preconditions.checkNotNull(bloomKFilter); + Preconditions.checkNotNull(bloomKFilterHolder); this.dimension = dimension; - this.bloomKFilter = bloomKFilter; + this.bloomKFilter = bloomKFilterHolder.getFilter(); + this.hash = bloomKFilterHolder.getFilterHash(); this.extractionFn = extractionFn; } @Override public byte[] getCacheKey() { - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - try { - BloomKFilter.serialize(byteArrayOutputStream, bloomKFilter); - } - catch (IOException e) { - throw new ISE(e, "Exception when generating cache key for [%s]", this); - } - byte[] bloomFilterBytes = byteArrayOutputStream.toByteArray(); return new CacheKeyBuilder(DimFilterUtils.BLOOM_DIM_FILTER_CACHE_ID) .appendString(dimension) .appendByte(DimFilterUtils.STRING_SEPARATOR) .appendByteArray(extractionFn == null ? new byte[0] : extractionFn.getCacheKey()) .appendByte(DimFilterUtils.STRING_SEPARATOR) - .appendByteArray(bloomFilterBytes) + .appendByteArray(hash) .build(); } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java new file mode 100644 index 000000000000..5dae67d7f9b3 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.filter; + +import org.apache.hive.common.util.BloomKFilter; + +import java.util.Arrays; + +public class BloomKFilterHolder +{ + private BloomKFilter filter; + private byte[] hash; + + public BloomKFilterHolder(BloomKFilter filter, byte[] hash) + { + this.filter = filter; + this.hash = hash; + } + + BloomKFilter getFilter() + { + return filter; + } + + byte[] getFilterHash() + { + return hash; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + BloomKFilterHolder that = (BloomKFilterHolder) o; + return Arrays.equals(this.hash, that.hash); + } +} diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java index 181235a34778..bd2cf45b205f 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java @@ -22,6 +22,7 @@ import com.google.common.base.Function; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.hash.Hashing; import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.impl.DimensionsSpec; @@ -49,6 +50,8 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import javax.annotation.Nonnull; +import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.util.List; @@ -130,9 +133,10 @@ public void testSerde() throws IOException { BloomKFilter bloomFilter = new BloomKFilter(1500); bloomFilter.addString("myTestString"); + BloomKFilterHolder holder = new BloomKFilterHolder(bloomFilter, null); BloomDimFilter bloomDimFilter = new BloomDimFilter( "abc", - bloomFilter, + holder, new TimeDimExtractionFn("yyyy-MM-dd", "yyyy-MM", true) ); DimFilter filter = mapper.readValue(mapper.writeValueAsBytes(bloomDimFilter), DimFilter.class); @@ -145,7 +149,7 @@ public void testSerde() throws IOException } @Test - public void testWithTimeExtractionFnNull() + public void testWithTimeExtractionFnNull() throws IOException { assertFilterMatches(new BloomDimFilter( "dim0", @@ -170,7 +174,7 @@ public void testWithTimeExtractionFnNull() } @Test - public void testSingleValueStringColumnWithoutNulls() + public void testSingleValueStringColumnWithoutNulls() throws IOException { assertFilterMatches(new BloomDimFilter("dim0", bloomKFilter(1000, (String) null), null), ImmutableList.of()); assertFilterMatches(new BloomDimFilter("dim0", bloomKFilter(1000, ""), null), ImmutableList.of()); @@ -179,7 +183,7 @@ public void testSingleValueStringColumnWithoutNulls() } @Test - public void testSingleValueStringColumnWithNulls() + public void testSingleValueStringColumnWithNulls() throws IOException { if (NullHandling.replaceWithDefault()) { assertFilterMatches(new BloomDimFilter("dim1", bloomKFilter(1000, (String) null), null), ImmutableList.of("0")); @@ -196,7 +200,7 @@ public void testSingleValueStringColumnWithNulls() } @Test - public void testMultiValueStringColumn() + public void testMultiValueStringColumn() throws IOException { if (NullHandling.replaceWithDefault()) { assertFilterMatches( @@ -217,7 +221,7 @@ public void testMultiValueStringColumn() } @Test - public void testMissingColumnSpecifiedInDimensionList() + public void testMissingColumnSpecifiedInDimensionList() throws IOException { assertFilterMatches( new BloomDimFilter("dim3", bloomKFilter(1000, (String) null), null), @@ -230,7 +234,7 @@ public void testMissingColumnSpecifiedInDimensionList() } @Test - public void testMissingColumnNotSpecifiedInDimensionList() + public void testMissingColumnNotSpecifiedInDimensionList() throws IOException { assertFilterMatches( new BloomDimFilter("dim4", bloomKFilter(1000, (String) null), null), @@ -243,7 +247,7 @@ public void testMissingColumnNotSpecifiedInDimensionList() } @Test - public void testExpressionVirtualColumn() + public void testExpressionVirtualColumn() throws IOException { assertFilterMatches( new BloomDimFilter("expr", bloomKFilter(1000, 1.1F), null), @@ -263,7 +267,7 @@ public void testExpressionVirtualColumn() } @Test - public void testSelectorWithLookupExtractionFn() + public void testSelectorWithLookupExtractionFn() throws IOException { final Map stringMap = ImmutableMap.of( "1", "HELLO", @@ -334,7 +338,27 @@ public void testSelectorWithLookupExtractionFn() } } - private static BloomKFilter bloomKFilter(int expectedEntries, String... values) + @Test + public void testCacheKeyIsNotGiantIfFilterIsGiant() throws IOException + { + BloomKFilter bloomFilter = new BloomKFilter(10_000_000); + // FILL IT UP! + bloomFilter.addString("myTestString"); + + BloomKFilterHolder holder = getBloomKFilterHolder(bloomFilter); + + BloomDimFilter bloomDimFilter = new BloomDimFilter( + "abc", + holder, + new TimeDimExtractionFn("yyyy-MM-dd", "yyyy-MM", true) + ); + + // actual size is 86 bytes instead of 7794075 bytes + final int actualSize = bloomDimFilter.getCacheKey().length; + Assert.assertTrue(actualSize < 100); + } + + private static BloomKFilterHolder bloomKFilter(int expectedEntries, String... values) throws IOException { BloomKFilter filter = new BloomKFilter(expectedEntries); for (String value : values) { @@ -344,10 +368,21 @@ private static BloomKFilter bloomKFilter(int expectedEntries, String... values) filter.addString(value); } } - return filter; + + return getBloomKFilterHolder(filter); + } + + @Nonnull + private static BloomKFilterHolder getBloomKFilterHolder(BloomKFilter filter) throws IOException + { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + BloomKFilter.serialize(byteArrayOutputStream, filter); + byte[] bytes = byteArrayOutputStream.toByteArray(); + + return new BloomKFilterHolder(filter, Hashing.sha512().hashBytes(bytes).asBytes()); } - private static BloomKFilter bloomKFilter(int expectedEntries, Float... values) + private static BloomKFilterHolder bloomKFilter(int expectedEntries, Float... values) throws IOException { BloomKFilter filter = new BloomKFilter(expectedEntries); for (Float value : values) { @@ -357,10 +392,10 @@ private static BloomKFilter bloomKFilter(int expectedEntries, Float... values) filter.addFloat(value); } } - return filter; + return getBloomKFilterHolder(filter); } - private static BloomKFilter bloomKFilter(int expectedEntries, Double... values) + private static BloomKFilterHolder bloomKFilter(int expectedEntries, Double... values) throws IOException { BloomKFilter filter = new BloomKFilter(expectedEntries); for (Double value : values) { @@ -370,10 +405,10 @@ private static BloomKFilter bloomKFilter(int expectedEntries, Double... values) filter.addDouble(value); } } - return filter; + return getBloomKFilterHolder(filter); } - private static BloomKFilter bloomKFilter(int expectedEntries, Long... values) + private static BloomKFilterHolder bloomKFilter(int expectedEntries, Long... values) throws IOException { BloomKFilter filter = new BloomKFilter(expectedEntries); for (Long value : values) { @@ -383,6 +418,6 @@ private static BloomKFilter bloomKFilter(int expectedEntries, Long... values) filter.addLong(value); } } - return filter; + return getBloomKFilterHolder(filter); } } From e290af36b61472974bef695f68bedab3987f6db6 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 19 Oct 2018 16:44:01 -0700 Subject: [PATCH 2/7] make serde private, BloomDimFilter.toString and BloomDimFilter.equals use hash instead of bloomKFilter which has no tostring or equals of its own --- .../druid/guice/BloomFilterSerializersModule.java | 12 ++++++------ .../apache/druid/query/filter/BloomDimFilter.java | 8 +++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java index ad5a99f220a4..64816cc4ebbb 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java @@ -49,10 +49,10 @@ public BloomFilterSerializersModule() addDeserializer(BloomKFilterHolder.class, new BloomKFilterHolderDeserializer()); } - public static class BloomKFilterSerializer extends StdSerializer + private static class BloomKFilterSerializer extends StdSerializer { - public BloomKFilterSerializer() + BloomKFilterSerializer() { super(BloomKFilter.class); } @@ -69,10 +69,10 @@ public void serialize( } } - public static class BloomKFilterDeserializer extends StdDeserializer + private static class BloomKFilterDeserializer extends StdDeserializer { - protected BloomKFilterDeserializer() + BloomKFilterDeserializer() { super(BloomKFilter.class); } @@ -87,9 +87,9 @@ public BloomKFilter deserialize( } } - public static class BloomKFilterHolderDeserializer extends StdDeserializer + private static class BloomKFilterHolderDeserializer extends StdDeserializer { - protected BloomKFilterHolderDeserializer() + BloomKFilterHolderDeserializer() { super(BloomKFilterHolder.class); } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java index 814b1281aa66..cfde28e1c7c6 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java @@ -25,12 +25,14 @@ import com.google.common.base.Predicate; import com.google.common.collect.RangeSet; import com.google.common.collect.Sets; +import org.apache.commons.codec.binary.Base64; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.cache.CacheKeyBuilder; import org.apache.druid.query.extraction.ExtractionFn; import org.apache.druid.segment.filter.DimensionPredicateFilter; import org.apache.hive.common.util.BloomKFilter; +import java.util.Arrays; import java.util.HashSet; /** @@ -178,9 +180,9 @@ public ExtractionFn getExtractionFn() public String toString() { if (extractionFn != null) { - return StringUtils.format("%s(%s) = %s", extractionFn, dimension, bloomKFilter); + return StringUtils.format("%s(%s) = %s", extractionFn, dimension, Base64.encodeBase64String(hash)); } else { - return StringUtils.format("%s = %s", dimension, bloomKFilter); + return StringUtils.format("%s = %s", dimension, Base64.encodeBase64String(hash)); } } @@ -199,7 +201,7 @@ public boolean equals(Object o) if (!dimension.equals(that.dimension)) { return false; } - if (bloomKFilter != null ? !bloomKFilter.equals(that.bloomKFilter) : that.bloomKFilter != null) { + if (hash != null ? !Arrays.equals(hash, that.hash) : that.hash != null) { return false; } return extractionFn != null ? extractionFn.equals(that.extractionFn) : that.extractionFn == null; From 1abd3c6daae511429560c893849f0852aa8a0b10 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 19 Oct 2018 16:55:27 -0700 Subject: [PATCH 3/7] keep and use HashCode object instead of converting to bytes up front --- .../druid/guice/BloomFilterSerializersModule.java | 3 ++- .../apache/druid/query/filter/BloomDimFilter.java | 11 ++++++----- .../druid/query/filter/BloomKFilterHolder.java | 12 +++++++----- .../druid/query/filter/BloomDimFilterTest.java | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java index 64816cc4ebbb..9b437c23db8e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java @@ -26,6 +26,7 @@ import com.fasterxml.jackson.databind.jsontype.NamedType; import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import com.google.common.hash.HashCode; import com.google.common.hash.Hashing; import org.apache.druid.query.filter.BloomDimFilter; import org.apache.druid.query.filter.BloomKFilterHolder; @@ -101,7 +102,7 @@ public BloomKFilterHolder deserialize( { byte[] bytes = jsonParser.getBinaryValue(); BloomKFilter filter = BloomKFilter.deserialize(new ByteArrayInputStream(bytes)); - byte[] hash = Hashing.sha512().hashBytes(bytes).asBytes(); + HashCode hash = Hashing.sha512().hashBytes(bytes); return new BloomKFilterHolder(filter, hash); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java index cfde28e1c7c6..7628c9477e8c 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java @@ -25,6 +25,7 @@ import com.google.common.base.Predicate; import com.google.common.collect.RangeSet; import com.google.common.collect.Sets; +import com.google.common.hash.HashCode; import org.apache.commons.codec.binary.Base64; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.cache.CacheKeyBuilder; @@ -42,7 +43,7 @@ public class BloomDimFilter implements DimFilter private final String dimension; private final BloomKFilter bloomKFilter; - private final byte[] hash; + private final HashCode hash; private final ExtractionFn extractionFn; @JsonCreator @@ -68,7 +69,7 @@ public byte[] getCacheKey() .appendByte(DimFilterUtils.STRING_SEPARATOR) .appendByteArray(extractionFn == null ? new byte[0] : extractionFn.getCacheKey()) .appendByte(DimFilterUtils.STRING_SEPARATOR) - .appendByteArray(hash) + .appendByteArray(hash.asBytes()) .build(); } @@ -180,9 +181,9 @@ public ExtractionFn getExtractionFn() public String toString() { if (extractionFn != null) { - return StringUtils.format("%s(%s) = %s", extractionFn, dimension, Base64.encodeBase64String(hash)); + return StringUtils.format("%s(%s) = %s", extractionFn, dimension, hash.toString()); } else { - return StringUtils.format("%s = %s", dimension, Base64.encodeBase64String(hash)); + return StringUtils.format("%s = %s", dimension, hash.toString()); } } @@ -201,7 +202,7 @@ public boolean equals(Object o) if (!dimension.equals(that.dimension)) { return false; } - if (hash != null ? !Arrays.equals(hash, that.hash) : that.hash != null) { + if (hash != null ? !hash.equals(that.hash) : that.hash != null) { return false; } return extractionFn != null ? extractionFn.equals(that.extractionFn) : that.extractionFn == null; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java index 5dae67d7f9b3..517bb214d576 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java @@ -19,16 +19,18 @@ package org.apache.druid.query.filter; +import com.google.common.hash.HashCode; import org.apache.hive.common.util.BloomKFilter; import java.util.Arrays; +import java.util.Objects; public class BloomKFilterHolder { - private BloomKFilter filter; - private byte[] hash; + private final BloomKFilter filter; + private final HashCode hash; - public BloomKFilterHolder(BloomKFilter filter, byte[] hash) + public BloomKFilterHolder(BloomKFilter filter, HashCode hash) { this.filter = filter; this.hash = hash; @@ -39,7 +41,7 @@ BloomKFilter getFilter() return filter; } - byte[] getFilterHash() + HashCode getFilterHash() { return hash; } @@ -55,6 +57,6 @@ public boolean equals(Object o) } BloomKFilterHolder that = (BloomKFilterHolder) o; - return Arrays.equals(this.hash, that.hash); + return Objects.equals(this.hash, that.hash); } } diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java index bd2cf45b205f..7cd69150a9d0 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java @@ -379,7 +379,7 @@ private static BloomKFilterHolder getBloomKFilterHolder(BloomKFilter filter) thr BloomKFilter.serialize(byteArrayOutputStream, filter); byte[] bytes = byteArrayOutputStream.toByteArray(); - return new BloomKFilterHolder(filter, Hashing.sha512().hashBytes(bytes).asBytes()); + return new BloomKFilterHolder(filter, Hashing.sha512().hashBytes(bytes)); } private static BloomKFilterHolder bloomKFilter(int expectedEntries, Float... values) throws IOException From 9c5ab3db804900d76c93cf1d43ce4001121484f5 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 19 Oct 2018 16:56:12 -0700 Subject: [PATCH 4/7] uneeded imports oops --- .../main/java/org/apache/druid/query/filter/BloomDimFilter.java | 2 -- .../java/org/apache/druid/query/filter/BloomKFilterHolder.java | 1 - 2 files changed, 3 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java index 7628c9477e8c..911387d970ba 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java @@ -26,14 +26,12 @@ import com.google.common.collect.RangeSet; import com.google.common.collect.Sets; import com.google.common.hash.HashCode; -import org.apache.commons.codec.binary.Base64; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.cache.CacheKeyBuilder; import org.apache.druid.query.extraction.ExtractionFn; import org.apache.druid.segment.filter.DimensionPredicateFilter; import org.apache.hive.common.util.BloomKFilter; -import java.util.Arrays; import java.util.HashSet; /** diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java index 517bb214d576..5fc2246cffc8 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java @@ -22,7 +22,6 @@ import com.google.common.hash.HashCode; import org.apache.hive.common.util.BloomKFilter; -import java.util.Arrays; import java.util.Objects; public class BloomKFilterHolder From 779972d6f3ebfde68dfc5cb9bd62563997e6b7d5 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sun, 21 Oct 2018 01:13:18 -0700 Subject: [PATCH 5/7] tweaks from review --- .../org/apache/druid/query/filter/BloomDimFilter.java | 2 +- .../apache/druid/query/filter/BloomKFilterHolder.java | 6 ++++++ .../apache/druid/query/filter/BloomDimFilterTest.java | 9 ++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java index 911387d970ba..383488cc41ac 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomDimFilter.java @@ -222,7 +222,7 @@ public HashSet getRequiredColumns() public int hashCode() { int result = dimension.hashCode(); - result = 31 * result + (bloomKFilter != null ? bloomKFilter.hashCode() : 0); + result = 31 * result + (hash != null ? hash.hashCode() : 0); result = 31 * result + (extractionFn != null ? extractionFn.hashCode() : 0); return result; } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java index 5fc2246cffc8..5d540a1b5dc3 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java @@ -58,4 +58,10 @@ public boolean equals(Object o) BloomKFilterHolder that = (BloomKFilterHolder) o; return Objects.equals(this.hash, that.hash); } + + @Override + public int hashCode() + { + return Objects.hash(filter, hash); + } } diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java index 7cd69150a9d0..0cd83bfc80ff 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java @@ -353,7 +353,14 @@ public void testCacheKeyIsNotGiantIfFilterIsGiant() throws IOException new TimeDimExtractionFn("yyyy-MM-dd", "yyyy-MM", true) ); - // actual size is 86 bytes instead of 7794075 bytes + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + BloomKFilter.serialize(byteArrayOutputStream, bloomFilter); + byte[] bloomFilterBytes = byteArrayOutputStream.toByteArray(); + + // serialized filter can be quite large for high capacity bloom filters... + Assert.assertTrue(bloomFilterBytes.length > 7794000); + + // actual size is 86 bytes instead of 7794075 bytes of old key format final int actualSize = bloomDimFilter.getCacheKey().length; Assert.assertTrue(actualSize < 100); } From ca04a6f27fc5c7cb69854b97a3ca416a43ccd592 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sun, 21 Oct 2018 01:18:51 -0700 Subject: [PATCH 6/7] refactor dupe code --- .../query/filter/BloomDimFilterTest.java | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java index 0cd83bfc80ff..c83cb9bf5bcd 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java @@ -93,6 +93,8 @@ public class BloomDimFilterTest extends BaseFilterTest PARSER.parseBatch(ImmutableMap.of("dim0", "5", "dim1", "abc")).get(0) ); + private static DefaultObjectMapper mapper = new DefaultObjectMapper(); + public BloomDimFilterTest( String testName, IndexBuilder indexBuilder, @@ -114,8 +116,6 @@ public BloomDimFilterTest( ); } - private static DefaultObjectMapper mapper = new DefaultObjectMapper(); - @BeforeClass public static void beforeClass() { @@ -353,9 +353,7 @@ public void testCacheKeyIsNotGiantIfFilterIsGiant() throws IOException new TimeDimExtractionFn("yyyy-MM-dd", "yyyy-MM", true) ); - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - BloomKFilter.serialize(byteArrayOutputStream, bloomFilter); - byte[] bloomFilterBytes = byteArrayOutputStream.toByteArray(); + byte[] bloomFilterBytes = getFilterBytes(bloomFilter); // serialized filter can be quite large for high capacity bloom filters... Assert.assertTrue(bloomFilterBytes.length > 7794000); @@ -379,16 +377,6 @@ private static BloomKFilterHolder bloomKFilter(int expectedEntries, String... va return getBloomKFilterHolder(filter); } - @Nonnull - private static BloomKFilterHolder getBloomKFilterHolder(BloomKFilter filter) throws IOException - { - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - BloomKFilter.serialize(byteArrayOutputStream, filter); - byte[] bytes = byteArrayOutputStream.toByteArray(); - - return new BloomKFilterHolder(filter, Hashing.sha512().hashBytes(bytes)); - } - private static BloomKFilterHolder bloomKFilter(int expectedEntries, Float... values) throws IOException { BloomKFilter filter = new BloomKFilter(expectedEntries); @@ -427,4 +415,19 @@ private static BloomKFilterHolder bloomKFilter(int expectedEntries, Long... valu } return getBloomKFilterHolder(filter); } + + @Nonnull + private static BloomKFilterHolder getBloomKFilterHolder(BloomKFilter filter) throws IOException + { + byte[] bytes = getFilterBytes(filter); + + return new BloomKFilterHolder(filter, Hashing.sha512().hashBytes(bytes)); + } + + private static byte[] getFilterBytes(BloomKFilter filter) throws IOException + { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + BloomKFilter.serialize(byteArrayOutputStream, filter); + return byteArrayOutputStream.toByteArray(); + } } From 5a4d10e45cb367be3a3e194503af6d3d8d944d8b Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 22 Oct 2018 01:07:43 -0700 Subject: [PATCH 7/7] refactor --- .../guice/BloomFilterSerializersModule.java | 27 +++++++++-------- .../query/filter/BloomKFilterHolder.java | 18 +++++++++++ .../query/filter/BloomDimFilterTest.java | 30 ++++--------------- 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java index 9b437c23db8e..011e091d3dd1 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java @@ -26,8 +26,6 @@ import com.fasterxml.jackson.databind.jsontype.NamedType; import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.ser.std.StdSerializer; -import com.google.common.hash.HashCode; -import com.google.common.hash.Hashing; import org.apache.druid.query.filter.BloomDimFilter; import org.apache.druid.query.filter.BloomKFilterHolder; import org.apache.hive.common.util.BloomKFilter; @@ -63,10 +61,7 @@ public void serialize( BloomKFilter bloomKFilter, JsonGenerator jsonGenerator, SerializerProvider serializerProvider ) throws IOException { - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - BloomKFilter.serialize(byteArrayOutputStream, bloomKFilter); - byte[] bytes = byteArrayOutputStream.toByteArray(); - jsonGenerator.writeBinary(bytes); + jsonGenerator.writeBinary(bloomKFilterToBytes(bloomKFilter)); } } @@ -83,8 +78,7 @@ public BloomKFilter deserialize( JsonParser jsonParser, DeserializationContext deserializationContext ) throws IOException { - byte[] bytes = jsonParser.getBinaryValue(); - return BloomKFilter.deserialize(new ByteArrayInputStream(bytes)); + return bloomKFilterFromBytes(jsonParser.getBinaryValue()); } } @@ -100,10 +94,19 @@ public BloomKFilterHolder deserialize( JsonParser jsonParser, DeserializationContext deserializationContext ) throws IOException { - byte[] bytes = jsonParser.getBinaryValue(); - BloomKFilter filter = BloomKFilter.deserialize(new ByteArrayInputStream(bytes)); - HashCode hash = Hashing.sha512().hashBytes(bytes); - return new BloomKFilterHolder(filter, hash); + return BloomKFilterHolder.fromBytes(jsonParser.getBinaryValue()); } } + + public static byte[] bloomKFilterToBytes(BloomKFilter bloomKFilter) throws IOException + { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + BloomKFilter.serialize(byteArrayOutputStream, bloomKFilter); + return byteArrayOutputStream.toByteArray(); + } + + public static BloomKFilter bloomKFilterFromBytes(byte[] bytes) throws IOException + { + return BloomKFilter.deserialize(new ByteArrayInputStream(bytes)); + } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java index 5d540a1b5dc3..e06632f0a60c 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilterHolder.java @@ -20,8 +20,11 @@ package org.apache.druid.query.filter; import com.google.common.hash.HashCode; +import com.google.common.hash.Hashing; +import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.hive.common.util.BloomKFilter; +import java.io.IOException; import java.util.Objects; public class BloomKFilterHolder @@ -45,6 +48,21 @@ HashCode getFilterHash() return hash; } + public static BloomKFilterHolder fromBloomKFilter(BloomKFilter filter) throws IOException + { + byte[] bytes = BloomFilterSerializersModule.bloomKFilterToBytes(filter); + + return new BloomKFilterHolder(filter, Hashing.sha512().hashBytes(bytes)); + } + + public static BloomKFilterHolder fromBytes(byte[] bytes) throws IOException + { + return new BloomKFilterHolder( + BloomFilterSerializersModule.bloomKFilterFromBytes(bytes), + Hashing.sha512().hashBytes(bytes) + ); + } + @Override public boolean equals(Object o) { diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java index c83cb9bf5bcd..81a272f5f95a 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomDimFilterTest.java @@ -22,7 +22,6 @@ import com.google.common.base.Function; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import com.google.common.hash.Hashing; import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.impl.DimensionsSpec; @@ -50,8 +49,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import javax.annotation.Nonnull; -import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.util.List; @@ -345,7 +342,7 @@ public void testCacheKeyIsNotGiantIfFilterIsGiant() throws IOException // FILL IT UP! bloomFilter.addString("myTestString"); - BloomKFilterHolder holder = getBloomKFilterHolder(bloomFilter); + BloomKFilterHolder holder = BloomKFilterHolder.fromBloomKFilter(bloomFilter); BloomDimFilter bloomDimFilter = new BloomDimFilter( "abc", @@ -353,7 +350,7 @@ public void testCacheKeyIsNotGiantIfFilterIsGiant() throws IOException new TimeDimExtractionFn("yyyy-MM-dd", "yyyy-MM", true) ); - byte[] bloomFilterBytes = getFilterBytes(bloomFilter); + byte[] bloomFilterBytes = BloomFilterSerializersModule.bloomKFilterToBytes(bloomFilter); // serialized filter can be quite large for high capacity bloom filters... Assert.assertTrue(bloomFilterBytes.length > 7794000); @@ -374,7 +371,7 @@ private static BloomKFilterHolder bloomKFilter(int expectedEntries, String... va } } - return getBloomKFilterHolder(filter); + return BloomKFilterHolder.fromBloomKFilter(filter); } private static BloomKFilterHolder bloomKFilter(int expectedEntries, Float... values) throws IOException @@ -387,7 +384,7 @@ private static BloomKFilterHolder bloomKFilter(int expectedEntries, Float... val filter.addFloat(value); } } - return getBloomKFilterHolder(filter); + return BloomKFilterHolder.fromBloomKFilter(filter); } private static BloomKFilterHolder bloomKFilter(int expectedEntries, Double... values) throws IOException @@ -400,7 +397,7 @@ private static BloomKFilterHolder bloomKFilter(int expectedEntries, Double... va filter.addDouble(value); } } - return getBloomKFilterHolder(filter); + return BloomKFilterHolder.fromBloomKFilter(filter); } private static BloomKFilterHolder bloomKFilter(int expectedEntries, Long... values) throws IOException @@ -413,21 +410,6 @@ private static BloomKFilterHolder bloomKFilter(int expectedEntries, Long... valu filter.addLong(value); } } - return getBloomKFilterHolder(filter); - } - - @Nonnull - private static BloomKFilterHolder getBloomKFilterHolder(BloomKFilter filter) throws IOException - { - byte[] bytes = getFilterBytes(filter); - - return new BloomKFilterHolder(filter, Hashing.sha512().hashBytes(bytes)); - } - - private static byte[] getFilterBytes(BloomKFilter filter) throws IOException - { - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - BloomKFilter.serialize(byteArrayOutputStream, filter); - return byteArrayOutputStream.toByteArray(); + return BloomKFilterHolder.fromBloomKFilter(filter); } }