From 7dbc75e6b78f3f0f656e955dd0e1e879f4e9e71d Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 22 Sep 2018 15:38:22 -0700 Subject: [PATCH 01/36] blooming aggs --- .../extensions-core/bloom-filter.md | 74 +- .../guice/BloomFilterSerializersModule.java | 6 +- .../bloom/BloomFilterAggregator.java | 79 +++ .../bloom/BloomFilterAggregatorFactory.java | 233 +++++++ .../bloom/BloomFilterBufferAggregator.java | 127 ++++ .../bloom/BloomFilterMergeAggregator.java | 82 +++ .../BloomFilterMergeAggregatorFactory.java | 85 +++ .../BloomFilterMergeBufferAggregator.java | 128 ++++ ...ilterAggregatorColumnSelectorStrategy.java | 31 + ...gregatorColumnSelectorStrategyFactory.java | 50 ++ ...ilterAggregatorColumnSelectorStrategy.java | 36 + ...ilterAggregatorColumnSelectorStrategy.java | 36 + ...AggregatorColumnValueSelectorStrategy.java | 36 + ...ilterAggregatorColumnSelectorStrategy.java | 49 ++ .../bloom/BloomFilterAggregatorTest.java | 630 ++++++++++++++++++ .../query/aggregation/AggregatorUtil.java | 3 + 16 files changed, 1675 insertions(+), 10 deletions(-) create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java create mode 100644 extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java diff --git a/docs/content/development/extensions-core/bloom-filter.md b/docs/content/development/extensions-core/bloom-filter.md index d01835d0e034..304606580e08 100644 --- a/docs/content/development/extensions-core/bloom-filter.md +++ b/docs/content/development/extensions-core/bloom-filter.md @@ -24,22 +24,29 @@ title: "Bloom Filter" # Bloom Filter -Make sure to [include](../../operations/including-extensions.html) `druid-bloom-filter` as an extension. +This extension adds the ability to both construct bloom filters from query results, and filter query results by testing +against a bloom filter. Make sure to [include](../../operations/including-extensions.html) `druid-bloom-filter` as an +extension. -BloomFilter is a probabilistic data structure for set membership check. -Following are some characterstics of BloomFilter +A BloomFilter is a probabilistic data structure for set membership check. +Following are some characterstics of BloomFilter - BloomFilters are highly space efficient when compared to using a HashSet. -- Because of the probabilistic nature of bloom filter false positive (element not present in bloom filter but test() says true) are possible -- false negatives are not possible (if element is present then test() will never say false). -- The false positive probability is configurable (default: 5%) depending on which storage requirement may increase or decrease. +- Because of the probabilistic nature of bloom filter false positive results are possible (e.g. element was not actually +present in bloom filter construction, but `test()` says true) +- False negatives are not possible (if element is present then `test()` will never say false). +- The false positive probability is configurable (default: 5%) depending on which storage requirement may increase or + decrease. - Lower the false positive probability greater is the space requirement. - Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. -- During the creation of bloom filter expected number of entries must be specified.If the number of insertions exceed the specified initial number of entries then false positive probability will increase accordingly. +- During the creation of bloom filter expected number of entries must be specified.If the number of insertions exceed + the specified initial number of entries then false positive probability will increase accordingly. -Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash algorithm. +This extension is built on top of `org.apache.hive.common.util.BloomKFilter`. Internally, this implementation of bloom +filter uses Murmur3 fast non-cryptographic hash algorithm. -### JSON Representation of Bloom Filter +## Filtering queries with a Bloom Filter +### JSON Specification of Bloom Filter ```json { "type" : "bloom", @@ -73,3 +80,52 @@ SELECT COUNT(*) FROM druid.foo WHERE bloom_filter_test(, ', + "maxNumEntries": + "field": + } +``` + +|Property |Description |required? | +|-------------------------|------------------------------|----------------------------------| +|`type` |Aggregator Type. Should always be `bloom`|yes| +|`name` |Output field name |yes| +|`field` |[DimensionSpec](./../dimensionspecs.html) to add to `org.apache.hive.common.util.BloomKFilter` | yes | +|`maxNumEntries` |Maximum number of distinct values supported by `org.apache.hive.common.util.BloomKFilter`, default `1500`| no | + +### Example +```json +{ + "queryType": "timeseries", + "dataSource": "wikiticker", + "intervals": [ "2015-09-12T00:00:00.000/2015-09-13T00:00:00.000" ], + "granularity": "day", + "aggregations": [ + { + "type": "bloom", + "name": "userBloom", + "maxNumEntries": 100000, + "field": { + "type":"default", + "dimension":"user", + "outputType": "STRING" + } + } + ] +} +``` + +response +```json +[{"timestamp":"2015-09-12T00:00:00.000Z","result":{"userBloom":"BAAAJhAAAA..."}}] +``` + +These values can then be set in the filter specification above. \ No newline at end of file diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java index 5162abb1d059..2bd5007ac44e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java @@ -27,6 +27,7 @@ import com.fasterxml.jackson.databind.jsontype.NamedType; import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import org.apache.druid.query.aggregation.bloom.BloomFilterAggregatorFactory; import org.apache.druid.query.filter.BloomDimFilter; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.filter.BloomKFilterHolder; @@ -41,7 +42,10 @@ public class BloomFilterSerializersModule extends SimpleModule public BloomFilterSerializersModule() { - registerSubtypes(new NamedType(BloomDimFilter.class, BLOOM_FILTER_TYPE_NAME)); + registerSubtypes( + new NamedType(BloomDimFilter.class, BLOOM_FILTER_TYPE_NAME), + new NamedType(BloomFilterAggregatorFactory.class, BLOOM_FILTER_TYPE_NAME) + ); addSerializer(BloomKFilter.class, new BloomKFilterSerializer()); addDeserializer(BloomKFilter.class, new BloomKFilterDeserializer()); addDeserializer(BloomKFilterHolder.class, new BloomKFilterHolderDeserializer()); diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java new file mode 100644 index 000000000000..a0d67850ba80 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.query.ColumnSelectorPlus; +import org.apache.druid.query.aggregation.Aggregator; +import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; +import org.apache.hive.common.util.BloomKFilter; + +import javax.annotation.Nullable; + +public class BloomFilterAggregator implements Aggregator +{ + private ColumnSelectorPlus selectorPlus; + private BloomKFilter collector; + + public BloomFilterAggregator( + ColumnSelectorPlus selectorPlus, + int maxNumEntries + ) + { + this.selectorPlus = selectorPlus; + this.collector = new BloomKFilter(maxNumEntries); + } + + @Override + public void aggregate() + { + selectorPlus.getColumnSelectorStrategy().add(selectorPlus.getSelector(), collector); + } + + @Nullable + @Override + public Object get() + { + return collector; + } + + @Override + public float getFloat() + { + throw new UnsupportedOperationException("BloomFilterAggregator does not support getFloat()"); + } + + @Override + public long getLong() + { + throw new UnsupportedOperationException("BloomFilterAggregator does not support getLong()"); + } + + @Override + public double getDouble() + { + throw new UnsupportedOperationException("BloomFilterAggregator does not support getDouble()"); + } + + @Override + public void close() + { + + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java new file mode 100644 index 000000000000..de5dcb94ff49 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.commons.codec.binary.Base64; +import org.apache.druid.io.ByteBufferInputStream; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.query.ColumnSelectorPlus; +import org.apache.druid.query.aggregation.Aggregator; +import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.query.aggregation.AggregatorUtil; +import org.apache.druid.query.aggregation.BufferAggregator; +import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategyFactory; +import org.apache.druid.query.cache.CacheKeyBuilder; +import org.apache.druid.query.dimension.DimensionSpec; +import org.apache.druid.segment.ColumnSelectorFactory; +import org.apache.druid.segment.DimensionHandlerUtils; +import org.apache.hive.common.util.BloomKFilter; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; + +public class BloomFilterAggregatorFactory extends AggregatorFactory +{ + protected static final BloomFilterAggregatorColumnSelectorStrategyFactory STRATEGY_FACTORY = + new BloomFilterAggregatorColumnSelectorStrategyFactory(); + + private final String name; + private final DimensionSpec field; + private final int maxNumEntries; + + @JsonCreator + public BloomFilterAggregatorFactory( + @JsonProperty("name") String name, + @JsonProperty("field") final DimensionSpec field, + @JsonProperty("maxNumEntries") Integer maxNumEntries + ) + { + this.name = name; + this.field = field; + this.maxNumEntries = maxNumEntries != null ? maxNumEntries : 1500; + } + + @Override + public Aggregator factorize(ColumnSelectorFactory columnFactory) + { + ColumnSelectorPlus selectorPlus = + DimensionHandlerUtils.createColumnSelectorPlus( + STRATEGY_FACTORY, + field, + columnFactory + ); + + return new BloomFilterAggregator(selectorPlus, maxNumEntries); + } + + @Override + public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) + { + ColumnSelectorPlus selectorPlus = + DimensionHandlerUtils.createColumnSelectorPlus( + STRATEGY_FACTORY, + field, + columnFactory + ); + + return new BloomFilterBufferAggregator(selectorPlus, maxNumEntries); + } + + @Override + public Comparator getComparator() + { + // idk how to compare? + return (Comparator) (o1, o2) -> 0; + } + + @Override + public Object combine(Object lhs, Object rhs) + { + if (rhs == null) { + return lhs; + } + if (lhs == null) { + return rhs; + } + ((BloomKFilter) lhs).merge((BloomKFilter) rhs); + return lhs; + } + + @Override + public AggregatorFactory getCombiningFactory() + { + return new BloomFilterMergeAggregatorFactory(name, name, maxNumEntries); + } + + @Override + public List getRequiredColumns() + { + return Collections.singletonList(new BloomFilterAggregatorFactory(name, field, maxNumEntries)); + } + + @Override + public Object deserialize(Object object) + { + final ByteBuffer buffer; + + if (object instanceof byte[]) { + buffer = ByteBuffer.wrap((byte[]) object); + } else if (object instanceof ByteBuffer) { + // Be conservative, don't assume we own this buffer. + buffer = ((ByteBuffer) object).duplicate(); + } else if (object instanceof String) { + buffer = ByteBuffer.wrap(Base64.decodeBase64(StringUtils.toUtf8((String) object))); + } else { + return object; + } + + ByteBufferInputStream byteBufferInputStream = new ByteBufferInputStream(buffer); + try { + return BloomKFilter.deserialize(byteBufferInputStream); + } + catch (Exception ex) { + throw new RuntimeException("Failed to deserialize bloomK filter", ex); + } + } + + @Override + public Object finalizeComputation(Object object) + { + return object; + } + + @JsonProperty + @Override + public String getName() + { + return name; + } + + @JsonProperty + public DimensionSpec getField() + { + return field; + } + + @JsonProperty + public int getMaxNumEntries() + { + return maxNumEntries; + } + + @Override + public List requiredFields() + { + return Collections.singletonList(field.getDimension()); + } + + @Override + public String getTypeName() + { + return "bloomFilter"; + } + + @Override + public int getMaxIntermediateSize() + { + BloomKFilter throwaway = new BloomKFilter(maxNumEntries); + return (throwaway.getBitSet().length * Long.BYTES) + 5; + } + + @Override + public byte[] getCacheKey() + { + return new CacheKeyBuilder(AggregatorUtil.BLOOM_FILTER_CACHE_TYPE_ID) + .appendCacheable(field) + .appendInt(maxNumEntries) + .build(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + BloomFilterAggregatorFactory that = (BloomFilterAggregatorFactory) o; + return maxNumEntries == that.maxNumEntries && + Objects.equals(name, that.name) && + Objects.equals(field, that.field); + } + + @Override + public int hashCode() + { + return Objects.hash(name, field, maxNumEntries); + } + + @Override + public String toString() + { + return "BloomFilterAggregatorFactory{" + + "name='" + name + '\'' + + ", field=" + field + + ", maxNumEntries=" + maxNumEntries + + '}'; + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java new file mode 100644 index 000000000000..7e55935c03e8 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import com.fasterxml.jackson.databind.util.ByteBufferBackedOutputStream; +import org.apache.druid.io.ByteBufferInputStream; +import org.apache.druid.query.ColumnSelectorPlus; +import org.apache.druid.query.aggregation.BufferAggregator; +import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; +import org.apache.hive.common.util.BloomKFilter; + +import java.nio.ByteBuffer; + + +public class BloomFilterBufferAggregator implements BufferAggregator +{ + private ColumnSelectorPlus selectorPlus; + private int maxNumEntries; + + public BloomFilterBufferAggregator( + ColumnSelectorPlus selectorPlus, + int maxNumEntries + ) + { + this.selectorPlus = selectorPlus; + this.maxNumEntries = maxNumEntries; + } + + @Override + public void init(ByteBuffer buf, int position) + { + final ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + BloomKFilter filter = new BloomKFilter(maxNumEntries); + ByteBufferBackedOutputStream wat = new ByteBufferBackedOutputStream(mutationBuffer); + try { + BloomKFilter.serialize(wat, filter); + } + catch (Exception ex) { + throw new RuntimeException("Failed to initialize bloomK filter", ex); + } + } + + @Override + public void aggregate(ByteBuffer buf, int position) + { + final int oldPosition = buf.position(); + final int oldLimit = buf.limit(); + try { + buf.position(position); + BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(buf)); + selectorPlus.getColumnSelectorStrategy().add(selectorPlus.getSelector(), collector); + buf.position(position); + ByteBufferBackedOutputStream out = new ByteBufferBackedOutputStream(buf); + BloomKFilter.serialize(out, collector); + } + catch (Exception ex) { + throw new RuntimeException("Failed to merge bloomK filters", ex); + } + finally { + buf.position(oldPosition); + buf.limit(oldLimit); + } + } + + @Override + public Object get(ByteBuffer buf, int position) + { + try { + ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); + return collector; + } + catch (Exception ex) { + throw new RuntimeException("Failed to deserialize bloomK filter", ex); + } + } + + @Override + public float getFloat(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getFloat()"); + } + + @Override + public long getLong(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getLong()"); + } + + @Override + public double getDouble(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getDouble()"); + } + + @Override + public void close() + { + + } + + @Override + public void inspectRuntimeShape(RuntimeShapeInspector inspector) + { + inspector.visit("selectorPlus", selectorPlus); + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java new file mode 100644 index 000000000000..309d8f56d664 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.query.aggregation.Aggregator; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.hive.common.util.BloomKFilter; + +import javax.annotation.Nullable; + +public class BloomFilterMergeAggregator implements Aggregator +{ + private ColumnValueSelector selector; + private BloomKFilter collector; + + public BloomFilterMergeAggregator( + ColumnValueSelector selector, + int maxNumEntries + ) + { + this.selector = selector; + this.collector = new BloomKFilter(maxNumEntries); + } + + @Override + public void aggregate() + { + BloomKFilter other = selector.getObject(); + if (other != null) { + collector.merge(other); + } + } + + @Nullable + @Override + public Object get() + { + return collector; + } + + + @Override + public float getFloat() + { + throw new UnsupportedOperationException("BloomFilterMergeAggregator does not support getFloat()"); + } + + @Override + public long getLong() + { + throw new UnsupportedOperationException("BloomFilterMergeAggregator does not support getLong()"); + } + + @Override + public double getDouble() + { + throw new UnsupportedOperationException("BloomFilterMergeAggregator does not support getDouble()"); + } + + @Override + public void close() + { + + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java new file mode 100644 index 000000000000..c56ad462cd8f --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.query.aggregation.Aggregator; +import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.query.aggregation.AggregatorUtil; +import org.apache.druid.query.aggregation.BufferAggregator; +import org.apache.druid.query.aggregation.NoopAggregator; +import org.apache.druid.query.aggregation.NoopBufferAggregator; +import org.apache.druid.query.cache.CacheKeyBuilder; +import org.apache.druid.segment.ColumnSelectorFactory; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.NilColumnValueSelector; +import org.apache.hive.common.util.BloomKFilter; + +import java.util.Collections; +import java.util.List; + +public class BloomFilterMergeAggregatorFactory extends BloomFilterAggregatorFactory +{ + private String fieldName; + + BloomFilterMergeAggregatorFactory( + String name, + String field, + Integer maxNumEntries + ) + { + super(name, null, maxNumEntries); + this.fieldName = field; + } + + @Override + public Aggregator factorize(final ColumnSelectorFactory metricFactory) + { + final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); + if (selector instanceof NilColumnValueSelector) { + return NoopAggregator.instance(); + } + return new BloomFilterMergeAggregator(selector, getMaxNumEntries()); + } + + @Override + public BufferAggregator factorizeBuffered(final ColumnSelectorFactory metricFactory) + { + final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); + if (selector instanceof NilColumnValueSelector) { + return NoopBufferAggregator.instance(); + } + return new BloomFilterMergeBufferAggregator(selector, getMaxNumEntries()); + } + + @Override + public List getRequiredColumns() + { + return Collections.singletonList(new BloomFilterMergeAggregatorFactory(getName(), fieldName, getMaxNumEntries())); + } + + @Override + public byte[] getCacheKey() + { + return new CacheKeyBuilder(AggregatorUtil.BLOOM_FILTER_MERGE_CACHE_TYPE_ID) + .appendString(fieldName) + .appendInt(getMaxNumEntries()) + .build(); + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java new file mode 100644 index 000000000000..191c1be96d01 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import com.fasterxml.jackson.databind.util.ByteBufferBackedOutputStream; +import org.apache.druid.io.ByteBufferInputStream; +import org.apache.druid.query.aggregation.BufferAggregator; +import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.hive.common.util.BloomKFilter; + +import java.nio.ByteBuffer; + +public class BloomFilterMergeBufferAggregator implements BufferAggregator +{ + private ColumnValueSelector selector; + private int maxNumEntries; + + public BloomFilterMergeBufferAggregator( + ColumnValueSelector selector, + int maxNumEntries + ) + { + this.selector = selector; + this.maxNumEntries = maxNumEntries; + } + + @Override + public void init(ByteBuffer buf, int position) + { + final ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + BloomKFilter filter = new BloomKFilter(maxNumEntries); + ByteBufferBackedOutputStream outputStream = new ByteBufferBackedOutputStream(mutationBuffer); + try { + BloomKFilter.serialize(outputStream, filter); + } + catch (Exception ex) { + throw new RuntimeException("Failed to initialize bloomK filter", ex); + } + } + + @Override + public void aggregate(ByteBuffer buf, int position) + { + final int oldPosition = buf.position(); + final int oldLimit = buf.limit(); + try { + buf.position(position); + BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(buf)); + BloomKFilter other = selector.getObject(); + if (other != null) { + collector.merge(other); + buf.position(position); + ByteBufferBackedOutputStream out = new ByteBufferBackedOutputStream(buf); + BloomKFilter.serialize(out, collector); + } + } + catch (Exception ex) { + throw new RuntimeException("Failed to merge bloomK filters", ex); + } + finally { + buf.position(oldPosition); + buf.limit(oldLimit); + } + } + + @Override + public Object get(ByteBuffer buf, int position) + { + try { + ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); + return collector; + } + catch (Exception ex) { + throw new RuntimeException("Failed to deserialize bloomK filter", ex); + } + } + + @Override + public float getFloat(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterMergeBufferAggregator does not support getFloat()"); + } + + @Override + public long getLong(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterMergeBufferAggregator does not support getLong()"); + } + + @Override + public double getDouble(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterMergeBufferAggregator does not support getDouble()"); + } + + @Override + public void close() + { + + } + + @Override + public void inspectRuntimeShape(RuntimeShapeInspector inspector) + { + inspector.visit("selector", selector); + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java new file mode 100644 index 000000000000..531efdca980b --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom.types; + +import org.apache.druid.query.dimension.ColumnSelectorStrategy; +import org.apache.hive.common.util.BloomKFilter; + +public interface BloomFilterAggregatorColumnSelectorStrategy extends ColumnSelectorStrategy +{ + /** + * Add column value to bloomK filter + */ + void add(TValueSelector selector, BloomKFilter bloomFilter); +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java new file mode 100644 index 000000000000..c84696a67c7b --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom.types; + +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.column.ColumnCapabilities; +import org.apache.druid.segment.column.ValueType; + +public class BloomFilterAggregatorColumnSelectorStrategyFactory + implements ColumnSelectorStrategyFactory +{ + @Override + public BloomFilterAggregatorColumnSelectorStrategy makeColumnSelectorStrategy( + ColumnCapabilities capabilities, ColumnValueSelector selector + ) + { + ValueType type = capabilities.getType(); + switch (type) { + case STRING: + return new StringBloomFilterAggregatorColumnSelectorStrategy(); + case LONG: + return new LongBloomFilterAggregatorColumnValueSelectorStrategy(); + case FLOAT: + return new FloatBloomFilterAggregatorColumnSelectorStrategy(); + case DOUBLE: + return new DoubleBloomFilterAggregatorColumnSelectorStrategy(); + default: + throw new IAE("Cannot create query type helper from invalid type [%s]", type); + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java new file mode 100644 index 000000000000..848e4e4285af --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom.types; + +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.segment.DoubleColumnSelector; +import org.apache.hive.common.util.BloomKFilter; + +public class DoubleBloomFilterAggregatorColumnSelectorStrategy + implements BloomFilterAggregatorColumnSelectorStrategy +{ + @Override + public void add(DoubleColumnSelector selector, BloomKFilter bloomFilter) + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + bloomFilter.addDouble(selector.getDouble()); + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java new file mode 100644 index 000000000000..ae2c5e522b12 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom.types; + +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.segment.FloatColumnSelector; +import org.apache.hive.common.util.BloomKFilter; + +public class FloatBloomFilterAggregatorColumnSelectorStrategy + implements BloomFilterAggregatorColumnSelectorStrategy +{ + @Override + public void add(FloatColumnSelector selector, BloomKFilter bloomFilter) + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + bloomFilter.addFloat(selector.getFloat()); + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java new file mode 100644 index 000000000000..4b6d931e8823 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom.types; + +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.segment.LongColumnSelector; +import org.apache.hive.common.util.BloomKFilter; + +public class LongBloomFilterAggregatorColumnValueSelectorStrategy + implements BloomFilterAggregatorColumnSelectorStrategy +{ + @Override + public void add(LongColumnSelector selector, BloomKFilter bloomFilter) + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + bloomFilter.addLong(selector.getLong()); + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java new file mode 100644 index 000000000000..700f17275b1e --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom.types; + +import org.apache.druid.segment.DimensionSelector; +import org.apache.hive.common.util.BloomKFilter; + +public class StringBloomFilterAggregatorColumnSelectorStrategy + implements BloomFilterAggregatorColumnSelectorStrategy +{ + @Override + public void add(DimensionSelector selector, BloomKFilter bloomFilter) + { + if (selector.getRow().size() > 1) { + String[] strings = (String[]) selector.getObject(); + for (String value : strings) { + if (value == null) { + bloomFilter.addBytes(null, 0, 0); + } else { + bloomFilter.addString(value); + } + } + } else { + String value = (String) selector.getObject(); + if (value == null) { + bloomFilter.addBytes(null, 0, 0); + } else { + bloomFilter.addString(value); + } + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java new file mode 100644 index 000000000000..57443683b847 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -0,0 +1,630 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Function; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import org.apache.commons.codec.binary.Base64; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.guice.BloomFilterExtensionModule; +import org.apache.druid.jackson.DefaultObjectMapper; +import org.apache.druid.query.ColumnSelectorPlus; +import org.apache.druid.query.aggregation.Aggregator; +import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.query.aggregation.BufferAggregator; +import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.query.aggregation.bloom.types.DoubleBloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.query.aggregation.bloom.types.FloatBloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.query.aggregation.bloom.types.LongBloomFilterAggregatorColumnValueSelectorStrategy; +import org.apache.druid.query.aggregation.bloom.types.StringBloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.query.aggregation.cardinality.CardinalityAggregatorTest; +import org.apache.druid.query.dimension.DefaultDimensionSpec; +import org.apache.druid.query.dimension.DimensionSpec; +import org.apache.druid.query.dimension.ExtractionDimensionSpec; +import org.apache.druid.query.dimension.RegexFilteredDimensionSpec; +import org.apache.druid.query.extraction.RegexDimExtractionFn; +import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionSelector; +import org.apache.druid.segment.DoubleColumnSelector; +import org.apache.druid.segment.FloatColumnSelector; +import org.apache.druid.segment.LongColumnSelector; +import org.apache.hive.common.util.BloomKFilter; +import org.junit.Assert; +import org.junit.Test; + +import javax.annotation.Nullable; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class BloomFilterAggregatorTest +{ + private static final String nullish = NullHandling.replaceWithDefault() ? "" : null; + private static final List values1 = dimensionValues( + "a", + "b", + "c", + "a", + "a", + nullish, + "b", + "b", + "b", + "b", + "a", + "a" + ); + private static final List values2 = dimensionValues( + "a", + "b", + "c", + "x", + "a", + "e", + "b", + new String[]{nullish, "x"}, + new String[]{"x", nullish}, + new String[]{"y", "x"}, + new String[]{"x", "y"}, + new String[]{"x", "y", "a"} + ); + private static final Double[] doubleValues1 = new Double[]{0.1, 1.5, 18.3, 0.1}; + private static final Float[] floatValues1 = new Float[]{0.4f, 0.8f, 23.2f}; + private static final Long[] longValues1 = new Long[]{10241L, 12312355L, 0L, 81L}; + + private static final int maxNumValues = 15; + + private static BloomKFilter filter1; + private static BloomKFilter filter2; + + private static String serializedFilter1; + private static String serializedFilter2; + private static String serializedCombinedFilter; + private static String serializedLongFilter; + private static String serializedDoubleFilter; + private static String serializedFloatFilter; + + static { + try { + filter1 = new BloomKFilter(maxNumValues); + filter2 = new BloomKFilter(maxNumValues); + BloomKFilter combinedValuesFilter = new BloomKFilter(maxNumValues); + + for (String[] values : values1) { + for (String val : values) { + if (!NullHandling.replaceWithDefault() && val == null) { + filter1.addBytes(null, 0, 0); + combinedValuesFilter.addBytes(null, 0, 0); + } else { + filter1.addString(NullHandling.nullToEmptyIfNeeded(val)); + combinedValuesFilter.addString(NullHandling.nullToEmptyIfNeeded(val)); + } + } + } + for (String[] values : values2) { + for (String val : values) { + if (!NullHandling.replaceWithDefault() && val == null) { + filter2.addBytes(null, 0, 0); + combinedValuesFilter.addBytes(null, 0, 0); + } else { + filter2.addString(NullHandling.nullToEmptyIfNeeded(val)); + combinedValuesFilter.addString(NullHandling.nullToEmptyIfNeeded(val)); + } + } + } + + serializedFilter1 = filterToString(filter1); + serializedFilter2 = filterToString(filter2); + serializedCombinedFilter = filterToString(combinedValuesFilter); + + BloomKFilter longFilter = new BloomKFilter(maxNumValues); + for (long val : longValues1) { + longFilter.addLong(val); + } + serializedLongFilter = filterToString(longFilter); + + BloomKFilter floatFilter = new BloomKFilter(maxNumValues); + for (float val : floatValues1) { + floatFilter.addFloat(val); + } + serializedFloatFilter = filterToString(floatFilter); + + BloomKFilter doubleFilter = new BloomKFilter(maxNumValues); + for (double val : doubleValues1) { + doubleFilter.addDouble(val); + } + serializedDoubleFilter = filterToString(doubleFilter); + + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + } + + private final DimensionSpec dimSpec = new DefaultDimensionSpec("dim1", "dim1"); + private BloomFilterAggregatorFactory valueAggregatorFactory; + public BloomFilterAggregatorTest() + { + + valueAggregatorFactory = new BloomFilterAggregatorFactory( + "billy", + dimSpec, + maxNumValues + ); + + } + + private static List dimensionValues(Object... values) + { + return Lists.transform( + Lists.newArrayList(values), new Function() + { + @Nullable + @Override + public String[] apply(@Nullable Object input) + { + if (input instanceof String[]) { + return (String[]) input; + } else { + return new String[]{(String) input}; + } + } + } + ); + } + + private static void aggregateDimension(List selectorList, Aggregator agg) + { + agg.aggregate(); + + for (DimensionSelector selector : selectorList) { + ((CardinalityAggregatorTest.TestDimensionSelector) selector).increment(); + } + } + + private static void bufferAggregateDimension( + List selectorList, + BufferAggregator agg, + ByteBuffer buf, + int pos + ) + { + agg.aggregate(buf, pos); + + for (DimensionSelector selector : selectorList) { + ((CardinalityAggregatorTest.TestDimensionSelector) selector).increment(); + } + } + + private static void aggregateColumn(List selectorList, Aggregator agg) + { + agg.aggregate(); + + for (SteppableSelector selector : selectorList) { + selector.increment(); + } + } + + private static void bufferAggregateColumn( + List selectorList, + BufferAggregator agg, + ByteBuffer buf, + int pos + ) + { + agg.aggregate(buf, pos); + + for (SteppableSelector selector : selectorList) { + selector.increment(); + } + } + + private static String filterToString(BloomKFilter bloomKFilter) throws IOException + { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + BloomKFilter.serialize(byteArrayOutputStream, bloomKFilter); + byte[] bytes = byteArrayOutputStream.toByteArray(); + return Base64.encodeBase64String(bytes); + } + + @Test + public void testAggregateValues() throws IOException + { + DimensionSelector dimSelector = new CardinalityAggregatorTest.TestDimensionSelector(values1, null); + BloomFilterAggregator agg = new BloomFilterAggregator( + new ColumnSelectorPlus<>( + dimSpec.getDimension(), + dimSpec.getOutputName(), + new StringBloomFilterAggregatorColumnSelectorStrategy(), + dimSelector + ), + maxNumValues + ); + + for (int i = 0; i < values1.size(); ++i) { + aggregateDimension(Collections.singletonList(dimSelector), agg); + } + + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get()); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedFilter1, serialized); + } + + @Test + public void testAggregateLongValues() throws IOException + { + TestLongColumnSelector selector = new TestLongColumnSelector(Arrays.asList(longValues1)); + BloomFilterAggregator agg = new BloomFilterAggregator( + new ColumnSelectorPlus<>( + "longColumn", + "longlongman", + new LongBloomFilterAggregatorColumnValueSelectorStrategy(), + selector + ), + maxNumValues + ); + + for (Long ignored : longValues1) { + aggregateColumn(Collections.singletonList(selector), agg); + } + + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get()); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedLongFilter, serialized); + } + + @Test + public void testAggregateFloatValues() throws IOException + { + TestFloatColumnSelector selector = new TestFloatColumnSelector(Arrays.asList(floatValues1)); + BloomFilterAggregator agg = new BloomFilterAggregator( + new ColumnSelectorPlus<>( + "floatColumn", + "floatColumn", + new FloatBloomFilterAggregatorColumnSelectorStrategy(), + selector + ), + maxNumValues + ); + + for (Float ignored : floatValues1) { + aggregateColumn(Collections.singletonList(selector), agg); + } + + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get()); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedFloatFilter, serialized); + } + + @Test + public void testAggregateDoubleValues() throws IOException + { + TestDoubleColumnSelector selector = new TestDoubleColumnSelector(Arrays.asList(doubleValues1)); + BloomFilterAggregator agg = new BloomFilterAggregator( + new ColumnSelectorPlus<>( + "doubleColumn", + "doubleColumn", + new DoubleBloomFilterAggregatorColumnSelectorStrategy(), + selector + ), + maxNumValues + ); + + for (Double ignored : doubleValues1) { + aggregateColumn(Collections.singletonList(selector), agg); + } + + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get()); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedDoubleFilter, serialized); + } + + @Test + public void testBufferAggregateValues() throws IOException + { + DimensionSelector dimSelector = new CardinalityAggregatorTest.TestDimensionSelector(values2, null); + BloomFilterBufferAggregator agg = new BloomFilterBufferAggregator( + new ColumnSelectorPlus<>( + dimSpec.getDimension(), + dimSpec.getOutputName(), + new StringBloomFilterAggregatorColumnSelectorStrategy(), + dimSelector + ), + maxNumValues + ); + + int maxSize = valueAggregatorFactory.getMaxIntermediateSizeWithNulls(); + ByteBuffer buf = ByteBuffer.allocate(maxSize + 64); + int pos = 10; + buf.limit(pos + maxSize); + + agg.init(buf, pos); + + for (int i = 0; i < values2.size(); ++i) { + bufferAggregateDimension(Collections.singletonList(dimSelector), agg, buf, pos); + } + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedFilter2, serialized); + } + + @Test + public void testCombineValues() throws IOException + { + DimensionSelector dimSelector1 = new CardinalityAggregatorTest.TestDimensionSelector(values1, null); + DimensionSelector dimSelector2 = new CardinalityAggregatorTest.TestDimensionSelector(values2, null); + + ColumnSelectorPlus selector1 = new ColumnSelectorPlus<>( + dimSpec.getDimension(), + dimSpec.getOutputName(), + new StringBloomFilterAggregatorColumnSelectorStrategy(), + dimSelector1 + ); + + ColumnSelectorPlus selector2 = new ColumnSelectorPlus<>( + dimSpec.getDimension(), + dimSpec.getOutputName(), + new StringBloomFilterAggregatorColumnSelectorStrategy(), + dimSelector2 + ); + + BloomFilterAggregator agg1 = new BloomFilterAggregator( + selector1, + maxNumValues + ); + BloomFilterAggregator agg2 = new BloomFilterAggregator( + selector2, + maxNumValues + ); + + for (int i = 0; i < values1.size(); ++i) { + aggregateDimension(Collections.singletonList(dimSelector1), agg1); + } + for (int i = 0; i < values2.size(); ++i) { + aggregateDimension(Collections.singletonList(dimSelector2), agg2); + } + + BloomKFilter combined = (BloomKFilter) valueAggregatorFactory.finalizeComputation( + valueAggregatorFactory.combine( + agg1.get(), + agg2.get() + ) + ); + + String serialized = filterToString(combined); + Assert.assertEquals(serializedCombinedFilter, serialized); + } + + @Test + public void testMergeValues() throws IOException + { + final TestBloomFilterColumnSelector mergeDim = + new TestBloomFilterColumnSelector(ImmutableList.of(filter1, filter2)); + + BloomFilterMergeAggregator mergeAggregator = new BloomFilterMergeAggregator(mergeDim, maxNumValues); + + for (int i = 0; i < 2; ++i) { + aggregateColumn(Collections.singletonList(mergeDim), mergeAggregator); + } + + + BloomKFilter merged = (BloomKFilter) valueAggregatorFactory.getCombiningFactory() + .finalizeComputation(mergeAggregator.get()); + String serialized = filterToString(merged); + Assert.assertEquals(serializedCombinedFilter, serialized); + } + + @Test + public void testBuferMergeValues() throws IOException + { + final TestBloomFilterColumnSelector mergeDim = + new TestBloomFilterColumnSelector(ImmutableList.of(filter1, filter2)); + + BloomFilterMergeBufferAggregator mergeAggregator = new BloomFilterMergeBufferAggregator(mergeDim, maxNumValues); + + int maxSize = valueAggregatorFactory.getCombiningFactory().getMaxIntermediateSizeWithNulls(); + ByteBuffer buf = ByteBuffer.allocate(maxSize + 64); + int pos = 10; + buf.limit(pos + maxSize); + + mergeAggregator.init(buf, pos); + + for (int i = 0; i < 2; ++i) { + bufferAggregateColumn(Collections.singletonList(mergeDim), mergeAggregator, buf, pos); + } + + BloomKFilter merged = (BloomKFilter) valueAggregatorFactory.getCombiningFactory() + .finalizeComputation(mergeAggregator.get(buf, pos)); + String serialized = filterToString(merged); + + Assert.assertEquals(serializedCombinedFilter, serialized); + } + + @Test + public void testSerde() throws Exception + { + BloomFilterAggregatorFactory factory = new BloomFilterAggregatorFactory( + "billy", + new DefaultDimensionSpec("b", "b"), + maxNumValues + ); + ObjectMapper objectMapper = new DefaultObjectMapper(); + new BloomFilterExtensionModule().getJacksonModules().forEach(objectMapper::registerModule); + Assert.assertEquals( + factory, + objectMapper.readValue(objectMapper.writeValueAsString(factory), AggregatorFactory.class) + ); + + String fieldNamesOnly = "{" + + "\"type\":\"bloom\"," + + "\"name\":\"billy\"," + + "\"field\":\"b\"," + + "\"maxNumEntries\":15" + + "}"; + Assert.assertEquals( + factory, + objectMapper.readValue(fieldNamesOnly, AggregatorFactory.class) + ); + + BloomFilterAggregatorFactory factory2 = new BloomFilterAggregatorFactory( + "billy", + new ExtractionDimensionSpec("b", "b", new RegexDimExtractionFn(".*", false, null)), + maxNumValues + ); + + Assert.assertEquals( + factory2, + objectMapper.readValue(objectMapper.writeValueAsString(factory2), AggregatorFactory.class) + ); + + BloomFilterAggregatorFactory factory3 = new BloomFilterAggregatorFactory( + "billy", + new RegexFilteredDimensionSpec(new DefaultDimensionSpec("a", "a"), ".*"), + maxNumValues + ); + Assert.assertEquals( + factory3, + objectMapper.readValue(objectMapper.writeValueAsString(factory3), AggregatorFactory.class) + ); + } + + private abstract static class SteppableSelector implements ColumnValueSelector + { + List values; + int pos; + + public SteppableSelector(List values) + { + this.values = values; + this.pos = 0; + } + + @Nullable + @Override + public T getObject() + { + return values.get(pos); + } + + public void increment() + { + pos++; + } + + public void reset() + { + pos = 0; + } + + + @Override + public double getDouble() + { + throw new UnsupportedOperationException(); + } + + @Override + public float getFloat() + { + throw new UnsupportedOperationException(); + } + + @Override + public long getLong() + { + throw new UnsupportedOperationException(); + } + + + @Override + public void inspectRuntimeShape(RuntimeShapeInspector inspector) + { + + } + + @Override + public Class classOfObject() + { + return null; + } + + @Override + public boolean isNull() + { + return false; + } + } + + public static class TestBloomFilterColumnSelector extends SteppableSelector + { + public TestBloomFilterColumnSelector(List values) + { + super(values); + } + } + + public static class TestLongColumnSelector extends SteppableSelector implements LongColumnSelector + { + public TestLongColumnSelector(List values) + { + super(values); + } + + @Override + public long getLong() + { + return values.get(pos); + } + } + + public static class TestFloatColumnSelector extends SteppableSelector implements FloatColumnSelector + { + public TestFloatColumnSelector(List values) + { + super(values); + } + + @Override + public float getFloat() + { + return values.get(pos); + } + } + + public static class TestDoubleColumnSelector extends SteppableSelector implements DoubleColumnSelector + { + public TestDoubleColumnSelector(List values) + { + super(values); + } + + @Override + public double getDouble() + { + return values.get(pos); + } + } +} diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java index 1d291b7360d4..aa2b44c4b015 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java @@ -100,6 +100,9 @@ public class AggregatorUtil public static final byte STRING_FIRST_CACHE_TYPE_ID = 0x2B; public static final byte STRING_LAST_CACHE_TYPE_ID = 0x2C; + public static final byte BLOOM_FILTER_CACHE_TYPE_ID = 0x30; + public static final byte BLOOM_FILTER_MERGE_CACHE_TYPE_ID = 0x31; + // Suppressed aggregator public static final byte SUPPRESSED_AGG_CACHE_TYPE_ID = 0x2D; From e1c9f77d8072c4f607863e6586814d2a0956cceb Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 5 Oct 2018 17:25:30 -0700 Subject: [PATCH 02/36] partially address review --- .../bloom/BloomFilterAggregator.java | 6 +++--- .../bloom/BloomFilterAggregatorFactory.java | 11 ++++++---- .../bloom/BloomFilterBufferAggregator.java | 20 +++++++++--------- .../bloom/BloomFilterMergeAggregator.java | 11 ++++------ .../BloomFilterMergeAggregatorFactory.java | 8 ++----- .../BloomFilterMergeBufferAggregator.java | 21 ++++++++----------- ...gregatorColumnSelectorStrategyFactory.java | 3 ++- ...ilterAggregatorColumnSelectorStrategy.java | 6 +++--- ...ilterAggregatorColumnSelectorStrategy.java | 6 +++--- ...AggregatorColumnValueSelectorStrategy.java | 6 +++--- ...ilterAggregatorColumnSelectorStrategy.java | 6 +++--- .../query/aggregation/AggregatorUtil.java | 7 ++++--- 12 files changed, 53 insertions(+), 58 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java index a0d67850ba80..f6514bfc378e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java @@ -28,8 +28,8 @@ public class BloomFilterAggregator implements Aggregator { - private ColumnSelectorPlus selectorPlus; - private BloomKFilter collector; + private final ColumnSelectorPlus selectorPlus; + private final BloomKFilter collector; public BloomFilterAggregator( ColumnSelectorPlus selectorPlus, @@ -74,6 +74,6 @@ public double getDouble() @Override public void close() { - + // nothing to close } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index de5dcb94ff49..53c3c03b073a 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -37,6 +37,8 @@ import org.apache.druid.segment.DimensionHandlerUtils; import org.apache.hive.common.util.BloomKFilter; +import javax.annotation.Nullable; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.Collections; import java.util.Comparator; @@ -45,6 +47,7 @@ public class BloomFilterAggregatorFactory extends AggregatorFactory { + private static final int DEFAULT_NUM_ENTRIES = 1500; protected static final BloomFilterAggregatorColumnSelectorStrategyFactory STRATEGY_FACTORY = new BloomFilterAggregatorColumnSelectorStrategyFactory(); @@ -56,12 +59,12 @@ public class BloomFilterAggregatorFactory extends AggregatorFactory public BloomFilterAggregatorFactory( @JsonProperty("name") String name, @JsonProperty("field") final DimensionSpec field, - @JsonProperty("maxNumEntries") Integer maxNumEntries + @Nullable @JsonProperty("maxNumEntries") Integer maxNumEntries ) { this.name = name; this.field = field; - this.maxNumEntries = maxNumEntries != null ? maxNumEntries : 1500; + this.maxNumEntries = maxNumEntries != null ? maxNumEntries : DEFAULT_NUM_ENTRIES; } @Override @@ -98,7 +101,7 @@ public Comparator getComparator() } @Override - public Object combine(Object lhs, Object rhs) + public Object combine(@Nullable Object lhs, @Nullable Object rhs) { if (rhs == null) { return lhs; @@ -142,7 +145,7 @@ public Object deserialize(Object object) try { return BloomKFilter.deserialize(byteBufferInputStream); } - catch (Exception ex) { + catch (IOException ex) { throw new RuntimeException("Failed to deserialize bloomK filter", ex); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java index 7e55935c03e8..ec2b3dfc0146 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -27,13 +27,14 @@ import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import org.apache.hive.common.util.BloomKFilter; +import java.io.IOException; import java.nio.ByteBuffer; public class BloomFilterBufferAggregator implements BufferAggregator { - private ColumnSelectorPlus selectorPlus; - private int maxNumEntries; + private final ColumnSelectorPlus selectorPlus; + private final int maxNumEntries; public BloomFilterBufferAggregator( ColumnSelectorPlus selectorPlus, @@ -50,11 +51,11 @@ public void init(ByteBuffer buf, int position) final ByteBuffer mutationBuffer = buf.duplicate(); mutationBuffer.position(position); BloomKFilter filter = new BloomKFilter(maxNumEntries); - ByteBufferBackedOutputStream wat = new ByteBufferBackedOutputStream(mutationBuffer); + ByteBufferBackedOutputStream outputStream = new ByteBufferBackedOutputStream(mutationBuffer); try { - BloomKFilter.serialize(wat, filter); + BloomKFilter.serialize(outputStream, filter); } - catch (Exception ex) { + catch (IOException ex) { throw new RuntimeException("Failed to initialize bloomK filter", ex); } } @@ -72,7 +73,7 @@ public void aggregate(ByteBuffer buf, int position) ByteBufferBackedOutputStream out = new ByteBufferBackedOutputStream(buf); BloomKFilter.serialize(out, collector); } - catch (Exception ex) { + catch (IOException ex) { throw new RuntimeException("Failed to merge bloomK filters", ex); } finally { @@ -87,10 +88,9 @@ public Object get(ByteBuffer buf, int position) try { ByteBuffer mutationBuffer = buf.duplicate(); mutationBuffer.position(position); - BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); - return collector; + return BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); } - catch (Exception ex) { + catch (IOException ex) { throw new RuntimeException("Failed to deserialize bloomK filter", ex); } } @@ -116,7 +116,7 @@ public double getDouble(ByteBuffer buf, int position) @Override public void close() { - + // nothing to close } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java index 309d8f56d664..6a6d2ced6cdf 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java @@ -27,13 +27,10 @@ public class BloomFilterMergeAggregator implements Aggregator { - private ColumnValueSelector selector; - private BloomKFilter collector; + private final ColumnValueSelector selector; + private final BloomKFilter collector; - public BloomFilterMergeAggregator( - ColumnValueSelector selector, - int maxNumEntries - ) + public BloomFilterMergeAggregator(ColumnValueSelector selector, int maxNumEntries) { this.selector = selector; this.collector = new BloomKFilter(maxNumEntries); @@ -77,6 +74,6 @@ public double getDouble() @Override public void close() { - + // nothing to close } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java index c56ad462cd8f..a53a5eab54d6 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -36,13 +36,9 @@ public class BloomFilterMergeAggregatorFactory extends BloomFilterAggregatorFactory { - private String fieldName; + private final String fieldName; - BloomFilterMergeAggregatorFactory( - String name, - String field, - Integer maxNumEntries - ) + BloomFilterMergeAggregatorFactory(String name, String field, Integer maxNumEntries) { super(name, null, maxNumEntries); this.fieldName = field; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index 191c1be96d01..007e4fa88706 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -26,17 +26,15 @@ import org.apache.druid.segment.ColumnValueSelector; import org.apache.hive.common.util.BloomKFilter; +import java.io.IOException; import java.nio.ByteBuffer; public class BloomFilterMergeBufferAggregator implements BufferAggregator { - private ColumnValueSelector selector; - private int maxNumEntries; + private final ColumnValueSelector selector; + private final int maxNumEntries; - public BloomFilterMergeBufferAggregator( - ColumnValueSelector selector, - int maxNumEntries - ) + public BloomFilterMergeBufferAggregator(ColumnValueSelector selector, int maxNumEntries) { this.selector = selector; this.maxNumEntries = maxNumEntries; @@ -52,7 +50,7 @@ public void init(ByteBuffer buf, int position) try { BloomKFilter.serialize(outputStream, filter); } - catch (Exception ex) { + catch (IOException ex) { throw new RuntimeException("Failed to initialize bloomK filter", ex); } } @@ -73,7 +71,7 @@ public void aggregate(ByteBuffer buf, int position) BloomKFilter.serialize(out, collector); } } - catch (Exception ex) { + catch (IOException ex) { throw new RuntimeException("Failed to merge bloomK filters", ex); } finally { @@ -88,10 +86,9 @@ public Object get(ByteBuffer buf, int position) try { ByteBuffer mutationBuffer = buf.duplicate(); mutationBuffer.position(position); - BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); - return collector; + return BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); } - catch (Exception ex) { + catch (IOException ex) { throw new RuntimeException("Failed to deserialize bloomK filter", ex); } } @@ -117,7 +114,7 @@ public double getDouble(ByteBuffer buf, int position) @Override public void close() { - + // nothing to close } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java index c84696a67c7b..65197271c45f 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java @@ -30,7 +30,8 @@ public class BloomFilterAggregatorColumnSelectorStrategyFactory { @Override public BloomFilterAggregatorColumnSelectorStrategy makeColumnSelectorStrategy( - ColumnCapabilities capabilities, ColumnValueSelector selector + ColumnCapabilities capabilities, + ColumnValueSelector selector ) { ValueType type = capabilities.getType(); diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java index 848e4e4285af..cbb5b0465aad 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java @@ -20,14 +20,14 @@ package org.apache.druid.query.aggregation.bloom.types; import org.apache.druid.common.config.NullHandling; -import org.apache.druid.segment.DoubleColumnSelector; +import org.apache.druid.segment.BaseDoubleColumnValueSelector; import org.apache.hive.common.util.BloomKFilter; public class DoubleBloomFilterAggregatorColumnSelectorStrategy - implements BloomFilterAggregatorColumnSelectorStrategy + implements BloomFilterAggregatorColumnSelectorStrategy { @Override - public void add(DoubleColumnSelector selector, BloomKFilter bloomFilter) + public void add(BaseDoubleColumnValueSelector selector, BloomKFilter bloomFilter) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { bloomFilter.addDouble(selector.getDouble()); diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java index ae2c5e522b12..812f1082e0ba 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java @@ -20,14 +20,14 @@ package org.apache.druid.query.aggregation.bloom.types; import org.apache.druid.common.config.NullHandling; -import org.apache.druid.segment.FloatColumnSelector; +import org.apache.druid.segment.BaseFloatColumnValueSelector; import org.apache.hive.common.util.BloomKFilter; public class FloatBloomFilterAggregatorColumnSelectorStrategy - implements BloomFilterAggregatorColumnSelectorStrategy + implements BloomFilterAggregatorColumnSelectorStrategy { @Override - public void add(FloatColumnSelector selector, BloomKFilter bloomFilter) + public void add(BaseFloatColumnValueSelector selector, BloomKFilter bloomFilter) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { bloomFilter.addFloat(selector.getFloat()); diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java index 4b6d931e8823..27e7ee928eca 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java @@ -20,14 +20,14 @@ package org.apache.druid.query.aggregation.bloom.types; import org.apache.druid.common.config.NullHandling; -import org.apache.druid.segment.LongColumnSelector; +import org.apache.druid.segment.BaseLongColumnValueSelector; import org.apache.hive.common.util.BloomKFilter; public class LongBloomFilterAggregatorColumnValueSelectorStrategy - implements BloomFilterAggregatorColumnSelectorStrategy + implements BloomFilterAggregatorColumnSelectorStrategy { @Override - public void add(LongColumnSelector selector, BloomKFilter bloomFilter) + public void add(BaseLongColumnValueSelector selector, BloomKFilter bloomFilter) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { bloomFilter.addLong(selector.getLong()); diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java index 700f17275b1e..043e39b2c49b 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java @@ -29,14 +29,14 @@ public class StringBloomFilterAggregatorColumnSelectorStrategy public void add(DimensionSelector selector, BloomKFilter bloomFilter) { if (selector.getRow().size() > 1) { - String[] strings = (String[]) selector.getObject(); - for (String value : strings) { + selector.getRow().forEach(v -> { + String value = selector.lookupName(v); if (value == null) { bloomFilter.addBytes(null, 0, 0); } else { bloomFilter.addString(value); } - } + }); } else { String value = (String) selector.getObject(); if (value == null) { diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java index aa2b44c4b015..1fd81d71e5b8 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java @@ -100,9 +100,6 @@ public class AggregatorUtil public static final byte STRING_FIRST_CACHE_TYPE_ID = 0x2B; public static final byte STRING_LAST_CACHE_TYPE_ID = 0x2C; - public static final byte BLOOM_FILTER_CACHE_TYPE_ID = 0x30; - public static final byte BLOOM_FILTER_MERGE_CACHE_TYPE_ID = 0x31; - // Suppressed aggregator public static final byte SUPPRESSED_AGG_CACHE_TYPE_ID = 0x2D; @@ -113,6 +110,10 @@ public class AggregatorUtil public static final byte HLL_SKETCH_TO_STRING_CACHE_TYPE_ID = 0x31; public static final byte HLL_SKETCH_TO_ESTIMATE_AND_BOUNDS_CACHE_TYPE_ID = 0x32; + // bloom filter extension + public static final byte BLOOM_FILTER_CACHE_TYPE_ID = 0x33; + public static final byte BLOOM_FILTER_MERGE_CACHE_TYPE_ID = 0x34; + /** * returns the list of dependent postAggregators that should be calculated in order to calculate given postAgg * From 935a28a49a3c1f8648ec6286b1e0f576e1fe8399 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 8 Oct 2018 15:12:28 -0700 Subject: [PATCH 03/36] fix docs --- docs/content/development/extensions-core/bloom-filter.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/content/development/extensions-core/bloom-filter.md b/docs/content/development/extensions-core/bloom-filter.md index 304606580e08..b95dc326b648 100644 --- a/docs/content/development/extensions-core/bloom-filter.md +++ b/docs/content/development/extensions-core/bloom-filter.md @@ -38,7 +38,7 @@ present in bloom filter construction, but `test()` says true) decrease. - Lower the false positive probability greater is the space requirement. - Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. -- During the creation of bloom filter expected number of entries must be specified.If the number of insertions exceed +- During the creation of bloom filter expected number of entries must be specified. If the number of insertions exceed the specified initial number of entries then false positive probability will increase accordingly. This extension is built on top of `org.apache.hive.common.util.BloomKFilter`. Internally, this implementation of bloom @@ -87,7 +87,7 @@ Input for a `bloomKFilter` can also be created from a druid query with the `bloo ### JSON Specification of Bloom Filter Aggregator ```json { - "type": "bloomFilter", + "type": "bloom", "name": , "maxNumEntries": "field": From c17f8b59bce86a143459cdc8282756d3ff804092 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 23 Oct 2018 14:16:16 -0700 Subject: [PATCH 04/36] minor test refactor after rebase --- .../bloom/BloomFilterAggregatorTest.java | 66 ++++++++----------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java index 57443683b847..60957b787f8d 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -26,6 +26,7 @@ import org.apache.commons.codec.binary.Base64; import org.apache.druid.common.config.NullHandling; import org.apache.druid.guice.BloomFilterExtensionModule; +import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.druid.jackson.DefaultObjectMapper; import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.Aggregator; @@ -53,7 +54,6 @@ import org.junit.Test; import javax.annotation.Nullable; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; @@ -113,28 +113,8 @@ public class BloomFilterAggregatorTest filter2 = new BloomKFilter(maxNumValues); BloomKFilter combinedValuesFilter = new BloomKFilter(maxNumValues); - for (String[] values : values1) { - for (String val : values) { - if (!NullHandling.replaceWithDefault() && val == null) { - filter1.addBytes(null, 0, 0); - combinedValuesFilter.addBytes(null, 0, 0); - } else { - filter1.addString(NullHandling.nullToEmptyIfNeeded(val)); - combinedValuesFilter.addString(NullHandling.nullToEmptyIfNeeded(val)); - } - } - } - for (String[] values : values2) { - for (String val : values) { - if (!NullHandling.replaceWithDefault() && val == null) { - filter2.addBytes(null, 0, 0); - combinedValuesFilter.addBytes(null, 0, 0); - } else { - filter2.addString(NullHandling.nullToEmptyIfNeeded(val)); - combinedValuesFilter.addString(NullHandling.nullToEmptyIfNeeded(val)); - } - } - } + createStringFilter(values1, filter1, combinedValuesFilter); + createStringFilter(values2, filter2, combinedValuesFilter); serializedFilter1 = filterToString(filter1); serializedFilter2 = filterToString(filter2); @@ -164,17 +144,19 @@ public class BloomFilterAggregatorTest } } - private final DimensionSpec dimSpec = new DefaultDimensionSpec("dim1", "dim1"); - private BloomFilterAggregatorFactory valueAggregatorFactory; - public BloomFilterAggregatorTest() + private static void createStringFilter(List values, BloomKFilter filter, BloomKFilter combinedValuesFilter) { - - valueAggregatorFactory = new BloomFilterAggregatorFactory( - "billy", - dimSpec, - maxNumValues - ); - + for (String[] vals : values) { + for (String val : vals) { + if (!NullHandling.replaceWithDefault() && val == null) { + filter.addBytes(null, 0, 0); + combinedValuesFilter.addBytes(null, 0, 0); + } else { + filter.addString(NullHandling.nullToEmptyIfNeeded(val)); + combinedValuesFilter.addString(NullHandling.nullToEmptyIfNeeded(val)); + } + } + } } private static List dimensionValues(Object... values) @@ -244,12 +226,22 @@ private static void bufferAggregateColumn( private static String filterToString(BloomKFilter bloomKFilter) throws IOException { - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - BloomKFilter.serialize(byteArrayOutputStream, bloomKFilter); - byte[] bytes = byteArrayOutputStream.toByteArray(); - return Base64.encodeBase64String(bytes); + return Base64.encodeBase64String(BloomFilterSerializersModule.bloomKFilterToBytes(bloomKFilter)); + } + + private final DimensionSpec dimSpec = new DefaultDimensionSpec("dim1", "dim1"); + private BloomFilterAggregatorFactory valueAggregatorFactory; + + public BloomFilterAggregatorTest() + { + valueAggregatorFactory = new BloomFilterAggregatorFactory( + "billy", + dimSpec, + maxNumValues + ); } + @Test public void testAggregateValues() throws IOException { From 03a99bce6a3e24f9a6c31bf49024ec3aa22d9c76 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 13 Nov 2018 13:17:46 -0800 Subject: [PATCH 05/36] use copied bloomkfilter --- .../druid/query/aggregation/bloom/BloomFilterAggregator.java | 2 +- .../query/aggregation/bloom/BloomFilterAggregatorFactory.java | 2 +- .../query/aggregation/bloom/BloomFilterBufferAggregator.java | 2 +- .../query/aggregation/bloom/BloomFilterMergeAggregator.java | 2 +- .../aggregation/bloom/BloomFilterMergeAggregatorFactory.java | 2 +- .../aggregation/bloom/BloomFilterMergeBufferAggregator.java | 2 +- .../types/BloomFilterAggregatorColumnSelectorStrategy.java | 2 +- .../DoubleBloomFilterAggregatorColumnSelectorStrategy.java | 2 +- .../types/FloatBloomFilterAggregatorColumnSelectorStrategy.java | 2 +- .../LongBloomFilterAggregatorColumnValueSelectorStrategy.java | 2 +- .../StringBloomFilterAggregatorColumnSelectorStrategy.java | 2 +- .../query/aggregation/bloom/BloomFilterAggregatorTest.java | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java index f6514bfc378e..7fecf8138080 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java @@ -22,7 +22,7 @@ import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; -import org.apache.hive.common.util.BloomKFilter; +import org.apache.druid.query.filter.BloomKFilter; import javax.annotation.Nullable; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 53c3c03b073a..859d60f70ffc 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -33,9 +33,9 @@ import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategyFactory; import org.apache.druid.query.cache.CacheKeyBuilder; import org.apache.druid.query.dimension.DimensionSpec; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.ColumnSelectorFactory; import org.apache.druid.segment.DimensionHandlerUtils; -import org.apache.hive.common.util.BloomKFilter; import javax.annotation.Nullable; import java.io.IOException; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java index ec2b3dfc0146..48242a3e5da0 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -24,8 +24,8 @@ import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; -import org.apache.hive.common.util.BloomKFilter; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java index 6a6d2ced6cdf..afb88ed6beef 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java @@ -20,8 +20,8 @@ package org.apache.druid.query.aggregation.bloom; import org.apache.druid.query.aggregation.Aggregator; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.ColumnValueSelector; -import org.apache.hive.common.util.BloomKFilter; import javax.annotation.Nullable; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java index a53a5eab54d6..c6f3f3fd8a23 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -26,10 +26,10 @@ import org.apache.druid.query.aggregation.NoopAggregator; import org.apache.druid.query.aggregation.NoopBufferAggregator; import org.apache.druid.query.cache.CacheKeyBuilder; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.ColumnSelectorFactory; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.NilColumnValueSelector; -import org.apache.hive.common.util.BloomKFilter; import java.util.Collections; import java.util.List; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index 007e4fa88706..f102123fc147 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -22,9 +22,9 @@ import com.fasterxml.jackson.databind.util.ByteBufferBackedOutputStream; import org.apache.druid.io.ByteBufferInputStream; import org.apache.druid.query.aggregation.BufferAggregator; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import org.apache.druid.segment.ColumnValueSelector; -import org.apache.hive.common.util.BloomKFilter; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java index 531efdca980b..f1d43937c909 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java @@ -20,7 +20,7 @@ package org.apache.druid.query.aggregation.bloom.types; import org.apache.druid.query.dimension.ColumnSelectorStrategy; -import org.apache.hive.common.util.BloomKFilter; +import org.apache.druid.query.filter.BloomKFilter; public interface BloomFilterAggregatorColumnSelectorStrategy extends ColumnSelectorStrategy { diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java index cbb5b0465aad..7d161455894a 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java @@ -20,8 +20,8 @@ package org.apache.druid.query.aggregation.bloom.types; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.BaseDoubleColumnValueSelector; -import org.apache.hive.common.util.BloomKFilter; public class DoubleBloomFilterAggregatorColumnSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java index 812f1082e0ba..c70a466bca71 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java @@ -20,8 +20,8 @@ package org.apache.druid.query.aggregation.bloom.types; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.BaseFloatColumnValueSelector; -import org.apache.hive.common.util.BloomKFilter; public class FloatBloomFilterAggregatorColumnSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java index 27e7ee928eca..5d63f0d943d7 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java @@ -20,8 +20,8 @@ package org.apache.druid.query.aggregation.bloom.types; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.BaseLongColumnValueSelector; -import org.apache.hive.common.util.BloomKFilter; public class LongBloomFilterAggregatorColumnValueSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java index 043e39b2c49b..3058cb13564b 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java @@ -19,8 +19,8 @@ package org.apache.druid.query.aggregation.bloom.types; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.DimensionSelector; -import org.apache.hive.common.util.BloomKFilter; public class StringBloomFilterAggregatorColumnSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java index 60957b787f8d..0c5e6c06a1e3 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -43,13 +43,13 @@ import org.apache.druid.query.dimension.ExtractionDimensionSpec; import org.apache.druid.query.dimension.RegexFilteredDimensionSpec; import org.apache.druid.query.extraction.RegexDimExtractionFn; +import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.DoubleColumnSelector; import org.apache.druid.segment.FloatColumnSelector; import org.apache.druid.segment.LongColumnSelector; -import org.apache.hive.common.util.BloomKFilter; import org.junit.Assert; import org.junit.Test; From 21eb78f01efdf98e1b4a9e197bc4ef3de77e1b6d Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 13:13:34 -0800 Subject: [PATCH 06/36] add ByteBuffer methods to BloomKFilter to allow agg to use in place, simplify some things, more tests --- .../guice/BloomFilterSerializersModule.java | 6 + .../bloom/BaseBloomFilterAggregator.java | 66 +++ .../BaseBloomFilterBufferAggregator.java | 84 +++ .../bloom/BloomFilterAggregator.java | 39 +- .../bloom/BloomFilterAggregatorFactory.java | 36 +- .../bloom/BloomFilterBufferAggregator.java | 81 +-- .../bloom/BloomFilterMergeAggregator.java | 59 +- .../BloomFilterMergeAggregatorFactory.java | 3 +- .../BloomFilterMergeBufferAggregator.java | 90 +--- .../aggregation/bloom/BloomFilterSerde.java | 68 +++ ...ilterAggregatorColumnSelectorStrategy.java | 4 + ...ilterAggregatorColumnSelectorStrategy.java | 12 + ...ilterAggregatorColumnSelectorStrategy.java | 12 + ...AggregatorColumnValueSelectorStrategy.java | 12 + ...ilterAggregatorColumnSelectorStrategy.java | 24 + .../druid/query/filter/BloomKFilter.java | 208 ++++++- .../bloom/BloomFilterAggregatorTest.java | 17 +- .../bloom/BloomFilterGroupByQueryTest.java | 149 +++++ .../druid/query/filter/BloomKFilterTest.java | 508 ++++++++++++++++++ .../src/test/resources/sample.data.tsv | 13 + .../apache/druid/sql/guice/SqlBindings.java | 4 +- 21 files changed, 1234 insertions(+), 261 deletions(-) create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java create mode 100644 extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java create mode 100644 extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java create mode 100644 extensions-core/druid-bloom-filter/src/test/resources/sample.data.tsv diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java index 2bd5007ac44e..0ac4f728a243 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterSerializersModule.java @@ -28,9 +28,11 @@ import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.ser.std.StdSerializer; import org.apache.druid.query.aggregation.bloom.BloomFilterAggregatorFactory; +import org.apache.druid.query.aggregation.bloom.BloomFilterSerde; import org.apache.druid.query.filter.BloomDimFilter; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.filter.BloomKFilterHolder; +import org.apache.druid.segment.serde.ComplexMetrics; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -49,6 +51,10 @@ public BloomFilterSerializersModule() addSerializer(BloomKFilter.class, new BloomKFilterSerializer()); addDeserializer(BloomKFilter.class, new BloomKFilterDeserializer()); addDeserializer(BloomKFilterHolder.class, new BloomKFilterHolderDeserializer()); + + if (ComplexMetrics.getSerdeForType(BLOOM_FILTER_TYPE_NAME) == null) { + ComplexMetrics.registerSerde(BLOOM_FILTER_TYPE_NAME, new BloomFilterSerde()); + } } private static class BloomKFilterSerializer extends StdSerializer diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java new file mode 100644 index 000000000000..e250c7e41131 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.query.aggregation.Aggregator; +import org.apache.druid.query.filter.BloomKFilter; + +import javax.annotation.Nullable; + +public abstract class BaseBloomFilterAggregator implements Aggregator +{ + protected final BloomKFilter collector; + + public BaseBloomFilterAggregator(BloomKFilter filter) + { + this.collector = filter; + } + + @Nullable + @Override + public Object get() + { + return collector; + } + + @Override + public float getFloat() + { + throw new UnsupportedOperationException("BloomFilterAggregator does not support getFloat()"); + } + + @Override + public long getLong() + { + throw new UnsupportedOperationException("BloomFilterAggregator does not support getLong()"); + } + + @Override + public double getDouble() + { + throw new UnsupportedOperationException("BloomFilterAggregator does not support getDouble()"); + } + + @Override + public void close() + { + // nothing to close + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java new file mode 100644 index 000000000000..59d9e832ca02 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.query.aggregation.BufferAggregator; +import org.apache.druid.query.filter.BloomKFilter; + +import java.io.IOException; +import java.nio.ByteBuffer; + +public abstract class BaseBloomFilterBufferAggregator implements BufferAggregator +{ + private final int maxNumEntries; + + public BaseBloomFilterBufferAggregator(int maxNumEntries) + { + this.maxNumEntries = maxNumEntries; + } + + @Override + public void init(ByteBuffer buf, int position) + { + final ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + BloomKFilter filter = new BloomKFilter(maxNumEntries); + try { + BloomKFilter.serialize(mutationBuffer, filter); + } + catch (IOException ex) { + throw new RuntimeException("Failed to initialize bloomK filter", ex); + } + } + + @Override + public Object get(ByteBuffer buf, int position) + { + ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + int sizeBytes = 5 + (buf.getInt(position + 1) << 3); + mutationBuffer.limit(position + sizeBytes); + return mutationBuffer.slice(); + } + + @Override + public float getFloat(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getFloat()"); + } + + @Override + public long getLong(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getLong()"); + } + + @Override + public double getDouble(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getDouble()"); + } + + @Override + public void close() + { + // nothing to close + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java index 7fecf8138080..7372dd95ec83 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java @@ -20,24 +20,20 @@ package org.apache.druid.query.aggregation.bloom; import org.apache.druid.query.ColumnSelectorPlus; -import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; import org.apache.druid.query.filter.BloomKFilter; -import javax.annotation.Nullable; - -public class BloomFilterAggregator implements Aggregator +public class BloomFilterAggregator extends BaseBloomFilterAggregator { private final ColumnSelectorPlus selectorPlus; - private final BloomKFilter collector; public BloomFilterAggregator( ColumnSelectorPlus selectorPlus, int maxNumEntries ) { + super(new BloomKFilter(maxNumEntries)); this.selectorPlus = selectorPlus; - this.collector = new BloomKFilter(maxNumEntries); } @Override @@ -45,35 +41,4 @@ public void aggregate() { selectorPlus.getColumnSelectorStrategy().add(selectorPlus.getSelector(), collector); } - - @Nullable - @Override - public Object get() - { - return collector; - } - - @Override - public float getFloat() - { - throw new UnsupportedOperationException("BloomFilterAggregator does not support getFloat()"); - } - - @Override - public long getLong() - { - throw new UnsupportedOperationException("BloomFilterAggregator does not support getLong()"); - } - - @Override - public double getDouble() - { - throw new UnsupportedOperationException("BloomFilterAggregator does not support getDouble()"); - } - - @Override - public void close() - { - // nothing to close - } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 859d60f70ffc..f18568ed6f34 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -22,7 +22,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import org.apache.commons.codec.binary.Base64; -import org.apache.druid.io.ByteBufferInputStream; +import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.Aggregator; @@ -128,31 +128,24 @@ public List getRequiredColumns() @Override public Object deserialize(Object object) { - final ByteBuffer buffer; - - if (object instanceof byte[]) { - buffer = ByteBuffer.wrap((byte[]) object); - } else if (object instanceof ByteBuffer) { - // Be conservative, don't assume we own this buffer. - buffer = ((ByteBuffer) object).duplicate(); - } else if (object instanceof String) { - buffer = ByteBuffer.wrap(Base64.decodeBase64(StringUtils.toUtf8((String) object))); + if (object instanceof String) { + return ByteBuffer.wrap(Base64.decodeBase64(StringUtils.toUtf8((String) object))); } else { - return object; - } - - ByteBufferInputStream byteBufferInputStream = new ByteBufferInputStream(buffer); - try { - return BloomKFilter.deserialize(byteBufferInputStream); - } - catch (IOException ex) { - throw new RuntimeException("Failed to deserialize bloomK filter", ex); + throw new RuntimeException("Failed to deserialize BloomKFilter"); } } @Override public Object finalizeComputation(Object object) { + if (object instanceof ByteBuffer) { + try { + return BloomKFilter.deserialize((ByteBuffer) object); + } + catch (IOException ioe) { + throw new RuntimeException("Failed to deserialize BloomKFilter"); + } + } return object; } @@ -184,14 +177,13 @@ public List requiredFields() @Override public String getTypeName() { - return "bloomFilter"; + return BloomFilterSerializersModule.BLOOM_FILTER_TYPE_NAME; } @Override public int getMaxIntermediateSize() { - BloomKFilter throwaway = new BloomKFilter(maxNumEntries); - return (throwaway.getBitSet().length * Long.BYTES) + 5; + return BloomKFilter.computeSizeBytes(maxNumEntries); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java index 48242a3e5da0..1a50a52f3703 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -19,104 +19,33 @@ package org.apache.druid.query.aggregation.bloom; -import com.fasterxml.jackson.databind.util.ByteBufferBackedOutputStream; -import org.apache.druid.io.ByteBufferInputStream; import org.apache.druid.query.ColumnSelectorPlus; -import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; -import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; -import java.io.IOException; import java.nio.ByteBuffer; -public class BloomFilterBufferAggregator implements BufferAggregator +public class BloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator { private final ColumnSelectorPlus selectorPlus; - private final int maxNumEntries; public BloomFilterBufferAggregator( ColumnSelectorPlus selectorPlus, int maxNumEntries ) { + super(maxNumEntries); this.selectorPlus = selectorPlus; - this.maxNumEntries = maxNumEntries; - } - - @Override - public void init(ByteBuffer buf, int position) - { - final ByteBuffer mutationBuffer = buf.duplicate(); - mutationBuffer.position(position); - BloomKFilter filter = new BloomKFilter(maxNumEntries); - ByteBufferBackedOutputStream outputStream = new ByteBufferBackedOutputStream(mutationBuffer); - try { - BloomKFilter.serialize(outputStream, filter); - } - catch (IOException ex) { - throw new RuntimeException("Failed to initialize bloomK filter", ex); - } } @Override public void aggregate(ByteBuffer buf, int position) { final int oldPosition = buf.position(); - final int oldLimit = buf.limit(); - try { - buf.position(position); - BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(buf)); - selectorPlus.getColumnSelectorStrategy().add(selectorPlus.getSelector(), collector); - buf.position(position); - ByteBufferBackedOutputStream out = new ByteBufferBackedOutputStream(buf); - BloomKFilter.serialize(out, collector); - } - catch (IOException ex) { - throw new RuntimeException("Failed to merge bloomK filters", ex); - } - finally { - buf.position(oldPosition); - buf.limit(oldLimit); - } - } - - @Override - public Object get(ByteBuffer buf, int position) - { - try { - ByteBuffer mutationBuffer = buf.duplicate(); - mutationBuffer.position(position); - return BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); - } - catch (IOException ex) { - throw new RuntimeException("Failed to deserialize bloomK filter", ex); - } - } - - @Override - public float getFloat(ByteBuffer buf, int position) - { - throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getFloat()"); - } - - @Override - public long getLong(ByteBuffer buf, int position) - { - throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getLong()"); - } - - @Override - public double getDouble(ByteBuffer buf, int position) - { - throw new UnsupportedOperationException("BloomFilterBufferAggregator does not support getDouble()"); - } - - @Override - public void close() - { - // nothing to close + buf.position(position); + selectorPlus.getColumnSelectorStrategy().bufferAdd(selectorPlus.getSelector(), buf); + buf.position(oldPosition); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java index afb88ed6beef..a0cdbcea53ff 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java @@ -19,61 +19,42 @@ package org.apache.druid.query.aggregation.bloom; -import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.ColumnValueSelector; -import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.ByteBuffer; -public class BloomFilterMergeAggregator implements Aggregator +public class BloomFilterMergeAggregator extends BaseBloomFilterAggregator { private final ColumnValueSelector selector; - private final BloomKFilter collector; public BloomFilterMergeAggregator(ColumnValueSelector selector, int maxNumEntries) { + super(new BloomKFilter(maxNumEntries)); this.selector = selector; - this.collector = new BloomKFilter(maxNumEntries); } @Override public void aggregate() { - BloomKFilter other = selector.getObject(); + Object other = selector.getObject(); if (other != null) { - collector.merge(other); + if (other instanceof BloomKFilter) { + collector.merge((BloomKFilter) other); + } else if (other instanceof ByteBuffer) { + // fun fact: because bloom filter agg factory deserialize returns a byte buffer to avoid unnecessary serde, + // but group by v1 ends up trying to merge bytebuffers from buffer aggs with this agg instead of the buffer + // merge agg. fun! Also, it requires a 'ComplexMetricSerde' to be registered even for query time only aggs, but + // then never uses it. also fun! + try { + BloomKFilter otherFilter = BloomKFilter.deserialize((ByteBuffer) other); + collector.merge(otherFilter); + } + catch (IOException ioe) { + throw new RuntimeException("Failed to deserialize BloomKFilter", ioe); + } + } } } - - @Nullable - @Override - public Object get() - { - return collector; - } - - - @Override - public float getFloat() - { - throw new UnsupportedOperationException("BloomFilterMergeAggregator does not support getFloat()"); - } - - @Override - public long getLong() - { - throw new UnsupportedOperationException("BloomFilterMergeAggregator does not support getLong()"); - } - - @Override - public double getDouble() - { - throw new UnsupportedOperationException("BloomFilterMergeAggregator does not support getDouble()"); - } - - @Override - public void close() - { - // nothing to close - } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java index c6f3f3fd8a23..1136a3136eae 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -31,6 +31,7 @@ import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.NilColumnValueSelector; +import java.nio.ByteBuffer; import java.util.Collections; import java.util.List; @@ -57,7 +58,7 @@ public Aggregator factorize(final ColumnSelectorFactory metricFactory) @Override public BufferAggregator factorizeBuffered(final ColumnSelectorFactory metricFactory) { - final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); + final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); if (selector instanceof NilColumnValueSelector) { return NoopBufferAggregator.instance(); } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index f102123fc147..1bafcb4c01d6 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -19,102 +19,32 @@ package org.apache.druid.query.aggregation.bloom; -import com.fasterxml.jackson.databind.util.ByteBufferBackedOutputStream; -import org.apache.druid.io.ByteBufferInputStream; -import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import org.apache.druid.segment.ColumnValueSelector; -import java.io.IOException; import java.nio.ByteBuffer; -public class BloomFilterMergeBufferAggregator implements BufferAggregator +public class BloomFilterMergeBufferAggregator extends BaseBloomFilterBufferAggregator { - private final ColumnValueSelector selector; - private final int maxNumEntries; + private final ColumnValueSelector selector; - public BloomFilterMergeBufferAggregator(ColumnValueSelector selector, int maxNumEntries) + public BloomFilterMergeBufferAggregator(ColumnValueSelector selector, int maxNumEntries) { + super(maxNumEntries); this.selector = selector; - this.maxNumEntries = maxNumEntries; - } - - @Override - public void init(ByteBuffer buf, int position) - { - final ByteBuffer mutationBuffer = buf.duplicate(); - mutationBuffer.position(position); - BloomKFilter filter = new BloomKFilter(maxNumEntries); - ByteBufferBackedOutputStream outputStream = new ByteBufferBackedOutputStream(mutationBuffer); - try { - BloomKFilter.serialize(outputStream, filter); - } - catch (IOException ex) { - throw new RuntimeException("Failed to initialize bloomK filter", ex); - } } @Override public void aggregate(ByteBuffer buf, int position) { final int oldPosition = buf.position(); - final int oldLimit = buf.limit(); - try { - buf.position(position); - BloomKFilter collector = BloomKFilter.deserialize(new ByteBufferInputStream(buf)); - BloomKFilter other = selector.getObject(); - if (other != null) { - collector.merge(other); - buf.position(position); - ByteBufferBackedOutputStream out = new ByteBufferBackedOutputStream(buf); - BloomKFilter.serialize(out, collector); - } - } - catch (IOException ex) { - throw new RuntimeException("Failed to merge bloomK filters", ex); - } - finally { - buf.position(oldPosition); - buf.limit(oldLimit); - } - } - - @Override - public Object get(ByteBuffer buf, int position) - { - try { - ByteBuffer mutationBuffer = buf.duplicate(); - mutationBuffer.position(position); - return BloomKFilter.deserialize(new ByteBufferInputStream(mutationBuffer)); - } - catch (IOException ex) { - throw new RuntimeException("Failed to deserialize bloomK filter", ex); - } - } - - @Override - public float getFloat(ByteBuffer buf, int position) - { - throw new UnsupportedOperationException("BloomFilterMergeBufferAggregator does not support getFloat()"); - } - - @Override - public long getLong(ByteBuffer buf, int position) - { - throw new UnsupportedOperationException("BloomFilterMergeBufferAggregator does not support getLong()"); - } - - @Override - public double getDouble(ByteBuffer buf, int position) - { - throw new UnsupportedOperationException("BloomFilterMergeBufferAggregator does not support getDouble()"); - } - - @Override - public void close() - { - // nothing to close + buf.position(position); + // size is 5 header bytes + length of long array + int sizeBytes = 5 + (buf.getInt(position + 1) << 3); + ByteBuffer other = selector.getObject(); + BloomKFilter.mergeBloomFilterByteBuffers(buf, position, sizeBytes, other, other.position(), sizeBytes); + buf.position(oldPosition); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java new file mode 100644 index 000000000000..6ffcfb8a721e --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.guice.BloomFilterSerializersModule; +import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.GenericColumnSerializer; +import org.apache.druid.segment.column.ColumnBuilder; +import org.apache.druid.segment.data.ObjectStrategy; +import org.apache.druid.segment.serde.ComplexMetricExtractor; +import org.apache.druid.segment.serde.ComplexMetricSerde; +import org.apache.druid.segment.writeout.SegmentWriteOutMedium; + +import java.nio.ByteBuffer; + +/** + * This exists so bloom filter agg has something to register so group by v1 will work, but isn't actually used + * because bloom filter agg is currently query time only + */ +public class BloomFilterSerde extends ComplexMetricSerde +{ + @Override + public String getTypeName() + { + return BloomFilterSerializersModule.BLOOM_FILTER_TYPE_NAME; + } + + @Override + public ComplexMetricExtractor getExtractor() + { + throw new UnsupportedOperationException("How can this be?"); + } + + @Override + public void deserializeColumn(ByteBuffer byteBuffer, ColumnBuilder columnBuilder) + { + throw new UnsupportedOperationException("How can this be?"); + } + + @Override + public GenericColumnSerializer getSerializer(SegmentWriteOutMedium segmentWriteOutMedium, String column) + { + throw new UnsupportedOperationException("How can this be?"); + } + + @Override + public ObjectStrategy getObjectStrategy() + { + throw new UnsupportedOperationException("How can this be?"); + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java index f1d43937c909..c908f52ce6c0 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java @@ -22,10 +22,14 @@ import org.apache.druid.query.dimension.ColumnSelectorStrategy; import org.apache.druid.query.filter.BloomKFilter; +import java.nio.ByteBuffer; + public interface BloomFilterAggregatorColumnSelectorStrategy extends ColumnSelectorStrategy { /** * Add column value to bloomK filter */ void add(TValueSelector selector, BloomKFilter bloomFilter); + + void bufferAdd(TValueSelector selector, ByteBuffer buffer); } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java index 7d161455894a..1174cd187e01 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java @@ -23,6 +23,8 @@ import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.BaseDoubleColumnValueSelector; +import java.nio.ByteBuffer; + public class DoubleBloomFilterAggregatorColumnSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy { @@ -33,4 +35,14 @@ public void add(BaseDoubleColumnValueSelector selector, BloomKFilter bloomFilter bloomFilter.addDouble(selector.getDouble()); } } + + @Override + public void bufferAdd(BaseDoubleColumnValueSelector selector, ByteBuffer buffer) + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + BloomKFilter.addDouble(buffer, selector.getDouble()); + } else { + BloomKFilter.addBytes(buffer, null, 0, 0); + } + } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java index c70a466bca71..eddab583e00f 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java @@ -23,6 +23,8 @@ import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.BaseFloatColumnValueSelector; +import java.nio.ByteBuffer; + public class FloatBloomFilterAggregatorColumnSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy { @@ -33,4 +35,14 @@ public void add(BaseFloatColumnValueSelector selector, BloomKFilter bloomFilter) bloomFilter.addFloat(selector.getFloat()); } } + + @Override + public void bufferAdd(BaseFloatColumnValueSelector selector, ByteBuffer buffer) + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + BloomKFilter.addFloat(buffer, selector.getFloat()); + } else { + BloomKFilter.addBytes(buffer, null, 0, 0); + } + } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java index 5d63f0d943d7..f7cc36053d5e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java @@ -23,6 +23,8 @@ import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.BaseLongColumnValueSelector; +import java.nio.ByteBuffer; + public class LongBloomFilterAggregatorColumnValueSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy { @@ -33,4 +35,14 @@ public void add(BaseLongColumnValueSelector selector, BloomKFilter bloomFilter) bloomFilter.addLong(selector.getLong()); } } + + @Override + public void bufferAdd(BaseLongColumnValueSelector selector, ByteBuffer buffer) + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + BloomKFilter.addLong(buffer, selector.getLong()); + } else { + BloomKFilter.addBytes(buffer, null, 0, 0); + } + } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java index 3058cb13564b..db714fdc11dc 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java @@ -22,6 +22,8 @@ import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.DimensionSelector; +import java.nio.ByteBuffer; + public class StringBloomFilterAggregatorColumnSelectorStrategy implements BloomFilterAggregatorColumnSelectorStrategy { @@ -46,4 +48,26 @@ public void add(DimensionSelector selector, BloomKFilter bloomFilter) } } } + + @Override + public void bufferAdd(DimensionSelector selector, ByteBuffer buffer) + { + if (selector.getRow().size() > 1) { + selector.getRow().forEach(v -> { + String value = selector.lookupName(v); + if (value == null) { + BloomKFilter.addBytes(buffer, null, 0, 0); + } else { + BloomKFilter.addString(buffer, value); + } + }); + } else { + String value = (String) selector.getObject(); + if (value == null) { + BloomKFilter.addBytes(buffer, null, 0, 0); + } else { + BloomKFilter.addString(buffer, value); + } + } + } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index 29492872966b..908d4c45dc67 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -27,6 +27,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Arrays; /** @@ -38,7 +40,13 @@ * https://github.com/apache/hive/commit/87ce36b458350db141c4cb4b6336a9a01796370f#diff-e65fc506757ee058dc951d15a9a526c3L238 * and this linked issue https://issues.apache.org/jira/browse/HIVE-20101. * - * Todo: remove this and begin using hive-storage-api version again once https://issues.apache.org/jira/browse/HIVE-20893 is released + * Addtionally, a handful of methods have been added to in situ work with BloomKFilters that have been serialized to a + * ByteBuffer, e.g. all add and merge methods. Test methods were not added because we don't need them.. but would + * probably be chill to do so it is symmetrical. + * + * Todo: remove this and begin using hive-storage-api version again once + * https://issues.apache.org/jira/browse/HIVE-20893 is released and if/when static ByteBuffer methods have been merged + * (or alternatively, move them to some sort of utils class) * * begin copy-pasta: * @@ -62,7 +70,7 @@ public class BloomKFilter private static final int DEFAULT_BLOCK_SIZE_BITS = (int) (Math.log(DEFAULT_BLOCK_SIZE) / Math.log(2)); private static final int DEFAULT_BLOCK_OFFSET_MASK = DEFAULT_BLOCK_SIZE - 1; private static final int DEFAULT_BIT_OFFSET_MASK = Long.SIZE - 1; - private final ThreadLocal BYTE_ARRAY_4 = ThreadLocal.withInitial(() -> new byte[4]); + private static final ThreadLocal BYTE_ARRAY_4 = ThreadLocal.withInitial(() -> new byte[4]); private final BitSet bitSet; private final int m; private final int k; @@ -210,6 +218,200 @@ public static void mergeBloomFilterBytes( } } + /** + * Serialize a bloom filter + * + * @param out output stream to write to + * @param bloomFilter BloomKFilter that needs to be seralized + */ + public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) throws IOException + { + /** + * Serialized BloomKFilter format: + * 1 byte for the number of hash functions. + * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset + * big endina longs in the BloomKFilter bitset + */ + ByteBuffer view = out.duplicate().order(ByteOrder.BIG_ENDIAN); + view.put((byte) bloomFilter.k); + view.putInt(bloomFilter.getBitSet().length); + for (long value : bloomFilter.getBitSet()) { + view.putLong(value); + } + } + + /** + * Deserialize a bloom filter + * Read a byte stream, which was written by {@linkplain #serialize(OutputStream, BloomKFilter)} + * into a {@code BloomKFilter} + * + * @param in input bytestream + * + * @return deserialized BloomKFilter + */ + public static BloomKFilter deserialize(ByteBuffer in) throws IOException + { + if (in == null) { + throw new IOException("Input stream is null"); + } + + try { + ByteBuffer dataBuffer = in.duplicate().order(ByteOrder.BIG_ENDIAN); + int numHashFunc = dataBuffer.get(); + int bitsetArrayLen = dataBuffer.getInt(); + long[] data = new long[bitsetArrayLen]; + for (int i = 0; i < bitsetArrayLen; i++) { + data[i] = dataBuffer.getLong(); + } + return new BloomKFilter(data, numHashFunc); + } + catch (RuntimeException e) { + IOException io = new IOException("Unable to deserialize BloomKFilter"); + io.initCause(e); + throw io; + } + } + + /** + * Merges BloomKFilter bf2 into bf1. + * Assumes 2 BloomKFilters with the same size/hash functions are serialized to ByteBuffers + * + * @param bf1Bytes + * @param bf1Start + * @param bf1Length + * @param bf2Bytes + * @param bf2Start + * @param bf2Length + */ + public static void mergeBloomFilterByteBuffers( + ByteBuffer bf1Bytes, + int bf1Start, + int bf1Length, + ByteBuffer bf2Bytes, + int bf2Start, + int bf2Length + ) + { + if (bf1Length != bf2Length) { + throw new IllegalArgumentException("bf1Length " + bf1Length + " does not match bf2Length " + bf2Length); + } + + // Validation on the bitset size/3 hash functions. + for (int idx = 0; idx < START_OF_SERIALIZED_LONGS; ++idx) { + if (bf1Bytes.get(bf1Start + idx) != bf2Bytes.get(bf2Start + idx)) { + throw new IllegalArgumentException("bf1 NumHashFunctions/NumBits does not match bf2"); + } + } + + // Just bitwise-OR the bits together - size/# functions should be the same, + // rest of the data is serialized long values for the bitset which are supposed to be bitwise-ORed. + for (int idx = START_OF_SERIALIZED_LONGS; idx < bf1Length; ++idx) { + final int pos1 = bf1Start + idx; + final int pos2 = bf2Start + idx; + final byte val = (byte) (bf1Bytes.get(pos1) | bf2Bytes.get(pos2)); + bf1Bytes.put(pos1, val); + } + } + + /** + * Caculate size in bytes of a BloomKFilter for a given number of entries + * @param maxNumEntries + * @return + */ + public static int computeSizeBytes(long maxNumEntries) + { + checkArgument(maxNumEntries > 0, "expectedEntries should be > 0"); + long numBits = optimalNumOfBits(maxNumEntries, DEFAULT_FPP); + + int nLongs = (int) Math.ceil((double) numBits / (double) Long.SIZE); + int padLongs = DEFAULT_BLOCK_SIZE - nLongs % DEFAULT_BLOCK_SIZE; + return START_OF_SERIALIZED_LONGS + ((nLongs + padLongs) * Long.BYTES); + } + + public static void add(ByteBuffer buffer, byte[] val) + { + addBytes(buffer, val); + } + + public static void addBytes(ByteBuffer buffer, byte[] val, int offset, int length) + { + long hash64 = val == null ? Murmur3.NULL_HASHCODE : + Murmur3.hash64(val, offset, length); + addHash(buffer, hash64); + } + + public static void addBytes(ByteBuffer buffer, byte[] val) + { + addBytes(buffer, val, 0, val.length); + } + + public static void addHash(ByteBuffer buffer, long hash64) + { + final int hash1 = (int) hash64; + final int hash2 = (int) (hash64 >>> 32); + + int firstHash = hash1 + hash2; + // hashcode should be positive, flip all the bits if it's negative + if (firstHash < 0) { + firstHash = ~firstHash; + } + + ByteBuffer view = buffer.duplicate().order(ByteOrder.BIG_ENDIAN); + int startPosition = view.position(); + int numHashFuncs = view.get(startPosition); + int totalBlockCount = view.getInt(startPosition + 1) / DEFAULT_BLOCK_SIZE; + // first hash is used to locate start of the block (blockBaseOffset) + // subsequent K hashes are used to generate K bits within a block of words + final int blockIdx = firstHash % totalBlockCount; + final int blockBaseOffset = blockIdx << DEFAULT_BLOCK_SIZE_BITS; + for (int i = 1; i <= numHashFuncs; i++) { + int combinedHash = hash1 + ((i + 1) * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + // LSB 3 bits is used to locate offset within the block + final int absOffset = blockBaseOffset + (combinedHash & DEFAULT_BLOCK_OFFSET_MASK); + // Next 6 bits are used to locate offset within a long/word + final int bitPos = (combinedHash >>> DEFAULT_BLOCK_SIZE_BITS) & DEFAULT_BIT_OFFSET_MASK; + + final int bufPos = startPosition + START_OF_SERIALIZED_LONGS + (absOffset * Long.BYTES); + view.putLong(bufPos, view.getLong(bufPos) | (1L << bitPos)); + } + } + + public static void addString(ByteBuffer buffer, String val) + { + addBytes(buffer, StringUtils.toUtf8(val)); + } + + public static void addByte(ByteBuffer buffer, byte val) + { + addBytes(buffer, new byte[]{val}); + } + + public static void addInt(ByteBuffer buffer, int val) + { + // puts int in little endian order + addBytes(buffer, intToByteArrayLE(val)); + } + + public static void addLong(ByteBuffer buffer, long val) + { + // puts long in little endian order + addHash(buffer, Murmur3.hash64(val)); + } + + public static void addFloat(ByteBuffer buffer, float val) + { + addInt(buffer, Float.floatToIntBits(val)); + } + + public static void addDouble(ByteBuffer buffer, double val) + { + addLong(buffer, Double.doubleToLongBits(val)); + } + public void add(byte[] val) { addBytes(val); @@ -381,7 +583,7 @@ public boolean testDouble(double val) return testLong(Double.doubleToLongBits(val)); } - private byte[] intToByteArrayLE(int val) + private static byte[] intToByteArrayLE(int val) { byte[] bytes = BYTE_ARRAY_4.get(); bytes[0] = (byte) (val >> 0); diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java index 0c5e6c06a1e3..45b62cbb9a7a 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -432,8 +432,13 @@ public void testMergeValues() throws IOException @Test public void testBuferMergeValues() throws IOException { - final TestBloomFilterColumnSelector mergeDim = - new TestBloomFilterColumnSelector(ImmutableList.of(filter1, filter2)); + final TestBloomFilterBufferColumnSelector mergeDim = + new TestBloomFilterBufferColumnSelector( + ImmutableList.of( + ByteBuffer.wrap(BloomFilterSerializersModule.bloomKFilterToBytes(filter1)), + ByteBuffer.wrap(BloomFilterSerializersModule.bloomKFilterToBytes(filter2)) + ) + ); BloomFilterMergeBufferAggregator mergeAggregator = new BloomFilterMergeBufferAggregator(mergeDim, maxNumValues); @@ -578,6 +583,14 @@ public TestBloomFilterColumnSelector(List values) } } + public static class TestBloomFilterBufferColumnSelector extends SteppableSelector + { + public TestBloomFilterBufferColumnSelector(List values) + { + super(values); + } + } + public static class TestLongColumnSelector extends SteppableSelector implements LongColumnSelector { public TestLongColumnSelector(List values) diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java new file mode 100644 index 000000000000..542acdf454ad --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import com.google.common.collect.Lists; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.data.input.MapBasedRow; +import org.apache.druid.guice.BloomFilterExtensionModule; +import org.apache.druid.java.util.common.granularity.Granularities; +import org.apache.druid.java.util.common.guava.Sequence; +import org.apache.druid.query.aggregation.AggregationTestHelper; +import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.query.groupby.GroupByQueryConfig; +import org.apache.druid.query.groupby.GroupByQueryRunnerTest; +import org.junit.After; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +@RunWith(Parameterized.class) +public class BloomFilterGroupByQueryTest +{ + private AggregationTestHelper helper; + + @Rule + public final TemporaryFolder tempFolder = new TemporaryFolder(); + + public BloomFilterGroupByQueryTest(final GroupByQueryConfig config) + { + BloomFilterExtensionModule module = new BloomFilterExtensionModule(); + module.configure(null); + helper = AggregationTestHelper.createGroupByQueryAggregationTestHelper( + Lists.newArrayList(module.getJacksonModules()), + config, + tempFolder + ); + } + + @Parameterized.Parameters(name = "{0}") + public static Collection constructorFeeder() + { + final List constructors = new ArrayList<>(); + for (GroupByQueryConfig config : GroupByQueryRunnerTest.testConfigs()) { + constructors.add(new Object[]{config}); + } + return constructors; + } + + @After + public void teardown() throws IOException + { + helper.close(); + } + + @Test + public void testIngestWithNullsIgnoredAndQuery() throws Exception + { + MapBasedRow row = ingestAndQuery(true); + Object o = row.getRaw("blooming_quality"); + Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("mezzanine")); + Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("premium")); + Assert.assertFalse(((BloomKFilter) row.getRaw("blooming_quality")).testString("entertainment")); + + } + + @Test + public void testIngestWithNullsToZeroAndQuery() throws Exception + { + // Nulls are ignored and not replaced with default for SQL compatible null handling. + // This is already tested in testIngestWithNullsIgnoredAndQuery() + if (NullHandling.replaceWithDefault()) { + MapBasedRow row = ingestAndQuery(false); + Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("mezzanine")); + Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("premium")); + Assert.assertFalse(((BloomKFilter) row.getRaw("blooming_quality")).testString("entertainment")); + } + } + + private MapBasedRow ingestAndQuery(boolean ignoreNulls) throws Exception + { + String metricSpec = "[{ \"type\": \"count\", \"name\": \"count\"}]"; + + String parseSpec = "{" + + "\"type\" : \"string\"," + + "\"parseSpec\" : {" + + " \"format\" : \"tsv\"," + + " \"timestampSpec\" : {" + + " \"column\" : \"timestamp\"," + + " \"format\" : \"auto\"" + + "}," + + " \"dimensionsSpec\" : {" + + " \"dimensions\": []," + + " \"dimensionExclusions\" : []," + + " \"spatialDimensions\" : []" + + " }," + + " \"columns\": [\"timestamp\", \"market\", \"quality\", \"placement\", \"placementish\", \"index\"]" + + " }" + + "}"; + + String query = "{" + + "\"queryType\": \"groupBy\"," + + "\"dataSource\": \"test_datasource\"," + + "\"granularity\": \"ALL\"," + + "\"dimensions\": []," + + "\"filter\":{ \"type\":\"selector\", \"dimension\":\"market\", \"value\":\"upfront\"}," + + "\"aggregations\": [" + + " { \"type\": \"bloom\", \"name\": \"blooming_quality\", \"field\": \"quality\" }" + + "]," + + "\"intervals\": [ \"1970/2050\" ]" + + "}"; + + Sequence seq = helper.createIndexAndRunQueryOnSegment( + this.getClass().getClassLoader().getResourceAsStream("sample.data.tsv"), + parseSpec, + metricSpec, + 0, + Granularities.NONE, + 50000, + query + ); + + return (MapBasedRow) seq.toList().get(0); + } +} diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java new file mode 100644 index 000000000000..5da1059354f7 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java @@ -0,0 +1,508 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.filter; + +import org.apache.druid.io.ByteBufferInputStream; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Random; + +import static org.junit.Assert.assertEquals; + +public class BloomKFilterTest +{ + private static final int COUNT = 100; + Random rand = new Random(123); + + @Test + public void testBloomKFilterBytes() throws IOException + { + BloomKFilter bf = new BloomKFilter(10000); + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf); + byte[] bfBytes = bytesOut.toByteArray(); + ByteBuffer buffer = ByteBuffer.wrap(bfBytes); + + byte[] val = new byte[]{1, 2, 3}; + byte[] val1 = new byte[]{1, 2, 3, 4}; + byte[] val2 = new byte[]{1, 2, 3, 4, 5}; + byte[] val3 = new byte[]{1, 2, 3, 4, 5, 6}; + + + bf.add(val); + BloomKFilter.add(buffer, val); + BloomKFilter rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.test(val)); + assertEquals(false, rehydrated.test(val1)); + assertEquals(false, rehydrated.test(val2)); + assertEquals(false, rehydrated.test(val3)); + BloomKFilter.add(buffer, val1); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.test(val)); + assertEquals(true, rehydrated.test(val1)); + assertEquals(false, rehydrated.test(val2)); + assertEquals(false, rehydrated.test(val3)); + BloomKFilter.add(buffer, val2); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.test(val)); + assertEquals(true, rehydrated.test(val1)); + assertEquals(true, rehydrated.test(val2)); + assertEquals(false, rehydrated.test(val3)); + BloomKFilter.add(buffer, val3); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.test(val)); + assertEquals(true, rehydrated.test(val1)); + assertEquals(true, rehydrated.test(val2)); + assertEquals(true, rehydrated.test(val3)); + + byte[] randVal = new byte[COUNT]; + for (int i = 0; i < COUNT; i++) { + rand.nextBytes(randVal); + BloomKFilter.add(buffer, randVal); + } + // last value should be present + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + assertEquals(true, rehydrated.test(randVal)); + // most likely this value should not exist + randVal[0] = 0; + randVal[1] = 0; + randVal[2] = 0; + randVal[3] = 0; + randVal[4] = 0; + assertEquals(false, rehydrated.test(randVal)); + + assertEquals(7808, rehydrated.sizeInBytes()); + } + + @Test + public void testBloomKFilterByte() throws IOException + { + BloomKFilter bf = new BloomKFilter(10000); + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf); + byte[] bfBytes = bytesOut.toByteArray(); + ByteBuffer buffer = ByteBuffer.wrap(bfBytes); + + byte val = Byte.MIN_VALUE; + byte val1 = 1; + byte val2 = 2; + byte val3 = Byte.MAX_VALUE; + + BloomKFilter.addLong(buffer, val); + BloomKFilter rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(false, rehydrated.testLong(val1)); + assertEquals(false, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val1); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(false, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val2); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(true, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val3); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(true, rehydrated.testLong(val2)); + assertEquals(true, rehydrated.testLong(val3)); + + byte randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = (byte) rand.nextInt(Byte.MAX_VALUE); + BloomKFilter.addLong(buffer, randVal); + } + + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + + // last value should be present + assertEquals(true, rehydrated.testLong(randVal)); + // most likely this value should not exist + assertEquals(false, rehydrated.testLong((byte) -120)); + + assertEquals(7808, rehydrated.sizeInBytes()); + } + + @Test + public void testBloomKFilterInt() throws IOException + { + BloomKFilter bf = new BloomKFilter(10000); + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf); + byte[] bfBytes = bytesOut.toByteArray(); + ByteBuffer buffer = ByteBuffer.wrap(bfBytes); + + int val = Integer.MIN_VALUE; + int val1 = 1; + int val2 = 2; + int val3 = Integer.MAX_VALUE; + + BloomKFilter.addLong(buffer, val); + BloomKFilter rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(false, rehydrated.testLong(val1)); + assertEquals(false, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val1); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(false, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val2); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(true, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val3); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(true, rehydrated.testLong(val2)); + assertEquals(true, rehydrated.testLong(val3)); + + int randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextInt(); + BloomKFilter.addLong(buffer, randVal); + } + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + // last value should be present + assertEquals(true, rehydrated.testLong(randVal)); + // most likely this value should not exist + assertEquals(false, rehydrated.testLong(-120)); + + assertEquals(7808, rehydrated.sizeInBytes()); + } + + @Test + public void testBloomKFilterLong() throws IOException + { + BloomKFilter bf = new BloomKFilter(10000); + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf); + byte[] bfBytes = bytesOut.toByteArray(); + ByteBuffer buffer = ByteBuffer.wrap(bfBytes); + + long val = Long.MIN_VALUE; + long val1 = 1; + long val2 = 2; + long val3 = Long.MAX_VALUE; + + BloomKFilter.addLong(buffer, val); + BloomKFilter rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(false, rehydrated.testLong(val1)); + assertEquals(false, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val1); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(false, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val2); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(true, rehydrated.testLong(val2)); + assertEquals(false, rehydrated.testLong(val3)); + BloomKFilter.addLong(buffer, val3); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testLong(val)); + assertEquals(true, rehydrated.testLong(val1)); + assertEquals(true, rehydrated.testLong(val2)); + assertEquals(true, rehydrated.testLong(val3)); + + int randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextInt(); + BloomKFilter.addLong(buffer, randVal); + } + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + // last value should be present + assertEquals(true, rehydrated.testLong(randVal)); + // most likely this value should not exist + assertEquals(false, rehydrated.testLong(-120)); + + assertEquals(7808, rehydrated.sizeInBytes()); + } + + @Test + public void testBloomKFilterFloat() throws IOException + { + BloomKFilter bf = new BloomKFilter(10000); + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf); + byte[] bfBytes = bytesOut.toByteArray(); + ByteBuffer buffer = ByteBuffer.wrap(bfBytes); + + float val = Float.NEGATIVE_INFINITY; + float val1 = 1.1f; + float val2 = 2.2f; + float val3 = Float.POSITIVE_INFINITY; + + BloomKFilter.addFloat(buffer, val); + BloomKFilter rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testFloat(val)); + assertEquals(false, rehydrated.testFloat(val1)); + assertEquals(false, rehydrated.testFloat(val2)); + assertEquals(false, rehydrated.testFloat(val3)); + BloomKFilter.addFloat(buffer, val1); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testFloat(val)); + assertEquals(true, rehydrated.testFloat(val1)); + assertEquals(false, rehydrated.testFloat(val2)); + assertEquals(false, rehydrated.testFloat(val3)); + BloomKFilter.addFloat(buffer, val2); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testFloat(val)); + assertEquals(true, rehydrated.testFloat(val1)); + assertEquals(true, rehydrated.testFloat(val2)); + assertEquals(false, rehydrated.testFloat(val3)); + BloomKFilter.addFloat(buffer, val3); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testFloat(val)); + assertEquals(true, rehydrated.testFloat(val1)); + assertEquals(true, rehydrated.testFloat(val2)); + assertEquals(true, rehydrated.testFloat(val3)); + + float randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextFloat(); + BloomKFilter.addFloat(buffer, randVal); + } + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + + // last value should be present + assertEquals(true, rehydrated.testFloat(randVal)); + // most likely this value should not exist + assertEquals(false, rehydrated.testFloat(-120.2f)); + + assertEquals(7808, rehydrated.sizeInBytes()); + } + + @Test + public void testBloomKFilterDouble() throws IOException + { + BloomKFilter bf = new BloomKFilter(10000); + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf); + byte[] bfBytes = bytesOut.toByteArray(); + ByteBuffer buffer = ByteBuffer.wrap(bfBytes); + + double val = Double.NEGATIVE_INFINITY; + double val1 = 1.1d; + double val2 = 2.2d; + double val3 = Double.POSITIVE_INFINITY; + + BloomKFilter.addDouble(buffer, val); + BloomKFilter rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testDouble(val)); + assertEquals(false, rehydrated.testDouble(val1)); + assertEquals(false, rehydrated.testDouble(val2)); + assertEquals(false, rehydrated.testDouble(val3)); + BloomKFilter.addDouble(buffer, val1); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testDouble(val)); + assertEquals(true, rehydrated.testDouble(val1)); + assertEquals(false, rehydrated.testDouble(val2)); + assertEquals(false, rehydrated.testDouble(val3)); + BloomKFilter.addDouble(buffer, val2); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testDouble(val)); + assertEquals(true, rehydrated.testDouble(val1)); + assertEquals(true, rehydrated.testDouble(val2)); + assertEquals(false, rehydrated.testDouble(val3)); + BloomKFilter.addDouble(buffer, val3); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testDouble(val)); + assertEquals(true, rehydrated.testDouble(val1)); + assertEquals(true, rehydrated.testDouble(val2)); + assertEquals(true, rehydrated.testDouble(val3)); + + double randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextDouble(); + BloomKFilter.addDouble(buffer, randVal); + } + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + + // last value should be present + assertEquals(true, rehydrated.testDouble(randVal)); + // most likely this value should not exist + assertEquals(false, rehydrated.testDouble(-120.2d)); + + assertEquals(7808, rehydrated.sizeInBytes()); + } + + @Test + public void testBloomKFilterString() throws IOException + { + BloomKFilter bf = new BloomKFilter(100000); + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf); + byte[] bfBytes = bytesOut.toByteArray(); + ByteBuffer buffer = ByteBuffer.wrap(bfBytes); + + String val = "bloo"; + String val1 = "bloom fil"; + String val2 = "bloom filter"; + String val3 = "cuckoo filter"; + + BloomKFilter.addString(buffer, val); + BloomKFilter rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testString(val)); + assertEquals(false, rehydrated.testString(val1)); + assertEquals(false, rehydrated.testString(val2)); + assertEquals(false, rehydrated.testString(val3)); + BloomKFilter.addString(buffer, val1); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testString(val)); + assertEquals(true, rehydrated.testString(val1)); + assertEquals(false, rehydrated.testString(val2)); + assertEquals(false, rehydrated.testString(val3)); + BloomKFilter.addString(buffer, val2); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testString(val)); + assertEquals(true, rehydrated.testString(val1)); + assertEquals(true, rehydrated.testString(val2)); + assertEquals(false, rehydrated.testString(val3)); + BloomKFilter.addString(buffer, val3); + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + buffer.position(0); + assertEquals(true, rehydrated.testString(val)); + assertEquals(true, rehydrated.testString(val1)); + assertEquals(true, rehydrated.testString(val2)); + assertEquals(true, rehydrated.testString(val3)); + + long randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextLong(); + BloomKFilter.addString(buffer, Long.toString(randVal)); + } + rehydrated = BloomKFilter.deserialize(new ByteBufferInputStream(buffer)); + // last value should be present + assertEquals(true, rehydrated.testString(Long.toString(randVal))); + // most likely this value should not exist + assertEquals(false, rehydrated.testString(Long.toString(-120))); + + assertEquals(77952, rehydrated.sizeInBytes()); + } + @Test + public void testMergeBloomKFilterByteBuffers() throws Exception + { + BloomKFilter bf1 = new BloomKFilter(10000); + BloomKFilter bf2 = new BloomKFilter(10000); + + String[] inputs1 = { + "bloo", + "bloom fil", + "bloom filter", + "cuckoo filter", + }; + + String[] inputs2 = { + "2_bloo", + "2_bloom fil", + "2_bloom filter", + "2_cuckoo filter", + }; + + for (String val : inputs1) { + bf1.addString(val); + } + for (String val : inputs2) { + bf2.addString(val); + } + + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bf1); + byte[] bf1Bytes = bytesOut.toByteArray(); + bytesOut.reset(); + BloomKFilter.serialize(bytesOut, bf2); + byte[] bf2Bytes = bytesOut.toByteArray(); + + ByteBuffer buf1 = ByteBuffer.wrap(bf1Bytes); + ByteBuffer buf2 = ByteBuffer.wrap(bf2Bytes); + + // Merge bytes + BloomKFilter.mergeBloomFilterByteBuffers( + buf1, + 0, + bf1Bytes.length, + buf2, + 0, + bf2Bytes.length + ); + + // Deserialize and test + byte[] merged = new byte[bf1Bytes.length]; + buf1.get(merged, 0, bf1Bytes.length); + + ByteArrayInputStream bytesIn = new ByteArrayInputStream(merged, 0, bf1Bytes.length); + BloomKFilter bfMerged = BloomKFilter.deserialize(bytesIn); + // All values should pass test + for (String val : inputs1) { + assert bfMerged.testString(val); + } + for (String val : inputs2) { + assert bfMerged.testString(val); + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/test/resources/sample.data.tsv b/extensions-core/druid-bloom-filter/src/test/resources/sample.data.tsv new file mode 100644 index 000000000000..674d86cefe9f --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/test/resources/sample.data.tsv @@ -0,0 +1,13 @@ +2011-04-15T00:00:00.000Z spot automotive preferred apreferred 106.793700 +2011-04-15T00:00:00.000Z spot business preferred bpreferred 94.469747 +2011-04-15T00:00:00.000Z spot entertainment preferred epreferred 135.109191 +2011-04-15T00:00:00.000Z spot health preferred hpreferred 99.596909 +2011-04-15T00:00:00.000Z spot mezzanine preferred mpreferred 92.782760 +2011-04-15T00:00:00.000Z spot news preferred npreferred +2011-04-15T00:00:00.000Z spot premium preferred ppreferred +2011-04-15T00:00:00.000Z spot technology preferred tpreferred +2011-04-15T00:00:00.000Z spot travel preferred tpreferred +2011-04-15T00:00:00.000Z total_market mezzanine preferred mpreferred +2011-04-15T00:00:00.000Z total_market premium preferred ppreferred +2011-04-15T00:00:00.000Z upfront mezzanine preferred mpreferred +2011-04-15T00:00:00.000Z upfront premium preferred ppreferred diff --git a/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java b/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java index a6b41ff64bc2..133e7338d384 100644 --- a/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java +++ b/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java @@ -39,6 +39,8 @@ public static void addOperatorConversion( final Class clazz ) { - Multibinder.newSetBinder(binder, SqlOperatorConversion.class).addBinding().to(clazz); + if (binder != null) { + Multibinder.newSetBinder(binder, SqlOperatorConversion.class).addBinding().to(clazz); + } } } From d1ba9d4041e46f4c5889fd6966141df96c497478 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 13:27:15 -0800 Subject: [PATCH 07/36] add methods to BloomKFilter to get number of set bits, use in comparator, fixes --- .../bloom/BloomFilterAggregatorFactory.java | 54 +++++++++++++++---- .../druid/query/filter/BloomKFilter.java | 17 ++++++ 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index f18568ed6f34..1a3f818192b4 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -21,8 +21,11 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.primitives.Ints; +import io.netty.buffer.ByteBuf; import org.apache.commons.codec.binary.Base64; import org.apache.druid.guice.BloomFilterSerializersModule; +import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.Aggregator; @@ -96,8 +99,24 @@ public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) @Override public Comparator getComparator() { - // idk how to compare? - return (Comparator) (o1, o2) -> 0; + return (Comparator) (o1, o2) -> { + try { + if (o1 instanceof ByteBuffer && o2 instanceof ByteBuffer) { + BloomKFilter o1f = BloomKFilter.deserialize((ByteBuffer) o1); + BloomKFilter o2f = BloomKFilter.deserialize((ByteBuffer) o2); + return Ints.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); + } else if (o1 instanceof BloomKFilter && o2 instanceof BloomKFilter) { + BloomKFilter o1f = (BloomKFilter) o1; + BloomKFilter o2f = (BloomKFilter) o2; + return Ints.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); + } else { + throw new RE("Unable to compare unexpected types [%s]", o1.getClass().getName()); + } + } + catch (IOException ioe) { + throw new RuntimeException("Failed to deserialize BloomKFilter"); + } + }; } @Override @@ -109,8 +128,17 @@ public Object combine(@Nullable Object lhs, @Nullable Object rhs) if (lhs == null) { return rhs; } - ((BloomKFilter) lhs).merge((BloomKFilter) rhs); - return lhs; + if (rhs instanceof BloomKFilter) { + ((BloomKFilter) lhs).merge((BloomKFilter) rhs); + return lhs; + } else { + ByteBuffer buf = (ByteBuffer) lhs; + int position = buf.position(); + int sizeBytes = 5 + (buf.getInt(position + 1) << 3); + ByteBuffer other = (ByteBuffer) rhs; + BloomKFilter.mergeBloomFilterByteBuffers(buf, position, sizeBytes, other, other.position(), sizeBytes); + return lhs; + } } @Override @@ -131,22 +159,26 @@ public Object deserialize(Object object) if (object instanceof String) { return ByteBuffer.wrap(Base64.decodeBase64(StringUtils.toUtf8((String) object))); } else { - throw new RuntimeException("Failed to deserialize BloomKFilter"); + return object; } } @Override public Object finalizeComputation(Object object) { - if (object instanceof ByteBuffer) { - try { + + try { + if (object instanceof ByteBuffer) { return BloomKFilter.deserialize((ByteBuffer) object); + } else if (object instanceof byte[]) { + BloomKFilter.deserialize(ByteBuffer.wrap((byte[]) object)); + } else { + return object; } - catch (IOException ioe) { - throw new RuntimeException("Failed to deserialize BloomKFilter"); - } } - return object; + catch(IOException ioe){ + throw new RuntimeException("Failed to deserialize BloomKFilter"); + } } @JsonProperty diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index 908d4c45dc67..d95fc848137e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -603,6 +603,11 @@ public int getBitSize() return bitSet.getData().length * Long.SIZE; } + public int getNumSetBits() + { + return bitSet.setBitsCount(); + } + public int getNumHashFunctions() { return k; @@ -690,6 +695,18 @@ public boolean get(int index) return (data[index >>> 6] & (1L << index)) != 0; } + + public int setBitsCount() + { + int setCount = 0; + for (int i = 0; i < bitSize(); i++) { + if (get(i)) { + setCount++; + } + } + return setCount; + } + /** * Number of bits */ From f284aeb7f72bcc60306f7183ccd33ebf80fd7247 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 16:45:02 -0800 Subject: [PATCH 08/36] more docs --- .../extensions-core/bloom-filter.md | 36 +++++++++++++------ .../bloom/BloomFilterAggregatorFactory.java | 1 - .../druid/query/filter/BloomKFilter.java | 9 ++--- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/docs/content/development/extensions-core/bloom-filter.md b/docs/content/development/extensions-core/bloom-filter.md index b95dc326b648..d02dc7e85ae1 100644 --- a/docs/content/development/extensions-core/bloom-filter.md +++ b/docs/content/development/extensions-core/bloom-filter.md @@ -28,21 +28,35 @@ This extension adds the ability to both construct bloom filters from query resul against a bloom filter. Make sure to [include](../../operations/including-extensions.html) `druid-bloom-filter` as an extension. -A BloomFilter is a probabilistic data structure for set membership check. -Following are some characterstics of BloomFilter +A BloomFilter is a probabilistic data structure for performing a set membership check. A bloom filter is a good candidate +to use with Druid for cases where an explicit filter is impossible, e.g. filtering a query against a set of millions of + values. + +Following are some characterstics of BloomFilters: - BloomFilters are highly space efficient when compared to using a HashSet. -- Because of the probabilistic nature of bloom filter false positive results are possible (e.g. element was not actually -present in bloom filter construction, but `test()` says true) +- Because of the probabilistic nature of bloom filters, false positive results are possible (e.g. element was not actually +inserted into a bloom filter during construction, but `test()` says true) - False negatives are not possible (if element is present then `test()` will never say false). -- The false positive probability is configurable (default: 5%) depending on which storage requirement may increase or - decrease. -- Lower the false positive probability greater is the space requirement. -- Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. -- During the creation of bloom filter expected number of entries must be specified. If the number of insertions exceed +- The false positive probability of this implementation is currently fixed at 5%, but increasing the number of entries +that the filter can hold can decrease this false positive rate in exchange for overall size. +- Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. During the creation of bloom filter expected number of entries must be specified. If the number of insertions exceed the specified initial number of entries then false positive probability will increase accordingly. -This extension is built on top of `org.apache.hive.common.util.BloomKFilter`. Internally, this implementation of bloom -filter uses Murmur3 fast non-cryptographic hash algorithm. +This extension is currently based on `org.apache.hive.common.util.BloomKFilter` from `hive-storage-api`. Internally, +this implementation uses Murmur3 as the hash algorithm. + +To construct a BloomKFilter externally with Java to use as a filter in a Druid query: + +```java +BloomKFilter bloomFilter = new BloomKFilter(1500); +bloomFilter.addString("some string"); +bloomFilter.addString("some other string"); +bloomFilter.addString("striiings!"); +ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); +BloomKFilter.serialize(byteArrayOutputStream, bloomFilter); +String base64Serialized= Base64.encodeBase64String(bytes); +``` +This string can then be used in the native or sql Druid query. ## Filtering queries with a Bloom Filter diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 1a3f818192b4..7edd854d328b 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -166,7 +166,6 @@ public Object deserialize(Object object) @Override public Object finalizeComputation(Object object) { - try { if (object instanceof ByteBuffer) { return BloomKFilter.deserialize((ByteBuffer) object); diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index d95fc848137e..6d2b4961c3cb 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -221,10 +221,10 @@ public static void mergeBloomFilterBytes( /** * Serialize a bloom filter * - * @param out output stream to write to + * @param out output buffer to write to * @param bloomFilter BloomKFilter that needs to be seralized */ - public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) throws IOException + public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) { /** * Serialized BloomKFilter format: @@ -242,10 +242,11 @@ public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) throws IO /** * Deserialize a bloom filter - * Read a byte stream, which was written by {@linkplain #serialize(OutputStream, BloomKFilter)} + * Read a byte buffer, which was written by {@linkplain #serialize(OutputStream, BloomKFilter)} or + * {@linkplain #serialize(ByteBuffer, BloomKFilter)} * into a {@code BloomKFilter} * - * @param in input bytestream + * @param in input ByteBuffer * * @return deserialized BloomKFilter */ From 71d00cf195d6f4c5d777256cbe9049100b54662f Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 16:50:42 -0800 Subject: [PATCH 09/36] fix --- .../query/aggregation/bloom/BloomFilterAggregatorFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 7edd854d328b..da443b3bc36e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -170,12 +170,12 @@ public Object finalizeComputation(Object object) if (object instanceof ByteBuffer) { return BloomKFilter.deserialize((ByteBuffer) object); } else if (object instanceof byte[]) { - BloomKFilter.deserialize(ByteBuffer.wrap((byte[]) object)); + return BloomKFilter.deserialize(ByteBuffer.wrap((byte[]) object)); } else { return object; } } - catch(IOException ioe){ + catch(IOException ioe) { throw new RuntimeException("Failed to deserialize BloomKFilter"); } } From cec770654336bdceb70c234680697e4e2d19156c Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 20:14:52 -0800 Subject: [PATCH 10/36] fix style --- .../query/aggregation/bloom/BloomFilterAggregatorFactory.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index da443b3bc36e..25fbe328cfb5 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -22,7 +22,6 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.primitives.Ints; -import io.netty.buffer.ByteBuf; import org.apache.commons.codec.binary.Base64; import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.druid.java.util.common.RE; @@ -175,7 +174,7 @@ public Object finalizeComputation(Object object) return object; } } - catch(IOException ioe) { + catch (IOException ioe) { throw new RuntimeException("Failed to deserialize BloomKFilter"); } } From ee91f3ba634b85eb99a2bf0e9dd7f702c5b7f52f Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 21:13:29 -0800 Subject: [PATCH 11/36] simplify bloomfilter bytebuffer merge, change methods to allow passing buffer offsets --- .../BaseBloomFilterBufferAggregator.java | 8 +---- .../bloom/BloomFilterAggregatorFactory.java | 4 +-- .../BloomFilterMergeBufferAggregator.java | 4 +-- .../druid/query/filter/BloomKFilter.java | 30 +++++++++++++------ .../druid/query/filter/BloomKFilterTest.java | 4 +-- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java index 59d9e832ca02..412363eafd33 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java @@ -22,7 +22,6 @@ import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.filter.BloomKFilter; -import java.io.IOException; import java.nio.ByteBuffer; public abstract class BaseBloomFilterBufferAggregator implements BufferAggregator @@ -40,12 +39,7 @@ public void init(ByteBuffer buf, int position) final ByteBuffer mutationBuffer = buf.duplicate(); mutationBuffer.position(position); BloomKFilter filter = new BloomKFilter(maxNumEntries); - try { - BloomKFilter.serialize(mutationBuffer, filter); - } - catch (IOException ex) { - throw new RuntimeException("Failed to initialize bloomK filter", ex); - } + BloomKFilter.serialize(mutationBuffer, filter); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 25fbe328cfb5..bd56386f1df7 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -132,10 +132,8 @@ public Object combine(@Nullable Object lhs, @Nullable Object rhs) return lhs; } else { ByteBuffer buf = (ByteBuffer) lhs; - int position = buf.position(); - int sizeBytes = 5 + (buf.getInt(position + 1) << 3); ByteBuffer other = (ByteBuffer) rhs; - BloomKFilter.mergeBloomFilterByteBuffers(buf, position, sizeBytes, other, other.position(), sizeBytes); + BloomKFilter.mergeBloomFilterByteBuffers(buf, buf.position(), other, other.position()); return lhs; } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index 1bafcb4c01d6..5fb40712321e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -40,10 +40,8 @@ public void aggregate(ByteBuffer buf, int position) { final int oldPosition = buf.position(); buf.position(position); - // size is 5 header bytes + length of long array - int sizeBytes = 5 + (buf.getInt(position + 1) << 3); ByteBuffer other = selector.getObject(); - BloomKFilter.mergeBloomFilterByteBuffers(buf, position, sizeBytes, other, other.position(), sizeBytes); + BloomKFilter.mergeBloomFilterByteBuffers(buf, position, other, other.position()); buf.position(oldPosition); } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index 6d2b4961c3cb..053196d86c81 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -218,13 +218,19 @@ public static void mergeBloomFilterBytes( } } + public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) + { + serialize(out, out.position(), bloomFilter); + } + /** - * Serialize a bloom filter + * Serialize a bloom filter to a ByteBuffer * * @param out output buffer to write to + * @param position output buffer position * @param bloomFilter BloomKFilter that needs to be seralized */ - public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) + public static void serialize(ByteBuffer out, int position, BloomKFilter bloomFilter) { /** * Serialized BloomKFilter format: @@ -233,6 +239,7 @@ public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) * big endina longs in the BloomKFilter bitset */ ByteBuffer view = out.duplicate().order(ByteOrder.BIG_ENDIAN); + view.position(position); view.put((byte) bloomFilter.k); view.putInt(bloomFilter.getBitSet().length); for (long value : bloomFilter.getBitSet()) { @@ -240,17 +247,22 @@ public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) } } + public static BloomKFilter deserialize(ByteBuffer in) throws IOException + { + return deserialize(in, in.position()); + } + /** * Deserialize a bloom filter * Read a byte buffer, which was written by {@linkplain #serialize(OutputStream, BloomKFilter)} or - * {@linkplain #serialize(ByteBuffer, BloomKFilter)} + * {@linkplain #serialize(ByteBuffer, int, BloomKFilter)} * into a {@code BloomKFilter} * * @param in input ByteBuffer * * @return deserialized BloomKFilter */ - public static BloomKFilter deserialize(ByteBuffer in) throws IOException + public static BloomKFilter deserialize(ByteBuffer in, int position) throws IOException { if (in == null) { throw new IOException("Input stream is null"); @@ -258,6 +270,7 @@ public static BloomKFilter deserialize(ByteBuffer in) throws IOException try { ByteBuffer dataBuffer = in.duplicate().order(ByteOrder.BIG_ENDIAN); + dataBuffer.position(position); int numHashFunc = dataBuffer.get(); int bitsetArrayLen = dataBuffer.getInt(); long[] data = new long[bitsetArrayLen]; @@ -279,20 +292,19 @@ public static BloomKFilter deserialize(ByteBuffer in) throws IOException * * @param bf1Bytes * @param bf1Start - * @param bf1Length * @param bf2Bytes * @param bf2Start - * @param bf2Length */ public static void mergeBloomFilterByteBuffers( ByteBuffer bf1Bytes, int bf1Start, - int bf1Length, ByteBuffer bf2Bytes, - int bf2Start, - int bf2Length + int bf2Start ) { + final int bf1Length = bf1Bytes.getInt(1 + bf1Start); + final int bf2Length = bf2Bytes.getInt(1 + bf2Start); + if (bf1Length != bf2Length) { throw new IllegalArgumentException("bf1Length " + bf1Length + " does not match bf2Length " + bf2Length); } diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java index 5da1059354f7..e261d7c226ca 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java @@ -485,10 +485,8 @@ public void testMergeBloomKFilterByteBuffers() throws Exception BloomKFilter.mergeBloomFilterByteBuffers( buf1, 0, - bf1Bytes.length, buf2, - 0, - bf2Bytes.length + 0 ); // Deserialize and test From 6470dc606bbe14a4f461120196d6438a2d62b0d7 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 21:17:19 -0800 Subject: [PATCH 12/36] oof, more fixes --- .../druid/query/filter/BloomKFilter.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index 053196d86c81..f20b45265e5a 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -290,20 +290,22 @@ public static BloomKFilter deserialize(ByteBuffer in, int position) throws IOExc * Merges BloomKFilter bf2 into bf1. * Assumes 2 BloomKFilters with the same size/hash functions are serialized to ByteBuffers * - * @param bf1Bytes + * @param bf1Buffer * @param bf1Start - * @param bf2Bytes + * @param bf2Buffer * @param bf2Start */ public static void mergeBloomFilterByteBuffers( - ByteBuffer bf1Bytes, + ByteBuffer bf1Buffer, int bf1Start, - ByteBuffer bf2Bytes, + ByteBuffer bf2Buffer, int bf2Start ) { - final int bf1Length = bf1Bytes.getInt(1 + bf1Start); - final int bf2Length = bf2Bytes.getInt(1 + bf2Start); + ByteBuffer view1 = bf1Buffer.duplicate().order(ByteOrder.BIG_ENDIAN); + ByteBuffer view2 = bf2Buffer.duplicate().order(ByteOrder.BIG_ENDIAN); + final int bf1Length = view1.getInt(1 + bf1Start); + final int bf2Length = view2.getInt(1 + bf2Start); if (bf1Length != bf2Length) { throw new IllegalArgumentException("bf1Length " + bf1Length + " does not match bf2Length " + bf2Length); @@ -311,7 +313,7 @@ public static void mergeBloomFilterByteBuffers( // Validation on the bitset size/3 hash functions. for (int idx = 0; idx < START_OF_SERIALIZED_LONGS; ++idx) { - if (bf1Bytes.get(bf1Start + idx) != bf2Bytes.get(bf2Start + idx)) { + if (view1.get(bf1Start + idx) != view2.get(bf2Start + idx)) { throw new IllegalArgumentException("bf1 NumHashFunctions/NumBits does not match bf2"); } } @@ -321,8 +323,7 @@ public static void mergeBloomFilterByteBuffers( for (int idx = START_OF_SERIALIZED_LONGS; idx < bf1Length; ++idx) { final int pos1 = bf1Start + idx; final int pos2 = bf2Start + idx; - final byte val = (byte) (bf1Bytes.get(pos1) | bf2Bytes.get(pos2)); - bf1Bytes.put(pos1, val); + view1.put(pos1, (byte) (view1.get(pos1) | view2.get(pos2))); } } From 233aa9e071e380b3bcc5dcc34903e2952833208b Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 21:20:25 -0800 Subject: [PATCH 13/36] more sane docs example --- docs/content/development/extensions-core/bloom-filter.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/content/development/extensions-core/bloom-filter.md b/docs/content/development/extensions-core/bloom-filter.md index d02dc7e85ae1..b37c30f9d18e 100644 --- a/docs/content/development/extensions-core/bloom-filter.md +++ b/docs/content/development/extensions-core/bloom-filter.md @@ -49,9 +49,9 @@ To construct a BloomKFilter externally with Java to use as a filter in a Druid q ```java BloomKFilter bloomFilter = new BloomKFilter(1500); -bloomFilter.addString("some string"); -bloomFilter.addString("some other string"); -bloomFilter.addString("striiings!"); +bloomFilter.addString("value 1"); +bloomFilter.addString("value 2"); +bloomFilter.addString("value 3"); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); BloomKFilter.serialize(byteArrayOutputStream, bloomFilter); String base64Serialized= Base64.encodeBase64String(bytes); From 654a994615b803d309da20804852431ff230a806 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 8 Jan 2019 22:52:24 -0800 Subject: [PATCH 14/36] fix it --- .../main/java/org/apache/druid/query/filter/BloomKFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index f20b45265e5a..7e4dfd8f91c7 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -304,8 +304,8 @@ public static void mergeBloomFilterByteBuffers( { ByteBuffer view1 = bf1Buffer.duplicate().order(ByteOrder.BIG_ENDIAN); ByteBuffer view2 = bf2Buffer.duplicate().order(ByteOrder.BIG_ENDIAN); - final int bf1Length = view1.getInt(1 + bf1Start); - final int bf2Length = view2.getInt(1 + bf2Start); + final int bf1Length = START_OF_SERIALIZED_LONGS + (view1.getInt(1 + bf1Start) * Long.BYTES); + final int bf2Length = START_OF_SERIALIZED_LONGS + (view2.getInt(1 + bf2Start) * Long.BYTES); if (bf1Length != bf2Length) { throw new IllegalArgumentException("bf1Length " + bf1Length + " does not match bf2Length " + bf2Length); From 6c04d24274d713870c449c89ac01d6fcedb53c31 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 11 Jan 2019 02:55:22 -0800 Subject: [PATCH 15/36] do the right thing in the right place --- .../org/apache/druid/guice/BloomFilterExtensionModule.java | 4 +++- sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java index a163d1ad0472..59e784dcfe97 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java @@ -40,6 +40,8 @@ public List getJacksonModules() @Override public void configure(Binder binder) { - SqlBindings.addOperatorConversion(binder, BloomFilterOperatorConversion.class); + if (binder != null) { + SqlBindings.addOperatorConversion(binder, BloomFilterOperatorConversion.class); + } } } diff --git a/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java b/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java index 133e7338d384..5617d1f120e1 100644 --- a/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java +++ b/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java @@ -39,8 +39,6 @@ public static void addOperatorConversion( final Class clazz ) { - if (binder != null) { Multibinder.newSetBinder(binder, SqlOperatorConversion.class).addBinding().to(clazz); - } } } From 2e5f43d0173772811799f196ed52945e8e282618 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 11 Jan 2019 02:57:23 -0800 Subject: [PATCH 16/36] formatting --- sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java b/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java index 5617d1f120e1..a6b41ff64bc2 100644 --- a/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java +++ b/sql/src/main/java/org/apache/druid/sql/guice/SqlBindings.java @@ -39,6 +39,6 @@ public static void addOperatorConversion( final Class clazz ) { - Multibinder.newSetBinder(binder, SqlOperatorConversion.class).addBinding().to(clazz); + Multibinder.newSetBinder(binder, SqlOperatorConversion.class).addBinding().to(clazz); } } From a12bad1ec3315ba6813f8633464c27c38a56f145 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 11 Jan 2019 14:33:04 -0800 Subject: [PATCH 17/36] fix --- .../DoubleBloomFilterAggregatorColumnSelectorStrategy.java | 2 ++ .../types/FloatBloomFilterAggregatorColumnSelectorStrategy.java | 2 ++ .../LongBloomFilterAggregatorColumnValueSelectorStrategy.java | 2 ++ 3 files changed, 6 insertions(+) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java index 1174cd187e01..241712145491 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java @@ -33,6 +33,8 @@ public void add(BaseDoubleColumnValueSelector selector, BloomKFilter bloomFilter { if (NullHandling.replaceWithDefault() || !selector.isNull()) { bloomFilter.addDouble(selector.getDouble()); + } else { + bloomFilter.addBytes(null, 0, 0); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java index eddab583e00f..7efc95d4a7d6 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java @@ -33,6 +33,8 @@ public void add(BaseFloatColumnValueSelector selector, BloomKFilter bloomFilter) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { bloomFilter.addFloat(selector.getFloat()); + } else { + bloomFilter.addBytes(null, 0, 0); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java index f7cc36053d5e..2cc5de9c3113 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java @@ -33,6 +33,8 @@ public void add(BaseLongColumnValueSelector selector, BloomKFilter bloomFilter) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { bloomFilter.addLong(selector.getLong()); + } else { + bloomFilter.addBytes(null, 0, 0); } } From ee6ecd660f8ed686813dfe9d98f73dba491557e5 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 11 Jan 2019 14:35:15 -0800 Subject: [PATCH 18/36] avoid conflict --- .../org/apache/druid/query/aggregation/AggregatorUtil.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java index 1fd81d71e5b8..87690e2d7a4d 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java @@ -111,8 +111,8 @@ public class AggregatorUtil public static final byte HLL_SKETCH_TO_ESTIMATE_AND_BOUNDS_CACHE_TYPE_ID = 0x32; // bloom filter extension - public static final byte BLOOM_FILTER_CACHE_TYPE_ID = 0x33; - public static final byte BLOOM_FILTER_MERGE_CACHE_TYPE_ID = 0x34; + public static final byte BLOOM_FILTER_CACHE_TYPE_ID = 0x34; + public static final byte BLOOM_FILTER_MERGE_CACHE_TYPE_ID = 0x35; /** * returns the list of dependent postAggregators that should be calculated in order to calculate given postAgg From 70882c9bd9a0283563ce0972eef864e437a50038 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 12 Jan 2019 01:31:48 -0800 Subject: [PATCH 19/36] typo fixes, faster comparator, docs for comparator behavior --- .../extensions-core/bloom-filter.md | 15 ++-- .../DerivativeDataSourceManager.java | 2 +- .../MaterializedViewUtils.java | 2 +- .../BloomFilterMergeAggregatorFactory.java | 5 +- .../druid/query/filter/BloomKFilter.java | 85 ++++++++++++++----- .../bloom/BloomFilterAggregatorTest.java | 2 +- .../bloom/BloomFilterGroupByQueryTest.java | 70 +++++++++------ .../druid/query/filter/BloomKFilterTest.java | 37 +++++++- 8 files changed, 159 insertions(+), 59 deletions(-) diff --git a/docs/content/development/extensions-core/bloom-filter.md b/docs/content/development/extensions-core/bloom-filter.md index b37c30f9d18e..48e79b64bc05 100644 --- a/docs/content/development/extensions-core/bloom-filter.md +++ b/docs/content/development/extensions-core/bloom-filter.md @@ -32,10 +32,10 @@ A BloomFilter is a probabilistic data structure for performing a set membership to use with Druid for cases where an explicit filter is impossible, e.g. filtering a query against a set of millions of values. -Following are some characterstics of BloomFilters: +Following are some characteristics of BloomFilters: - BloomFilters are highly space efficient when compared to using a HashSet. -- Because of the probabilistic nature of bloom filters, false positive results are possible (e.g. element was not actually -inserted into a bloom filter during construction, but `test()` says true) +- Because of the probabilistic nature of bloom filters, false positive results are possible (element was not actually +inserted into a bloom filter during construction, but `test()` says true) - False negatives are not possible (if element is present then `test()` will never say false). - The false positive probability of this implementation is currently fixed at 5%, but increasing the number of entries that the filter can hold can decrease this false positive rate in exchange for overall size. @@ -54,8 +54,9 @@ bloomFilter.addString("value 2"); bloomFilter.addString("value 3"); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); BloomKFilter.serialize(byteArrayOutputStream, bloomFilter); -String base64Serialized= Base64.encodeBase64String(bytes); +String base64Serialized = Base64.encodeBase64String(byteArrayOutputStream.toByteArray()); ``` + This string can then be used in the native or sql Druid query. ## Filtering queries with a Bloom Filter @@ -142,4 +143,8 @@ response [{"timestamp":"2015-09-12T00:00:00.000Z","result":{"userBloom":"BAAAJhAAAA..."}}] ``` -These values can then be set in the filter specification above. \ No newline at end of file +These values can then be set in the filter specification above. + +Ordering results by a bloom filter aggregator, for example in a TopN query, will perform a comparatively expensive +linear scan _of the filter itself_ to count the number of set bits as a means of approximating how many items have been +added to the set. As such, ordering by an alternate aggregation is recommended if possible. \ No newline at end of file diff --git a/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/DerivativeDataSourceManager.java b/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/DerivativeDataSourceManager.java index 45ab136e443d..35a5c282ef8a 100644 --- a/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/DerivativeDataSourceManager.java +++ b/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/DerivativeDataSourceManager.java @@ -210,7 +210,7 @@ public Pair map(int index, ResultSet r, St } /** - * caculate the average data size per segment granularity for a given datasource. + * calculate the average data size per segment granularity for a given datasource. * * e.g. for a datasource, there're 5 segments as follows, * interval = "2018-04-01/2017-04-02", segment size = 1024 * 1024 * 2 diff --git a/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/MaterializedViewUtils.java b/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/MaterializedViewUtils.java index 92eff78a0315..2e96c941fa57 100644 --- a/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/MaterializedViewUtils.java +++ b/extensions-contrib/materialized-view-selection/src/main/java/org/apache/druid/query/materializedview/MaterializedViewUtils.java @@ -85,7 +85,7 @@ private static Set extractFieldsFromAggregations(List } /** - * caculate the intervals which are covered by interval2, but not covered by interval1. + * calculate the intervals which are covered by interval2, but not covered by interval1. * result intervals = interval2 - interval1 ∩ interval2 * e.g. * a list of interval2: ["2018-04-01T00:00:00.000Z/2018-04-02T00:00:00.000Z", diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java index 1136a3136eae..f0c99b24405b 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -19,6 +19,7 @@ package org.apache.druid.query.aggregation.bloom; +import org.apache.druid.java.util.common.ISE; import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.AggregatorUtil; @@ -50,7 +51,7 @@ public Aggregator factorize(final ColumnSelectorFactory metricFactory) { final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); if (selector instanceof NilColumnValueSelector) { - return NoopAggregator.instance(); + throw new ISE("WTF?! Unexpected NilColumnValueSelector"); } return new BloomFilterMergeAggregator(selector, getMaxNumEntries()); } @@ -60,7 +61,7 @@ public BufferAggregator factorizeBuffered(final ColumnSelectorFactory metricFact { final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); if (selector instanceof NilColumnValueSelector) { - return NoopBufferAggregator.instance(); + throw new ISE("WTF?! Unexpected NilColumnValueSelector"); } return new BloomFilterMergeBufferAggregator(selector, getMaxNumEntries()); } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index 7e4dfd8f91c7..60a5ba96a88c 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -180,6 +180,8 @@ public static BloomKFilter deserialize(InputStream in) throws IOException } } + // custom Druid ByteBuffer methods start here + /** * Merges BloomKFilter bf2 into bf1. * Assumes 2 BloomKFilters with the same size/hash functions are serialized to byte arrays @@ -224,7 +226,7 @@ public static void serialize(ByteBuffer out, BloomKFilter bloomFilter) } /** - * Serialize a bloom filter to a ByteBuffer + * Serialize a bloom filter to a ByteBuffer. Does not mutate buffer position. * * @param out output buffer to write to * @param position output buffer position @@ -235,8 +237,8 @@ public static void serialize(ByteBuffer out, int position, BloomKFilter bloomFil /** * Serialized BloomKFilter format: * 1 byte for the number of hash functions. - * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset - * big endina longs in the BloomKFilter bitset + * 1 big endian int(to match OutputStream) for the number of longs in the bitset + * big endian longs in the BloomKFilter bitset */ ByteBuffer view = out.duplicate().order(ByteOrder.BIG_ENDIAN); view.position(position); @@ -256,7 +258,7 @@ public static BloomKFilter deserialize(ByteBuffer in) throws IOException * Deserialize a bloom filter * Read a byte buffer, which was written by {@linkplain #serialize(OutputStream, BloomKFilter)} or * {@linkplain #serialize(ByteBuffer, int, BloomKFilter)} - * into a {@code BloomKFilter} + * into a {@code BloomKFilter}. Does not mutate buffer position. * * @param in input ByteBuffer * @@ -287,7 +289,7 @@ public static BloomKFilter deserialize(ByteBuffer in, int position) throws IOExc } /** - * Merges BloomKFilter bf2 into bf1. + * Merges BloomKFilter bf2Buffer into bf1Buffer in place. Does not mutate buffer positions. * Assumes 2 BloomKFilters with the same size/hash functions are serialized to ByteBuffers * * @param bf1Buffer @@ -328,12 +330,29 @@ public static void mergeBloomFilterByteBuffers( } /** - * Caculate size in bytes of a BloomKFilter for a given number of entries - * @param maxNumEntries + * ByteBuffer based copy of logic of {@link BloomKFilter#getNumSetBits()} + * @param bfBuffer + * @param start * @return */ + public static int getNumSetBits(ByteBuffer bfBuffer, int start) + { + ByteBuffer view = bfBuffer.duplicate().order(ByteOrder.BIG_ENDIAN); + view.position(start); + int numLongs = view.getInt(1 + start); + int setBits = 0; + for (int i = 0, pos = START_OF_SERIALIZED_LONGS + start; i < numLongs; i++, pos += Long.BYTES) { + setBits += Long.bitCount(view.getLong(pos)); + } + return setBits; + } + + /** + * Calculate size in bytes of a BloomKFilter for a given number of entries + */ public static int computeSizeBytes(long maxNumEntries) { + // copied from constructor checkArgument(maxNumEntries > 0, "expectedEntries should be > 0"); long numBits = optimalNumOfBits(maxNumEntries, DEFAULT_FPP); @@ -342,11 +361,18 @@ public static int computeSizeBytes(long maxNumEntries) return START_OF_SERIALIZED_LONGS + ((nLongs + padLongs) * Long.BYTES); } + /** + * ByteBuffer based copy of {@link BloomKFilter#add(byte[])} that adds a value to the ByteBuffer in place. + */ public static void add(ByteBuffer buffer, byte[] val) { addBytes(buffer, val); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addBytes(byte[], int, int)} that adds a value to the ByteBuffer + * in place. + */ public static void addBytes(ByteBuffer buffer, byte[] val, int offset, int length) { long hash64 = val == null ? Murmur3.NULL_HASHCODE : @@ -354,11 +380,17 @@ public static void addBytes(ByteBuffer buffer, byte[] val, int offset, int lengt addHash(buffer, hash64); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addBytes(byte[])} that adds a value to the ByteBuffer in place. + */ public static void addBytes(ByteBuffer buffer, byte[] val) { addBytes(buffer, val, 0, val.length); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addHash(long)} that adds a value to the ByteBuffer in place. + */ public static void addHash(ByteBuffer buffer, long hash64) { final int hash1 = (int) hash64; @@ -394,37 +426,54 @@ public static void addHash(ByteBuffer buffer, long hash64) } } + /** + * ByteBuffer based copy of {@link BloomKFilter#addString(String)} that adds a value to the ByteBuffer in place. + */ public static void addString(ByteBuffer buffer, String val) { addBytes(buffer, StringUtils.toUtf8(val)); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addByte(byte)} that adds a value to the ByteBuffer in place. + */ public static void addByte(ByteBuffer buffer, byte val) { addBytes(buffer, new byte[]{val}); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addInt(int)} that adds a value to the ByteBuffer in place. + */ public static void addInt(ByteBuffer buffer, int val) { - // puts int in little endian order addBytes(buffer, intToByteArrayLE(val)); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addLong(long)} that adds a value to the ByteBuffer in place. + */ public static void addLong(ByteBuffer buffer, long val) { - // puts long in little endian order addHash(buffer, Murmur3.hash64(val)); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addFloat(float)} that adds a value to the ByteBuffer in place. + */ public static void addFloat(ByteBuffer buffer, float val) { addInt(buffer, Float.floatToIntBits(val)); } + /** + * ByteBuffer based copy of {@link BloomKFilter#addDouble(double)} + */ public static void addDouble(ByteBuffer buffer, double val) { addLong(buffer, Double.doubleToLongBits(val)); } + // custom Druid ByteBuffer methods end here public void add(byte[] val) { @@ -619,7 +668,11 @@ public int getBitSize() public int getNumSetBits() { - return bitSet.setBitsCount(); + int setCount = 0; + for (long datum : bitSet.getData()) { + setCount += Long.bitCount(datum); + } + return setCount; } public int getNumHashFunctions() @@ -709,18 +762,6 @@ public boolean get(int index) return (data[index >>> 6] & (1L << index)) != 0; } - - public int setBitsCount() - { - int setCount = 0; - for (int i = 0; i < bitSize(); i++) { - if (get(i)) { - setCount++; - } - } - return setCount; - } - /** * Number of bits */ diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java index 45b62cbb9a7a..cd8a05c3c7b8 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -224,7 +224,7 @@ private static void bufferAggregateColumn( } } - private static String filterToString(BloomKFilter bloomKFilter) throws IOException + static String filterToString(BloomKFilter bloomKFilter) throws IOException { return Base64.encodeBase64String(BloomFilterSerializersModule.bloomKFilterToBytes(bloomKFilter)); } diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java index 542acdf454ad..bd137c8bf339 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java @@ -20,6 +20,7 @@ package org.apache.druid.query.aggregation.bloom; import com.google.common.collect.Lists; +import org.apache.commons.codec.binary.Base64; import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.MapBasedRow; import org.apache.druid.guice.BloomFilterExtensionModule; @@ -37,6 +38,7 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; @@ -78,30 +80,58 @@ public void teardown() throws IOException } @Test - public void testIngestWithNullsIgnoredAndQuery() throws Exception + public void testQuery() throws Exception { - MapBasedRow row = ingestAndQuery(true); - Object o = row.getRaw("blooming_quality"); + + String query = "{" + + "\"queryType\": \"groupBy\"," + + "\"dataSource\": \"test_datasource\"," + + "\"granularity\": \"ALL\"," + + "\"dimensions\": []," + + "\"filter\":{ \"type\":\"selector\", \"dimension\":\"market\", \"value\":\"upfront\"}," + + "\"aggregations\": [" + + " { \"type\": \"bloom\", \"name\": \"blooming_quality\", \"field\": \"quality\" }" + + "]," + + "\"intervals\": [ \"1970/2050\" ]" + + "}"; + + MapBasedRow row = ingestAndQuery(query); + + Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("mezzanine")); Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("premium")); Assert.assertFalse(((BloomKFilter) row.getRaw("blooming_quality")).testString("entertainment")); - } @Test - public void testIngestWithNullsToZeroAndQuery() throws Exception + public void testQueryFakeDimension() throws Exception { - // Nulls are ignored and not replaced with default for SQL compatible null handling. - // This is already tested in testIngestWithNullsIgnoredAndQuery() - if (NullHandling.replaceWithDefault()) { - MapBasedRow row = ingestAndQuery(false); - Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("mezzanine")); - Assert.assertTrue(((BloomKFilter) row.getRaw("blooming_quality")).testString("premium")); - Assert.assertFalse(((BloomKFilter) row.getRaw("blooming_quality")).testString("entertainment")); - } + String query = "{" + + "\"queryType\": \"groupBy\"," + + "\"dataSource\": \"test_datasource\"," + + "\"granularity\": \"ALL\"," + + "\"dimensions\": []," + + "\"filter\":{ \"type\":\"selector\", \"dimension\":\"market\", \"value\":\"upfront\"}," + + "\"aggregations\": [" + + " { \"type\": \"bloom\", \"name\": \"blooming_quality\", \"field\": \"nope\" }" + + "]," + + "\"intervals\": [ \"1970/2050\" ]" + + "}"; + + MapBasedRow row = ingestAndQuery(query); + + BloomKFilter filter = new BloomKFilter(1500); + filter.addBytes(null, 0, 0); + + Object val = row.getRaw("blooming_quality"); + + String serialized = BloomFilterAggregatorTest.filterToString((BloomKFilter) val); + String empty = BloomFilterAggregatorTest.filterToString(filter); + + Assert.assertEquals(empty, serialized); } - private MapBasedRow ingestAndQuery(boolean ignoreNulls) throws Exception + private MapBasedRow ingestAndQuery(String query) throws Exception { String metricSpec = "[{ \"type\": \"count\", \"name\": \"count\"}]"; @@ -122,18 +152,6 @@ private MapBasedRow ingestAndQuery(boolean ignoreNulls) throws Exception + " }" + "}"; - String query = "{" - + "\"queryType\": \"groupBy\"," - + "\"dataSource\": \"test_datasource\"," - + "\"granularity\": \"ALL\"," - + "\"dimensions\": []," - + "\"filter\":{ \"type\":\"selector\", \"dimension\":\"market\", \"value\":\"upfront\"}," - + "\"aggregations\": [" - + " { \"type\": \"bloom\", \"name\": \"blooming_quality\", \"field\": \"quality\" }" - + "]," - + "\"intervals\": [ \"1970/2050\" ]" - + "}"; - Sequence seq = helper.createIndexAndRunQueryOnSegment( this.getClass().getClassLoader().getResourceAsStream("sample.data.tsv"), parseSpec, diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java index e261d7c226ca..3385924e4e1a 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/filter/BloomKFilterTest.java @@ -20,6 +20,7 @@ package org.apache.druid.query.filter; import org.apache.druid.io.ByteBufferInputStream; +import org.junit.Assert; import org.junit.Test; import java.io.ByteArrayInputStream; @@ -27,13 +28,14 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Random; +import java.util.concurrent.ThreadLocalRandom; import static org.junit.Assert.assertEquals; public class BloomKFilterTest { private static final int COUNT = 100; - Random rand = new Random(123); + Random rand = ThreadLocalRandom.current(); @Test public void testBloomKFilterBytes() throws IOException @@ -444,6 +446,7 @@ public void testBloomKFilterString() throws IOException assertEquals(77952, rehydrated.sizeInBytes()); } + @Test public void testMergeBloomKFilterByteBuffers() throws Exception { @@ -503,4 +506,36 @@ public void testMergeBloomKFilterByteBuffers() throws Exception assert bfMerged.testString(val); } } + + @Test + public void testCountBitBloomKFilterByteBuffersEmpty() throws Exception + { + BloomKFilter bfWithValues = new BloomKFilter(10000); + BloomKFilter bfEmpty = new BloomKFilter(10000); + BloomKFilter bfNull = new BloomKFilter(10000); + + for (int i = 0; i < 1000; i++) { + bfWithValues.addInt(rand.nextInt()); + } + + bfNull.addBytes(null, 0, 0); + + ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); + BloomKFilter.serialize(bytesOut, bfWithValues); + ByteBuffer bufWithValues = ByteBuffer.wrap(bytesOut.toByteArray()); + bytesOut.reset(); + BloomKFilter.serialize(bytesOut, bfEmpty); + ByteBuffer bufEmpty = ByteBuffer.wrap(bytesOut.toByteArray()); + bytesOut.reset(); + BloomKFilter.serialize(bytesOut, bfNull); + ByteBuffer bufWithNull = ByteBuffer.wrap(bytesOut.toByteArray()); + + + Assert.assertTrue(BloomKFilter.getNumSetBits(bufWithValues, 0) > 0); + Assert.assertFalse(BloomKFilter.getNumSetBits(bufEmpty, 0) > 0); + Assert.assertTrue(BloomKFilter.getNumSetBits(bufWithNull, 0) > 0); + Assert.assertTrue( + BloomKFilter.getNumSetBits(bufWithValues, 0) > BloomKFilter.getNumSetBits(bufWithNull, 0) + ); + } } From 3858cb871a99c10d7b2cf36a3c0651676eec18c2 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 12 Jan 2019 01:58:30 -0800 Subject: [PATCH 20/36] unused imports --- .../aggregation/bloom/BloomFilterMergeAggregatorFactory.java | 2 -- .../query/aggregation/bloom/BloomFilterGroupByQueryTest.java | 3 --- 2 files changed, 5 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java index f0c99b24405b..73cf78c8efcf 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -24,8 +24,6 @@ import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.AggregatorUtil; import org.apache.druid.query.aggregation.BufferAggregator; -import org.apache.druid.query.aggregation.NoopAggregator; -import org.apache.druid.query.aggregation.NoopBufferAggregator; import org.apache.druid.query.cache.CacheKeyBuilder; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.ColumnSelectorFactory; diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java index bd137c8bf339..f9678c1ad9f5 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java @@ -20,8 +20,6 @@ package org.apache.druid.query.aggregation.bloom; import com.google.common.collect.Lists; -import org.apache.commons.codec.binary.Base64; -import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.MapBasedRow; import org.apache.druid.guice.BloomFilterExtensionModule; import org.apache.druid.java.util.common.granularity.Granularities; @@ -38,7 +36,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; From 2ccc137651e35c7a13b4ac6faf33b5e5da37841f Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 12 Jan 2019 02:04:45 -0800 Subject: [PATCH 21/36] use buffer comparator instead of deserializing --- .../bloom/BloomFilterAggregatorFactory.java | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index bd56386f1df7..5168e4686a28 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -99,21 +99,19 @@ public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) public Comparator getComparator() { return (Comparator) (o1, o2) -> { - try { - if (o1 instanceof ByteBuffer && o2 instanceof ByteBuffer) { - BloomKFilter o1f = BloomKFilter.deserialize((ByteBuffer) o1); - BloomKFilter o2f = BloomKFilter.deserialize((ByteBuffer) o2); - return Ints.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); - } else if (o1 instanceof BloomKFilter && o2 instanceof BloomKFilter) { - BloomKFilter o1f = (BloomKFilter) o1; - BloomKFilter o2f = (BloomKFilter) o2; - return Ints.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); - } else { - throw new RE("Unable to compare unexpected types [%s]", o1.getClass().getName()); - } - } - catch (IOException ioe) { - throw new RuntimeException("Failed to deserialize BloomKFilter"); + if (o1 instanceof ByteBuffer && o2 instanceof ByteBuffer) { + ByteBuffer buf1 = (ByteBuffer) o1; + ByteBuffer buf2 = (ByteBuffer) o2; + return Ints.compare( + BloomKFilter.getNumSetBits(buf1, buf1.position()), + BloomKFilter.getNumSetBits(buf2, buf2.position()) + ); + } else if (o1 instanceof BloomKFilter && o2 instanceof BloomKFilter) { + BloomKFilter o1f = (BloomKFilter) o1; + BloomKFilter o2f = (BloomKFilter) o2; + return Ints.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); + } else { + throw new RE("Unable to compare unexpected types [%s]", o1.getClass().getName()); } }; } From ff87a3785353069dfd125e05ad94842d9c1775ed Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 16 Jan 2019 13:51:23 -0800 Subject: [PATCH 22/36] striped readwrite lock for buffer agg, null handling comparator, other review changes --- .../guice/BloomFilterExtensionModule.java | 4 +--- .../bloom/BaseBloomFilterAggregator.java | 4 ++-- .../BaseBloomFilterBufferAggregator.java | 24 +++++++++++++++---- .../bloom/BloomFilterAggregatorFactory.java | 9 ++++--- .../bloom/BloomFilterBufferAggregator.java | 16 +++++++++---- .../BloomFilterMergeBufferAggregator.java | 18 ++++++++++---- .../bloom/BloomFilterGroupByQueryTest.java | 19 +++++++++++++-- 7 files changed, 68 insertions(+), 26 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java index 59e784dcfe97..a163d1ad0472 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/guice/BloomFilterExtensionModule.java @@ -40,8 +40,6 @@ public List getJacksonModules() @Override public void configure(Binder binder) { - if (binder != null) { - SqlBindings.addOperatorConversion(binder, BloomFilterOperatorConversion.class); - } + SqlBindings.addOperatorConversion(binder, BloomFilterOperatorConversion.class); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java index e250c7e41131..279dc6448352 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java @@ -28,9 +28,9 @@ public abstract class BaseBloomFilterAggregator implements Aggregator { protected final BloomKFilter collector; - public BaseBloomFilterAggregator(BloomKFilter filter) + public BaseBloomFilterAggregator(BloomKFilter collector) { - this.collector = filter; + this.collector = collector; } @Nullable diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java index 412363eafd33..29e92e8f52bb 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java @@ -19,14 +19,20 @@ package org.apache.druid.query.aggregation.bloom; +import com.google.common.util.concurrent.Striped; import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.filter.BloomKFilter; import java.nio.ByteBuffer; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReadWriteLock; public abstract class BaseBloomFilterBufferAggregator implements BufferAggregator { + private static final int NUM_STRIPES = 64; // for locking per buffer position + private final int maxNumEntries; + protected final Striped striped = Striped.readWriteLock(NUM_STRIPES); public BaseBloomFilterBufferAggregator(int maxNumEntries) { @@ -45,11 +51,19 @@ public void init(ByteBuffer buf, int position) @Override public Object get(ByteBuffer buf, int position) { - ByteBuffer mutationBuffer = buf.duplicate(); - mutationBuffer.position(position); - int sizeBytes = 5 + (buf.getInt(position + 1) << 3); - mutationBuffer.limit(position + sizeBytes); - return mutationBuffer.slice(); + int index = (System.identityHashCode(buf) + 31 * position) & 63; + Lock lock = striped.getAt(index).readLock(); + lock.lock(); + try { + ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + // | k (byte) | numLongs (int) | bitset (long[numLongs]) | + int sizeBytes = 1 + Integer.BYTES + (buf.getInt(position + 1) * Long.BYTES); + mutationBuffer.limit(position + sizeBytes); + return mutationBuffer.slice(); + } finally { + lock.unlock(); + } } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 5168e4686a28..6615aee50010 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -21,7 +21,6 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.primitives.Ints; import org.apache.commons.codec.binary.Base64; import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.druid.java.util.common.RE; @@ -98,22 +97,22 @@ public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) @Override public Comparator getComparator() { - return (Comparator) (o1, o2) -> { + return Comparator.nullsFirst((o1, o2) -> { if (o1 instanceof ByteBuffer && o2 instanceof ByteBuffer) { ByteBuffer buf1 = (ByteBuffer) o1; ByteBuffer buf2 = (ByteBuffer) o2; - return Ints.compare( + return Integer.compare( BloomKFilter.getNumSetBits(buf1, buf1.position()), BloomKFilter.getNumSetBits(buf2, buf2.position()) ); } else if (o1 instanceof BloomKFilter && o2 instanceof BloomKFilter) { BloomKFilter o1f = (BloomKFilter) o1; BloomKFilter o2f = (BloomKFilter) o2; - return Ints.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); + return Integer.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); } else { throw new RE("Unable to compare unexpected types [%s]", o1.getClass().getName()); } - }; + }); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java index 1a50a52f3703..757bac6a39ef 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -24,6 +24,7 @@ import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import java.nio.ByteBuffer; +import java.util.concurrent.locks.Lock; public class BloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator @@ -42,10 +43,17 @@ public BloomFilterBufferAggregator( @Override public void aggregate(ByteBuffer buf, int position) { - final int oldPosition = buf.position(); - buf.position(position); - selectorPlus.getColumnSelectorStrategy().bufferAdd(selectorPlus.getSelector(), buf); - buf.position(oldPosition); + int index = (System.identityHashCode(buf) + 31 * position) & 63; + Lock lock = striped.getAt(index).writeLock(); + lock.lock(); + try { + final int oldPosition = buf.position(); + buf.position(position); + selectorPlus.getColumnSelectorStrategy().bufferAdd(selectorPlus.getSelector(), buf); + buf.position(oldPosition); + } finally { + lock.unlock(); + } } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index 5fb40712321e..01fe4dc0331d 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -24,6 +24,7 @@ import org.apache.druid.segment.ColumnValueSelector; import java.nio.ByteBuffer; +import java.util.concurrent.locks.Lock; public class BloomFilterMergeBufferAggregator extends BaseBloomFilterBufferAggregator { @@ -38,11 +39,18 @@ public BloomFilterMergeBufferAggregator(ColumnValueSelector selector @Override public void aggregate(ByteBuffer buf, int position) { - final int oldPosition = buf.position(); - buf.position(position); - ByteBuffer other = selector.getObject(); - BloomKFilter.mergeBloomFilterByteBuffers(buf, position, other, other.position()); - buf.position(oldPosition); + int index = (System.identityHashCode(buf) + 31 * position) & 63; + Lock lock = striped.getAt(index).writeLock(); + lock.lock(); + try { + final int oldPosition = buf.position(); + buf.position(position); + ByteBuffer other = selector.getObject(); + BloomKFilter.mergeBloomFilterByteBuffers(buf, position, other, other.position()); + buf.position(oldPosition); + } finally { + lock.unlock(); + } } @Override diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java index f9678c1ad9f5..498a12656c4b 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java @@ -19,15 +19,24 @@ package org.apache.druid.query.aggregation.bloom; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; +import com.google.inject.Guice; +import com.google.inject.Injector; +import com.google.inject.Key; import org.apache.druid.data.input.MapBasedRow; import org.apache.druid.guice.BloomFilterExtensionModule; +import org.apache.druid.guice.annotations.Json; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.query.aggregation.AggregationTestHelper; +import org.apache.druid.query.expression.LookupEnabledTestExprMacroTable; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.groupby.GroupByQueryConfig; import org.apache.druid.query.groupby.GroupByQueryRunnerTest; +import org.apache.druid.query.lookup.LookupReferencesManager; +import org.apache.druid.segment.TestHelper; import org.junit.After; import org.junit.Assert; import org.junit.Rule; @@ -44,6 +53,14 @@ @RunWith(Parameterized.class) public class BloomFilterGroupByQueryTest { + private static final BloomFilterExtensionModule module = new BloomFilterExtensionModule(); + + private static final Injector injector = Guice.createInjector( + binder -> { + binder.bind(Key.get(ObjectMapper.class, Json.class)).toInstance(TestHelper.makeJsonMapper()); + }, + module + ); private AggregationTestHelper helper; @Rule @@ -51,8 +68,6 @@ public class BloomFilterGroupByQueryTest public BloomFilterGroupByQueryTest(final GroupByQueryConfig config) { - BloomFilterExtensionModule module = new BloomFilterExtensionModule(); - module.configure(null); helper = AggregationTestHelper.createGroupByQueryAggregationTestHelper( Lists.newArrayList(module.getJacksonModules()), config, From a635a09f7b87c8a56f20c6e4451cafce02d6dd8c Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 16 Jan 2019 15:17:54 -0800 Subject: [PATCH 23/36] style fixes --- .../bloom/BaseBloomFilterBufferAggregator.java | 3 ++- .../bloom/BloomFilterBufferAggregator.java | 3 ++- .../bloom/BloomFilterMergeBufferAggregator.java | 3 ++- .../bloom/BloomFilterGroupByQueryTest.java | 17 +++++++---------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java index 29e92e8f52bb..01918175745d 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java @@ -61,7 +61,8 @@ public Object get(ByteBuffer buf, int position) int sizeBytes = 1 + Integer.BYTES + (buf.getInt(position + 1) * Long.BYTES); mutationBuffer.limit(position + sizeBytes); return mutationBuffer.slice(); - } finally { + } + finally { lock.unlock(); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java index 757bac6a39ef..253b2f475c1c 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -51,7 +51,8 @@ public void aggregate(ByteBuffer buf, int position) buf.position(position); selectorPlus.getColumnSelectorStrategy().bufferAdd(selectorPlus.getSelector(), buf); buf.position(oldPosition); - } finally { + } + finally { lock.unlock(); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index 01fe4dc0331d..9f1c3c2fa533 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -48,7 +48,8 @@ public void aggregate(ByteBuffer buf, int position) ByteBuffer other = selector.getObject(); BloomKFilter.mergeBloomFilterByteBuffers(buf, position, other, other.position()); buf.position(oldPosition); - } finally { + } + finally { lock.unlock(); } } diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java index 498a12656c4b..36e2e02de967 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java @@ -20,10 +20,8 @@ package org.apache.druid.query.aggregation.bloom; import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.inject.Guice; -import com.google.inject.Injector; import com.google.inject.Key; import org.apache.druid.data.input.MapBasedRow; import org.apache.druid.guice.BloomFilterExtensionModule; @@ -31,11 +29,9 @@ import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.query.aggregation.AggregationTestHelper; -import org.apache.druid.query.expression.LookupEnabledTestExprMacroTable; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.query.groupby.GroupByQueryConfig; import org.apache.druid.query.groupby.GroupByQueryRunnerTest; -import org.apache.druid.query.lookup.LookupReferencesManager; import org.apache.druid.segment.TestHelper; import org.junit.After; import org.junit.Assert; @@ -55,12 +51,13 @@ public class BloomFilterGroupByQueryTest { private static final BloomFilterExtensionModule module = new BloomFilterExtensionModule(); - private static final Injector injector = Guice.createInjector( - binder -> { - binder.bind(Key.get(ObjectMapper.class, Json.class)).toInstance(TestHelper.makeJsonMapper()); - }, - module - ); + static { + // throwaway, just using to properly initialize jackson modules + Guice.createInjector( + binder -> binder.bind(Key.get(ObjectMapper.class, Json.class)).toInstance(TestHelper.makeJsonMapper()), + module + ); + } private AggregationTestHelper helper; @Rule From 34183ac66084802bc532005ee002b89b26635ddb Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Wed, 16 Jan 2019 17:14:04 -0800 Subject: [PATCH 24/36] style --- .../query/aggregation/bloom/BloomFilterGroupByQueryTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java index 36e2e02de967..763e0f1d45a9 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java @@ -58,6 +58,7 @@ public class BloomFilterGroupByQueryTest module ); } + private AggregationTestHelper helper; @Rule From daad5a6372df6a5dfd5ff46732af989850f6f509 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 18 Jan 2019 14:48:50 -0800 Subject: [PATCH 25/36] remove sync for now --- .../BaseBloomFilterBufferAggregator.java | 28 +++++-------------- .../bloom/BloomFilterBufferAggregator.java | 17 +++-------- .../BloomFilterMergeBufferAggregator.java | 19 ++++--------- .../druid/query/filter/BloomKFilter.java | 5 ++-- 4 files changed, 18 insertions(+), 51 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java index 01918175745d..5151d5a26a11 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java @@ -19,21 +19,15 @@ package org.apache.druid.query.aggregation.bloom; -import com.google.common.util.concurrent.Striped; import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.filter.BloomKFilter; import java.nio.ByteBuffer; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReadWriteLock; public abstract class BaseBloomFilterBufferAggregator implements BufferAggregator { - private static final int NUM_STRIPES = 64; // for locking per buffer position - private final int maxNumEntries; - protected final Striped striped = Striped.readWriteLock(NUM_STRIPES); - + public BaseBloomFilterBufferAggregator(int maxNumEntries) { this.maxNumEntries = maxNumEntries; @@ -51,20 +45,12 @@ public void init(ByteBuffer buf, int position) @Override public Object get(ByteBuffer buf, int position) { - int index = (System.identityHashCode(buf) + 31 * position) & 63; - Lock lock = striped.getAt(index).readLock(); - lock.lock(); - try { - ByteBuffer mutationBuffer = buf.duplicate(); - mutationBuffer.position(position); - // | k (byte) | numLongs (int) | bitset (long[numLongs]) | - int sizeBytes = 1 + Integer.BYTES + (buf.getInt(position + 1) * Long.BYTES); - mutationBuffer.limit(position + sizeBytes); - return mutationBuffer.slice(); - } - finally { - lock.unlock(); - } + ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + // | k (byte) | numLongs (int) | bitset (long[numLongs]) | + int sizeBytes = 1 + Integer.BYTES + (buf.getInt(position + 1) * Long.BYTES); + mutationBuffer.limit(position + sizeBytes); + return mutationBuffer.slice(); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java index 253b2f475c1c..1a50a52f3703 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -24,7 +24,6 @@ import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import java.nio.ByteBuffer; -import java.util.concurrent.locks.Lock; public class BloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator @@ -43,18 +42,10 @@ public BloomFilterBufferAggregator( @Override public void aggregate(ByteBuffer buf, int position) { - int index = (System.identityHashCode(buf) + 31 * position) & 63; - Lock lock = striped.getAt(index).writeLock(); - lock.lock(); - try { - final int oldPosition = buf.position(); - buf.position(position); - selectorPlus.getColumnSelectorStrategy().bufferAdd(selectorPlus.getSelector(), buf); - buf.position(oldPosition); - } - finally { - lock.unlock(); - } + final int oldPosition = buf.position(); + buf.position(position); + selectorPlus.getColumnSelectorStrategy().bufferAdd(selectorPlus.getSelector(), buf); + buf.position(oldPosition); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index 9f1c3c2fa533..5fb40712321e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -24,7 +24,6 @@ import org.apache.druid.segment.ColumnValueSelector; import java.nio.ByteBuffer; -import java.util.concurrent.locks.Lock; public class BloomFilterMergeBufferAggregator extends BaseBloomFilterBufferAggregator { @@ -39,19 +38,11 @@ public BloomFilterMergeBufferAggregator(ColumnValueSelector selector @Override public void aggregate(ByteBuffer buf, int position) { - int index = (System.identityHashCode(buf) + 31 * position) & 63; - Lock lock = striped.getAt(index).writeLock(); - lock.lock(); - try { - final int oldPosition = buf.position(); - buf.position(position); - ByteBuffer other = selector.getObject(); - BloomKFilter.mergeBloomFilterByteBuffers(buf, position, other, other.position()); - buf.position(oldPosition); - } - finally { - lock.unlock(); - } + final int oldPosition = buf.position(); + buf.position(position); + ByteBuffer other = selector.getObject(); + BloomKFilter.mergeBloomFilterByteBuffers(buf, position, other, other.position()); + buf.position(oldPosition); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index 60a5ba96a88c..98be27d99f9d 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -19,6 +19,7 @@ package org.apache.druid.query.filter; +import org.apache.druid.java.util.common.IOE; import org.apache.druid.java.util.common.StringUtils; import org.apache.hive.common.util.Murmur3; @@ -282,9 +283,7 @@ public static BloomKFilter deserialize(ByteBuffer in, int position) throws IOExc return new BloomKFilter(data, numHashFunc); } catch (RuntimeException e) { - IOException io = new IOException("Unable to deserialize BloomKFilter"); - io.initCause(e); - throw io; + throw new IOE(e, "Unable to deserialize BloomKFilter"); } } From b310e5293a40e3b8bc2c2053eb1609bfc32e25ad Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 18 Jan 2019 14:51:13 -0800 Subject: [PATCH 26/36] oops --- .../aggregation/bloom/BaseBloomFilterBufferAggregator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java index 5151d5a26a11..1294ea457723 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java @@ -27,7 +27,7 @@ public abstract class BaseBloomFilterBufferAggregator implements BufferAggregator { private final int maxNumEntries; - + public BaseBloomFilterBufferAggregator(int maxNumEntries) { this.maxNumEntries = maxNumEntries; From d6a3809f713650567c3443ca877bf815a4dac7a9 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 18 Jan 2019 15:16:00 -0800 Subject: [PATCH 27/36] consistency --- .../main/java/org/apache/druid/query/filter/BloomKFilter.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java index 98be27d99f9d..12533a20c7cd 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/filter/BloomKFilter.java @@ -19,7 +19,6 @@ package org.apache.druid.query.filter; -import org.apache.druid.java.util.common.IOE; import org.apache.druid.java.util.common.StringUtils; import org.apache.hive.common.util.Murmur3; @@ -283,7 +282,7 @@ public static BloomKFilter deserialize(ByteBuffer in, int position) throws IOExc return new BloomKFilter(data, numHashFunc); } catch (RuntimeException e) { - throw new IOE(e, "Unable to deserialize BloomKFilter"); + throw new IOException("Unable to deserialize BloomKFilter", e); } } From d0b90b2a7d85314697d390f31af47460b20ba03d Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 21 Jan 2019 15:07:49 -0800 Subject: [PATCH 28/36] inspect runtime shape of selector instead of selector plus, static comparator, add inner exception on serde exception --- .../bloom/BloomFilterAggregatorFactory.java | 39 ++++++++++--------- .../bloom/BloomFilterBufferAggregator.java | 2 +- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 6615aee50010..c55755a98509 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -49,7 +49,25 @@ public class BloomFilterAggregatorFactory extends AggregatorFactory { private static final int DEFAULT_NUM_ENTRIES = 1500; - protected static final BloomFilterAggregatorColumnSelectorStrategyFactory STRATEGY_FACTORY = + + private static final Comparator COMPARATOR = Comparator.nullsFirst((o1, o2) -> { + if (o1 instanceof ByteBuffer && o2 instanceof ByteBuffer) { + ByteBuffer buf1 = (ByteBuffer) o1; + ByteBuffer buf2 = (ByteBuffer) o2; + return Integer.compare( + BloomKFilter.getNumSetBits(buf1, buf1.position()), + BloomKFilter.getNumSetBits(buf2, buf2.position()) + ); + } else if (o1 instanceof BloomKFilter && o2 instanceof BloomKFilter) { + BloomKFilter o1f = (BloomKFilter) o1; + BloomKFilter o2f = (BloomKFilter) o2; + return Integer.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); + } else { + throw new RE("Unable to compare unexpected types [%s]", o1.getClass().getName()); + } + }); + + private static final BloomFilterAggregatorColumnSelectorStrategyFactory STRATEGY_FACTORY = new BloomFilterAggregatorColumnSelectorStrategyFactory(); private final String name; @@ -97,22 +115,7 @@ public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) @Override public Comparator getComparator() { - return Comparator.nullsFirst((o1, o2) -> { - if (o1 instanceof ByteBuffer && o2 instanceof ByteBuffer) { - ByteBuffer buf1 = (ByteBuffer) o1; - ByteBuffer buf2 = (ByteBuffer) o2; - return Integer.compare( - BloomKFilter.getNumSetBits(buf1, buf1.position()), - BloomKFilter.getNumSetBits(buf2, buf2.position()) - ); - } else if (o1 instanceof BloomKFilter && o2 instanceof BloomKFilter) { - BloomKFilter o1f = (BloomKFilter) o1; - BloomKFilter o2f = (BloomKFilter) o2; - return Integer.compare(o1f.getNumSetBits(), o2f.getNumSetBits()); - } else { - throw new RE("Unable to compare unexpected types [%s]", o1.getClass().getName()); - } - }); + return COMPARATOR; } @Override @@ -170,7 +173,7 @@ public Object finalizeComputation(Object object) } } catch (IOException ioe) { - throw new RuntimeException("Failed to deserialize BloomKFilter"); + throw new RuntimeException("Failed to deserialize BloomKFilter", ioe); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java index 1a50a52f3703..98cd92227cbb 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java @@ -51,6 +51,6 @@ public void aggregate(ByteBuffer buf, int position) @Override public void inspectRuntimeShape(RuntimeShapeInspector inspector) { - inspector.visit("selectorPlus", selectorPlus); + inspector.visit("selector", selectorPlus.getSelector()); } } From 435e784edd0f5e76c34d05fb9600590018e72be4 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 21 Jan 2019 15:14:34 -0800 Subject: [PATCH 29/36] CardinalityBufferAggregator inspect selectors instead of selectorPluses --- .../aggregation/cardinality/CardinalityBufferAggregator.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java index abb157738d4d..e5c6fac79807 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java @@ -20,6 +20,7 @@ package org.apache.druid.query.aggregation.cardinality; import org.apache.druid.hll.HyperLogLogCollector; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.aggregation.cardinality.types.CardinalityAggregatorColumnSelectorStrategy; @@ -112,6 +113,8 @@ public void close() @Override public void inspectRuntimeShape(RuntimeShapeInspector inspector) { - inspector.visit("selectorPluses", selectorPluses); + for (int i = 0; i < selectorPluses.length; i++) { + inspector.visit(StringUtils.format("selector-%d", i) , selectorPluses[i].getSelector()); + } } } From 3bdddb13151082554a87f6d74f3c0f5e7e53917e Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 21 Jan 2019 15:21:52 -0800 Subject: [PATCH 30/36] fix style --- .../aggregation/cardinality/CardinalityBufferAggregator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java index e5c6fac79807..875cd3d8be82 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/CardinalityBufferAggregator.java @@ -114,7 +114,7 @@ public void close() public void inspectRuntimeShape(RuntimeShapeInspector inspector) { for (int i = 0; i < selectorPluses.length; i++) { - inspector.visit(StringUtils.format("selector-%d", i) , selectorPluses[i].getSelector()); + inspector.visit(StringUtils.format("selector-%d", i), selectorPluses[i].getSelector()); } } } From 74feb9750eeca367728a6ee60b677994fe7e18f0 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Thu, 24 Jan 2019 15:33:34 -0800 Subject: [PATCH 31/36] refactor away from using ColumnSelectorPlus and ColumnSelectorStrategyFactory to instead use specialized aggregators for each supported column type, other review comments --- .../bloom/BaseBloomFilterAggregator.java | 9 +- .../BaseBloomFilterBufferAggregator.java | 28 ++- .../bloom/BloomFilterAggregatorFactory.java | 82 +++++--- .../bloom/BloomFilterMergeAggregator.java | 15 +- .../BloomFilterMergeAggregatorFactory.java | 5 +- .../BloomFilterMergeBufferAggregator.java | 21 +-- .../aggregation/bloom/BloomFilterSerde.java | 13 +- ....java => DoubleBloomFilterAggregator.java} | 22 +-- ...=> DoubleBloomFilterBufferAggregator.java} | 20 +- .../bloom/FloatBloomFilterAggregator.java | 42 +++++ ... => FloatBloomFilterBufferAggregator.java} | 20 +- .../bloom/LongBloomFilterAggregator.java | 42 +++++ ...a => LongBloomFilterBufferAggregator.java} | 20 +- ...egy.java => NilBloomFilterAggregator.java} | 22 ++- ...va => NilBloomFilterBufferAggregator.java} | 28 +-- .../bloom/StringBloomFilterAggregator.java | 54 ++++++ ...=> StringBloomFilterBufferAggregator.java} | 37 +--- ...gregatorColumnSelectorStrategyFactory.java | 51 ----- .../bloom/BloomFilterAggregatorTest.java | 177 ++++++++++-------- .../bloom/BloomFilterGroupByQueryTest.java | 2 +- 20 files changed, 412 insertions(+), 298 deletions(-) rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{BloomFilterAggregator.java => DoubleBloomFilterAggregator.java} (60%) rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java => DoubleBloomFilterBufferAggregator.java} (62%) create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/FloatBloomFilterAggregator.java rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{types/FloatBloomFilterAggregatorColumnSelectorStrategy.java => FloatBloomFilterBufferAggregator.java} (62%) create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/LongBloomFilterAggregator.java rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java => LongBloomFilterBufferAggregator.java} (62%) rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{types/BloomFilterAggregatorColumnSelectorStrategy.java => NilBloomFilterAggregator.java} (66%) rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{BloomFilterBufferAggregator.java => NilBloomFilterBufferAggregator.java} (51%) create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/StringBloomFilterAggregator.java rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{types/StringBloomFilterAggregatorColumnSelectorStrategy.java => StringBloomFilterBufferAggregator.java} (55%) delete mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java index 279dc6448352..652236b7c68a 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterAggregator.java @@ -21,16 +21,19 @@ import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.BaseNullableColumnValueSelector; import javax.annotation.Nullable; -public abstract class BaseBloomFilterAggregator implements Aggregator +public abstract class BaseBloomFilterAggregator implements Aggregator { - protected final BloomKFilter collector; + final BloomKFilter collector; + protected final TSelector selector; - public BaseBloomFilterAggregator(BloomKFilter collector) + BaseBloomFilterAggregator(TSelector selector, BloomKFilter collector) { this.collector = collector; + this.selector = selector; } @Nullable diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java index 1294ea457723..74def15c0626 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BaseBloomFilterBufferAggregator.java @@ -21,18 +21,24 @@ import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; +import org.apache.druid.segment.BaseNullableColumnValueSelector; import java.nio.ByteBuffer; -public abstract class BaseBloomFilterBufferAggregator implements BufferAggregator +public abstract class BaseBloomFilterBufferAggregator implements BufferAggregator { - private final int maxNumEntries; + protected final int maxNumEntries; + protected final TSelector selector; - public BaseBloomFilterBufferAggregator(int maxNumEntries) + BaseBloomFilterBufferAggregator(TSelector selector, int maxNumEntries) { + this.selector = selector; this.maxNumEntries = maxNumEntries; } + abstract void bufferAdd(ByteBuffer buf); + @Override public void init(ByteBuffer buf, int position) { @@ -42,6 +48,16 @@ public void init(ByteBuffer buf, int position) BloomKFilter.serialize(mutationBuffer, filter); } + @Override + public void aggregate(ByteBuffer buf, int position) + { + final int oldPosition = buf.position(); + buf.position(position); + bufferAdd(buf); + buf.position(oldPosition); + } + + @Override public Object get(ByteBuffer buf, int position) { @@ -76,4 +92,10 @@ public void close() { // nothing to close } + + @Override + public void inspectRuntimeShape(RuntimeShapeInspector inspector) + { + inspector.visit("selector", selector); + } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index c55755a98509..9ffcdc4ac5e2 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -23,20 +23,21 @@ import com.fasterxml.jackson.annotation.JsonProperty; import org.apache.commons.codec.binary.Base64; import org.apache.druid.guice.BloomFilterSerializersModule; +import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.StringUtils; -import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.AggregatorUtil; import org.apache.druid.query.aggregation.BufferAggregator; -import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; -import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategyFactory; import org.apache.druid.query.cache.CacheKeyBuilder; import org.apache.druid.query.dimension.DimensionSpec; import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.BaseNullableColumnValueSelector; import org.apache.druid.segment.ColumnSelectorFactory; -import org.apache.druid.segment.DimensionHandlerUtils; +import org.apache.druid.segment.NilColumnValueSelector; +import org.apache.druid.segment.column.ColumnCapabilities; +import org.apache.druid.segment.column.ValueType; import javax.annotation.Nullable; import java.io.IOException; @@ -67,9 +68,6 @@ public class BloomFilterAggregatorFactory extends AggregatorFactory } }); - private static final BloomFilterAggregatorColumnSelectorStrategyFactory STRATEGY_FACTORY = - new BloomFilterAggregatorColumnSelectorStrategyFactory(); - private final String name; private final DimensionSpec field; private final int maxNumEntries; @@ -89,27 +87,69 @@ public BloomFilterAggregatorFactory( @Override public Aggregator factorize(ColumnSelectorFactory columnFactory) { - ColumnSelectorPlus selectorPlus = - DimensionHandlerUtils.createColumnSelectorPlus( - STRATEGY_FACTORY, - field, - columnFactory - ); + BloomKFilter filter = new BloomKFilter(maxNumEntries); + ColumnCapabilities capabilities = columnFactory.getColumnCapabilities(field.getDimension()); + BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); - return new BloomFilterAggregator(selectorPlus, maxNumEntries); + if (selector instanceof NilColumnValueSelector) { + return new NilBloomFilterAggregator((NilColumnValueSelector) selector, filter); + } + if (capabilities == null) { + throw new IAE( + "Cannot create bloom filter buffer aggregator for column selector type [%s]", + selector.getClass().getName() + ); + } + ValueType type = capabilities.getType(); + switch (type) { + case STRING: + return new StringBloomFilterAggregator(columnFactory.makeDimensionSelector(field), filter); + case LONG: + return new LongBloomFilterAggregator(columnFactory.makeColumnValueSelector(field.getDimension()), filter); + case FLOAT: + return new FloatBloomFilterAggregator(columnFactory.makeColumnValueSelector(field.getDimension()), filter); + case DOUBLE: + return new DoubleBloomFilterAggregator(columnFactory.makeColumnValueSelector(field.getDimension()), filter); + default: + throw new IAE("Cannot create bloom filter aggregator for invalid column type [%s]", type); + } } @Override public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) { - ColumnSelectorPlus selectorPlus = - DimensionHandlerUtils.createColumnSelectorPlus( - STRATEGY_FACTORY, - field, - columnFactory - ); + ColumnCapabilities capabilities = columnFactory.getColumnCapabilities(field.getDimension()); + BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); - return new BloomFilterBufferAggregator(selectorPlus, maxNumEntries); + if (selector instanceof NilColumnValueSelector) { + return new NilBloomFilterBufferAggregator((NilColumnValueSelector) selector, maxNumEntries); + } + if (capabilities == null) { + throw new IAE( + "Cannot create bloom filter buffer aggregator for column selector type [%s]", + selector.getClass().getName() + ); + } + + ValueType type = capabilities.getType(); + switch (type) { + case STRING: + return new StringBloomFilterBufferAggregator(columnFactory.makeDimensionSelector(field), maxNumEntries); + case LONG: + return new LongBloomFilterBufferAggregator( + columnFactory.makeColumnValueSelector(field.getDimension()), maxNumEntries + ); + case FLOAT: + return new FloatBloomFilterBufferAggregator( + columnFactory.makeColumnValueSelector(field.getDimension()), maxNumEntries + ); + case DOUBLE: + return new DoubleBloomFilterBufferAggregator( + columnFactory.makeColumnValueSelector(field.getDimension()), maxNumEntries + ); + default: + throw new IAE("Cannot create bloom filter buffer aggregator for invalid column type [%s]", type); + } } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java index a0cdbcea53ff..67d7a70cb418 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregator.java @@ -25,14 +25,11 @@ import java.io.IOException; import java.nio.ByteBuffer; -public class BloomFilterMergeAggregator extends BaseBloomFilterAggregator +public final class BloomFilterMergeAggregator extends BaseBloomFilterAggregator> { - private final ColumnValueSelector selector; - - public BloomFilterMergeAggregator(ColumnValueSelector selector, int maxNumEntries) + public BloomFilterMergeAggregator(ColumnValueSelector selector, BloomKFilter collector) { - super(new BloomKFilter(maxNumEntries)); - this.selector = selector; + super(selector, collector); } @Override @@ -44,9 +41,9 @@ public void aggregate() collector.merge((BloomKFilter) other); } else if (other instanceof ByteBuffer) { // fun fact: because bloom filter agg factory deserialize returns a byte buffer to avoid unnecessary serde, - // but group by v1 ends up trying to merge bytebuffers from buffer aggs with this agg instead of the buffer - // merge agg. fun! Also, it requires a 'ComplexMetricSerde' to be registered even for query time only aggs, but - // then never uses it. also fun! + // but GroupByQueryEngine (group by v1) ends up trying to merge ByteBuffers from buffer aggs with this agg + // instead of the BloomFilterBufferMergeAggregator. fun! Also, it requires a 'ComplexMetricSerde' to be + // registered even for query time only aggs, but then never uses it. also fun! try { BloomKFilter otherFilter = BloomKFilter.deserialize((ByteBuffer) other); collector.merge(otherFilter); diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java index 73cf78c8efcf..83c0729b0793 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -26,6 +26,7 @@ import org.apache.druid.query.aggregation.BufferAggregator; import org.apache.druid.query.cache.CacheKeyBuilder; import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.BaseNullableColumnValueSelector; import org.apache.druid.segment.ColumnSelectorFactory; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.NilColumnValueSelector; @@ -47,11 +48,11 @@ public class BloomFilterMergeAggregatorFactory extends BloomFilterAggregatorFact @Override public Aggregator factorize(final ColumnSelectorFactory metricFactory) { - final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); + final BaseNullableColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); if (selector instanceof NilColumnValueSelector) { throw new ISE("WTF?! Unexpected NilColumnValueSelector"); } - return new BloomFilterMergeAggregator(selector, getMaxNumEntries()); + return new BloomFilterMergeAggregator((ColumnValueSelector) selector, new BloomKFilter(getMaxNumEntries())); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java index 5fb40712321e..026a23e7285d 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeBufferAggregator.java @@ -20,34 +20,21 @@ package org.apache.druid.query.aggregation.bloom; import org.apache.druid.query.filter.BloomKFilter; -import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import org.apache.druid.segment.ColumnValueSelector; import java.nio.ByteBuffer; -public class BloomFilterMergeBufferAggregator extends BaseBloomFilterBufferAggregator +public final class BloomFilterMergeBufferAggregator extends BaseBloomFilterBufferAggregator> { - private final ColumnValueSelector selector; - public BloomFilterMergeBufferAggregator(ColumnValueSelector selector, int maxNumEntries) { - super(maxNumEntries); - this.selector = selector; + super(selector, maxNumEntries); } @Override - public void aggregate(ByteBuffer buf, int position) + public void bufferAdd(ByteBuffer buf) { - final int oldPosition = buf.position(); - buf.position(position); ByteBuffer other = selector.getObject(); - BloomKFilter.mergeBloomFilterByteBuffers(buf, position, other, other.position()); - buf.position(oldPosition); - } - - @Override - public void inspectRuntimeShape(RuntimeShapeInspector inspector) - { - inspector.visit("selector", selector); + BloomKFilter.mergeBloomFilterByteBuffers(buf, buf.position(), other, other.position()); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java index 6ffcfb8a721e..227fe705ccfc 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterSerde.java @@ -31,8 +31,9 @@ import java.nio.ByteBuffer; /** - * This exists so bloom filter agg has something to register so group by v1 will work, but isn't actually used - * because bloom filter agg is currently query time only + * Dummy {@link ComplexMetricSerde} that exists so {@link BloomFilterAggregatorFactory} has something to register so + * {@link org.apache.druid.query.groupby.GroupByQueryEngine} will work, but isn't actually used because bloom filter + * aggregators are currently only implemented for use at query time */ public class BloomFilterSerde extends ComplexMetricSerde { @@ -45,24 +46,24 @@ public String getTypeName() @Override public ComplexMetricExtractor getExtractor() { - throw new UnsupportedOperationException("How can this be?"); + throw new UnsupportedOperationException("Bloom filter aggregators are query-time only"); } @Override public void deserializeColumn(ByteBuffer byteBuffer, ColumnBuilder columnBuilder) { - throw new UnsupportedOperationException("How can this be?"); + throw new UnsupportedOperationException("Bloom filter aggregators are query-time only"); } @Override public GenericColumnSerializer getSerializer(SegmentWriteOutMedium segmentWriteOutMedium, String column) { - throw new UnsupportedOperationException("How can this be?"); + throw new UnsupportedOperationException("Bloom filter aggregators are query-time only"); } @Override public ObjectStrategy getObjectStrategy() { - throw new UnsupportedOperationException("How can this be?"); + throw new UnsupportedOperationException("Bloom filter aggregators are query-time only"); } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/DoubleBloomFilterAggregator.java similarity index 60% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/DoubleBloomFilterAggregator.java index 7372dd95ec83..dfdae6c20d6e 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/DoubleBloomFilterAggregator.java @@ -19,26 +19,24 @@ package org.apache.druid.query.aggregation.bloom; -import org.apache.druid.query.ColumnSelectorPlus; -import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; +import org.apache.druid.common.config.NullHandling; import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.BaseDoubleColumnValueSelector; -public class BloomFilterAggregator extends BaseBloomFilterAggregator +public final class DoubleBloomFilterAggregator extends BaseBloomFilterAggregator { - private final ColumnSelectorPlus selectorPlus; - - public BloomFilterAggregator( - ColumnSelectorPlus selectorPlus, - int maxNumEntries - ) + DoubleBloomFilterAggregator(BaseDoubleColumnValueSelector selector, BloomKFilter collector) { - super(new BloomKFilter(maxNumEntries)); - this.selectorPlus = selectorPlus; + super(selector, collector); } @Override public void aggregate() { - selectorPlus.getColumnSelectorStrategy().add(selectorPlus.getSelector(), collector); + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + collector.addDouble(selector.getDouble()); + } else { + collector.addBytes(null, 0, 0); + } } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/DoubleBloomFilterBufferAggregator.java similarity index 62% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/DoubleBloomFilterBufferAggregator.java index 241712145491..e84b9fc70df4 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/DoubleBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/DoubleBloomFilterBufferAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.druid.query.aggregation.bloom.types; +package org.apache.druid.query.aggregation.bloom; import org.apache.druid.common.config.NullHandling; import org.apache.druid.query.filter.BloomKFilter; @@ -25,26 +25,20 @@ import java.nio.ByteBuffer; -public class DoubleBloomFilterAggregatorColumnSelectorStrategy - implements BloomFilterAggregatorColumnSelectorStrategy +public final class DoubleBloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator { - @Override - public void add(BaseDoubleColumnValueSelector selector, BloomKFilter bloomFilter) + DoubleBloomFilterBufferAggregator(BaseDoubleColumnValueSelector selector, int maxNumEntries) { - if (NullHandling.replaceWithDefault() || !selector.isNull()) { - bloomFilter.addDouble(selector.getDouble()); - } else { - bloomFilter.addBytes(null, 0, 0); - } + super(selector, maxNumEntries); } @Override - public void bufferAdd(BaseDoubleColumnValueSelector selector, ByteBuffer buffer) + public void bufferAdd(ByteBuffer buf) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { - BloomKFilter.addDouble(buffer, selector.getDouble()); + BloomKFilter.addDouble(buf, selector.getDouble()); } else { - BloomKFilter.addBytes(buffer, null, 0, 0); + BloomKFilter.addBytes(buf, null, 0, 0); } } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/FloatBloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/FloatBloomFilterAggregator.java new file mode 100644 index 000000000000..ae53d165b96e --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/FloatBloomFilterAggregator.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.BaseFloatColumnValueSelector; + +public final class FloatBloomFilterAggregator extends BaseBloomFilterAggregator +{ + FloatBloomFilterAggregator(BaseFloatColumnValueSelector selector, BloomKFilter collector) + { + super(selector, collector); + } + + @Override + public void aggregate() + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + collector.addFloat(selector.getFloat()); + } else { + collector.addBytes(null, 0, 0); + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/FloatBloomFilterBufferAggregator.java similarity index 62% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/FloatBloomFilterBufferAggregator.java index 7efc95d4a7d6..27e88d48d7a9 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/FloatBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/FloatBloomFilterBufferAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.druid.query.aggregation.bloom.types; +package org.apache.druid.query.aggregation.bloom; import org.apache.druid.common.config.NullHandling; import org.apache.druid.query.filter.BloomKFilter; @@ -25,26 +25,20 @@ import java.nio.ByteBuffer; -public class FloatBloomFilterAggregatorColumnSelectorStrategy - implements BloomFilterAggregatorColumnSelectorStrategy +public final class FloatBloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator { - @Override - public void add(BaseFloatColumnValueSelector selector, BloomKFilter bloomFilter) + FloatBloomFilterBufferAggregator(BaseFloatColumnValueSelector selector, int maxNumEntries) { - if (NullHandling.replaceWithDefault() || !selector.isNull()) { - bloomFilter.addFloat(selector.getFloat()); - } else { - bloomFilter.addBytes(null, 0, 0); - } + super(selector, maxNumEntries); } @Override - public void bufferAdd(BaseFloatColumnValueSelector selector, ByteBuffer buffer) + public void bufferAdd(ByteBuffer buf) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { - BloomKFilter.addFloat(buffer, selector.getFloat()); + BloomKFilter.addFloat(buf, selector.getFloat()); } else { - BloomKFilter.addBytes(buffer, null, 0, 0); + BloomKFilter.addBytes(buf, null, 0, 0); } } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/LongBloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/LongBloomFilterAggregator.java new file mode 100644 index 000000000000..caa47397df11 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/LongBloomFilterAggregator.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.BaseLongColumnValueSelector; + +public final class LongBloomFilterAggregator extends BaseBloomFilterAggregator +{ + LongBloomFilterAggregator(BaseLongColumnValueSelector selector, BloomKFilter collector) + { + super(selector, collector); + } + + @Override + public void aggregate() + { + if (NullHandling.replaceWithDefault() || !selector.isNull()) { + collector.addLong(selector.getLong()); + } else { + collector.addBytes(null, 0, 0); + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/LongBloomFilterBufferAggregator.java similarity index 62% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/LongBloomFilterBufferAggregator.java index 2cc5de9c3113..13a6634cda10 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/LongBloomFilterAggregatorColumnValueSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/LongBloomFilterBufferAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.druid.query.aggregation.bloom.types; +package org.apache.druid.query.aggregation.bloom; import org.apache.druid.common.config.NullHandling; import org.apache.druid.query.filter.BloomKFilter; @@ -25,26 +25,20 @@ import java.nio.ByteBuffer; -public class LongBloomFilterAggregatorColumnValueSelectorStrategy - implements BloomFilterAggregatorColumnSelectorStrategy +public final class LongBloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator { - @Override - public void add(BaseLongColumnValueSelector selector, BloomKFilter bloomFilter) + LongBloomFilterBufferAggregator(BaseLongColumnValueSelector selector, int maxNumEntries) { - if (NullHandling.replaceWithDefault() || !selector.isNull()) { - bloomFilter.addLong(selector.getLong()); - } else { - bloomFilter.addBytes(null, 0, 0); - } + super(selector, maxNumEntries); } @Override - public void bufferAdd(BaseLongColumnValueSelector selector, ByteBuffer buffer) + public void bufferAdd(ByteBuffer buf) { if (NullHandling.replaceWithDefault() || !selector.isNull()) { - BloomKFilter.addLong(buffer, selector.getLong()); + BloomKFilter.addLong(buf, selector.getLong()); } else { - BloomKFilter.addBytes(buffer, null, 0, 0); + BloomKFilter.addBytes(buf, null, 0, 0); } } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterAggregator.java similarity index 66% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterAggregator.java index c908f52ce6c0..7748ff3bbeee 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterAggregator.java @@ -17,19 +17,21 @@ * under the License. */ -package org.apache.druid.query.aggregation.bloom.types; +package org.apache.druid.query.aggregation.bloom; -import org.apache.druid.query.dimension.ColumnSelectorStrategy; import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.NilColumnValueSelector; -import java.nio.ByteBuffer; - -public interface BloomFilterAggregatorColumnSelectorStrategy extends ColumnSelectorStrategy +public final class NilBloomFilterAggregator extends BaseBloomFilterAggregator { - /** - * Add column value to bloomK filter - */ - void add(TValueSelector selector, BloomKFilter bloomFilter); + NilBloomFilterAggregator(NilColumnValueSelector selector, BloomKFilter collector) + { + super(selector, collector); + } - void bufferAdd(TValueSelector selector, ByteBuffer buffer); + @Override + public void aggregate() + { + // nothing to do + } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterBufferAggregator.java similarity index 51% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterBufferAggregator.java index 98cd92227cbb..a2df50fa65ed 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterBufferAggregator.java @@ -19,38 +19,26 @@ package org.apache.druid.query.aggregation.bloom; -import org.apache.druid.query.ColumnSelectorPlus; -import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; -import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; +import org.apache.druid.segment.NilColumnValueSelector; import java.nio.ByteBuffer; - -public class BloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator +public final class NilBloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator { - private final ColumnSelectorPlus selectorPlus; - - public BloomFilterBufferAggregator( - ColumnSelectorPlus selectorPlus, - int maxNumEntries - ) + NilBloomFilterBufferAggregator(NilColumnValueSelector selector, int maxNumEntries) { - super(maxNumEntries); - this.selectorPlus = selectorPlus; + super(selector, maxNumEntries); } @Override - public void aggregate(ByteBuffer buf, int position) + public void bufferAdd(ByteBuffer buf) { - final int oldPosition = buf.position(); - buf.position(position); - selectorPlus.getColumnSelectorStrategy().bufferAdd(selectorPlus.getSelector(), buf); - buf.position(oldPosition); + // nothing to do } @Override - public void inspectRuntimeShape(RuntimeShapeInspector inspector) + public void aggregate(ByteBuffer buf, int position) { - inspector.visit("selector", selectorPlus.getSelector()); + // nothing to do } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/StringBloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/StringBloomFilterAggregator.java new file mode 100644 index 000000000000..351ef8413a92 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/StringBloomFilterAggregator.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.DimensionSelector; + +public final class StringBloomFilterAggregator extends BaseBloomFilterAggregator +{ + StringBloomFilterAggregator(DimensionSelector selector, BloomKFilter collector) + { + super(selector, collector); + } + + @Override + public void aggregate() + { + // note: there might be room for optimization here but behavior must match BloomDimFilter implementation + if (selector.getRow().size() > 1) { + selector.getRow().forEach(v -> { + String value = selector.lookupName(v); + if (value == null) { + collector.addBytes(null, 0, 0); + } else { + collector.addString(value); + } + }); + } else { + String value = (String) selector.getObject(); + if (value == null) { + collector.addBytes(null, 0, 0); + } else { + collector.addString(value); + } + } + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/StringBloomFilterBufferAggregator.java similarity index 55% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/StringBloomFilterBufferAggregator.java index db714fdc11dc..c7c17c940e0d 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/StringBloomFilterAggregatorColumnSelectorStrategy.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/StringBloomFilterBufferAggregator.java @@ -17,56 +17,39 @@ * under the License. */ -package org.apache.druid.query.aggregation.bloom.types; +package org.apache.druid.query.aggregation.bloom; import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.DimensionSelector; import java.nio.ByteBuffer; -public class StringBloomFilterAggregatorColumnSelectorStrategy - implements BloomFilterAggregatorColumnSelectorStrategy +public final class StringBloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator { - @Override - public void add(DimensionSelector selector, BloomKFilter bloomFilter) + + StringBloomFilterBufferAggregator(DimensionSelector selector, int maxNumEntries) { - if (selector.getRow().size() > 1) { - selector.getRow().forEach(v -> { - String value = selector.lookupName(v); - if (value == null) { - bloomFilter.addBytes(null, 0, 0); - } else { - bloomFilter.addString(value); - } - }); - } else { - String value = (String) selector.getObject(); - if (value == null) { - bloomFilter.addBytes(null, 0, 0); - } else { - bloomFilter.addString(value); - } - } + super(selector, maxNumEntries); } @Override - public void bufferAdd(DimensionSelector selector, ByteBuffer buffer) + public void bufferAdd(ByteBuffer buf) { if (selector.getRow().size() > 1) { selector.getRow().forEach(v -> { String value = selector.lookupName(v); if (value == null) { - BloomKFilter.addBytes(buffer, null, 0, 0); + BloomKFilter.addBytes(buf, null, 0, 0); } else { - BloomKFilter.addString(buffer, value); + BloomKFilter.addString(buf, value); } }); } else { String value = (String) selector.getObject(); if (value == null) { - BloomKFilter.addBytes(buffer, null, 0, 0); + BloomKFilter.addBytes(buf, null, 0, 0); } else { - BloomKFilter.addString(buffer, value); + BloomKFilter.addString(buf, value); } } } diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java deleted file mode 100644 index 65197271c45f..000000000000 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/types/BloomFilterAggregatorColumnSelectorStrategyFactory.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.aggregation.bloom.types; - -import org.apache.druid.java.util.common.IAE; -import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.column.ColumnCapabilities; -import org.apache.druid.segment.column.ValueType; - -public class BloomFilterAggregatorColumnSelectorStrategyFactory - implements ColumnSelectorStrategyFactory -{ - @Override - public BloomFilterAggregatorColumnSelectorStrategy makeColumnSelectorStrategy( - ColumnCapabilities capabilities, - ColumnValueSelector selector - ) - { - ValueType type = capabilities.getType(); - switch (type) { - case STRING: - return new StringBloomFilterAggregatorColumnSelectorStrategy(); - case LONG: - return new LongBloomFilterAggregatorColumnValueSelectorStrategy(); - case FLOAT: - return new FloatBloomFilterAggregatorColumnSelectorStrategy(); - case DOUBLE: - return new DoubleBloomFilterAggregatorColumnSelectorStrategy(); - default: - throw new IAE("Cannot create query type helper from invalid type [%s]", type); - } - } -} diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java index cd8a05c3c7b8..57ec520247db 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -28,15 +28,9 @@ import org.apache.druid.guice.BloomFilterExtensionModule; import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.druid.jackson.DefaultObjectMapper; -import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.BufferAggregator; -import org.apache.druid.query.aggregation.bloom.types.BloomFilterAggregatorColumnSelectorStrategy; -import org.apache.druid.query.aggregation.bloom.types.DoubleBloomFilterAggregatorColumnSelectorStrategy; -import org.apache.druid.query.aggregation.bloom.types.FloatBloomFilterAggregatorColumnSelectorStrategy; -import org.apache.druid.query.aggregation.bloom.types.LongBloomFilterAggregatorColumnValueSelectorStrategy; -import org.apache.druid.query.aggregation.bloom.types.StringBloomFilterAggregatorColumnSelectorStrategy; import org.apache.druid.query.aggregation.cardinality.CardinalityAggregatorTest; import org.apache.druid.query.dimension.DefaultDimensionSpec; import org.apache.druid.query.dimension.DimensionSpec; @@ -246,15 +240,7 @@ public BloomFilterAggregatorTest() public void testAggregateValues() throws IOException { DimensionSelector dimSelector = new CardinalityAggregatorTest.TestDimensionSelector(values1, null); - BloomFilterAggregator agg = new BloomFilterAggregator( - new ColumnSelectorPlus<>( - dimSpec.getDimension(), - dimSpec.getOutputName(), - new StringBloomFilterAggregatorColumnSelectorStrategy(), - dimSelector - ), - maxNumValues - ); + StringBloomFilterAggregator agg = new StringBloomFilterAggregator(dimSelector, new BloomKFilter(maxNumValues)); for (int i = 0; i < values1.size(); ++i) { aggregateDimension(Collections.singletonList(dimSelector), agg); @@ -269,15 +255,7 @@ public void testAggregateValues() throws IOException public void testAggregateLongValues() throws IOException { TestLongColumnSelector selector = new TestLongColumnSelector(Arrays.asList(longValues1)); - BloomFilterAggregator agg = new BloomFilterAggregator( - new ColumnSelectorPlus<>( - "longColumn", - "longlongman", - new LongBloomFilterAggregatorColumnValueSelectorStrategy(), - selector - ), - maxNumValues - ); + LongBloomFilterAggregator agg = new LongBloomFilterAggregator(selector, new BloomKFilter(maxNumValues)); for (Long ignored : longValues1) { aggregateColumn(Collections.singletonList(selector), agg); @@ -292,15 +270,7 @@ public void testAggregateLongValues() throws IOException public void testAggregateFloatValues() throws IOException { TestFloatColumnSelector selector = new TestFloatColumnSelector(Arrays.asList(floatValues1)); - BloomFilterAggregator agg = new BloomFilterAggregator( - new ColumnSelectorPlus<>( - "floatColumn", - "floatColumn", - new FloatBloomFilterAggregatorColumnSelectorStrategy(), - selector - ), - maxNumValues - ); + FloatBloomFilterAggregator agg = new FloatBloomFilterAggregator(selector, new BloomKFilter(maxNumValues)); for (Float ignored : floatValues1) { aggregateColumn(Collections.singletonList(selector), agg); @@ -315,15 +285,7 @@ public void testAggregateFloatValues() throws IOException public void testAggregateDoubleValues() throws IOException { TestDoubleColumnSelector selector = new TestDoubleColumnSelector(Arrays.asList(doubleValues1)); - BloomFilterAggregator agg = new BloomFilterAggregator( - new ColumnSelectorPlus<>( - "doubleColumn", - "doubleColumn", - new DoubleBloomFilterAggregatorColumnSelectorStrategy(), - selector - ), - maxNumValues - ); + DoubleBloomFilterAggregator agg = new DoubleBloomFilterAggregator(selector, new BloomKFilter(maxNumValues)); for (Double ignored : doubleValues1) { aggregateColumn(Collections.singletonList(selector), agg); @@ -335,18 +297,10 @@ public void testAggregateDoubleValues() throws IOException } @Test - public void testBufferAggregateValues() throws IOException + public void testBufferAggregateStringValues() throws IOException { DimensionSelector dimSelector = new CardinalityAggregatorTest.TestDimensionSelector(values2, null); - BloomFilterBufferAggregator agg = new BloomFilterBufferAggregator( - new ColumnSelectorPlus<>( - dimSpec.getDimension(), - dimSpec.getOutputName(), - new StringBloomFilterAggregatorColumnSelectorStrategy(), - dimSelector - ), - maxNumValues - ); + StringBloomFilterBufferAggregator agg = new StringBloomFilterBufferAggregator(dimSelector, maxNumValues); int maxSize = valueAggregatorFactory.getMaxIntermediateSizeWithNulls(); ByteBuffer buf = ByteBuffer.allocate(maxSize + 64); @@ -363,34 +317,77 @@ public void testBufferAggregateValues() throws IOException Assert.assertEquals(serializedFilter2, serialized); } + @Test + public void testBufferAggregateLongValues() throws IOException + { + TestLongColumnSelector selector = new TestLongColumnSelector(Arrays.asList(longValues1)); + LongBloomFilterBufferAggregator agg = new LongBloomFilterBufferAggregator(selector, maxNumValues); + + int maxSize = valueAggregatorFactory.getMaxIntermediateSizeWithNulls(); + ByteBuffer buf = ByteBuffer.allocate(maxSize + 64); + int pos = 10; + buf.limit(pos + maxSize); + + agg.init(buf, pos); + + for (int i = 0; i < longValues1.length; ++i) { + bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos); + } + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedLongFilter, serialized); + } + + @Test + public void testBufferAggregateFloatValues() throws IOException + { + TestFloatColumnSelector selector = new TestFloatColumnSelector(Arrays.asList(floatValues1)); + FloatBloomFilterBufferAggregator agg = new FloatBloomFilterBufferAggregator(selector, maxNumValues); + + int maxSize = valueAggregatorFactory.getMaxIntermediateSizeWithNulls(); + ByteBuffer buf = ByteBuffer.allocate(maxSize + 64); + int pos = 10; + buf.limit(pos + maxSize); + + agg.init(buf, pos); + + for (int i = 0; i < floatValues1.length; ++i) { + bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos); + } + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedFloatFilter, serialized); + } + + @Test + public void testBufferAggregateDoubleValues() throws IOException + { + TestDoubleColumnSelector selector = new TestDoubleColumnSelector(Arrays.asList(doubleValues1)); + DoubleBloomFilterBufferAggregator agg = new DoubleBloomFilterBufferAggregator(selector, maxNumValues); + + int maxSize = valueAggregatorFactory.getMaxIntermediateSizeWithNulls(); + ByteBuffer buf = ByteBuffer.allocate(maxSize + 64); + int pos = 10; + buf.limit(pos + maxSize); + + agg.init(buf, pos); + + for (int i = 0; i < doubleValues1.length; ++i) { + bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos); + } + BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)); + String serialized = filterToString(bloomKFilter); + Assert.assertEquals(serializedDoubleFilter, serialized); + } + @Test public void testCombineValues() throws IOException { DimensionSelector dimSelector1 = new CardinalityAggregatorTest.TestDimensionSelector(values1, null); DimensionSelector dimSelector2 = new CardinalityAggregatorTest.TestDimensionSelector(values2, null); - ColumnSelectorPlus selector1 = new ColumnSelectorPlus<>( - dimSpec.getDimension(), - dimSpec.getOutputName(), - new StringBloomFilterAggregatorColumnSelectorStrategy(), - dimSelector1 - ); - - ColumnSelectorPlus selector2 = new ColumnSelectorPlus<>( - dimSpec.getDimension(), - dimSpec.getOutputName(), - new StringBloomFilterAggregatorColumnSelectorStrategy(), - dimSelector2 - ); - - BloomFilterAggregator agg1 = new BloomFilterAggregator( - selector1, - maxNumValues - ); - BloomFilterAggregator agg2 = new BloomFilterAggregator( - selector2, - maxNumValues - ); + StringBloomFilterAggregator agg1 = new StringBloomFilterAggregator(dimSelector1, new BloomKFilter(maxNumValues)); + StringBloomFilterAggregator agg2 = new StringBloomFilterAggregator(dimSelector2, new BloomKFilter(maxNumValues)); for (int i = 0; i < values1.size(); ++i) { aggregateDimension(Collections.singletonList(dimSelector1), agg1); @@ -416,7 +413,33 @@ public void testMergeValues() throws IOException final TestBloomFilterColumnSelector mergeDim = new TestBloomFilterColumnSelector(ImmutableList.of(filter1, filter2)); - BloomFilterMergeAggregator mergeAggregator = new BloomFilterMergeAggregator(mergeDim, maxNumValues); + BloomFilterMergeAggregator mergeAggregator = + new BloomFilterMergeAggregator(mergeDim, new BloomKFilter(maxNumValues)); + + for (int i = 0; i < 2; ++i) { + aggregateColumn(Collections.singletonList(mergeDim), mergeAggregator); + } + + + BloomKFilter merged = (BloomKFilter) valueAggregatorFactory.getCombiningFactory() + .finalizeComputation(mergeAggregator.get()); + String serialized = filterToString(merged); + Assert.assertEquals(serializedCombinedFilter, serialized); + } + + @Test + public void testMergeValuesWithBuffersForGroupByV1() throws IOException + { + final TestBloomFilterColumnSelector mergeDim = + new TestBloomFilterColumnSelector( + ImmutableList.of( + ByteBuffer.wrap(BloomFilterSerializersModule.bloomKFilterToBytes(filter1)), + ByteBuffer.wrap(BloomFilterSerializersModule.bloomKFilterToBytes(filter2)) + ) + ); + + BloomFilterMergeAggregator mergeAggregator = + new BloomFilterMergeAggregator(mergeDim, new BloomKFilter(maxNumValues)); for (int i = 0; i < 2; ++i) { aggregateColumn(Collections.singletonList(mergeDim), mergeAggregator); @@ -575,9 +598,9 @@ public boolean isNull() } } - public static class TestBloomFilterColumnSelector extends SteppableSelector + public static class TestBloomFilterColumnSelector extends SteppableSelector { - public TestBloomFilterColumnSelector(List values) + public TestBloomFilterColumnSelector(List values) { super(values); } diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java index 763e0f1d45a9..a2207f2e6423 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterGroupByQueryTest.java @@ -130,8 +130,8 @@ public void testQueryFakeDimension() throws Exception MapBasedRow row = ingestAndQuery(query); + // a nil column results in a totally empty bloom filter BloomKFilter filter = new BloomKFilter(1500); - filter.addBytes(null, 0, 0); Object val = row.getRaw("blooming_quality"); From 3136ce73e169fafdd49da649874386ae981c8380 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Thu, 24 Jan 2019 15:42:08 -0800 Subject: [PATCH 32/36] adjustment --- .../bloom/BloomFilterAggregatorFactory.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 9ffcdc4ac5e2..075e8a511ede 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -89,12 +89,12 @@ public Aggregator factorize(ColumnSelectorFactory columnFactory) { BloomKFilter filter = new BloomKFilter(maxNumEntries); ColumnCapabilities capabilities = columnFactory.getColumnCapabilities(field.getDimension()); - BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); - if (selector instanceof NilColumnValueSelector) { - return new NilBloomFilterAggregator((NilColumnValueSelector) selector, filter); - } if (capabilities == null) { + BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); + if (selector instanceof NilColumnValueSelector) { + return new NilBloomFilterAggregator((NilColumnValueSelector) selector, filter); + } throw new IAE( "Cannot create bloom filter buffer aggregator for column selector type [%s]", selector.getClass().getName() @@ -119,12 +119,12 @@ public Aggregator factorize(ColumnSelectorFactory columnFactory) public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) { ColumnCapabilities capabilities = columnFactory.getColumnCapabilities(field.getDimension()); - BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); - if (selector instanceof NilColumnValueSelector) { - return new NilBloomFilterBufferAggregator((NilColumnValueSelector) selector, maxNumEntries); - } if (capabilities == null) { + BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); + if (selector instanceof NilColumnValueSelector) { + return new NilBloomFilterBufferAggregator((NilColumnValueSelector) selector, maxNumEntries); + } throw new IAE( "Cannot create bloom filter buffer aggregator for column selector type [%s]", selector.getClass().getName() From a50b2b2d9acc66bc36754fc7a2d90aa394c9c457 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Thu, 24 Jan 2019 18:38:29 -0800 Subject: [PATCH 33/36] fix teamcity error? --- .../bloom/BloomFilterAggregatorTest.java | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java index 57ec520247db..99b1075cb3eb 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -53,6 +53,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.stream.IntStream; public class BloomFilterAggregatorTest { @@ -330,9 +331,8 @@ public void testBufferAggregateLongValues() throws IOException agg.init(buf, pos); - for (int i = 0; i < longValues1.length; ++i) { - bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos); - } + IntStream.range(0, longValues1.length) + .forEach(i -> bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos)); BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)); String serialized = filterToString(bloomKFilter); Assert.assertEquals(serializedLongFilter, serialized); @@ -351,9 +351,8 @@ public void testBufferAggregateFloatValues() throws IOException agg.init(buf, pos); - for (int i = 0; i < floatValues1.length; ++i) { - bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos); - } + IntStream.range(0, floatValues1.length) + .forEach(i -> bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos)); BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)); String serialized = filterToString(bloomKFilter); Assert.assertEquals(serializedFloatFilter, serialized); @@ -372,9 +371,8 @@ public void testBufferAggregateDoubleValues() throws IOException agg.init(buf, pos); - for (int i = 0; i < doubleValues1.length; ++i) { - bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos); - } + IntStream.range(0, doubleValues1.length) + .forEach(i -> bufferAggregateColumn(Collections.singletonList(selector), agg, buf, pos)); BloomKFilter bloomKFilter = (BloomKFilter) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)); String serialized = filterToString(bloomKFilter); Assert.assertEquals(serializedDoubleFilter, serialized); From 68bb28fedb7d35a772e3f0cbab1fb92e07080d94 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 25 Jan 2019 12:54:39 -0800 Subject: [PATCH 34/36] rename nil aggs to empty, change empty agg constructor signature, add comments --- .../aggregation/bloom/BloomFilterAggregatorFactory.java | 4 ++-- .../bloom/BloomFilterMergeAggregatorFactory.java | 6 ++++-- ...ilterAggregator.java => EmptyBloomFilterAggregator.java} | 6 +++--- ...ggregator.java => EmptyBloomFilterBufferAggregator.java} | 6 +++--- 4 files changed, 12 insertions(+), 10 deletions(-) rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{NilBloomFilterAggregator.java => EmptyBloomFilterAggregator.java} (82%) rename extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/{NilBloomFilterBufferAggregator.java => EmptyBloomFilterBufferAggregator.java} (82%) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 075e8a511ede..60ec529a6537 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -93,7 +93,7 @@ public Aggregator factorize(ColumnSelectorFactory columnFactory) if (capabilities == null) { BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); if (selector instanceof NilColumnValueSelector) { - return new NilBloomFilterAggregator((NilColumnValueSelector) selector, filter); + return new EmptyBloomFilterAggregator(filter); } throw new IAE( "Cannot create bloom filter buffer aggregator for column selector type [%s]", @@ -123,7 +123,7 @@ public BufferAggregator factorizeBuffered(ColumnSelectorFactory columnFactory) if (capabilities == null) { BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); if (selector instanceof NilColumnValueSelector) { - return new NilBloomFilterBufferAggregator((NilColumnValueSelector) selector, maxNumEntries); + return new EmptyBloomFilterBufferAggregator(maxNumEntries); } throw new IAE( "Cannot create bloom filter buffer aggregator for column selector type [%s]", diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java index 83c0729b0793..8dab8676e1ce 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterMergeAggregatorFactory.java @@ -49,6 +49,7 @@ public class BloomFilterMergeAggregatorFactory extends BloomFilterAggregatorFact public Aggregator factorize(final ColumnSelectorFactory metricFactory) { final BaseNullableColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); + // null columns should be empty bloom filters by this point, so encountering a nil column in merge agg is unexpected if (selector instanceof NilColumnValueSelector) { throw new ISE("WTF?! Unexpected NilColumnValueSelector"); } @@ -58,11 +59,12 @@ public Aggregator factorize(final ColumnSelectorFactory metricFactory) @Override public BufferAggregator factorizeBuffered(final ColumnSelectorFactory metricFactory) { - final ColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); + final BaseNullableColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName); + // null columns should be empty bloom filters by this point, so encountering a nil column in merge agg is unexpected if (selector instanceof NilColumnValueSelector) { throw new ISE("WTF?! Unexpected NilColumnValueSelector"); } - return new BloomFilterMergeBufferAggregator(selector, getMaxNumEntries()); + return new BloomFilterMergeBufferAggregator((ColumnValueSelector) selector, getMaxNumEntries()); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/EmptyBloomFilterAggregator.java similarity index 82% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterAggregator.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/EmptyBloomFilterAggregator.java index 7748ff3bbeee..57df6f2fe2c6 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/EmptyBloomFilterAggregator.java @@ -22,11 +22,11 @@ import org.apache.druid.query.filter.BloomKFilter; import org.apache.druid.segment.NilColumnValueSelector; -public final class NilBloomFilterAggregator extends BaseBloomFilterAggregator +public final class EmptyBloomFilterAggregator extends BaseBloomFilterAggregator { - NilBloomFilterAggregator(NilColumnValueSelector selector, BloomKFilter collector) + EmptyBloomFilterAggregator(BloomKFilter collector) { - super(selector, collector); + super(NilColumnValueSelector.instance(), collector); } @Override diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterBufferAggregator.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/EmptyBloomFilterBufferAggregator.java similarity index 82% rename from extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterBufferAggregator.java rename to extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/EmptyBloomFilterBufferAggregator.java index a2df50fa65ed..7b6301d37ee9 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/NilBloomFilterBufferAggregator.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/EmptyBloomFilterBufferAggregator.java @@ -23,11 +23,11 @@ import java.nio.ByteBuffer; -public final class NilBloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator +public final class EmptyBloomFilterBufferAggregator extends BaseBloomFilterBufferAggregator { - NilBloomFilterBufferAggregator(NilColumnValueSelector selector, int maxNumEntries) + EmptyBloomFilterBufferAggregator(int maxNumEntries) { - super(selector, maxNumEntries); + super(NilColumnValueSelector.instance(), maxNumEntries); } @Override From 8ebe1d998583a841b0665b1dcf8b79abd6e6f8e2 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 25 Jan 2019 17:43:41 -0800 Subject: [PATCH 35/36] use stringutils base64 stuff to be chill with master --- .../query/aggregation/bloom/BloomFilterAggregatorFactory.java | 3 +-- .../query/aggregation/bloom/BloomFilterAggregatorTest.java | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index 60ec529a6537..b074824896c3 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -21,7 +21,6 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.commons.codec.binary.Base64; import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.RE; @@ -194,7 +193,7 @@ public List getRequiredColumns() public Object deserialize(Object object) { if (object instanceof String) { - return ByteBuffer.wrap(Base64.decodeBase64(StringUtils.toUtf8((String) object))); + return ByteBuffer.wrap(StringUtils.decodeBase64String((String) object)); } else { return object; } diff --git a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java index 99b1075cb3eb..790cf8c5c01d 100644 --- a/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java +++ b/extensions-core/druid-bloom-filter/src/test/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorTest.java @@ -23,11 +23,11 @@ import com.google.common.base.Function; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; -import org.apache.commons.codec.binary.Base64; import org.apache.druid.common.config.NullHandling; import org.apache.druid.guice.BloomFilterExtensionModule; import org.apache.druid.guice.BloomFilterSerializersModule; import org.apache.druid.jackson.DefaultObjectMapper; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.BufferAggregator; @@ -221,7 +221,7 @@ private static void bufferAggregateColumn( static String filterToString(BloomKFilter bloomKFilter) throws IOException { - return Base64.encodeBase64String(BloomFilterSerializersModule.bloomKFilterToBytes(bloomKFilter)); + return StringUtils.encodeBase64String(BloomFilterSerializersModule.bloomKFilterToBytes(bloomKFilter)); } private final DimensionSpec dimSpec = new DefaultDimensionSpec("dim1", "dim1"); From a56615bf12575d3a487f6fc68bc11e3d39402271 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 28 Jan 2019 14:06:13 -0800 Subject: [PATCH 36/36] add aggregate combiner, comment --- .../bloom/BloomFilterAggregateCombiner.java | 72 +++++++++++++++++++ .../bloom/BloomFilterAggregatorFactory.java | 19 ++--- 2 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregateCombiner.java diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregateCombiner.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregateCombiner.java new file mode 100644 index 000000000000..6fc4bf9379e4 --- /dev/null +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregateCombiner.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.aggregation.bloom; + +import org.apache.druid.query.aggregation.ObjectAggregateCombiner; +import org.apache.druid.query.filter.BloomKFilter; +import org.apache.druid.segment.ColumnValueSelector; + +import javax.annotation.Nullable; + +public class BloomFilterAggregateCombiner extends ObjectAggregateCombiner +{ + @Nullable + private BloomKFilter combined; + + private final int maxNumEntries; + + public BloomFilterAggregateCombiner(int maxNumEntries) + { + this.maxNumEntries = maxNumEntries; + } + + @Override + public void reset(ColumnValueSelector selector) + { + combined = null; + fold(selector); + } + + @Override + public void fold(ColumnValueSelector selector) + { + BloomKFilter other = (BloomKFilter) selector.getObject(); + if (other == null) { + return; + } + if (combined == null) { + combined = new BloomKFilter(maxNumEntries); + } + combined.merge(other); + } + + @Nullable + @Override + public BloomKFilter getObject() + { + return combined; + } + + @Override + public Class classOfObject() + { + return BloomKFilter.class; + } +} diff --git a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java index b074824896c3..af60135afe38 100644 --- a/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java +++ b/extensions-core/druid-bloom-filter/src/main/java/org/apache/druid/query/aggregation/bloom/BloomFilterAggregatorFactory.java @@ -25,6 +25,7 @@ import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.query.aggregation.AggregateCombiner; import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.AggregatorUtil; @@ -92,6 +93,7 @@ public Aggregator factorize(ColumnSelectorFactory columnFactory) if (capabilities == null) { BaseNullableColumnValueSelector selector = columnFactory.makeColumnValueSelector(field.getDimension()); if (selector instanceof NilColumnValueSelector) { + // BloomKFilter must be the same size so we cannot use a constant for the empty agg return new EmptyBloomFilterAggregator(filter); } throw new IAE( @@ -166,15 +168,14 @@ public Object combine(@Nullable Object lhs, @Nullable Object rhs) if (lhs == null) { return rhs; } - if (rhs instanceof BloomKFilter) { - ((BloomKFilter) lhs).merge((BloomKFilter) rhs); - return lhs; - } else { - ByteBuffer buf = (ByteBuffer) lhs; - ByteBuffer other = (ByteBuffer) rhs; - BloomKFilter.mergeBloomFilterByteBuffers(buf, buf.position(), other, other.position()); - return lhs; - } + ((BloomKFilter) lhs).merge((BloomKFilter) rhs); + return lhs; + } + + @Override + public AggregateCombiner makeAggregateCombiner() + { + return new BloomFilterAggregateCombiner(maxNumEntries); } @Override