Skip to content

Commit 65e1f1d

Browse files
committed
Add initial bloom filter Python bindings and tests
1 parent 66d1d52 commit 65e1f1d

File tree

5 files changed

+203
-0
lines changed

5 files changed

+203
-0
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ target_sources(python
110110
src/count_wrapper.cpp
111111
src/tdigest_wrapper.cpp
112112
src/vector_of_kll.cpp
113+
src/bloom_filter_wrapper.cpp
113114
src/py_serde.cpp
114115
)
115116

_datasketches.so

1.29 MB
Binary file not shown.

src/bloom_filter_wrapper.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <nanobind/nanobind.h>
21+
#include <nanobind/stl/string.h>
22+
23+
#include "bloom_filter.hpp"
24+
#include "common_defs.hpp"
25+
26+
namespace nb = nanobind;
27+
28+
template<typename A>
29+
void bind_bloom_filter(nb::module_ &m, const char* name) {
30+
using namespace datasketches;
31+
using bloom_filter_type = bloom_filter_alloc<A>;
32+
33+
// Start with just one simple function
34+
m.def("create_bloom_filter",
35+
[](uint64_t max_distinct_items, double target_false_positive_prob) {
36+
return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob);
37+
},
38+
nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"),
39+
"Creates a Bloom filter with optimal parameters for the given accuracy requirements");
40+
41+
// Bind the class with minimal methods
42+
nb::class_<bloom_filter_type>(m, name)
43+
.def("is_empty", &bloom_filter_type::is_empty,
44+
"Returns True if the filter has seen no items, otherwise False")
45+
.def("update", static_cast<void (bloom_filter_type::*)(const std::string&)>(&bloom_filter_type::update),
46+
nb::arg("item"),
47+
"Updates the filter with the given string")
48+
.def("query", static_cast<bool (bloom_filter_type::*)(const std::string&) const>(&bloom_filter_type::query),
49+
nb::arg("item"),
50+
"Queries the filter for the given string");
51+
}
52+
53+
void init_bloom_filter(nb::module_ &m) {
54+
bind_bloom_filter<std::allocator<uint8_t>>(m, "bloom_filter");
55+
}

src/datasketches.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ void init_count_min(nb::module_& m);
4141
void init_density(nb::module_& m);
4242
void init_tdigest(nb::module_& m);
4343
void init_vector_of_kll(nb::module_& m);
44+
void init_bloom_filter(nb::module_& m);
4445

4546
// supporting objects
4647
void init_kolmogorov_smirnov(nb::module_& m);
@@ -73,6 +74,7 @@ NB_MODULE(_datasketches, m) {
7374
init_density(m);
7475
init_tdigest(m);
7576
init_vector_of_kll(m);
77+
init_bloom_filter(m);
7678

7779
init_kolmogorov_smirnov(m);
7880
init_serde(m);

tests/bloom_filter_test.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
import unittest
19+
from datasketches import create_bloom_filter
20+
21+
class BloomFilterTest(unittest.TestCase):
22+
def test_create_bloom_filter(self):
23+
"""Test that we can create a bloom filter with basic parameters"""
24+
bf = create_bloom_filter(1000, 0.01)
25+
self.assertIsNotNone(bf)
26+
self.assertTrue(bf.is_empty())
27+
28+
def test_bloom_filter_empty_state(self):
29+
"""Test that newly created bloom filter is empty"""
30+
bf = create_bloom_filter(100, 0.05)
31+
self.assertTrue(bf.is_empty())
32+
33+
def test_bloom_filter_update_and_query(self):
34+
"""Test basic update and query functionality"""
35+
bf = create_bloom_filter(1000, 0.01)
36+
37+
# Initially empty
38+
self.assertTrue(bf.is_empty())
39+
self.assertFalse(bf.query("test_item"))
40+
41+
# Add an item
42+
bf.update("test_item")
43+
self.assertFalse(bf.is_empty())
44+
self.assertTrue(bf.query("test_item"))
45+
46+
# Query for item not in filter
47+
self.assertFalse(bf.query("other_item"))
48+
49+
def test_bloom_filter_multiple_items(self):
50+
"""Test adding multiple items to the bloom filter"""
51+
bf = create_bloom_filter(1000, 0.01)
52+
53+
items = ["item1", "item2", "item3", "item4", "item5"]
54+
55+
# Add all items
56+
for item in items:
57+
bf.update(item)
58+
59+
# Check that all items are found
60+
for item in items:
61+
self.assertTrue(bf.query(item), f"Item {item} should be found")
62+
63+
# Check that items not added are not found
64+
non_items = ["not_item1", "not_item2", "not_item3"]
65+
for item in non_items:
66+
self.assertFalse(bf.query(item), f"Item {item} should not be found")
67+
68+
def test_bloom_filter_false_positives(self):
69+
"""Test that bloom filter can have false positives (this is expected behavior)"""
70+
bf = create_bloom_filter(10, 0.1) # Small filter, higher false positive rate
71+
72+
# Add a few items
73+
bf.update("item1")
74+
bf.update("item2")
75+
76+
# Check that added items are found
77+
self.assertTrue(bf.query("item1"))
78+
self.assertTrue(bf.query("item2"))
79+
80+
# With a small filter and high false positive rate, we might get false positives
81+
# This is expected behavior for bloom filters
82+
# We're not testing for specific false positives, just that the filter works
83+
84+
def test_bloom_filter_parameters(self):
85+
"""Test creating bloom filters with different parameters"""
86+
# Test with different sizes and false positive rates
87+
test_cases = [
88+
(100, 0.01),
89+
(1000, 0.05),
90+
(10000, 0.001),
91+
(100, 0.1),
92+
]
93+
94+
for max_items, false_positive_rate in test_cases:
95+
with self.subTest(max_items=max_items, false_positive_rate=false_positive_rate):
96+
bf = create_bloom_filter(max_items, false_positive_rate)
97+
self.assertIsNotNone(bf)
98+
self.assertTrue(bf.is_empty())
99+
100+
def test_bloom_filter_string_types(self):
101+
"""Test that bloom filter works with different string types"""
102+
bf = create_bloom_filter(1000, 0.01)
103+
104+
# Test with different string types
105+
test_strings = [
106+
"simple",
107+
"string with spaces",
108+
"string_with_underscores",
109+
"string-with-dashes",
110+
"string123with456numbers",
111+
"string.with.dots",
112+
"string!with@special#chars$",
113+
]
114+
115+
for test_string in test_strings:
116+
with self.subTest(test_string=test_string):
117+
bf.update(test_string)
118+
self.assertTrue(bf.query(test_string))
119+
120+
# Test empty string separately - it might be ignored by the implementation
121+
bf.update("")
122+
# Note: Empty strings might be ignored by the bloom filter implementation
123+
# This is common behavior, so we don't assert on the result
124+
125+
def test_bloom_filter_edge_cases(self):
126+
"""Test edge cases for bloom filter"""
127+
bf = create_bloom_filter(1000, 0.01)
128+
129+
# Test with very long strings
130+
long_string = "a" * 1000
131+
bf.update(long_string)
132+
self.assertTrue(bf.query(long_string))
133+
134+
# Test with unicode strings
135+
unicode_string = "café résumé naïve"
136+
bf.update(unicode_string)
137+
self.assertTrue(bf.query(unicode_string))
138+
139+
# Test with numbers as strings
140+
number_string = "12345"
141+
bf.update(number_string)
142+
self.assertTrue(bf.query(number_string))
143+
144+
if __name__ == '__main__':
145+
unittest.main()

0 commit comments

Comments
 (0)