1+ # Licensed to the Apache Software Foundation (ASF) under one
2+ # or more contributor license agreements. See the NOTICE file
3+ # distributed with this work for additional information
4+ # regarding copyright ownership. The ASF licenses this file
5+ # to you under the Apache License, Version 2.0 (the
6+ # "License"); you may not use this file except in compliance
7+ # with the License. You may obtain a copy of the License at
8+ #
9+ # http://www.apache.org/licenses/LICENSE-2.0
10+ #
11+ # Unless required by applicable law or agreed to in writing,
12+ # software distributed under the License is distributed on an
13+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+ # KIND, either express or implied. See the License for the
15+ # specific language governing permissions and limitations
16+ # under the License.
17+
18+ import unittest
19+ from datasketches import create_bloom_filter
20+
21+ class BloomFilterTest (unittest .TestCase ):
22+ def test_create_bloom_filter (self ):
23+ """Test that we can create a bloom filter with basic parameters"""
24+ bf = create_bloom_filter (1000 , 0.01 )
25+ self .assertIsNotNone (bf )
26+ self .assertTrue (bf .is_empty ())
27+
28+ def test_bloom_filter_empty_state (self ):
29+ """Test that newly created bloom filter is empty"""
30+ bf = create_bloom_filter (100 , 0.05 )
31+ self .assertTrue (bf .is_empty ())
32+
33+ def test_bloom_filter_update_and_query (self ):
34+ """Test basic update and query functionality"""
35+ bf = create_bloom_filter (1000 , 0.01 )
36+
37+ # Initially empty
38+ self .assertTrue (bf .is_empty ())
39+ self .assertFalse (bf .query ("test_item" ))
40+
41+ # Add an item
42+ bf .update ("test_item" )
43+ self .assertFalse (bf .is_empty ())
44+ self .assertTrue (bf .query ("test_item" ))
45+
46+ # Query for item not in filter
47+ self .assertFalse (bf .query ("other_item" ))
48+
49+ def test_bloom_filter_multiple_items (self ):
50+ """Test adding multiple items to the bloom filter"""
51+ bf = create_bloom_filter (1000 , 0.01 )
52+
53+ items = ["item1" , "item2" , "item3" , "item4" , "item5" ]
54+
55+ # Add all items
56+ for item in items :
57+ bf .update (item )
58+
59+ # Check that all items are found
60+ for item in items :
61+ self .assertTrue (bf .query (item ), f"Item { item } should be found" )
62+
63+ # Check that items not added are not found
64+ non_items = ["not_item1" , "not_item2" , "not_item3" ]
65+ for item in non_items :
66+ self .assertFalse (bf .query (item ), f"Item { item } should not be found" )
67+
68+ def test_bloom_filter_false_positives (self ):
69+ """Test that bloom filter can have false positives (this is expected behavior)"""
70+ bf = create_bloom_filter (10 , 0.1 ) # Small filter, higher false positive rate
71+
72+ # Add a few items
73+ bf .update ("item1" )
74+ bf .update ("item2" )
75+
76+ # Check that added items are found
77+ self .assertTrue (bf .query ("item1" ))
78+ self .assertTrue (bf .query ("item2" ))
79+
80+ # With a small filter and high false positive rate, we might get false positives
81+ # This is expected behavior for bloom filters
82+ # We're not testing for specific false positives, just that the filter works
83+
84+ def test_bloom_filter_parameters (self ):
85+ """Test creating bloom filters with different parameters"""
86+ # Test with different sizes and false positive rates
87+ test_cases = [
88+ (100 , 0.01 ),
89+ (1000 , 0.05 ),
90+ (10000 , 0.001 ),
91+ (100 , 0.1 ),
92+ ]
93+
94+ for max_items , false_positive_rate in test_cases :
95+ with self .subTest (max_items = max_items , false_positive_rate = false_positive_rate ):
96+ bf = create_bloom_filter (max_items , false_positive_rate )
97+ self .assertIsNotNone (bf )
98+ self .assertTrue (bf .is_empty ())
99+
100+ def test_bloom_filter_string_types (self ):
101+ """Test that bloom filter works with different string types"""
102+ bf = create_bloom_filter (1000 , 0.01 )
103+
104+ # Test with different string types
105+ test_strings = [
106+ "simple" ,
107+ "string with spaces" ,
108+ "string_with_underscores" ,
109+ "string-with-dashes" ,
110+ "string123with456numbers" ,
111+ "string.with.dots" ,
112+ "string!with@special#chars$" ,
113+ ]
114+
115+ for test_string in test_strings :
116+ with self .subTest (test_string = test_string ):
117+ bf .update (test_string )
118+ self .assertTrue (bf .query (test_string ))
119+
120+ # Test empty string separately - it might be ignored by the implementation
121+ bf .update ("" )
122+ # Note: Empty strings might be ignored by the bloom filter implementation
123+ # This is common behavior, so we don't assert on the result
124+
125+ def test_bloom_filter_edge_cases (self ):
126+ """Test edge cases for bloom filter"""
127+ bf = create_bloom_filter (1000 , 0.01 )
128+
129+ # Test with very long strings
130+ long_string = "a" * 1000
131+ bf .update (long_string )
132+ self .assertTrue (bf .query (long_string ))
133+
134+ # Test with unicode strings
135+ unicode_string = "café résumé naïve"
136+ bf .update (unicode_string )
137+ self .assertTrue (bf .query (unicode_string ))
138+
139+ # Test with numbers as strings
140+ number_string = "12345"
141+ bf .update (number_string )
142+ self .assertTrue (bf .query (number_string ))
143+
144+ if __name__ == '__main__' :
145+ unittest .main ()
0 commit comments