From af46992c05cdbb10cba517d1eafdb27868d897f6 Mon Sep 17 00:00:00 2001 From: Charles Smith Date: Sun, 14 Apr 2019 13:57:42 -0400 Subject: [PATCH 1/4] Fix tests filter size and false pos test size - The test filter's max_elements parameter is set to twice the number of elements that are being tested as false positives. This doesn't test the functionality of the filter, since max_elements should be related to the number of elements that we expect to put into the filter. - The tests currently aren't aggressive enough wrt the magnitude of false positives being checked, added a test for checking a million false positives that fails given expectations. --- tests/test_bloom_filter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_bloom_filter.py b/tests/test_bloom_filter.py index a94508c..ce28353 100755 --- a/tests/test_bloom_filter.py +++ b/tests/test_bloom_filter.py @@ -46,7 +46,7 @@ def _test(description, values, trials, error_rate, probe_bitnoer=None, filename= divisor = 100000 bloom = bloom_filter.BloomFilter( - max_elements=trials * 2, + max_elements=values.length() * 2, error_rate=error_rate, probe_bitnoer=probe_bitnoer, filename=filename, @@ -294,6 +294,7 @@ def test_bloom_filter(): all_good &= _test('states', States(), trials=100000, error_rate=0.01) all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1) + all_good &= _test('random', Random_content(), trials=1000000, error_rate=1E-9) all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1, probe_bitnoer=bloom_filter.get_bitno_seed_rnd) From a75d926e5860ba143d126c98926285792ae6658b Mon Sep 17 00:00:00 2001 From: Charles Smith Date: Sun, 14 Apr 2019 12:34:06 -0400 Subject: [PATCH 2/4] Use exponents to make uncorrelated hash probe valu - The previous linear combination hashing algorithm was flawed because the Hamming weight of the entire probe is guaranteed to be equal to min(num_bits_n / gcd(num_bits_n, hash2), num_probes_k) since we're treating hash2 as a generator of the group num_bits_n/hash2. Since the order of that group is the above, that means that the Hamming weight of the probe may be significantly smaller than num_probes_k, especially if hash2 is a multiple of num_bits_n (which will happen 1 out of num_bits_n times). A low Hamming weight probe is much more likely to be a positive, and therefore also much more likely to be a false positive. --- src/bloom_filter/bloom_filter.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/bloom_filter/bloom_filter.py b/src/bloom_filter/bloom_filter.py index e16c952..0c47297 100644 --- a/src/bloom_filter/bloom_filter.py +++ b/src/bloom_filter/bloom_filter.py @@ -487,11 +487,13 @@ def get_bitno_lin_comb(bloom_filter, key): hash_value1 = hash1(int_list) hash_value2 = hash2(int_list) + probe_value = hash_value1 - # We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions + # Use exponentiation to generate uncorrelated values for hash probes for probeno in range(1, bloom_filter.num_probes_k + 1): - bit_index = hash_value1 + probeno * hash_value2 - yield bit_index % bloom_filter.num_bits_m + probe_value *= probeno * hash_value2 + probe_value %= bloom_filter.num_bits_m + yield probe_value def try_unlink(filename): From 182c39d22b4dc9edcd6e41c9922782570ed4326d Mon Sep 17 00:00:00 2001 From: Charles Smith Date: Sun, 14 Apr 2019 12:34:06 -0400 Subject: [PATCH 3/4] Remove lin_comb from probe function name --- src/bloom_filter/__init__.py | 4 ++-- src/bloom_filter/bloom_filter.py | 4 ++-- tests/test_bloom_filter.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bloom_filter/__init__.py b/src/bloom_filter/__init__.py index 0f3379c..4092527 100644 --- a/src/bloom_filter/__init__.py +++ b/src/bloom_filter/__init__.py @@ -3,12 +3,12 @@ from .bloom_filter import ( BloomFilter, - get_bitno_lin_comb, + get_filter_bitno_probes, get_bitno_seed_rnd, ) __all__ = [ 'BloomFilter', - 'get_bitno_lin_comb', + 'get_filter_bitno_probes', 'get_bitno_seed_rnd', ] diff --git a/src/bloom_filter/bloom_filter.py b/src/bloom_filter/bloom_filter.py index 0c47297..263a20d 100644 --- a/src/bloom_filter/bloom_filter.py +++ b/src/bloom_filter/bloom_filter.py @@ -465,7 +465,7 @@ def hash2(int_list): return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2]) -def get_bitno_lin_comb(bloom_filter, key): +def get_filter_bitno_probes(bloom_filter, key): """Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result""" # This one assumes key is either bytes or str (or other list of integers) @@ -510,7 +510,7 @@ class BloomFilter(object): def __init__(self, max_elements=10000, error_rate=0.1, - probe_bitnoer=get_bitno_lin_comb, + probe_bitnoer=get_filter_bitno_probes, filename=None, start_fresh=False): # pylint: disable=R0913 diff --git a/tests/test_bloom_filter.py b/tests/test_bloom_filter.py index ce28353..0e4292f 100755 --- a/tests/test_bloom_filter.py +++ b/tests/test_bloom_filter.py @@ -39,7 +39,7 @@ def _test(description, values, trials, error_rate, probe_bitnoer=None, filename= # R0914: We want some local variables too. This is just test code. """Some quick automatic tests for the bloom filter class""" if not probe_bitnoer: - probe_bitnoer = bloom_filter.get_bitno_lin_comb + probe_bitnoer = bloom_filter.get_filter_bitno_probes all_good = True From f97b5b02309d18b950812f5ef43de2b1215bc6b0 Mon Sep 17 00:00:00 2001 From: Charles Smith Date: Sun, 14 Apr 2019 15:33:21 -0400 Subject: [PATCH 4/4] Increase modulo space to prevent collisions --- src/bloom_filter/bloom_filter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bloom_filter/bloom_filter.py b/src/bloom_filter/bloom_filter.py index 263a20d..d8ced40 100644 --- a/src/bloom_filter/bloom_filter.py +++ b/src/bloom_filter/bloom_filter.py @@ -489,11 +489,11 @@ def get_filter_bitno_probes(bloom_filter, key): hash_value2 = hash2(int_list) probe_value = hash_value1 - # Use exponentiation to generate uncorrelated values for hash probes for probeno in range(1, bloom_filter.num_probes_k + 1): - probe_value *= probeno * hash_value2 - probe_value %= bloom_filter.num_bits_m - yield probe_value + probe_value *= hash_value1 + probe_value += hash_value2 + probe_value %= MERSENNES1[2] + yield probe_value % bloom_filter.num_bits_m def try_unlink(filename):