Skip to content

Commit abfccef

Browse files
committed
Merge pull request jaybaird#6 from etrepum/master
Fix ScalableBloomFilter and tests on top of jaybaird#5
2 parents f5f5aae + 64e42c8 commit abfccef

File tree

6 files changed

+84
-24
lines changed

6 files changed

+84
-24
lines changed

CHANGES.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
Changes in 2.0
2+
==============
3+
Made major corrections to the algorithms for both BloomFilter and
4+
ScalableBloomFilter. Not numerically compatible with serialized
5+
representations of filters from previous versions. Specifically,
6+
BloomFilter was more accurate than requested and ScalableBloomFilter
7+
was much less accurate than requested.
8+
19
Changes in 1.1
210
==============
3-
Added copy, intersection and union functions to BloomFilter
11+
Added copy, intersection and union functions to BloomFilter

README.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ True
2727
>>> f = BloomFilter(capacity=1000, error_rate=0.001)
2828
>>> for i in xrange(0, f.capacity):
2929
... _ = f.add(i)
30-
>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
30+
>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
3131
True
3232

3333
>>> from pybloom import ScalableBloomFilter
@@ -36,8 +36,9 @@ True
3636
>>> for i in xrange(0, count):
3737
... _ = sbf.add(i)
3838
...
39-
>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
39+
>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
4040
True
4141

42-
# len(sbf) may not equal the entire input length. 0.006% error is well
43-
# below the default 0.1% error threshold
42+
# len(sbf) may not equal the entire input length. 0.01% error is well
43+
# below the default 0.1% error threshold. As the capacity goes up, the
44+
# error will approach 0.1%.

pybloom/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
"""pybloom
2-
2+
33
"""
44
from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
5-

pybloom/benchmarks.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env python
2+
#
3+
"""Test performance of BloomFilter at a set capacity and error rate."""
4+
import sys
5+
from pybloom import BloomFilter
6+
import bitarray, math, time
7+
8+
def main(capacity=100000, request_error_rate=0.1):
9+
f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
10+
assert (capacity == f.capacity)
11+
start = time.time()
12+
for i in xrange(0, f.capacity):
13+
f.add(i, skip_check=True)
14+
end = time.time()
15+
print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
16+
end - start, f.capacity / (end - start))
17+
oneBits = f.bitarray.count(True)
18+
zeroBits = f.bitarray.count(False)
19+
#print "Number of 1 bits:", oneBits
20+
#print "Number of 0 bits:", zeroBits
21+
print "Number of Filter Bits:", f.num_bits
22+
print "Number of slices:", f.num_slices
23+
print "Bits per slice:", f.bits_per_slice
24+
print "------"
25+
print "Fraction of 1 bits at capacity: {:5.3f}".format(
26+
oneBits / float(f.num_bits))
27+
# Look for false positives and measure the actual fp rate
28+
trials = f.capacity
29+
fp = 0
30+
start = time.time()
31+
for i in xrange(f.capacity, f.capacity + trials + 1):
32+
if i in f:
33+
fp += 1
34+
end = time.time()
35+
print ("{:5.3f} seconds to check false positives, "
36+
"{:10.2f} checks/second".format(end - start, trials / (end - start)))
37+
print "Requested FP rate: {:2.4f}".format(request_error_rate)
38+
print "Experimental false positive rate: {:2.4f}".format(fp / float(trials))
39+
# Compute theoretical fp max (Goel/Gupta)
40+
k = f.num_slices
41+
m = f.num_bits
42+
n = f.capacity
43+
fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
44+
print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)
45+
46+
if __name__ == '__main__' :
47+
status = main()
48+
sys.exit(status)

pybloom/pybloom.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
False
1717
>>> len(f) <= f.capacity
1818
True
19-
>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
19+
>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
2020
True
2121
2222
>>> from pybloom import ScalableBloomFilter
@@ -29,7 +29,7 @@
2929
True
3030
>>> len(sbf) <= count
3131
True
32-
>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
32+
>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
3333
True
3434
3535
"""
@@ -42,8 +42,8 @@
4242
except ImportError:
4343
raise ImportError('pybloom requires bitarray >= 0.3.4')
4444

45-
__version__ = '1.1'
46-
__author__ = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
45+
__version__ = '2.0'
46+
__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
4747
Marius Eriksen <marius@monkey.org>,\
4848
Alex Brasetvik <alex@brasetvik.com>"
4949

@@ -111,16 +111,15 @@ def __init__(self, capacity, error_rate=0.001):
111111
raise ValueError("Error_Rate must be between 0 and 1.")
112112
if not capacity > 0:
113113
raise ValueError("Capacity must be > 0")
114-
# given M = num_bits, k = num_slices, p = error_rate, n = capacity
114+
# given M = num_bits, k = num_slices, P = error_rate, n = capacity
115+
# k = log2(1/P)
115116
# solving for m = bits_per_slice
116117
# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
117118
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
118119
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
119-
num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
120-
# the error_rate constraint assumes a fill rate of 1/2
121-
# so we double the capacity to simplify the API
120+
num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
122121
bits_per_slice = int(math.ceil(
123-
(2 * capacity * abs(math.log(error_rate))) /
122+
(capacity * abs(math.log(error_rate))) /
124123
(num_slices * (math.log(2) ** 2))))
125124
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
126125
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
@@ -339,13 +338,18 @@ def add(self, key):
339338
"""
340339
if key in self:
341340
return True
342-
filter = self.filters[-1] if self.filters else None
343-
if filter is None or filter.count >= filter.capacity:
344-
num_filters = len(self.filters)
341+
if not self.filters:
345342
filter = BloomFilter(
346-
capacity=self.initial_capacity * (self.scale ** num_filters),
347-
error_rate=self.error_rate * (self.ratio ** num_filters))
343+
capacity=self.initial_capacity,
344+
error_rate=self.error_rate * (1.0 - self.ratio))
348345
self.filters.append(filter)
346+
else:
347+
filter = self.filters[-1]
348+
if filter.count >= filter.capacity:
349+
filter = BloomFilter(
350+
capacity=filter.capacity * self.scale,
351+
error_rate=filter.error_rate * self.ratio)
352+
self.filters.append(filter)
349353
filter.add(key, skip_check=True)
350354
return False
351355

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,16 @@
66

77
from setuptools import setup, find_packages, Extension
88

9-
VERSION = '1.0.3'
9+
VERSION = '2.0.0'
1010
DESCRIPTION = "PyBloom: A Probabilistic data structure"
1111
LONG_DESCRIPTION = """
1212
pybloom is a Python implementation of the bloom filter probabilistic data
13-
structure. The module also provides a Scalable Bloom Filter that allows a
13+
structure. The module also provides a Scalable Bloom Filter that allows a
1414
bloom filter to grow without knowing the original set size.
1515
"""
1616

1717
CLASSIFIERS = filter(None, map(str.strip,
18-
"""
18+
"""
1919
Intended Audience :: Developers
2020
License :: OSI Approved :: MIT License
2121
Programming Language :: Python

0 commit comments

Comments
 (0)