Merge pull request jaybaird#6 from etrepum/master

jaybaird · jaybaird · commit abfccef7929f · 2012-12-27T09:19:27.000-08:00
Fix ScalableBloomFilter and tests on top of jaybaird#5
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,11 @@
+Changes in 2.0
+==============
+Made major corrections to the algorithms for both BloomFilter and
+ScalableBloomFilter. Not numerically compatible with serialized
+representations of filters from previous versions. Specifically,
+BloomFilter was more accurate than requested and ScalableBloomFilter
+was much less accurate than requested.
+
 Changes in 1.1
 ==============
-Added copy, intersection and union functions to BloomFilter
+Added copy, intersection and union functions to BloomFilter
diff --git a/README.txt b/README.txt
@@ -27,7 +27,7 @@ True
 >>> f = BloomFilter(capacity=1000, error_rate=0.001)
 >>> for i in xrange(0, f.capacity):
 ...     _ = f.add(i)
->>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
 True
 
 >>> from pybloom import ScalableBloomFilter
@@ -36,8 +36,9 @@ True
 >>> for i in xrange(0, count):
 ...     _ = sbf.add(i)
 ...
->>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
 True
 
-# len(sbf) may not equal the entire input length. 0.006% error is well
-# below the default 0.1% error threshold
+# len(sbf) may not equal the entire input length. 0.01% error is well
+# below the default 0.1% error threshold. As the capacity goes up, the
+# error will approach 0.1%.
diff --git a/pybloom/__init__.py b/pybloom/__init__.py
@@ -1,5 +1,4 @@
 """pybloom
- 
+
 """
 from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
-    
diff --git a/pybloom/benchmarks.py b/pybloom/benchmarks.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+#
+"""Test performance of BloomFilter at a set capacity and error rate."""
+import sys
+from pybloom import BloomFilter
+import bitarray, math, time
+
+def main(capacity=100000, request_error_rate=0.1):
+    f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
+    assert (capacity == f.capacity)
+    start = time.time()
+    for i in xrange(0, f.capacity):
+        f.add(i, skip_check=True)
+    end = time.time()
+    print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
+            end - start, f.capacity / (end - start))
+    oneBits = f.bitarray.count(True)
+    zeroBits = f.bitarray.count(False)
+    #print "Number of 1 bits:", oneBits
+    #print "Number of 0 bits:", zeroBits
+    print "Number of Filter Bits:", f.num_bits
+    print "Number of slices:", f.num_slices
+    print "Bits per slice:", f.bits_per_slice
+    print "------"
+    print "Fraction of 1 bits at capacity: {:5.3f}".format(
+            oneBits / float(f.num_bits))
+    # Look for false positives and measure the actual fp rate
+    trials = f.capacity
+    fp = 0
+    start = time.time()
+    for i in xrange(f.capacity, f.capacity + trials + 1):
+        if i in f:
+            fp += 1
+    end = time.time()
+    print ("{:5.3f} seconds to check false positives, "
+           "{:10.2f} checks/second".format(end - start, trials / (end - start)))
+    print "Requested FP rate: {:2.4f}".format(request_error_rate)
+    print "Experimental false positive rate: {:2.4f}".format(fp / float(trials))
+    # Compute theoretical fp max (Goel/Gupta)
+    k = f.num_slices
+    m = f.num_bits
+    n = f.capacity
+    fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
+    print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)
+
+if __name__ == '__main__' :
+    status = main()
+    sys.exit(status)
diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py
@@ -16,7 +16,7 @@
     False
     >>> len(f) <= f.capacity
     True
-    >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+    >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
     True
 
     >>> from pybloom import ScalableBloomFilter
@@ -29,7 +29,7 @@
     True
     >>> len(sbf) <= count
     True
-    >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+    >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
     True
 
 """
@@ -42,8 +42,8 @@
 except ImportError:
     raise ImportError('pybloom requires bitarray >= 0.3.4')
 
-__version__ = '1.1'
-__author__  = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
+__version__ = '2.0'
+__author__  = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
                Marius Eriksen <marius@monkey.org>,\
                Alex Brasetvik <alex@brasetvik.com>"
 
@@ -111,16 +111,15 @@ def __init__(self, capacity, error_rate=0.001):
             raise ValueError("Error_Rate must be between 0 and 1.")
         if not capacity > 0:
             raise ValueError("Capacity must be > 0")
-        # given M = num_bits, k = num_slices, p = error_rate, n = capacity
+        # given M = num_bits, k = num_slices, P = error_rate, n = capacity
+        #       k = log2(1/P)
         # solving for m = bits_per_slice
         # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
         # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
         # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
-        num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
-        # the error_rate constraint assumes a fill rate of 1/2
-        # so we double the capacity to simplify the API
+        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
         bits_per_slice = int(math.ceil(
-            (2 * capacity * abs(math.log(error_rate))) /
+            (capacity * abs(math.log(error_rate))) /
             (num_slices * (math.log(2) ** 2))))
         self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
         self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
@@ -339,13 +338,18 @@ def add(self, key):
         """
         if key in self:
             return True
-        filter = self.filters[-1] if self.filters else None
-        if filter is None or filter.count >= filter.capacity:
-            num_filters = len(self.filters)
+        if not self.filters:
             filter = BloomFilter(
-                capacity=self.initial_capacity * (self.scale ** num_filters),
-                error_rate=self.error_rate * (self.ratio ** num_filters))
+                capacity=self.initial_capacity,
+                error_rate=self.error_rate * (1.0 - self.ratio))
             self.filters.append(filter)
+        else:
+            filter = self.filters[-1]
+            if filter.count >= filter.capacity:
+                filter = BloomFilter(
+                    capacity=filter.capacity * self.scale,
+                    error_rate=filter.error_rate * self.ratio)
+                self.filters.append(filter)
         filter.add(key, skip_check=True)
         return False
 
diff --git a/setup.py b/setup.py
@@ -6,16 +6,16 @@
 
 from setuptools import setup, find_packages, Extension
 
-VERSION = '1.0.3'
+VERSION = '2.0.0'
 DESCRIPTION = "PyBloom: A Probabilistic data structure"
 LONG_DESCRIPTION = """
 pybloom is a Python implementation of the bloom filter probabilistic data
-structure. The module also provides a Scalable Bloom Filter that allows a 
+structure. The module also provides a Scalable Bloom Filter that allows a
 bloom filter to grow without knowing the original set size.
 """
 
 CLASSIFIERS = filter(None, map(str.strip,
-"""                 
+"""
 Intended Audience :: Developers
 License :: OSI Approved :: MIT License
 Programming Language :: Python

-Original file line number
+Diff line change
@@ @@ -1,5 +1,4 @@ @@
 """pybloom
+-
++
 """
 from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
+-