Skip to content

Commit 2db78a4

Browse files
DengYipingmgechev
authored andcommitted
[feature] add probablistic data structure bloomfilter (mgechev#151)
* add probablistic data structure bloomfilter * Use Error in the code, change comment style, update bitmap interfaces
1 parent e5b50fa commit 2db78a4

File tree

2 files changed

+278
-0
lines changed

2 files changed

+278
-0
lines changed

src/data-structures/bloomfilter.js

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
/**
2+
* Bloomfilter and a bitmap.
3+
* Probablistic data structure useful for deduplication
4+
*
5+
* @example
6+
* // create a bloom filter with capacity of 1024 and 0.01 error rat
7+
* var bloomfilter = new Bloomfilter(1024, 0.01);
8+
* bloomfilter.set('hello');
9+
* bloomfilter.get('hello') === true;
10+
* bloomfilter.get('world') === false;
11+
* @module data-structures/bloomfilter
12+
*/
13+
(function(exports) {
14+
'use strict';
15+
16+
function randomUint32() {
17+
return Math.floor(Math.random() * Math.pow(2, 32));
18+
}
19+
/**
20+
* Calculate a 32 bit FNV-1a hash
21+
* Found here: https://gist.github.com/vaiorabbit/5657561
22+
* Ref.: http://isthe.com/chongo/tech/comp/fnv/
23+
*
24+
* @param {string} str the input value
25+
* @param {boolean} [asString=false] set to true to return the hash value as
26+
* 8-digit hex string instead of an integer
27+
* @param {integer} [seed] optionally pass the hash of the previous chunk
28+
* @returns {integer | string}
29+
*/
30+
function hashFnv32a(str, asString, seed) {
31+
/*jshint bitwise:false */
32+
var i;
33+
var l;
34+
var hval = seed === undefined ? 0x811c9dc5 : seed;
35+
36+
for (i = 0, l = str.length; i < l; i = i + 1) {
37+
hval ^= str.charCodeAt(i);
38+
hval +=
39+
(hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
40+
}
41+
if (asString) {
42+
// Convert to 8 digit hex string
43+
return ('0000000' + (hval >>> 0).toString(16)).substr(-8);
44+
}
45+
return hval >>> 0;
46+
}
47+
48+
// Make a hash function
49+
function mkHashFun(seed, limit) {
50+
return function(value) {
51+
return hashFnv32a(value, false, seed) % limit;
52+
};
53+
}
54+
/**
55+
* Create a bitmap with a given size
56+
* Bit map is not resizeable
57+
* @public
58+
* @constructor
59+
* @param {Number} size the size of the bitmap
60+
*/
61+
exports.Bitmap = function(size) {
62+
size = size || 1024;
63+
if (size < 0) {
64+
throw new Error('The size cannot be negative');
65+
}
66+
this.size = size;
67+
this.intSize = Math.ceil(size / 32); // number of underlying integers
68+
// Create a 0 initialized array
69+
this.intArray = Array.from({ length: this.intSize }, function() {
70+
return 0;
71+
});
72+
};
73+
74+
/**
75+
* Get the size of the bit map
76+
* @public
77+
* @return {Number} size of the bit map
78+
*/
79+
exports.Bitmap.prototype.getSize = function() {
80+
return this.size;
81+
};
82+
83+
/**
84+
* Get if a bit is set at a particular index
85+
* @public
86+
* @return {Boolean} true or false value of the bit
87+
*/
88+
exports.Bitmap.prototype.exists = function(index) {
89+
// Necessary boundary check
90+
if (index < 0 || index > this.size) {
91+
throw new Error('Index out of bound')
92+
}
93+
94+
// Calculate the offset within the int
95+
var intOffset = index % 32;
96+
var arrayOffset = Math.floor(index / 32); // the offset for the array
97+
var mask = 1 << intOffset;
98+
return (mask & this.intArray[arrayOffset]) !== 0;
99+
};
100+
101+
/**
102+
* Get the bit at a particular index
103+
* @public
104+
* @return {Number} true or false value of the bit
105+
*/
106+
exports.Bitmap.prototype.get = function(index) {
107+
// Necessary boundary check
108+
if (index < 0 || index > this.size) {
109+
throw new Error('Index out of bound')
110+
}
111+
112+
// Calculate the offset within the int
113+
var intOffset = index % 32;
114+
var arrayOffset = Math.floor(index / 32); // the offset for the array
115+
var mask = 1 << intOffset;
116+
if ((mask & this.intArray[arrayOffset]) !== 0) {
117+
return 1;
118+
} else {
119+
return 0;
120+
}
121+
};
122+
123+
/**
124+
* Set the bit at a particular index
125+
* @public
126+
* @param {Number} index the index to set
127+
* @param {Boolean} value the value that is necessary
128+
*/
129+
exports.Bitmap.prototype.set = function(index, value) {
130+
// necessary boundary check
131+
if (index < 0 || index > this.size) {
132+
throw new Error('Index out of bound')
133+
}
134+
135+
var intOffset = index % 32; //calculate the offset within the int
136+
var arrayOffset = Math.floor(index / 32); // the offset for the array
137+
var mask = 1 << intOffset;
138+
139+
// Check trutyness
140+
if (value) {
141+
this.intArray[arrayOffset] |= mask; // or operation
142+
} else {
143+
this.intArray[arrayOffset] &= ~mask; // and opertation to set 0
144+
}
145+
};
146+
147+
/**
148+
* Construct a bloom filter by given the capacity and the error rate, default error rate is 0.001
149+
* @public
150+
* @constructor
151+
* @param {Number} capacity the maximum capacity to maintain the given error rate
152+
* @param {Number} errorRate the error rate expected under maximum capacity
153+
*/
154+
exports.Bloomfilter = function(capacity, errorRate) {
155+
errorRate = errorRate || 0.001; // default error rate
156+
if (errorRate > 1 || errorRate < 0) {
157+
throw new Error('The error rate range is outside of bound');
158+
}
159+
if (capacity < 0) {
160+
throw new Error('The capacity cannot be negative');
161+
}
162+
this.capacity = capacity;
163+
this.errorRate = errorRate;
164+
165+
// Calculate the optimal size of the bitmap
166+
var numBit = Math.ceil(
167+
(capacity * Math.log(1.0 / errorRate)) / Math.pow(Math.log(2), 2)
168+
);
169+
170+
// Calculate the optimal number of hash functions
171+
this.numHashFunction = Math.ceil(Math.log(2), numBit / capacity);
172+
173+
// Create a bitmap
174+
this.bitmap = new exports.Bitmap(numBit);
175+
176+
// Generate an array of hash functions
177+
this.hashFunctions = Array.from(
178+
{ length: this.numHashFunction },
179+
function() {
180+
return mkHashFun(randomUint32(), numBit);
181+
}.bind(this)
182+
);
183+
};
184+
185+
/**
186+
* To check if an item is in the filter. If it is in, it will return true,
187+
* if it is not in the filter, highly likely it will return false, but guaranteed
188+
* @param {Number | String} value the value that we are trying to check in the filter
189+
*/
190+
exports.Bloomfilter.prototype.get = function(value) {
191+
value = String(value); // make it string
192+
var hashes = this.hashFunctions.map(function(hashFct) {
193+
return hashFct(value);
194+
});
195+
196+
// if one of the bits is not set
197+
for (var i = 0; i < hashes.length; i = i + 1) {
198+
if (this.bitmap.exists(hashes[i]) === false) {
199+
return false;
200+
}
201+
}
202+
return true;
203+
};
204+
205+
/**
206+
* To set(put) an item in the bloom filter
207+
* @public
208+
* @param {Number | String} value the value that is been set in the filter
209+
*/
210+
exports.Bloomfilter.prototype.set = function(value) {
211+
value = String(value); // make it string
212+
var hashes = this.hashFunctions.map(function(hashFct) {
213+
return hashFct(value);
214+
});
215+
216+
// Set all the corresponding bits
217+
for (var i = 0; i < hashes.length; i = i + 1) {
218+
this.bitmap.set(hashes[i], true);
219+
}
220+
};
221+
})(typeof window === 'undefined' ? module.exports : window);
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
var mod = require('../../src/data-structures/bloomfilter.js');
2+
var Bitmap = mod.Bitmap;
3+
var Bloomfilter = mod.Bloomfilter;
4+
5+
describe('Bitmap', function() {
6+
'use strict';
7+
8+
it('should be able to get and set values', function() {
9+
var bitmap = new Bitmap(1024);
10+
expect(bitmap.exists(0)).toBe(false);
11+
bitmap.set(0, true);
12+
expect(bitmap.exists(0)).toBe(true);
13+
expect(bitmap.exists(1023)).toBe(false);
14+
bitmap.set(1023, 1);
15+
expect(bitmap.exists(1023)).toBe(true);
16+
});
17+
18+
it('should be able to change everthing back', function() {
19+
var bitmap = new Bitmap(2048);
20+
for (var i = 0; i < 2048; i = i + 1) {
21+
expect(bitmap.get(i)).toBe(0);
22+
bitmap.set(i, 1);
23+
expect(bitmap.get(i)).toBe(1);
24+
bitmap.set(i, 0);
25+
expect(bitmap.get(i)).toBe(0);
26+
}
27+
});
28+
});
29+
30+
describe('Bloomfilter', function() {
31+
'use strict';
32+
it('should be able to identify duplicates', function() {
33+
var bloomfilter = new Bloomfilter(1024, 0.01);
34+
expect(bloomfilter.get('a')).toBe(false);
35+
expect(bloomfilter.get('b')).toBe(false);
36+
bloomfilter.set('a');
37+
expect(bloomfilter.get('a')).toBe(true);
38+
expect(bloomfilter.get('b')).toBe(false);
39+
bloomfilter.set('b');
40+
expect(bloomfilter.get('a')).toBe(true);
41+
expect(bloomfilter.get('b')).toBe(true);
42+
});
43+
44+
it('should handle large amount of data inside', function() {
45+
var bloomfilter = new Bloomfilter(4096, 0.001); // high precision
46+
47+
var falsePositive = 0;
48+
for (var i = 0; i < 1024; i = i + 1) {
49+
if (bloomfilter.get(i)) {
50+
falsePositive = falsePositive + 1;
51+
}
52+
bloomfilter.set(i, true);
53+
expect(bloomfilter.get(i)).toBe(true);
54+
}
55+
expect(falsePositive).toBeLessThan(100); // set a high theshold
56+
});
57+
});

0 commit comments

Comments
 (0)