Skip to content

Commit 7842c01

Browse files
committed
add BloomFilter
1 parent c2e2402 commit 7842c01

File tree

2 files changed

+260
-0
lines changed

2 files changed

+260
-0
lines changed
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
package src.main.java.com.search;
2+
3+
import java.io.Serializable;
4+
import java.util.ArrayList;
5+
import java.util.Collections;
6+
import java.util.List;
7+
import java.util.function.Function;
8+
9+
/**
10+
* A simple implementation of Bloom filter.
11+
* <p>
12+
* Bloom filter have a chance of being wrong.
13+
* <p>
14+
* The Bloom filter assert that elements that do not exist must not exist,
15+
* if assert an element exists, but not necessarily.
16+
* <p>
17+
* The accuracy rate depends on capacity and hash functions.
18+
*
19+
* @author yangxf
20+
*/
21+
public class BloomFilter implements Serializable {
22+
private static final long serialVersionUID = -4466610350741278658L;
23+
24+
private static final int LONG_SHIFT = 6;
25+
private static final int LONG_MASK = 63;
26+
27+
/**
28+
* hash functions
29+
*/
30+
private final List<Function<String, Integer>> hashFunctions;
31+
32+
private final long[] table;
33+
private final int tableMask;
34+
private int size;
35+
36+
/**
37+
* @param capacity the filter capacity
38+
* @param hashFunctions hash functions
39+
* @see Builder must be build by {@link Builder}.
40+
*/
41+
private BloomFilter(int capacity, List<Function<String, Integer>> hashFunctions) {
42+
this.hashFunctions = hashFunctions;
43+
int cap = nextPowerOf2(capacity);
44+
tableMask = (cap << LONG_SHIFT) - 1;
45+
table = new long[cap];
46+
}
47+
48+
public static Builder builder(int capacity) {
49+
if (capacity < 1) {
50+
throw new IllegalStateException("capacity must be > 0");
51+
}
52+
53+
return new Builder(capacity);
54+
}
55+
56+
/**
57+
* Add an element to the Bloom filter.
58+
*/
59+
public void add(String element) {
60+
checkNotNull(element, "element");
61+
62+
for (Function<String, Integer> hashFunction : hashFunctions) {
63+
int key = hashFunction.apply(element) & tableMask;
64+
table[key >>> LONG_SHIFT] |= (1 << (key & LONG_MASK));
65+
}
66+
size++;
67+
}
68+
69+
/**
70+
* @return true if the element exists, otherwise false.
71+
*/
72+
public boolean contains(String element) {
73+
if (element == null) {
74+
return false;
75+
}
76+
77+
for (Function<String, Integer> hashFunction : hashFunctions) {
78+
int key = hashFunction.apply(element) & tableMask;
79+
if ((table[key >>> LONG_SHIFT] & (1 << (key & LONG_MASK))) == 0) {
80+
return false;
81+
}
82+
}
83+
return true;
84+
}
85+
86+
public List<Function<String, Integer>> getHashFunctions() {
87+
return hashFunctions;
88+
}
89+
90+
public int size() {
91+
return size;
92+
}
93+
94+
private static void checkNotNull(String element, String msg) {
95+
if (element == null) {
96+
throw new NullPointerException(msg + " must be not null");
97+
}
98+
}
99+
100+
private static int nextPowerOf2(int i) {
101+
int n = i - 1;
102+
n |= n >>> 1;
103+
n |= n >>> 2;
104+
n |= n >>> 4;
105+
n |= n >>> 8;
106+
n |= n >>> 16;
107+
return (n < 0) ? 1 : (n >= 0x40000000) ? 0x40000000 : n + 1;
108+
}
109+
110+
/**
111+
* We need a list of unmodifiable hash functions.
112+
*/
113+
public static class Builder {
114+
private int capacity;
115+
private List<Function<String, Integer>> hashFunctions = new ArrayList<>();
116+
117+
private Builder(int capacity) {
118+
this.capacity = capacity;
119+
}
120+
121+
public Builder addHashFunction(Function<String, Integer> function) {
122+
hashFunctions.add(function);
123+
return this;
124+
}
125+
126+
public BloomFilter build() {
127+
if (hashFunctions.isEmpty()) {
128+
addDefaultHashFunction();
129+
}
130+
return new BloomFilter(capacity, Collections.unmodifiableList(hashFunctions));
131+
}
132+
133+
/**
134+
* I provides several default hash functions
135+
*/
136+
private void addDefaultHashFunction() {
137+
// Java String Hash Function
138+
hashFunctions.add(String::hashCode);
139+
140+
// SDBM Hash Function
141+
hashFunctions.add(key -> {
142+
if (key == null || key.isEmpty()) {
143+
return 0;
144+
}
145+
146+
int hash = 0;
147+
for (int i = 0; i < key.length(); i++) {
148+
hash = key.charAt(i) + (hash << 6) + (hash << 16) - hash;
149+
}
150+
hash &= 0x7ffffff;
151+
return hash;
152+
});
153+
154+
// Robert Sedgwicks Hash Function
155+
hashFunctions.add(key -> {
156+
if (key == null || key.isEmpty()) {
157+
return 0;
158+
}
159+
160+
int hash = 0;
161+
int magic = 63689;
162+
for (int i = 0; i < key.length(); i++) {
163+
hash = hash * magic + key.charAt(i);
164+
magic *= 378551;
165+
}
166+
return hash;
167+
});
168+
169+
// Arash Partow Hash Function
170+
hashFunctions.add(key -> {
171+
if (key == null || key.isEmpty()) {
172+
return 0;
173+
}
174+
175+
int hash = 0;
176+
for (int i = 0; i < key.length(); i++) {
177+
char ch = key.charAt(i);
178+
if ((i & 1) == 0) {
179+
hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
180+
} else {
181+
hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
182+
}
183+
}
184+
return hash;
185+
});
186+
}
187+
188+
}
189+
190+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
package src.test.java.com.search;
2+
3+
import org.junit.Test;
4+
import src.main.java.com.search.BloomFilter;
5+
6+
import java.util.HashSet;
7+
import java.util.Set;
8+
import java.util.concurrent.ThreadLocalRandom;
9+
10+
import static org.junit.Assert.*;
11+
12+
public class BloomFilterTest {
13+
14+
@Test
15+
public void test() {
16+
int count = 100000;
17+
int low = 50, up = 100;
18+
BloomFilter filter = BloomFilter.builder(10000).build();
19+
String[] data = new String[count];
20+
Set<String> dataSet = new HashSet<>();
21+
for (int i = 0; i < count; i++) {
22+
String str = randomString(low, up);
23+
data[i] = str;
24+
if (i % 2 == 0) {
25+
dataSet.add(str);
26+
filter.add(str);
27+
}
28+
}
29+
30+
int error = 0, total = 0;
31+
for (int i = 0; i < count; i++) {
32+
String str = data[i];
33+
if (filter.contains(str)) {
34+
total++;
35+
if (!dataSet.contains(str)) {
36+
error++;
37+
}
38+
} else {
39+
assertFalse(dataSet.contains(str));
40+
}
41+
}
42+
System.out.println("error: " + error);
43+
System.out.println("total: " + total);
44+
System.out.println("error rate : " + (double) error / total);
45+
}
46+
47+
public static String randomString(int minLength, int maxLength) {
48+
ThreadLocalRandom r = ThreadLocalRandom.current();
49+
int chLen = r.nextInt(minLength, maxLength),
50+
poolSize = CHAR_POOL.length;
51+
char[] chars = new char[chLen];
52+
for (int i = 0; i < chLen; i++) {
53+
chars[i] = CHAR_POOL[r.nextInt(poolSize)];
54+
}
55+
56+
return new String(chars);
57+
}
58+
59+
private static final char[] CHAR_POOL;
60+
61+
static {
62+
CHAR_POOL = new char[52];
63+
int i = 0;
64+
for (char c = 'a'; c <= 'z'; c++) {
65+
CHAR_POOL[i++] = c;
66+
CHAR_POOL[i++] = (char) (c - 32);
67+
}
68+
}
69+
70+
}

0 commit comments

Comments
 (0)