Skip to content

Commit 81ec274

Browse files
authored
Add HyperLogLog (TheAlgorithms#269)
1 parent 493792b commit 81ec274

File tree

3 files changed

+132
-0
lines changed

3 files changed

+132
-0
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using DataStructures.Probabilistic;
4+
using FluentAssertions;
5+
using NUnit.Framework;
6+
7+
namespace DataStructures.Tests.Probabilistic
8+
{
9+
public class HyperLogLogTest
10+
{
11+
[Test]
12+
public void TestHyperLogLog()
13+
{
14+
var hll = new HyperLogLog<int>();
15+
HashSet<int> actual = new ();
16+
17+
var rand = new Random();
18+
var tolerance = .05;
19+
for (var i = 0; i < 10000; i++)
20+
{
21+
var k = rand.Next(20000);
22+
hll.Add(k);
23+
actual.Add(k);
24+
}
25+
26+
hll.Cardinality().Should()
27+
.BeGreaterOrEqualTo((int)(actual.Count * (1 - tolerance)))
28+
.And
29+
.BeLessOrEqualTo((int)(actual.Count * (1 + tolerance)));
30+
}
31+
32+
[Test]
33+
public void TestHyperLogLogMerge()
34+
{
35+
var hll1 = new HyperLogLog<int>();
36+
var hll2 = new HyperLogLog<int>();
37+
var rand = new Random();
38+
var tolerance = .05;
39+
HashSet<int> actual = new ();
40+
for (var i = 0; i < 5000; i++)
41+
{
42+
var k = rand.Next(20000);
43+
hll1.Add(k);
44+
actual.Add(k);
45+
}
46+
47+
for (var i = 0; i < 5000; i++)
48+
{
49+
var k = rand.Next(20000);
50+
hll2.Add(k);
51+
actual.Add(k);
52+
}
53+
54+
var hll = HyperLogLog<int>.Merge(hll1, hll2);
55+
hll.Cardinality().Should()
56+
.BeGreaterOrEqualTo((int)(actual.Count * (1 - tolerance)))
57+
.And
58+
.BeLessOrEqualTo((int)(actual.Count * (1 + tolerance)));
59+
}
60+
}
61+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
5+
namespace DataStructures.Probabilistic
6+
{
7+
public class HyperLogLog<T> where T : notnull
8+
{
9+
private const int P = 16;
10+
private const double Alpha = .673;
11+
private readonly int[] registers;
12+
private readonly HashSet<int> setRegisters;
13+
14+
/// <summary>
15+
/// Initializes a new instance of the <see cref="HyperLogLog{T}"/> class.
16+
/// </summary>
17+
public HyperLogLog()
18+
{
19+
var m = 1 << P;
20+
registers = new int[m];
21+
setRegisters = new HashSet<int>();
22+
}
23+
24+
/// <summary>
25+
/// Merge's two HyperLogLog's together to form a union HLL.
26+
/// </summary>
27+
/// <param name="first">the first HLL.</param>
28+
/// <param name="second">The second HLL.</param>
29+
/// <returns>A HyperLogLog with the combined values of the two sets of registers.</returns>
30+
public static HyperLogLog<T> Merge(HyperLogLog<T> first, HyperLogLog<T> second)
31+
{
32+
var output = new HyperLogLog<T>();
33+
for (var i = 0; i < second.registers.Length; i++)
34+
{
35+
output.registers[i] = Math.Max(first.registers[i], second.registers[i]);
36+
}
37+
38+
output.setRegisters.UnionWith(first.setRegisters);
39+
output.setRegisters.UnionWith(second.setRegisters);
40+
return output;
41+
}
42+
43+
/// <summary>
44+
/// Adds an item to the HyperLogLog.
45+
/// </summary>
46+
/// <param name="item">The Item to be added.</param>
47+
public void Add(T item)
48+
{
49+
var x = item.GetHashCode();
50+
var binString = Convert.ToString(x, 2); // converts hash to binary
51+
var j = Convert.ToInt32(binString.Substring(0, Math.Min(P, binString.Length)), 2); // convert first b bits to register index
52+
var w = (int)Math.Log2(x ^ (x & (x - 1))); // find position of the right most 1.
53+
registers[j] = Math.Max(registers[j], w); // set the appropriate register to the appropriate value.
54+
setRegisters.Add(j);
55+
}
56+
57+
/// <summary>
58+
/// Determines the approximate cardinality of the HyperLogLog.
59+
/// </summary>
60+
/// <returns>the approximate cardinality.</returns>
61+
public int Cardinality()
62+
{
63+
// calculate the bottom part of the harmonic mean of the registers
64+
double z = setRegisters.Sum(index => Math.Pow(2, -1 * registers[index]));
65+
66+
// calculate the harmonic mean of the set registers
67+
return (int)Math.Ceiling(Alpha * setRegisters.Count * (setRegisters.Count / z));
68+
}
69+
}
70+
}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ This repository contains algorithms and data structures implemented in C# for ed
141141
* [Probabilistic](./DataStructures/Probabilistic)
142142
* [BloomFilter](./DataStructures/Probabilistic/BloomFilter.cs)
143143
* [Count-Min Sketch](./DataStructures/Probabilistic/CountMinSketch.cs)
144+
* [HyperLogLog](./DataStructures/Probabilistic/HyperLogLog.cs)
144145
* [Queue](./DataStructures/Queue)
145146
* [Array-based Queue](./DataStructures/Queue/ArrayBasedQueue.cs)
146147
* [List-based Queue](./DataStructures/Queue/ListBasedQueue.cs)

0 commit comments

Comments
 (0)