Skip to content

Introduce LongVariableByte #55

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.classpath
.settings
.project
*.class
*.csv
Expand Down
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
<version>4.13.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
<version>0.9.35</version>
<scope>test</scope>
</dependency>
</dependencies>
<issueManagement>
<system>GitHub Issue Tracking</system>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ public interface ByteIntegerCODEC {
* Compress data from an array to another array.
*
* Both inpos and outpos are modified to represent how much data was
* read and written to if 12 ints (inlength = 12) are compressed to 3
* read and written to. If 12 ints (inlength = 12) are compressed to 3
* bytes, then inpos will be incremented by 12 while outpos will be
* incremented by 3 we use IntWrapper to pass the values by reference.
* incremented by 3. We use IntWrapper to pass the values by reference.
*
* @param in
* input array
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen,

int ip = inPos.get();
int op = outPos.get();
int vbcNum = 0, vbcShift = 24; // Varialbe Byte Context.
int vbcNum = 0, vbcShift = 24; // Variable Byte Context.
final int inPosLast = ip + inLen;
while (ip < inPosLast) {
// Fetch a byte value.
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/me/lemire/integercompression/IntegerCODEC.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ public interface IntegerCODEC {
* Compress data from an array to another array.
*
* Both inpos and outpos are modified to represent how much data was
* read and written to if 12 ints (inlength = 12) are compressed to 3
* read and written to. If 12 ints (inlength = 12) are compressed to 3
* ints, then inpos will be incremented by 12 while outpos will be
* incremented by 3 we use IntWrapper to pass the values by reference.
* incremented by 3. We use IntWrapper to pass the values by reference.
*
* @param in
* input array
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

/**
* Interface describing a standard CODEC to compress integers. This is a
* variation on the IntegerCODEC interface meant to be used for random access.
* variation on the IntegerCODEC interface meant to be used for head access.
*
* The main difference is that we must specify the number of integers we wish to
* decode. This information should be stored elsewhere.
Expand All @@ -25,8 +25,8 @@ public interface SkippableIntegerCODEC {
* Compress data from an array to another array.
*
* Both inpos and outpos are modified to represent how much data was read
* and written to if 12 ints (inlength = 12) are compressed to 3 ints, then
* inpos will be incremented by 12 while outpos will be incremented by 3 we
* and written to. If 12 ints (inlength = 12) are compressed to 3 ints, then
* inpos will be incremented by 12 while outpos will be incremented by 3. We
* use IntWrapper to pass the values by reference.
*
* @param in
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/me/lemire/integercompression/VariableByte.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,11 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
for (int v = 0, shift = 0; p < finalp;) {
val = in[p];
int c = (byte) (val >>> s);
// Shift to next byte
s += 8;
// Shift to next integer if s==32
p += s>>5;
// cycle from 31 to 0
s = s & 31;
v += ((c & 127) << shift);
if ((c & 128) == 128) {
Expand Down Expand Up @@ -187,8 +190,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) {
val = in[p];
int c = val >>> s;
// Shift to next byte
s += 8;
// Shift to next integer if s==32
p += s>>5;
// cycle from 31 to 0
s = s & 31;
v += ((c & 127) << shift);
if ((c & 128) == 128) {
Expand Down
62 changes: 62 additions & 0 deletions src/main/java/me/lemire/longcompression/ByteLongCODEC.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
* (c) Daniel Lemire, http://lemire.me/en/
*/

package me.lemire.longcompression;

import me.lemire.integercompression.IntWrapper;

/**
* Interface describing a CODEC to compress longs to bytes.
*
* @author Benoit Lacelle
*
*/
public interface ByteLongCODEC {
/**
* Compress data from an array to another array.
*
* Both inpos and outpos are modified to represent how much data was
* read and written to. If 12 longs (inlength = 12) are compressed to 3
* bytes, then inpos will be incremented by 12 while outpos will be
* incremented by 3. We use IntWrapper to pass the values by reference.
*
* @param in
* input array
* @param inpos
* location in the input array
* @param inlength
* how many longs to compress
* @param out
* output array
* @param outpos
* where to write in the output array
*/
public void compress(long[] in, IntWrapper inpos, int inlength,
byte[] out, IntWrapper outpos);

/**
* Uncompress data from an array to another array.
*
* Both inpos and outpos parameters are modified to indicate new
* positions after read/write.
*
* @param in
* array containing data in compressed form
* @param inpos
* where to start reading in the array
* @param inlength
* length of the compressed data (ignored by some
* schemes)
* @param out
* array where to write the compressed output
* @param outpos
* where to write the compressed output in out
*/
public void uncompress(byte[] in, IntWrapper inpos, int inlength,
long[] out, IntWrapper outpos);

}
11 changes: 11 additions & 0 deletions src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package me.lemire.longcompression;

/**
* This is just like LongCODEC, except that it indicates that delta coding is
* "integrated", so that you don't need a separate step for delta coding.
*
* @author Benoit Lacelle
*/
public interface IntegratedLongCODEC extends LongCODEC {

}
189 changes: 189 additions & 0 deletions src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
package me.lemire.longcompression;

import java.util.Arrays;

import me.lemire.integercompression.BinaryPacking;
import me.lemire.integercompression.Composition;
import me.lemire.integercompression.IntCompressor;
import me.lemire.integercompression.IntWrapper;
import me.lemire.integercompression.IntegerCODEC;
import me.lemire.integercompression.VariableByte;

/**
* A {@link LongCODEC} which split each long in a highpart (32 first bits) and a low part (32 last bits).
*
* @author Benoit Lacelle
*
*/
public class LongAs2IntsCodec implements LongCODEC {
final IntegerCODEC highPartsCodec;
final IntegerCODEC lowPartsCodec;

public LongAs2IntsCodec(IntegerCODEC highPartsCodec, IntegerCODEC lowPartsCodec) {
this.highPartsCodec = highPartsCodec;
this.lowPartsCodec = lowPartsCodec;
}

/**
* By default, we expect longs to be slightly above Integer.MAX_VALUE. Hence highParts to be small and positive
* integers. For lowParts, we rely on {@link IntCompressor} default IntegerCODEC
*/
public LongAs2IntsCodec() {
this(new VariableByte(), new Composition(new BinaryPacking(), new VariableByte()));
}

@Override
public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) {
if (inlength == 0) {
return;
}

int[] highParts = new int[inlength];
int[] lowParts = new int[inlength];

for (int i = 0; i < inlength; i++) {
int inPosition = inpos.get() + i;

highParts[i] = RoaringIntPacking.high(in[inPosition]);
lowParts[i] = RoaringIntPacking.low(in[inPosition]);
}

// TODO What would be a relevant buffer size?
int[] buffer = new int[inlength * 16];

int outPosition = outpos.get();

boolean hasLeftover;
{
// The first integer is reserved to hold the number of compressed ints
IntWrapper highPartsOutPosition = new IntWrapper(1);

highPartsCodec.compress(highParts, new IntWrapper(), inlength, buffer, highPartsOutPosition);

// Record the compressedHighparts length
buffer[0] = highPartsOutPosition.get() - 1;

for (int i = 0; i < highPartsOutPosition.get() / 2; i++) {
long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]);
out[outPosition++] = pack;
}

if (1 == highPartsOutPosition.get() % 2) {
// Shift the trailing integer as first in the buffer
hasLeftover = true;
buffer[0] = buffer[highPartsOutPosition.get() - 1];
} else {
hasLeftover = false;
}
}

{
// The first integer is reserved to hold the number of compressed ints
IntWrapper lowPartsOutPosition = new IntWrapper(1);
if (hasLeftover) {
// Keep the trailing int from highParts before the reserved int from lowParts compressed length
lowPartsOutPosition.set(2);
}

lowPartsCodec.compress(lowParts, new IntWrapper(0), inlength, buffer, lowPartsOutPosition);

// Record the compressedHighparts length
buffer[hasLeftover ? 1 : 0] = lowPartsOutPosition.get() - (hasLeftover ? 2 : 1);

for (int i = 0; i < lowPartsOutPosition.get() / 2; i++) {
long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]);
out[outPosition++] = pack;
}

if (1 == lowPartsOutPosition.get() % 2) {
// The trailing integer is packed with a 0
long pack = RoaringIntPacking.pack(buffer[lowPartsOutPosition.get() - 1], 0);
out[outPosition++] = pack;
}
}

inpos.add(inlength);
outpos.set(outPosition);
}

/**
* inlength is ignored by this codec. We may rely on it instead of storing the compressedLowPart length
*/
@Override
public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) {
if (inlength == 0) {
return;
}

int longIndex = inpos.get();

int nbCompressedHighParts = RoaringIntPacking.high(in[longIndex]);
int[] compressedHighParts = new int[nbCompressedHighParts];

// !highPart as we just read the highPart for nbCompressedHighParts
boolean highPart = false;
for (int i = 0; i < nbCompressedHighParts; i++) {
int nextInt;
if (highPart) {
nextInt = RoaringIntPacking.high(in[longIndex + (i + 1) / 2]);
} else {
nextInt = RoaringIntPacking.low(in[longIndex + (i + 1) / 2]);
}
compressedHighParts[i] = nextInt;

highPart = !highPart;
}

// TODO What would be a relevant buffer size?
int[] buffer = new int[inlength * 16];

IntWrapper highPartsOutPosition = new IntWrapper();
highPartsCodec.uncompress(compressedHighParts,
new IntWrapper(),
compressedHighParts.length,
buffer,
highPartsOutPosition);
int[] highParts = Arrays.copyOf(buffer, highPartsOutPosition.get());

// +1 as we initially read nbCompressedHighParts
int intIndexNbCompressedLowParts = longIndex * 2 + 1 + nbCompressedHighParts;
int nbCompressedLowParts;
if (highPart) {
nbCompressedLowParts = RoaringIntPacking.high(in[intIndexNbCompressedLowParts / 2]);
} else {
nbCompressedLowParts = RoaringIntPacking.low(in[intIndexNbCompressedLowParts / 2]);
}
highPart = !highPart;

int[] compressedLowParts = new int[nbCompressedLowParts];
for (int i = 0; i < nbCompressedLowParts; i++) {
int nextInt;
if (highPart) {
nextInt = RoaringIntPacking.high(in[(intIndexNbCompressedLowParts + 1 + i) / 2]);
} else {
nextInt = RoaringIntPacking.low(in[(intIndexNbCompressedLowParts + 1 + i) / 2]);
}
compressedLowParts[i] = nextInt;

highPart = !highPart;
}

IntWrapper lowPartsOutPosition = new IntWrapper();
lowPartsCodec.uncompress(compressedLowParts,
new IntWrapper(),
compressedLowParts.length,
buffer,
lowPartsOutPosition);
int[] lowParts = Arrays.copyOf(buffer, lowPartsOutPosition.get());
assert highParts.length == lowParts.length;

int outposition = outpos.get();
for (int i = 0; i < highParts.length; i++) {
out[outposition++] = RoaringIntPacking.pack(highParts[i], lowParts[i]);
}

inpos.add(inlength);
outpos.set(outposition);
}

}
Loading