diff --git a/.gitignore b/.gitignore index 53960d2..88b3320 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .classpath +.settings .project *.class *.csv diff --git a/pom.xml b/pom.xml index fc6912b..a960b2c 100644 --- a/pom.xml +++ b/pom.xml @@ -48,6 +48,12 @@ 4.13.1 test + + org.roaringbitmap + RoaringBitmap + 0.9.35 + test + GitHub Issue Tracking diff --git a/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java b/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java index 47d4f57..6e8f903 100644 --- a/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java +++ b/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java @@ -18,9 +18,9 @@ public interface ByteIntegerCODEC { * Compress data from an array to another array. * * Both inpos and outpos are modified to represent how much data was - * read and written to if 12 ints (inlength = 12) are compressed to 3 + * read and written to. If 12 ints (inlength = 12) are compressed to 3 * bytes, then inpos will be incremented by 12 while outpos will be - * incremented by 3 we use IntWrapper to pass the values by reference. + * incremented by 3. We use IntWrapper to pass the values by reference. * * @param in * input array diff --git a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java index 4b2f896..ca9d0ad 100644 --- a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java +++ b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java @@ -105,7 +105,7 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen, int ip = inPos.get(); int op = outPos.get(); - int vbcNum = 0, vbcShift = 24; // Varialbe Byte Context. + int vbcNum = 0, vbcShift = 24; // Variable Byte Context. final int inPosLast = ip + inLen; while (ip < inPosLast) { // Fetch a byte value. diff --git a/src/main/java/me/lemire/integercompression/IntegerCODEC.java b/src/main/java/me/lemire/integercompression/IntegerCODEC.java index 7929e48..f2c9c7a 100644 --- a/src/main/java/me/lemire/integercompression/IntegerCODEC.java +++ b/src/main/java/me/lemire/integercompression/IntegerCODEC.java @@ -18,9 +18,9 @@ public interface IntegerCODEC { * Compress data from an array to another array. * * Both inpos and outpos are modified to represent how much data was - * read and written to if 12 ints (inlength = 12) are compressed to 3 + * read and written to. If 12 ints (inlength = 12) are compressed to 3 * ints, then inpos will be incremented by 12 while outpos will be - * incremented by 3 we use IntWrapper to pass the values by reference. + * incremented by 3. We use IntWrapper to pass the values by reference. * * @param in * input array diff --git a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java index c10d2f0..4568d71 100644 --- a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java +++ b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java @@ -10,7 +10,7 @@ /** * Interface describing a standard CODEC to compress integers. This is a - * variation on the IntegerCODEC interface meant to be used for random access. + * variation on the IntegerCODEC interface meant to be used for head access. * * The main difference is that we must specify the number of integers we wish to * decode. This information should be stored elsewhere. @@ -25,8 +25,8 @@ public interface SkippableIntegerCODEC { * Compress data from an array to another array. * * Both inpos and outpos are modified to represent how much data was read - * and written to if 12 ints (inlength = 12) are compressed to 3 ints, then - * inpos will be incremented by 12 while outpos will be incremented by 3 we + * and written to. If 12 ints (inlength = 12) are compressed to 3 ints, then + * inpos will be incremented by 12 while outpos will be incremented by 3. We * use IntWrapper to pass the values by reference. * * @param in diff --git a/src/main/java/me/lemire/integercompression/VariableByte.java b/src/main/java/me/lemire/integercompression/VariableByte.java index 5b25c43..09e479b 100644 --- a/src/main/java/me/lemire/integercompression/VariableByte.java +++ b/src/main/java/me/lemire/integercompression/VariableByte.java @@ -122,8 +122,11 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, for (int v = 0, shift = 0; p < finalp;) { val = in[p]; int c = (byte) (val >>> s); + // Shift to next byte s += 8; + // Shift to next integer if s==32 p += s>>5; + // cycle from 31 to 0 s = s & 31; v += ((c & 127) << shift); if ((c & 128) == 128) { @@ -187,8 +190,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) { val = in[p]; int c = val >>> s; + // Shift to next byte s += 8; + // Shift to next integer if s==32 p += s>>5; + // cycle from 31 to 0 s = s & 31; v += ((c & 127) << shift); if ((c & 128) == 128) { diff --git a/src/main/java/me/lemire/longcompression/ByteLongCODEC.java b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java new file mode 100644 index 0000000..e405370 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java @@ -0,0 +1,62 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Interface describing a CODEC to compress longs to bytes. + * + * @author Benoit Lacelle + * + */ +public interface ByteLongCODEC { + /** + * Compress data from an array to another array. + * + * Both inpos and outpos are modified to represent how much data was + * read and written to. If 12 longs (inlength = 12) are compressed to 3 + * bytes, then inpos will be incremented by 12 while outpos will be + * incremented by 3. We use IntWrapper to pass the values by reference. + * + * @param in + * input array + * @param inpos + * location in the input array + * @param inlength + * how many longs to compress + * @param out + * output array + * @param outpos + * where to write in the output array + */ + public void compress(long[] in, IntWrapper inpos, int inlength, + byte[] out, IntWrapper outpos); + + /** + * Uncompress data from an array to another array. + * + * Both inpos and outpos parameters are modified to indicate new + * positions after read/write. + * + * @param in + * array containing data in compressed form + * @param inpos + * where to start reading in the array + * @param inlength + * length of the compressed data (ignored by some + * schemes) + * @param out + * array where to write the compressed output + * @param outpos + * where to write the compressed output in out + */ + public void uncompress(byte[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos); + +} diff --git a/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java b/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java new file mode 100644 index 0000000..b21ef68 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java @@ -0,0 +1,11 @@ +package me.lemire.longcompression; + +/** + * This is just like LongCODEC, except that it indicates that delta coding is + * "integrated", so that you don't need a separate step for delta coding. + * + * @author Benoit Lacelle + */ +public interface IntegratedLongCODEC extends LongCODEC { + +} diff --git a/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java new file mode 100644 index 0000000..3b2bc76 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java @@ -0,0 +1,189 @@ +package me.lemire.longcompression; + +import java.util.Arrays; + +import me.lemire.integercompression.BinaryPacking; +import me.lemire.integercompression.Composition; +import me.lemire.integercompression.IntCompressor; +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.IntegerCODEC; +import me.lemire.integercompression.VariableByte; + +/** + * A {@link LongCODEC} which split each long in a highpart (32 first bits) and a low part (32 last bits). + * + * @author Benoit Lacelle + * + */ +public class LongAs2IntsCodec implements LongCODEC { + final IntegerCODEC highPartsCodec; + final IntegerCODEC lowPartsCodec; + + public LongAs2IntsCodec(IntegerCODEC highPartsCodec, IntegerCODEC lowPartsCodec) { + this.highPartsCodec = highPartsCodec; + this.lowPartsCodec = lowPartsCodec; + } + + /** + * By default, we expect longs to be slightly above Integer.MAX_VALUE. Hence highParts to be small and positive + * integers. For lowParts, we rely on {@link IntCompressor} default IntegerCODEC + */ + public LongAs2IntsCodec() { + this(new VariableByte(), new Composition(new BinaryPacking(), new VariableByte())); + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + + int[] highParts = new int[inlength]; + int[] lowParts = new int[inlength]; + + for (int i = 0; i < inlength; i++) { + int inPosition = inpos.get() + i; + + highParts[i] = RoaringIntPacking.high(in[inPosition]); + lowParts[i] = RoaringIntPacking.low(in[inPosition]); + } + + // TODO What would be a relevant buffer size? + int[] buffer = new int[inlength * 16]; + + int outPosition = outpos.get(); + + boolean hasLeftover; + { + // The first integer is reserved to hold the number of compressed ints + IntWrapper highPartsOutPosition = new IntWrapper(1); + + highPartsCodec.compress(highParts, new IntWrapper(), inlength, buffer, highPartsOutPosition); + + // Record the compressedHighparts length + buffer[0] = highPartsOutPosition.get() - 1; + + for (int i = 0; i < highPartsOutPosition.get() / 2; i++) { + long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); + out[outPosition++] = pack; + } + + if (1 == highPartsOutPosition.get() % 2) { + // Shift the trailing integer as first in the buffer + hasLeftover = true; + buffer[0] = buffer[highPartsOutPosition.get() - 1]; + } else { + hasLeftover = false; + } + } + + { + // The first integer is reserved to hold the number of compressed ints + IntWrapper lowPartsOutPosition = new IntWrapper(1); + if (hasLeftover) { + // Keep the trailing int from highParts before the reserved int from lowParts compressed length + lowPartsOutPosition.set(2); + } + + lowPartsCodec.compress(lowParts, new IntWrapper(0), inlength, buffer, lowPartsOutPosition); + + // Record the compressedHighparts length + buffer[hasLeftover ? 1 : 0] = lowPartsOutPosition.get() - (hasLeftover ? 2 : 1); + + for (int i = 0; i < lowPartsOutPosition.get() / 2; i++) { + long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); + out[outPosition++] = pack; + } + + if (1 == lowPartsOutPosition.get() % 2) { + // The trailing integer is packed with a 0 + long pack = RoaringIntPacking.pack(buffer[lowPartsOutPosition.get() - 1], 0); + out[outPosition++] = pack; + } + } + + inpos.add(inlength); + outpos.set(outPosition); + } + + /** + * inlength is ignored by this codec. We may rely on it instead of storing the compressedLowPart length + */ + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + + int longIndex = inpos.get(); + + int nbCompressedHighParts = RoaringIntPacking.high(in[longIndex]); + int[] compressedHighParts = new int[nbCompressedHighParts]; + + // !highPart as we just read the highPart for nbCompressedHighParts + boolean highPart = false; + for (int i = 0; i < nbCompressedHighParts; i++) { + int nextInt; + if (highPart) { + nextInt = RoaringIntPacking.high(in[longIndex + (i + 1) / 2]); + } else { + nextInt = RoaringIntPacking.low(in[longIndex + (i + 1) / 2]); + } + compressedHighParts[i] = nextInt; + + highPart = !highPart; + } + + // TODO What would be a relevant buffer size? + int[] buffer = new int[inlength * 16]; + + IntWrapper highPartsOutPosition = new IntWrapper(); + highPartsCodec.uncompress(compressedHighParts, + new IntWrapper(), + compressedHighParts.length, + buffer, + highPartsOutPosition); + int[] highParts = Arrays.copyOf(buffer, highPartsOutPosition.get()); + + // +1 as we initially read nbCompressedHighParts + int intIndexNbCompressedLowParts = longIndex * 2 + 1 + nbCompressedHighParts; + int nbCompressedLowParts; + if (highPart) { + nbCompressedLowParts = RoaringIntPacking.high(in[intIndexNbCompressedLowParts / 2]); + } else { + nbCompressedLowParts = RoaringIntPacking.low(in[intIndexNbCompressedLowParts / 2]); + } + highPart = !highPart; + + int[] compressedLowParts = new int[nbCompressedLowParts]; + for (int i = 0; i < nbCompressedLowParts; i++) { + int nextInt; + if (highPart) { + nextInt = RoaringIntPacking.high(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); + } else { + nextInt = RoaringIntPacking.low(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); + } + compressedLowParts[i] = nextInt; + + highPart = !highPart; + } + + IntWrapper lowPartsOutPosition = new IntWrapper(); + lowPartsCodec.uncompress(compressedLowParts, + new IntWrapper(), + compressedLowParts.length, + buffer, + lowPartsOutPosition); + int[] lowParts = Arrays.copyOf(buffer, lowPartsOutPosition.get()); + assert highParts.length == lowParts.length; + + int outposition = outpos.get(); + for (int i = 0; i < highParts.length; i++) { + out[outposition++] = RoaringIntPacking.pack(highParts[i], lowParts[i]); + } + + inpos.add(inlength); + outpos.set(outposition); + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongCODEC.java b/src/main/java/me/lemire/longcompression/LongCODEC.java new file mode 100644 index 0000000..c0f67b2 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongCODEC.java @@ -0,0 +1,62 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Interface describing a standard CODEC to compress longs. + * + * @author Benoit Lacelle + * + */ +public interface LongCODEC { + /** + * Compress data from an array to another array. + * + * Both inpos and outpos are modified to represent how much data was + * read and written to. If 12 longs (inlength = 12) are compressed to 3 + * longs, then inpos will be incremented by 12 while outpos will be + * incremented by 3. We use IntWrapper to pass the values by reference. + * + * @param in + * input array + * @param inpos + * location in the input array + * @param inlength + * how many longs to compress + * @param out + * output array + * @param outpos + * where to write in the output array + */ + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos); + + /** + * Uncompress data from an array to another array. + * + * Both inpos and outpos parameters are modified to indicate new + * positions after read/write. + * + * @param in + * array containing data in compressed form + * @param inpos + * where to start reading in the array + * @param inlength + * length of the compressed data (ignored by some + * schemes) + * @param out + * array where to write the compressed output + * @param outpos + * where to write the compressed output in out + */ + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos); + +} diff --git a/src/main/java/me/lemire/longcompression/LongComposition.java b/src/main/java/me/lemire/longcompression/LongComposition.java new file mode 100644 index 0000000..1394a78 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongComposition.java @@ -0,0 +1,71 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Helper class to compose schemes. + * + * @author Benoit Lacelle + */ +public class LongComposition implements LongCODEC { + LongCODEC F1, F2; + + /** + * Compose a scheme from a first one (f1) and a second one (f2). The + * first one is called first and then the second one tries to compress + * whatever remains from the first run. + * + * By convention, the first scheme should be such that if, during + * decoding, a 32-bit zero is first encountered, then there is no + * output. + * + * @param f1 + * first codec + * @param f2 + * second codec + */ + public LongComposition(LongCODEC f1, LongCODEC f2) { + F1 = f1; + F2 = f2; + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + int inposInit = inpos.get(); + int outposInit = outpos.get(); + F1.compress(in, inpos, inlength, out, outpos); + if (outpos.get() == outposInit) { + out[outposInit] = 0; + outpos.increment(); + } + inlength -= inpos.get() - inposInit; + F2.compress(in, inpos, inlength, out, outpos); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + if (inlength == 0) + return; + final int init = inpos.get(); + F1.uncompress(in, inpos, inlength, out, outpos); + inlength -= inpos.get() - init; + F2.uncompress(in, inpos, inlength, out, outpos); + } + + @Override + public String toString() { + return F1.toString() + " + " + F2.toString(); + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongJustCopy.java b/src/main/java/me/lemire/longcompression/LongJustCopy.java new file mode 100644 index 0000000..7a5a67a --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongJustCopy.java @@ -0,0 +1,52 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * @author Benoit lacelle + * + */ +public final class LongJustCopy implements LongCODEC, SkippableLongCODEC { + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + System.arraycopy(in, inpos.get(), out, outpos.get(), inlength); + inpos.add(inlength); + outpos.add(inlength); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + headlessUncompress(in,inpos,inlength,out,outpos,inlength); + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos, int num) { + System.arraycopy(in, inpos.get(), out, outpos.get(), num); + inpos.add(num); + outpos.add(num); + + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + headlessCompress(in,inpos,inlength,out,outpos); + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongUtil.java b/src/main/java/me/lemire/longcompression/LongUtil.java new file mode 100644 index 0000000..c06433f --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongUtil.java @@ -0,0 +1,22 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +/** + * These are unofficial helpers related to long compression + * + * @author Benoit Lacelle + * + */ +@Deprecated +public class LongUtil { + + protected static String longToBinaryWithLeading(long l) { + return String.format("%64s", Long.toBinaryString(l)).replace(' ', '0'); + } +} diff --git a/src/main/java/me/lemire/longcompression/LongVariableByte.java b/src/main/java/me/lemire/longcompression/LongVariableByte.java new file mode 100644 index 0000000..478db20 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongVariableByte.java @@ -0,0 +1,343 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.LongBuffer; + +import me.lemire.integercompression.IntWrapper; + +/** + * Implementation of variable-byte. For best performance, use it using the + * ByteLongCODEC interface. + * + * Note that this does not use differential coding: if you are working on sorted + * lists, you must compute the deltas separately. + * + * @author Benoit Lacelle + */ +public class LongVariableByte implements LongCODEC, ByteLongCODEC, SkippableLongCODEC { + + private static byte extract7bits(int i, long val) { + return (byte) ((val >>> (7 * i)) & ((1 << 7) - 1)); + } + + private static byte extract7bitsmaskless(int i, long val) { + return (byte) ((val >>> (7 * i))); + } + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + headlessCompress(in, inpos, inlength, out, outpos); + } + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + if (inlength == 0) + return; + // Worst case: we write 10 bytes per long, hence 2 longs for a long, hence 16 bytes per long + ByteBuffer buf = makeBuffer(inlength * 16); + buf.order(ByteOrder.LITTLE_ENDIAN); + for (int k = inpos.get(); k < inpos.get() + inlength; ++k) { + final long val = in[k]; + // System.out.println(LongUtil.longToBinaryWithLeading(val)); + if (val >= 0 && val < (1 << 7)) { + buf.put((byte) (val | (1 << 7))); + } else if (val >= 0 && val < (1 << 14)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) (extract7bitsmaskless(1, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1 << 21)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) (extract7bitsmaskless(2, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1 << 28)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) (extract7bitsmaskless(3, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 35)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) (extract7bitsmaskless(4, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 42)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) (extract7bitsmaskless(5, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 49)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) (extract7bitsmaskless(6, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 56)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) extract7bits(6, val)); + buf.put((byte) (extract7bitsmaskless(7, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 63)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) extract7bits(6, val)); + buf.put((byte) extract7bits(7, val)); + buf.put((byte) (extract7bitsmaskless(8, (val)) | (1 << 7))); + } else { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) extract7bits(6, val)); + buf.put((byte) extract7bits(7, val)); + buf.put((byte) extract7bits(8, val)); + buf.put((byte) (extract7bitsmaskless(9, (val)) | (1 << 7))); + } + } + while (buf.position() % 8 != 0) + buf.put((byte) 0); + final int length = buf.position(); + buf.flip(); + LongBuffer ibuf = buf.asLongBuffer(); + ibuf.get(out, outpos.get(), length / 8); + outpos.add(length / 8); + inpos.add(inlength); + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, byte[] out, + IntWrapper outpos) { + if (inlength == 0) + return; + int outpostmp = outpos.get(); + for (int k = inpos.get(); k < inpos.get() + inlength; ++k) { + final long val = in[k]; + if (val >= 0 && val < (1 << 7)) { + out[outpostmp++] = (byte) (val | (1 << 7)); + } else if (val >= 0 && val < (1 << 14)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(1, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1 << 21)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(2, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1 << 28)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(3, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 35)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(4, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 42)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(5, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 49)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(6, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 56)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) extract7bits(6, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(7, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 63)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) extract7bits(6, val); + out[outpostmp++] = (byte) extract7bits(7, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(8, (val)) | (1 << 7)); + } else { + // System.out.println(LongUtil.longToBinaryWithLeading(val)); + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) extract7bits(6, val); + out[outpostmp++] = (byte) extract7bits(7, val); + out[outpostmp++] = (byte) extract7bits(8, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(9, (val)) | (1 << 7)); + } + } + outpos.set(outpostmp); + inpos.add(inlength); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + int s = 0; + long val = 0; + int p = inpos.get(); + int finalp = inpos.get() + inlength; + int tmpoutpos = outpos.get(); + for (long v = 0, shift = 0; p < finalp;) { + val = in[p]; + // System.out.println(LongUtil.longToBinaryWithLeading(val)); + long c = (byte) (val >>> s); + // Shift to next byte + s += 8; + // Shift to next long if s==64 + p += s>>6; + // cycle from 63 to 0 + s = s & 63; + v += ((c & 127) << shift); + if ((c & 128) == 128) { + out[tmpoutpos++] = v; + v = 0; + shift = 0; + } else + shift += 7; + assert shift < 64; + } + outpos.set(tmpoutpos); + inpos.add(inlength); + } + + @Override + public void uncompress(byte[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + int p = inpos.get(); + int finalp = inpos.get() + inlength; + int tmpoutpos = outpos.get(); + for (long v = 0; p < finalp; out[tmpoutpos++] = v) { + v = in[p] & 0x7F; + if (in[p] < 0) { + p += 1; + continue; + } + v = ((in[p + 1] & 0x7F) << 7) | v; + if (in[p + 1] < 0) { + p += 2; + continue; + } + v = ((in[p + 2] & 0x7F) << 14) | v; + if (in[p + 2] < 0 ) { + p += 3; + continue; + } + v = ((in[p + 3] & 0x7F) << 21) | v; + if (in[p + 3] < 0) { + p += 4; + continue; + } + v = (((long) in[p + 4] & 0x7F) << 28) | v; + if (in[p + 4] < 0) { + p += 5; + continue; + } + v = (((long) in[p + 5] & 0x7F) << 35) | v; + if (in[p + 5] < 0) { + p += 6; + continue; + } + v = (((long) in[p + 6] & 0x7F) << 42) | v; + if (in[p + 6] < 0) { + p += 7; + continue; + } + v = (((long) in[p + 7] & 0x7F) << 49) | v; + if (in[p + 7] < 0) { + p += 8; + continue; + } + v = (((long) in[p + 8] & 0x7F) << 56) | v; + if (in[p + 8] < 0) { + p += 9; + continue; + } + v = (((long) in[p + 9] & 0x7F) << 63) | v; + p += 10; + } + outpos.set(tmpoutpos); + inpos.add(p); + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos, int num) { + int s = 0; + long val = 0; + int p = inpos.get(); + int tmpoutpos = outpos.get(); + int finaloutpos = num + tmpoutpos; + for (long v = 0, shift = 0; tmpoutpos < finaloutpos;) { + val = in[p]; + // System.out.println(longToBinaryWithLeading(val)); + long c = val >>> s; + // Shift to next byte + s += 8; + // Shift to next long if s == 64 + p += s>>6; + // cycle from 63 to 0 + s = s & 63; + v += ((c & 127) << shift); + if ((c & 128) == 128) { + out[tmpoutpos++] = v; + v = 0; + shift = 0; + } else + shift += 7; + assert shift < 64; + } + outpos.set(tmpoutpos); + inpos.set(p + (s!=0 ? 1 : 0)); + } + + /** + * Creates a new buffer of the requested size. + * + * In case you need a different way to allocate buffers, you can override this method + * with a custom behavior. The default implementation allocates a new Java direct + * {@link ByteBuffer} on each invocation. + */ + protected ByteBuffer makeBuffer(int sizeInBytes) { + return ByteBuffer.allocateDirect(sizeInBytes); + } +} diff --git a/src/main/java/me/lemire/longcompression/RoaringIntPacking.java b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java new file mode 100644 index 0000000..f109ab3 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java @@ -0,0 +1,108 @@ +/* + * (c) the authors Licensed under the Apache License, Version 2.0. + */ +package me.lemire.longcompression; + +import java.math.BigInteger; +import java.util.Comparator; + +/** + * Used to hold the logic packing 2 integers in a long, and separating a long in two integers. It is + * useful in {@link Roaring64NavigableMap} as the implementation split the input long in two + * integers, one used as key of a NavigableMap while the other is added in a Bitmap + * + * @author Benoit Lacelle + * + */ +// Duplicated from RoaringBitmap +class RoaringIntPacking { + + /** + * + * @param id any long, positive or negative + * @return an int holding the 32 highest order bits of information of the input long + */ + public static int high(long id) { + return (int) (id >> 32); + } + + /** + * + * @param id any long, positive or negative + * @return an int holding the 32 lowest order bits of information of the input long + */ + public static int low(long id) { + return (int) id; + } + + /** + * + * @param high an integer representing the highest order bits of the output long + * @param low an integer representing the lowest order bits of the output long + * @return a long packing together the integers as computed by + * {@link RoaringIntPacking#high(long)} and {@link RoaringIntPacking#low(long)} + */ + // https://stackoverflow.com/questions/12772939/java-storing-two-ints-in-a-long + public static long pack(int high, int low) { + return (((long) high) << 32) | (low & 0xffffffffL); + } + + + /** + * + * @param signedLongs true if long put in a {@link Roaring64NavigableMap} should be considered as + * signed long. + * @return the int representing the highest value which can be set as high value in a + * {@link Roaring64NavigableMap} + */ + public static int highestHigh(boolean signedLongs) { + if (signedLongs) { + return Integer.MAX_VALUE; + } else { + return -1; + } + } + + /** + * @return A comparator for unsigned longs: a negative long is a long greater than Long.MAX_VALUE + */ + public static Comparator unsignedComparator() { + return new Comparator() { + + @Override + public int compare(Integer o1, Integer o2) { + return compareUnsigned(o1, o2); + } + }; + } + + /** + * Compares two {@code int} values numerically treating the values as unsigned. + * + * @param x the first {@code int} to compare + * @param y the second {@code int} to compare + * @return the value {@code 0} if {@code x == y}; a value less than {@code 0} if {@code x < y} as + * unsigned values; and a value greater than {@code 0} if {@code x > y} as unsigned values + * @since 1.8 + */ + // Duplicated from jdk8 Integer.compareUnsigned + public static int compareUnsigned(int x, int y) { + return Integer.compare(x + Integer.MIN_VALUE, y + Integer.MIN_VALUE); + } + + /** the constant 2^64 */ + private static final BigInteger TWO_64 = BigInteger.ONE.shiftLeft(64); + + /** + * JDK8 Long.toUnsignedString was too complex to backport. Go for a slow version relying on + * BigInteger + */ + // https://stackoverflow.com/questions/7031198/java-signed-long-to-unsigned-long-string + static String toUnsignedString(long l) { + BigInteger b = BigInteger.valueOf(l); + if (b.signum() < 0) { + b = b.add(TWO_64); + } + return b.toString(); + } +} diff --git a/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java b/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java new file mode 100644 index 0000000..e3e7b84 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java @@ -0,0 +1,69 @@ +/** + * This is code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Interface describing a standard CODEC to compress longs. This is a + * variation on the LongCODEC interface meant to be used for head access. + * + * The main difference is that we must specify the number of longs we wish to + * decode. This information should be stored elsewhere. + * + * This interface was designed by the Terrier team for their search engine. + * + * @author Benoit Lacelle + * + */ +public interface SkippableLongCODEC { + /** + * Compress data from an array to another array. + * + * Both inpos and outpos are modified to represent how much data was read + * and written to. If 12 longs (inlength = 12) are compressed to 3 longs, then + * inpos will be incremented by 12 while outpos will be incremented by 3. We + * use IntWrapper to pass the values by reference. + * + * @param in + * input array + * @param inpos + * location in the input array + * @param inlength + * how many longs to compress + * @param out + * output array + * @param outpos + * where to write in the output array + */ + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos); + + /** + * Uncompress data from an array to another array. + * + * Both inpos and outpos parameters are modified to indicate new positions + * after read/write. + * + * @param in + * array containing data in compressed form + * @param inpos + * where to start reading in the array + * @param inlength + * length of the compressed data (ignored by some schemes) + * @param out + * array where to write the compressed output + * @param outpos + * where to write the compressed output in out + * @param num + * number of longs we want to decode, the actual number of longs decoded can be less + */ + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos, int num); + +} diff --git a/src/main/java/me/lemire/longcompression/SkippableLongComposition.java b/src/main/java/me/lemire/longcompression/SkippableLongComposition.java new file mode 100644 index 0000000..5568489 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/SkippableLongComposition.java @@ -0,0 +1,70 @@ +/** + * This is code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Helper class to compose schemes. + * + * @author Benoit Lacelle + */ +public class SkippableLongComposition implements SkippableLongCODEC { + SkippableLongCODEC F1, F2; + + /** + * Compose a scheme from a first one (f1) and a second one (f2). The first + * one is called first and then the second one tries to compress whatever + * remains from the first run. + * + * By convention, the first scheme should be such that if, during decoding, + * a 32-bit zero is first encountered, then there is no output. + * + * @param f1 + * first codec + * @param f2 + * second codec + */ + public SkippableLongComposition(SkippableLongCODEC f1, + SkippableLongCODEC f2) { + F1 = f1; + F2 = f2; + } + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + int init = inpos.get(); + int outposInit = outpos.get(); + F1.headlessCompress(in, inpos, inlength, out, outpos); + if (outpos.get() == outposInit) { + out[outposInit] = 0; + outpos.increment(); + } + inlength -= inpos.get() - init; + F2.headlessCompress(in, inpos, inlength, out, outpos); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos, int num) { + int init = inpos.get(); + F1.headlessUncompress(in, inpos, inlength, out, outpos, num); + if (inpos.get() == init) { + inpos.increment(); + } + inlength -= inpos.get() - init; + num -= outpos.get(); + F2.headlessUncompress(in, inpos, inlength, out, outpos, num); + } + + @Override + public String toString() { + return F1.toString() + "+" + F2.toString(); + } + +} diff --git a/src/main/java/me/lemire/longcompression/differential/LongDelta.java b/src/main/java/me/lemire/longcompression/differential/LongDelta.java new file mode 100644 index 0000000..2b0e077 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/differential/LongDelta.java @@ -0,0 +1,150 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression.differential; + +/** + * Generic class to compute differential coding. + * + * @author Benoit lacelle + * + */ +public final class LongDelta { + + /** + * Apply differential coding (in-place). + * + * @param data + * data to be modified + */ + public static void delta(long[] data) { + for (int i = data.length - 1; i > 0; --i) { + data[i] -= data[i - 1]; + } + } + + /** + * Apply differential coding (in-place) given an initial value. + * + * @param data + * data to be modified + * @param start + * starting index + * @param length + * number of integers to process + * @param init + * initial value + * @return next initial vale + */ + public static long delta(long[] data, int start, int length, int init) { + final long nextinit = data[start + length - 1]; + for (int i = length - 1; i > 0; --i) { + data[start + i] -= data[start + i - 1]; + } + data[start] -= init; + return nextinit; + } + + /** + * Compute differential coding given an initial value. Output is written + * to a provided array: must have length "length" or better. + * + * @param data + * data to be modified + * @param start + * starting index + * @param length + * number of integers to process + * @param init + * initial value + * @param out + * output array + * @return next initial vale + */ + public static long delta(long[] data, int start, int length, int init, + long[] out) { + for (int i = length - 1; i > 0; --i) { + out[i] = data[start + i] - data[start + i - 1]; + } + out[0] = data[start] - init; + return data[start + length - 1]; + } + + /** + * Undo differential coding (in-place). Effectively computes a prefix + * sum. + * + * @param data + * to be modified. + */ + public static void inverseDelta(long[] data) { + for (int i = 1; i < data.length; ++i) { + data[i] += data[i - 1]; + } + } + + /** + * Undo differential coding (in-place). Effectively computes a prefix + * sum. Like inverseDelta, only faster. + * + * @param data + * to be modified + */ + public static void fastinverseDelta(long[] data) { + int sz0 = data.length / 4 * 4; + int i = 1; + if (sz0 >= 4) { + long a = data[0]; + for (; i < sz0 - 4; i += 4) { + a = data[i] += a; + a = data[i + 1] += a; + a = data[i + 2] += a; + a = data[i + 3] += a; + } + } + + for (; i != data.length; ++i) { + data[i] += data[i - 1]; + } + } + + /** + * Undo differential coding (in-place). Effectively computes a prefix + * sum. Like inverseDelta, only faster. Uses an initial value. + * + * @param data + * to be modified + * @param start + * starting index + * @param length + * number of integers to process + * @param init + * initial value + * @return next initial value + */ + public static long fastinverseDelta(long[] data, int start, int length, + int init) { + data[start] += init; + int sz0 = length / 4 * 4; + int i = 1; + if (sz0 >= 4) { + long a = data[start]; + for (; i < sz0 - 4; i += 4) { + a = data[start + i] += a; + a = data[start + i + 1] += a; + a = data[start + i + 2] += a; + a = data[start + i + 3] += a; + } + } + + for (; i != length; ++i) { + data[start + i] += data[start + i - 1]; + } + return data[start + length - 1]; + } + +} diff --git a/src/test/java/me/lemire/integercompression/AdhocTest.java b/src/test/java/me/lemire/integercompression/AdhocTest.java index bced6c0..8fd4049 100644 --- a/src/test/java/me/lemire/integercompression/AdhocTest.java +++ b/src/test/java/me/lemire/integercompression/AdhocTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import org.junit.Assert; diff --git a/src/test/java/me/lemire/integercompression/BasicTest.java b/src/test/java/me/lemire/integercompression/BasicTest.java index e88293e..b5f292e 100644 --- a/src/test/java/me/lemire/integercompression/BasicTest.java +++ b/src/test/java/me/lemire/integercompression/BasicTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; @@ -22,7 +29,7 @@ */ @SuppressWarnings({ "static-method" }) public class BasicTest { - IntegerCODEC[] codecs = { + final IntegerCODEC[] codecs = { new IntegratedComposition(new IntegratedBinaryPacking(), new IntegratedVariableByte()), new JustCopy(), @@ -44,7 +51,7 @@ public class BasicTest { new DeltaZigzagVariableByte()) }; /** - * + * This tests with a compressed array with various offset */ @Test public void saulTest() { diff --git a/src/test/java/me/lemire/integercompression/BoundaryTest.java b/src/test/java/me/lemire/integercompression/BoundaryTest.java index ede2e9f..128b431 100644 --- a/src/test/java/me/lemire/integercompression/BoundaryTest.java +++ b/src/test/java/me/lemire/integercompression/BoundaryTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; diff --git a/src/test/java/me/lemire/integercompression/ByteBasicTest.java b/src/test/java/me/lemire/integercompression/ByteBasicTest.java index c2f5b6f..93112c3 100644 --- a/src/test/java/me/lemire/integercompression/ByteBasicTest.java +++ b/src/test/java/me/lemire/integercompression/ByteBasicTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; diff --git a/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java b/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java index 5e0923d..ae42c1d 100644 --- a/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java +++ b/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java @@ -1,7 +1,10 @@ -/* +/** * This code is released under the * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ */ + package me.lemire.integercompression; import org.junit.Test; diff --git a/src/test/java/me/lemire/integercompression/ExampleTest.java b/src/test/java/me/lemire/integercompression/ExampleTest.java index 300983c..f6038b8 100644 --- a/src/test/java/me/lemire/integercompression/ExampleTest.java +++ b/src/test/java/me/lemire/integercompression/ExampleTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import me.lemire.integercompression.differential.*; diff --git a/src/test/java/me/lemire/integercompression/IntCompressorTest.java b/src/test/java/me/lemire/integercompression/IntCompressorTest.java index 34b8946..79e51fc 100644 --- a/src/test/java/me/lemire/integercompression/IntCompressorTest.java +++ b/src/test/java/me/lemire/integercompression/IntCompressorTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; diff --git a/src/test/java/me/lemire/integercompression/ResourcedTest.java b/src/test/java/me/lemire/integercompression/ResourcedTest.java index 61b8e58..34f1d05 100644 --- a/src/test/java/me/lemire/integercompression/ResourcedTest.java +++ b/src/test/java/me/lemire/integercompression/ResourcedTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.ArrayList; diff --git a/src/test/java/me/lemire/integercompression/SkippableBasicTest.java b/src/test/java/me/lemire/integercompression/SkippableBasicTest.java index d965992..93c1784 100644 --- a/src/test/java/me/lemire/integercompression/SkippableBasicTest.java +++ b/src/test/java/me/lemire/integercompression/SkippableBasicTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; @@ -12,7 +19,7 @@ */ @SuppressWarnings({ "static-method" }) public class SkippableBasicTest { - SkippableIntegerCODEC[] codecs = { + final SkippableIntegerCODEC[] codecs = { new JustCopy(), new VariableByte(), new SkippableComposition(new BinaryPacking(), new VariableByte()), diff --git a/src/test/java/me/lemire/integercompression/TestUtils.java b/src/test/java/me/lemire/integercompression/TestUtils.java index a0820ab..7ce51b3 100644 --- a/src/test/java/me/lemire/integercompression/TestUtils.java +++ b/src/test/java/me/lemire/integercompression/TestUtils.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; @@ -123,7 +130,7 @@ public static void assertSymmetry(IntegerCODEC codec, int... orig) { assertArrayEquals(orig, target); } - protected static int[] compress(IntegerCODEC codec, int[] data) { + public static int[] compress(IntegerCODEC codec, int[] data) { int[] outBuf = new int[data.length * 4]; IntWrapper inPos = new IntWrapper(); IntWrapper outPos = new IntWrapper(); diff --git a/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java b/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java index 3201b02..650eb4b 100644 --- a/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java +++ b/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java @@ -1,7 +1,10 @@ /** * This code is released under the * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ */ + package me.lemire.integercompression; import java.util.Arrays; diff --git a/src/test/java/me/lemire/longcompression/LongBasicTest.java b/src/test/java/me/lemire/longcompression/LongBasicTest.java new file mode 100644 index 0000000..5aa3551 --- /dev/null +++ b/src/test/java/me/lemire/longcompression/LongBasicTest.java @@ -0,0 +1,396 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; + +import org.junit.Test; + +import me.lemire.integercompression.BinaryPacking; +import me.lemire.integercompression.Composition; +import me.lemire.integercompression.FastPFOR; +import me.lemire.integercompression.FastPFOR128; +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.JustCopy; +import me.lemire.integercompression.NewPFD; +import me.lemire.integercompression.NewPFDS16; +import me.lemire.integercompression.NewPFDS9; +import me.lemire.integercompression.OptPFD; +import me.lemire.integercompression.OptPFDS16; +import me.lemire.integercompression.OptPFDS9; +import me.lemire.integercompression.Simple9; +import me.lemire.integercompression.VariableByte; +import me.lemire.integercompression.differential.Delta; +import me.lemire.integercompression.differential.IntegratedBinaryPacking; +import me.lemire.integercompression.differential.IntegratedComposition; +import me.lemire.integercompression.differential.IntegratedVariableByte; +import me.lemire.longcompression.differential.LongDelta; +import me.lemire.longcompression.synth.LongClusteredDataGenerator; + +/** + * Just some basic sanity tests. + * + * @author Benoit Lacelle + */ +@SuppressWarnings({ "static-method" }) +public class LongBasicTest { + final LongCODEC[] codecs = { + new LongJustCopy(), + new LongVariableByte(), + new LongAs2IntsCodec()}; + + /** + * This tests with a compressed array with various offset + */ + @Test + public void saulTest() { + for (LongCODEC C : codecs) { + for (int x = 0; x < 50; ++x) { + long[] a = { 2, 3, 4, 5 }; + long[] b = new long[90]; + long[] c = new long[a.length]; + + IntWrapper aOffset = new IntWrapper(0); + IntWrapper bOffset = new IntWrapper(x); + C.compress(a, aOffset, a.length, b, bOffset); + int len = bOffset.get() - x; + + bOffset.set(x); + IntWrapper cOffset = new IntWrapper(0); + C.uncompress(b, bOffset, len, c, cOffset); + if(!Arrays.equals(a, c)) { + System.out.println("Problem with "+C); + } + assertArrayEquals(a, c); + + } + } + } + /** + * + */ + @Test + public void varyingLengthTest() { + int N = 4096; + long[] data = new long[N]; + for (int k = 0; k < N; ++k) + data[k] = k; + for (LongCODEC c : codecs) { + System.out.println("[BasicTest.varyingLengthTest] codec = " + c); + for (int L = 1; L <= 128; L++) { + long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompress(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug"); + } + for (int L = 128; L <= N; L *= 2) { + long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompress(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) { + System.out.println(Arrays.toString(Arrays.copyOf( + answer, L))); + System.out.println(Arrays.toString(Arrays.copyOf(data, + L))); + throw new RuntimeException("bug"); + } + } + + } + } + + /** + * + */ + @Test + public void varyingLengthTest2() { + int N = 128; + long[] data = new long[N]; + data[127] = -1; + for (LongCODEC c : codecs) { + System.out.println("[BasicTest.varyingLengthTest2] codec = " + c); + try { + // CODEC Simple9 is limited to "small" integers. + if (c.getClass().equals( + Class.forName("me.lemire.integercompression.Simple9"))) + continue; + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + try { + // CODEC Simple16 is limited to "small" integers. + if (c.getClass().equals( + Class.forName("me.lemire.integercompression.Simple16"))) + continue; + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + try { + // CODEC GroupSimple9 is limited to "small" integers. + if (c.getClass().equals( + Class.forName("me.lemire.integercompression.GroupSimple9"))) + continue; + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + + for (int L = 1; L <= 128; L++) { + long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompress(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug"); + } + for (int L = 128; L <= N; L *= 2) { + long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompress(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug"); + } + + } + } + + /** + * + */ + @Test + public void checkVariousCases() { + for (LongCODEC c : codecs) { + testZeroInZeroOut(c); + test(c, c, 5, 10); + test(c, c, 5, 14); + test(c, c, 2, 18); + // TODO Unclear which codec should manage an empty output array or not + // Some IntegerCodec does not output anything if the input is smaller than some block size + // testSpurious(c); + testUnsorted(c); + testUnsorted2(c); + testUnsorted3(c); + } + } + + /** + * check that the codecs can be inverted. + */ + @Test + public void basictest() { + for (LongCODEC codec : codecs) { + test(codec, 5, 10); + test(codec, 5, 14); + test(codec, 2, 18); + } + } + + private static void testSpurious(LongCODEC c) { + long[] x = new long[1024]; + long[] y = new long[0]; + IntWrapper i0 = new IntWrapper(0); + IntWrapper i1 = new IntWrapper(0); + for (int inlength = 0; inlength < 32; ++inlength) { + c.compress(x, i0, inlength, y, i1); + assertEquals(0, i1.intValue()); + } + } + + private static void testZeroInZeroOut(LongCODEC c) { + long[] x = new long[0]; + long[] y = new long[0]; + IntWrapper i0 = new IntWrapper(0); + IntWrapper i1 = new IntWrapper(0); + c.compress(x, i0, 0, y, i1); + assertEquals(0, i1.intValue()); + + long[] out = new long[0]; + IntWrapper outpos = new IntWrapper(0); + c.uncompress(y, i1, 0, out, outpos); + assertEquals(0, outpos.intValue()); + } + + private static void test(LongCODEC c, LongCODEC co, int N, int nbr) { + LongClusteredDataGenerator cdg = new LongClusteredDataGenerator(); + for (int sparsity = 1; sparsity < 31 - nbr; sparsity += 4) { + long[][] data = new long[N][]; + int max = (1 << (nbr + sparsity)); + for (int k = 0; k < N; ++k) { + data[k] = cdg.generateClustered((1 << nbr), max); + } + testCodec(c, co, data, max); + } + } + + private static void test(LongCODEC codec, int N, int nbr) { + LongClusteredDataGenerator cdg = new LongClusteredDataGenerator(); + System.out.println("[BasicTest.test] N = " + N + " " + nbr); + for (int sparsity = 1; sparsity < 63 - nbr; sparsity += 4) { + long[][] data = new long[N][]; + long max = (1L << (nbr + sparsity)); + for (int k = 0; k < N; ++k) { + data[k] = cdg.generateClustered((1 << nbr), max); + } + + testCodec(codec, codec, data, max); + } + } + + private static void testCodec(LongCODEC c, LongCODEC co, + long[][] data, long max) { + int N = data.length; + int maxlength = 0; + for (int k = 0; k < N; ++k) { + if (data[k].length > maxlength) + maxlength = data[k].length; + } + long[] buffer = new long[maxlength + 1024]; + long[] dataout = new long[4 * maxlength + 1024]; + // 4x + 1024 to account for the possibility of some negative + // compression. + IntWrapper inpos = new IntWrapper(); + IntWrapper outpos = new IntWrapper(); + for (int k = 0; k < N; ++k) { + long[] backupdata = Arrays.copyOf(data[k], data[k].length); + + inpos.set(1); + outpos.set(0); + if (!(c instanceof IntegratedLongCODEC)) { + LongDelta.delta(backupdata); + } + c.compress(backupdata, inpos, backupdata.length - inpos.get(), + dataout, outpos); + final int thiscompsize = outpos.get() + 1; + inpos.set(0); + outpos.set(1); + buffer[0] = backupdata[0]; + co.uncompress(dataout, inpos, thiscompsize - 1, buffer, outpos); + if (!(c instanceof IntegratedLongCODEC)) + LongDelta.fastinverseDelta(buffer); + + // Check assertions. + assertEquals("length is not match", outpos.get(), data[k].length); + long[] bufferCutout = Arrays.copyOf(buffer, outpos.get()); + assertArrayEquals("failed to reconstruct original data", data[k], + bufferCutout); + } + } + + /** + * @param codec + * provided codec + */ + public void testUnsorted(LongCODEC codec) { + int[] lengths = { 133, 1026, 1333333 }; + for (int N : lengths) { + long[] data = new long[N]; + // initialize the data (most will be small) + for (int k = 0; k < N; k += 1) + data[k] = 3; + // throw some larger values + for (int k = 0; k < N; k += 5) + data[k] = 100; + for (int k = 0; k < N; k += 533) + data[k] = 10000; + data[5] = -311; + // could need more compressing + long[] compressed = new long[(int) Math.ceil(N * 1.01) + 1024]; + IntWrapper inputoffset = new IntWrapper(0); + IntWrapper outputoffset = new IntWrapper(0); + codec.compress(data, inputoffset, data.length, compressed, + outputoffset); + // we can repack the data: (optional) + compressed = Arrays.copyOf(compressed, outputoffset.intValue()); + + long[] recovered = new long[N]; + IntWrapper recoffset = new IntWrapper(0); + codec.uncompress(compressed, new IntWrapper(0), compressed.length, + recovered, recoffset); + assertArrayEquals(data, recovered); + } + } + + private void testUnsorted2(LongCODEC codec) { + long[] data = new long[128]; + data[5] = -1; + long[] compressed = new long[1024]; + IntWrapper inputoffset = new IntWrapper(0); + IntWrapper outputoffset = new IntWrapper(0); + codec.compress(data, inputoffset, data.length, compressed, outputoffset); + // we can repack the data: (optional) + compressed = Arrays.copyOf(compressed, outputoffset.intValue()); + + long[] recovered = new long[128]; + IntWrapper recoffset = new IntWrapper(0); + codec.uncompress(compressed, new IntWrapper(0), compressed.length, + recovered, recoffset); + assertArrayEquals(data, recovered); + } + + private void testUnsorted3(LongCODEC codec) { + long[] data = new long[128]; + data[127] = -1; + long[] compressed = new long[1024]; + IntWrapper inputoffset = new IntWrapper(0); + IntWrapper outputoffset = new IntWrapper(0); + codec.compress(data, inputoffset, data.length, compressed, outputoffset); + // we can repack the data: (optional) + compressed = Arrays.copyOf(compressed, outputoffset.intValue()); + + long[] recovered = new long[128]; + IntWrapper recoffset = new IntWrapper(0); + codec.uncompress(compressed, new IntWrapper(0), compressed.length, + recovered, recoffset); + assertArrayEquals(data, recovered); + } + + /** + * + */ + @Test + public void fastPforTest() { + // proposed by Stefan Ackermann (https://github.com/Stivo) + for (LongCODEC codec : codecs) { + int N = FastPFOR.BLOCK_SIZE; + long[] data = new long[N]; + for (int i = 0; i < N; i++) + data[i] = 0; + data[126] = -1; + long[] comp = LongTestUtils.compress(codec, Arrays.copyOf(data, N)); + long[] answer = LongTestUtils.uncompress(codec, comp, N); + for (int k = 0; k < N; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug " + k + " " + answer[k] + + " != " + data[k]); + } + } + + /** + * + */ + @Test + public void fastPfor128Test() { + // proposed by Stefan Ackermann (https://github.com/Stivo) + for (LongCODEC codec : codecs) { + int N = FastPFOR128.BLOCK_SIZE; + long[] data = new long[N]; + for (int i = 0; i < N; i++) + data[i] = 0; + data[126] = -1; + long[] comp = LongTestUtils.compress(codec, Arrays.copyOf(data, N)); + long[] answer = LongTestUtils.uncompress(codec, comp, N); + for (int k = 0; k < N; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug " + k + " " + answer[k] + + " != " + data[k]); + } + } + +} diff --git a/src/test/java/me/lemire/longcompression/LongTestUtils.java b/src/test/java/me/lemire/longcompression/LongTestUtils.java new file mode 100644 index 0000000..a44e665 --- /dev/null +++ b/src/test/java/me/lemire/longcompression/LongTestUtils.java @@ -0,0 +1,133 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; + +import me.lemire.integercompression.IntWrapper; + +/** + * Static utility methods for test. + */ +public class LongTestUtils { + + protected static void dumpIntArray(long[] data, String label) { + System.out.print(label); + for (int i = 0; i < data.length; ++i) { + if (i % 6 == 0) { + System.out.println(); + } + System.out.format(" %1$11d", data[i]); + } + System.out.println(); + } + + protected static void dumpIntArrayAsHex(long[] data, String label) { + System.out.print(label); + for (int i = 0; i < data.length; ++i) { + if (i % 8 == 0) { + System.out.println(); + } + System.out.format(" %1$08X", data[i]); + } + System.out.println(); + } + + /** + * Check that compress and uncompress keep original array. + * + * @param codec CODEC to test. + * @param orig original integers + */ + public static void assertSymmetry(LongCODEC codec, long... orig) { + // There are some cases that compressed array is bigger than original + // array. So output array for compress must be larger. + // + // Example: + // - VariableByte compresses an array like [ -1 ]. + // - Composition compresses a short array. + final int EXTEND = 1; + + long[] compressed = new long[orig.length + EXTEND]; + IntWrapper c_inpos = new IntWrapper(0); + IntWrapper c_outpos = new IntWrapper(0); + codec.compress(orig, c_inpos, orig.length, compressed, + c_outpos); + + assertTrue(c_outpos.get() <= orig.length + EXTEND); + + // Uncompress an array. + long[] uncompressed = new long[orig.length]; + IntWrapper u_inpos = new IntWrapper(0); + IntWrapper u_outpos = new IntWrapper(0); + codec.uncompress(compressed, u_inpos, c_outpos.get(), + uncompressed, u_outpos); + + // Compare between uncompressed and orig arrays. + long[] target = Arrays.copyOf(uncompressed, u_outpos.get()); + assertArrayEquals(orig, target); + } + + protected static long[] compress(LongCODEC codec, long[] data) { + long[] outBuf = new long[data.length * 8]; + IntWrapper inPos = new IntWrapper(); + IntWrapper outPos = new IntWrapper(); + codec.compress(data, inPos, data.length, outBuf, outPos); + return Arrays.copyOf(outBuf, outPos.get()); + } + + protected static long[] uncompress(LongCODEC codec, long[] data, int len) { + long[] outBuf = new long[len + 1024]; + IntWrapper inPos = new IntWrapper(); + IntWrapper outPos = new IntWrapper(); + codec.uncompress(data, inPos, data.length, outBuf, outPos); + return Arrays.copyOf(outBuf, outPos.get()); + } + + + + protected static byte[] compress(ByteLongCODEC codec, long[] data) { + byte[] outBuf = new byte[data.length * 4 * 4]; + IntWrapper inPos = new IntWrapper(); + IntWrapper outPos = new IntWrapper(); + codec.compress(data, inPos, data.length, outBuf, outPos); + return Arrays.copyOf(outBuf, outPos.get()); + } + + protected static long[] uncompress(ByteLongCODEC codec, byte[] data, int len) { + long[] outBuf = new long[len + 1024]; + IntWrapper inPos = new IntWrapper(); + IntWrapper outPos = new IntWrapper(); + codec.uncompress(data, inPos, data.length, outBuf, outPos); + return Arrays.copyOf(outBuf, outPos.get()); + } + + protected static long[] compressHeadless(SkippableLongCODEC codec, long[] data) { + long[] outBuf = new long[data.length * 4]; + IntWrapper inPos = new IntWrapper(); + IntWrapper outPos = new IntWrapper(); + codec.headlessCompress(data, inPos, data.length, outBuf, outPos); + return Arrays.copyOf(outBuf, outPos.get()); + } + + protected static long[] uncompressHeadless(SkippableLongCODEC codec, long[] data, int len) { + long[] outBuf = new long[len + 1024]; + IntWrapper inPos = new IntWrapper(); + IntWrapper outPos = new IntWrapper(); + codec.headlessUncompress(data, inPos, data.length, outBuf, outPos,len); + if(outPos.get() < len) throw new RuntimeException("Insufficient output."); + return Arrays.copyOf(outBuf, outPos.get()); + } + + public static String longToBinaryWithLeading(long l) { + return String.format("%64s", Long.toBinaryString(l)).replace(' ', '0'); + } +} diff --git a/src/test/java/me/lemire/longcompression/SkippableLongBasicTest.java b/src/test/java/me/lemire/longcompression/SkippableLongBasicTest.java new file mode 100644 index 0000000..e900c9c --- /dev/null +++ b/src/test/java/me/lemire/longcompression/SkippableLongBasicTest.java @@ -0,0 +1,145 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import java.util.Arrays; + +import org.junit.Test; + +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.TestUtils; +import me.lemire.integercompression.VariableByte; + + +/** + * Just some basic sanity tests. + * + * @author Benoit Lacelle + */ +@SuppressWarnings({ "static-method" }) +public class SkippableLongBasicTest { + final SkippableLongCODEC[] codecs = { + new LongJustCopy(), + new LongVariableByte(), }; + + + /** + * + */ + @Test + public void consistentTest() { + int N = 4096; + long[] data = new long[N]; + long[] rev = new long[N]; + for (int k = 0; k < N; ++k) + data[k] = k % 128; + for (SkippableLongCODEC c : codecs) { + System.out.println("[SkippeableBasicTest.consistentTest] codec = " + + c); + long[] outBuf = new long[N + 1024]; + for (int n = 0; n <= N; ++n) { + IntWrapper inPos = new IntWrapper(); + IntWrapper outPos = new IntWrapper(); + c.headlessCompress(data, inPos, n, outBuf, outPos); + + IntWrapper inPoso = new IntWrapper(); + IntWrapper outPoso = new IntWrapper(); + c.headlessUncompress(outBuf, inPoso, outPos.get(), rev, + outPoso, n); + if (outPoso.get() != n) { + throw new RuntimeException("bug "+n); + } + if (inPoso.get() != outPos.get()) { + throw new RuntimeException("bug "+n+" "+inPoso.get()+" "+outPos.get()); + } + for (int j = 0; j < n; ++j) + if (data[j] != rev[j]) { + throw new RuntimeException("bug"); + } + } + } + } + + + /** + * + */ + @Test + public void varyingLengthTest() { + int N = 4096; + long[] data = new long[N]; + for (int k = 0; k < N; ++k) + data[k] = k; + for (SkippableLongCODEC c : codecs) { + System.out.println("[SkippeableBasicTest.varyingLengthTest] codec = "+c); + for (int L = 1; L <= 128; L++) { + long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompressHeadless(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug "+c.toString()+" "+k+" "+answer[k]+" "+data[k]); + } + for (int L = 128; L <= N; L *= 2) { + long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompressHeadless(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug"); + } + + } + } + + /** + * + */ + @Test + public void varyingLengthTest2() { + int N = 128; + long[] data = new long[N]; + data[127] = -1; + for (SkippableLongCODEC c : codecs) { + System.out.println("[SkippeableBasicTest.varyingLengthTest2] codec = "+c); + + try { + // CODEC Simple9 is limited to "small" integers. + if (c.getClass().equals( + Class.forName("me.lemire.integercompression.Simple9"))) + continue; + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + try { + // CODEC Simple16 is limited to "small" integers. + if (c.getClass().equals( + Class.forName("me.lemire.integercompression.Simple16"))) + continue; + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + for (int L = 1; L <= 128; L++) { + long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompressHeadless(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) { + throw new RuntimeException("L=" + L + ": bug at k = "+k+" "+answer[k]+" "+data[k]+" for "+c.toString()); + } + } + for (int L = 128; L <= N; L *= 2) { + long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L)); + long[] answer = LongTestUtils.uncompressHeadless(c, comp, L); + for (int k = 0; k < L; ++k) + if (answer[k] != data[k]) + throw new RuntimeException("bug"); + } + + } + } + + +} diff --git a/src/test/java/me/lemire/longcompression/TestLongAs2IntsCodec.java b/src/test/java/me/lemire/longcompression/TestLongAs2IntsCodec.java new file mode 100644 index 0000000..00bb52a --- /dev/null +++ b/src/test/java/me/lemire/longcompression/TestLongAs2IntsCodec.java @@ -0,0 +1,106 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import java.util.stream.LongStream; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Edge-cases having caused issue specifically with LongVariableByte. + * + * @author Benoit Lacelle + */ +public class TestLongAs2IntsCodec { + final LongAs2IntsCodec codec = new LongAs2IntsCodec(); + + private void checkConsistency(LongCODEC codec, long[] array) { + { + long[] compressed = LongTestUtils.compress(codec, array); + long[] uncompressed = LongTestUtils.uncompress(codec, compressed, array.length); + + Assert.assertArrayEquals(array, uncompressed); + } + + if (codec instanceof ByteLongCODEC) { + byte[] compressed = LongTestUtils.compress((ByteLongCODEC) codec, array); + long[] uncompressed = LongTestUtils.uncompress((ByteLongCODEC) codec, compressed, array.length); + + Assert.assertArrayEquals(array, uncompressed); + } + + if (codec instanceof SkippableLongCODEC) { + long[] compressed = LongTestUtils.compressHeadless((SkippableLongCODEC) codec, array); + long[] uncompressed = + LongTestUtils.uncompressHeadless((SkippableLongCODEC) codec, compressed, array.length); + + Assert.assertArrayEquals(array, uncompressed); + } + } + + @Test + public void testCodec_Zero() { + checkConsistency(codec, new long[] { 0 }); + } + + @Test + public void testCodec_Minus1() { + checkConsistency(codec, new long[] { -1 }); + } + + @Test + public void testCodec_ZeroTimes8Minus1() { + checkConsistency(codec, new long[] { 0, 0, 0, 0, 0, 0, 0, 0, -1 }); + } + + @Test + public void testCodec_ZeroTimes127Minus1() { + long[] array = LongStream.concat(LongStream.range(0, 127).map(l -> 0), LongStream.of(-1)).toArray(); + + checkConsistency(codec, array); + } + + @Test + public void testCodec_ZeroTimes128Minus1() { + long[] array = LongStream.concat(LongStream.range(0, 128).map(l -> 0), LongStream.of(-1)).toArray(); + + checkConsistency(codec, array); + } + + @Test + public void testCodec_MinValue() { + checkConsistency(codec, new long[] { Long.MIN_VALUE }); + } + + @Test + public void testCodec_ZeroMinValue() { + checkConsistency(codec, new long[] { 0, Long.MIN_VALUE }); + } + + @Test + public void testCodec_allPowerOfTwo() { + checkConsistency(codec, new long[] { 1L << 42 }); + for (int i = 0; i < 64; i++) { + checkConsistency(codec, new long[] { 1L << i }); + } + } + + @Test + public void testCodec_ZeroThenAllPowerOfTwo() { + for (int i = 0; i < 64; i++) { + checkConsistency(codec, new long[] { 0, 1L << i }); + } + } + + @Test + public void testCodec_intermediateHighPowerOfTwo() { + Assert.assertEquals(3, LongTestUtils.compress((LongCODEC) codec, new long[] { 1L << 42 }).length); + } + +} diff --git a/src/test/java/me/lemire/longcompression/TestLongVariableByte.java b/src/test/java/me/lemire/longcompression/TestLongVariableByte.java new file mode 100644 index 0000000..15613f2 --- /dev/null +++ b/src/test/java/me/lemire/longcompression/TestLongVariableByte.java @@ -0,0 +1,103 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import java.util.stream.LongStream; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Edge-cases having caused issue specifically with LongVariableByte. + * + * @author Benoit Lacelle + */ +public class TestLongVariableByte { + final LongVariableByte codec = new LongVariableByte(); + + private void checkConsistency(LongCODEC codec, long[] array) { + { + long[] compressed = LongTestUtils.compress(codec, array); + long[] uncompressed = LongTestUtils.uncompress(codec, compressed, array.length); + + Assert.assertArrayEquals(array, uncompressed); + } + + if (codec instanceof ByteLongCODEC) { + byte[] compressed = LongTestUtils.compress((ByteLongCODEC) codec, array); + long[] uncompressed = LongTestUtils.uncompress((ByteLongCODEC) codec, compressed, array.length); + + Assert.assertArrayEquals(array, uncompressed); + } + + if (codec instanceof SkippableLongCODEC) { + long[] compressed = LongTestUtils.compressHeadless((SkippableLongCODEC) codec, array); + long[] uncompressed = + LongTestUtils.uncompressHeadless((SkippableLongCODEC) codec, compressed, array.length); + + Assert.assertArrayEquals(array, uncompressed); + } + } + + @Test + public void testCodec_ZeroMinus1() { + checkConsistency(codec, new long[] { -1 }); + } + + @Test + public void testCodec_ZeroTimes8Minus1() { + checkConsistency(codec, new long[] { 0, 0, 0, 0, 0, 0, 0, 0, -1 }); + } + + @Test + public void testCodec_ZeroTimes127Minus1() { + long[] array = LongStream.concat(LongStream.range(0, 127).map(l -> 0), LongStream.of(-1)).toArray(); + + checkConsistency(codec, array); + } + + @Test + public void testCodec_ZeroTimes128Minus1() { + long[] array = LongStream.concat(LongStream.range(0, 128).map(l -> 0), LongStream.of(-1)).toArray(); + + checkConsistency(codec, array); + } + + @Test + public void testCodec_MinValue() { + checkConsistency(codec, new long[] { Long.MIN_VALUE }); + } + + @Test + public void testCodec_ZeroMinValue() { + checkConsistency(codec, new long[] { 0, Long.MIN_VALUE }); + } + + @Test + public void testCodec_allPowerOfTwo() { + checkConsistency(codec, new long[] { 1L << 42 }); + for (int i = 0; i < 64; i++) { + checkConsistency(codec, new long[] { 1L << i }); + } + } + + @Test + public void testCodec_ZeroThenAllPowerOfTwo() { + for (int i = 0; i < 64; i++) { + checkConsistency(codec, new long[] { 0, 1L << i }); + } + } + + @Test + public void testCodec_intermediateHighPowerOfTwo() { + Assert.assertEquals(1, LongTestUtils.compress((LongCODEC) codec, new long[] { 1L << 42 }).length); + Assert.assertEquals(7, LongTestUtils.compress((ByteLongCODEC) codec, new long[] { 1L << 42 }).length); + Assert.assertEquals(1, LongTestUtils.compressHeadless((SkippableLongCODEC) codec, new long[] { 1L << 42 }).length); + } + +} diff --git a/src/test/java/me/lemire/longcompression/synth/LongClusteredDataGenerator.java b/src/test/java/me/lemire/longcompression/synth/LongClusteredDataGenerator.java new file mode 100644 index 0000000..5b90ee0 --- /dev/null +++ b/src/test/java/me/lemire/longcompression/synth/LongClusteredDataGenerator.java @@ -0,0 +1,91 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression.synth; + +import me.lemire.integercompression.synth.ClusteredDataGenerator; + +/** + * This class will generate lists of random longs based on the clustered + * model: + * + * Reference: Vo Ngoc Anh and Alistair Moffat. 2010. Index compression using + * 64-bit words. Softw. Pract. Exper.40, 2 (February 2010), 131-147. + * + * @author Benoit Lacelle + * @see ClusteredDataGenerator + */ +public class LongClusteredDataGenerator { + + final LongUniformDataGenerator unidg = new LongUniformDataGenerator(); + + /** + * Creating random array generator. + */ + public LongClusteredDataGenerator() { + } + + void fillUniform(long[] array, int offset, int length, long Min, long Max) { + long[] v = this.unidg.generateUniform(length, Max - Min); + for (int k = 0; k < v.length; ++k) + array[k + offset] = Min + v[k]; + } + + void fillClustered(long[] array, int offset, int length, long Min, long Max) { + final long range = Max - Min; + if ((range == length) || (length <= 10)) { + fillUniform(array, offset, length, Min, Max); + return; + } + final long cut = length + / 2 + + ((range - length - 1 > 0) ? (long)this.unidg.rand + .nextDouble() * (range - length - 1) : 0); + final double p = this.unidg.rand.nextDouble(); + if (p < 0.25) { + fillUniform(array, offset, length / 2, Min, Min + cut); + fillClustered(array, offset + length / 2, length + - length / 2, Min + cut, Max); + } else if (p < 0.5) { + fillClustered(array, offset, length / 2, Min, Min + cut); + fillUniform(array, offset + length / 2, length - length + / 2, Min + cut, Max); + } else { + fillClustered(array, offset, length / 2, Min, Min + cut); + fillClustered(array, offset + length / 2, length + - length / 2, Min + cut, Max); + } + } + + /** + * generates randomly N distinct integers from 0 to Max. + * + * @param N + * number of integers to generate + * @param Max + * maximal value of the integers + * @return array containing the integers + */ + public long[] generateClustered(int N, long Max) { + long[] array = new long[N]; + fillClustered(array, 0, N, 0, Max); + return array; + } + + /** + * Little test program. + * + * @param args + * arguments are ignored + */ + public static void main(final String[] args) { + long[] example = (new LongClusteredDataGenerator()) + .generateClustered(20, 1000); + for (int k = 0; k < example.length; ++k) + System.out.println(example[k]); + } + +} diff --git a/src/test/java/me/lemire/longcompression/synth/LongUniformDataGenerator.java b/src/test/java/me/lemire/longcompression/synth/LongUniformDataGenerator.java new file mode 100644 index 0000000..4d435f2 --- /dev/null +++ b/src/test/java/me/lemire/longcompression/synth/LongUniformDataGenerator.java @@ -0,0 +1,125 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression.synth; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; +import java.util.Set; + +import org.roaringbitmap.longlong.Roaring64Bitmap; + +import me.lemire.integercompression.synth.UniformDataGenerator; + +/** + * This class will generate "uniform" lists of random longs. + * + * @author Benoit Lacelle + * @see UniformDataGenerator + */ +public class LongUniformDataGenerator { + /** + * construct generator of random arrays. + */ + public LongUniformDataGenerator() { + this.rand = new Random(); + } + + /** + * @param seed + * random seed + */ + public LongUniformDataGenerator(final int seed) { + this.rand = new Random(seed); + } + + /** + * generates randomly N distinct longs from 0 to Max. + */ + long[] generateUniformHash(int N, long Max) { + if (N > Max) + throw new RuntimeException("not possible"); + long[] ans = new long[N]; + Set s = new HashSet<>(); + while (s.size() < N) + s.add((long) (this.rand.nextDouble() * Max)); + Iterator i = s.iterator(); + for (int k = 0; k < N; ++k) + ans[k] = i.next().longValue(); + Arrays.sort(ans); + return ans; + } + + /** + * output all longs from the range [0,Max) that are not in the array + */ + static long[] negate(long[] x, long Max) { + int newLength = saturatedCast(Max - x.length); + long[] ans = new long[newLength]; + int i = 0; + int c = 0; + for (int j = 0; j < x.length; ++j) { + long v = x[j]; + for (; i < v; ++i) + ans[c++] = i; + ++i; + } + while (c < ans.length) + ans[c++] = i++; + return ans; + } + + private static int saturatedCast(long toInt) { + if (toInt > Integer.MAX_VALUE) { + return Integer.MAX_VALUE; + } else { + return (int) toInt; + } + } + + /** + * generates randomly N distinct longs from 0 to Max. + * + * @param N + * number of longs to generate + * @param Max + * bound on the value of longs + * @return an array containing randomly selected longs + */ + public long[] generateUniform(int N, long Max) { + assert N >= 0; + assert Max >= 0; + if (N * 2 > Max) { + return negate(generateUniform(saturatedCast(Max - N), Max), Max); + } + if (2048 * N > Max) + return generateUniformBitmap(N, Max); + return generateUniformHash(N, Max); + } + + /** + * generates randomly N distinct longs from 0 to Max. + */ + long[] generateUniformBitmap(int N, long Max) { + if (N > Max) + throw new RuntimeException("not possible"); + Roaring64Bitmap bs = new Roaring64Bitmap(); + int cardinality = 0; + while (cardinality < N) { + long v = (long) (rand.nextDouble() * Max); + if (!bs.contains(v)) { + bs.add(v); + cardinality++; + } + } + return bs.toArray(); + } + + Random rand = new Random(); + +} \ No newline at end of file