diff --git a/src/main/java/org/apache/datasketches/hash/MurmurHash3.java b/src/main/java/org/apache/datasketches/hash/MurmurHash3.java index 38236560a..c4deef60d 100644 --- a/src/main/java/org/apache/datasketches/hash/MurmurHash3.java +++ b/src/main/java/org/apache/datasketches/hash/MurmurHash3.java @@ -20,6 +20,8 @@ package org.apache.datasketches.hash; import java.io.Serializable; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; /** @@ -177,6 +179,57 @@ public static long[] hash(final char[] key, final long seed) { return hashState.finalMix128(k1, k2, chars << 1); //convert to bytes } + //--Hash of ByteBuffer------------------------------------------------ + /** + * Returns a long array of size 2, which is a 128-bit hash of the input. + * + * @param buf The input byte buffer. Must be non-null and non-empty. + * @param seed A long valued seed. + * @return the hash. + */ + public static long[] hash(final ByteBuffer buf, final long seed) { + final HashState hashState = new HashState(seed, seed); + final ByteBuffer littleEndianBuf; + + if (buf.order() == ByteOrder.LITTLE_ENDIAN) { + littleEndianBuf = buf; + } else { + littleEndianBuf = buf.duplicate().order(ByteOrder.LITTLE_ENDIAN); + } + + final int bytes = littleEndianBuf.remaining(); //in bytes + final int offset = littleEndianBuf.position(); + + // Number of full 128-bit blocks of 16 bytes. + // Possible exclusion of a remainder of up to 15 bytes. + final int nblocks = bytes >>> 4; //bytes / 16 + + // Process the 128-bit blocks (the body) into the hash + for (int i = 0; i < nblocks; i++ ) { //16 bytes per block + final long k1 = getLong(littleEndianBuf, offset + (i << 4), 8); //0, 16, 32, ... + final long k2 = getLong(littleEndianBuf, offset + (i << 4) + 8, 8); //8, 24, 40, ... + hashState.blockMix128(k1, k2); + } + + // Get the tail index, remainder length + final int tail = nblocks << 4; //16 bytes per block + final int rem = bytes - tail; // remainder bytes: 0,1,...,15 + + // Get the tail + final long k1; + final long k2; + if (rem > 8) { //k1 -> whole; k2 -> partial + k1 = getLong(littleEndianBuf, offset + tail, 8); + k2 = getLong(littleEndianBuf, offset + tail + 8, rem - 8); + } + else { //k1 -> whole, partial or 0; k2 == 0 + k1 = (rem == 0) ? 0 : getLong(littleEndianBuf, offset + tail, rem); + k2 = 0; + } + // Mix the tail into the hash and return + return hashState.finalMix128(k1, k2, bytes); + } + //--Hash of byte[]---------------------------------------------------- /** * Returns a long array of size 2, which is a 128-bit hash of the input. @@ -308,6 +361,43 @@ private static long mixK2(long k2) { } } + //--Helper methods---------------------------------------------------- + /** + * Gets a long from the given byte buffer starting at the given position index and continuing for + * remainder (rem) bytes. The buffer must be in little-endian order. The bytes are extracted in + * little-endian order. The buffer endianness and limit are not checked. + * + * @param buf The given input byte buffer. + * @param index Zero-based index from the start of the byte array. + * @param rem Remainder bytes. An integer in the range [1,8]. + * @return long + */ + private static long getLong(final ByteBuffer buf, final int index, final int rem) { + long out = 0L; + + switch (rem) { + case 8: + out = buf.getLong(index); + break; + case 7: + out ^= (buf.get(index + 6) & 0xFFL) << 48; + case 6: + out ^= (buf.get(index + 5) & 0xFFL) << 40; + case 5: + out ^= (buf.get(index + 4) & 0xFFL) << 32; + case 4: + out ^= (buf.get(index + 3) & 0xFFL) << 24; + case 3: + out ^= (buf.get(index + 2) & 0xFFL) << 16; + case 2: + out ^= (buf.get(index + 1) & 0xFFL) << 8; + case 1: + out ^= buf.get(index) & 0xFFL; + } + + return out; + } + //--Helper methods---------------------------------------------------- /** * Gets a long from the given byte array starting at the given byte array index and continuing for diff --git a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java index bfb679711..076cd0b4e 100644 --- a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java +++ b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java @@ -27,6 +27,8 @@ import org.apache.datasketches.memory.Memory; +import java.nio.ByteBuffer; + /** * Although this class is package-private, it provides a single place to define and document * the common public API for both HllSketch and Union. @@ -327,6 +329,23 @@ public void update(final String datum) { couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } + /** + * Present the given byte buffer as a potential unique item. + * Bytes are read from the current position of the buffer until its limit. + * If the byte buffer is null or has no bytes remaining, no update attempt is made and the method returns. + * + * This method will not modify the position, mark, limit, or byte order of the buffer. + * + * Little-endian order is preferred, but not required. This method may perform better if the provided byte + * buffer is in little-endian order. + * + * @param data The given byte buffer. + */ + public void update(final ByteBuffer data) { + if ((data == null) || (data.remaining() == 0)) { return; } + couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); + } + /** * Present the given byte array as a potential unique item. * If the byte array is null or empty no update attempt is made and the method returns. diff --git a/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java b/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java index 3bfbcd26e..7fea07638 100644 --- a/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java +++ b/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java @@ -19,12 +19,18 @@ package org.apache.datasketches.hash; +import static java.nio.charset.StandardCharsets.UTF_16; import static org.apache.datasketches.hash.MurmurHash3.hash; import static java.nio.charset.StandardCharsets.UTF_8; import org.testng.Assert; import org.testng.annotations.Test; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; + /** * Tests the MurmurHash3 against specific, known hash results given known * inputs obtained from the public domain C++ version 150. @@ -46,6 +52,18 @@ public void checkByteArrRemainderGT8() { //byte[], remainder > 8 Assert.assertEquals(result[1], h2); } + @Test + public void checkByteBufRemainderGT8() { //byte buffer, remainder > 8 + String keyStr = "The quick brown fox jumps over the lazy dog"; + byte[] key = keyStr.getBytes(UTF_8); + + //Should be: + long h1 = 0xe34bbc7bbc071b6cL; + long h2 = 0x7a433ca9c49a9347L; + + checkHashByteBuf(key, h1, h2); + } + @Test public void checkByteArrChange1bit() { //byte[], change one bit String keyStr = "The quick brown fox jumps over the lazy eog"; @@ -58,6 +76,18 @@ public void checkByteArrChange1bit() { //byte[], change one bit Assert.assertEquals(result[1], h2); } + @Test + public void checkByteBufChange1bit() { //byte buffer, change one bit + String keyStr = "The quick brown fox jumps over the lazy eog"; + byte[] key = keyStr.getBytes(UTF_8); + + //Should be: + long h1 = 0x362108102c62d1c9L; + long h2 = 0x3285cd100292b305L; + + checkHashByteBuf(key, h1, h2); + } + @Test public void checkByteArrRemainderLt8() { //byte[], test a remainder < 8 String keyStr = "The quick brown fox jumps over the lazy dogdogdog"; @@ -70,6 +100,18 @@ public void checkByteArrRemainderLt8() { //byte[], test a remainder < 8 Assert.assertEquals(result[1], h2); } + @Test + public void checkByteBufRemainderLt8() { //byte buffer, test a remainder < 8 + String keyStr = "The quick brown fox jumps over the lazy dogdogdog"; + byte[] key = keyStr.getBytes(UTF_8); + + //Should be; + long h1 = 0x9c8205300e612fc4L; + long h2 = 0xcbc0af6136aa3df9L; + + checkHashByteBuf(key, h1, h2); + } + @Test public void checkByteArrReaminderEQ8() { //byte[], test a remainder = 8 String keyStr = "The quick brown fox jumps over the lazy1"; @@ -80,7 +122,18 @@ public void checkByteArrReaminderEQ8() { //byte[], test a remainder = 8 long h2 = 0xbdbf05f8da0f0392L; Assert.assertEquals(result[0], h1); Assert.assertEquals(result[1], h2); + } + + @Test + public void checkByteBufReaminderEQ8() { //byte buffer, test a remainder = 8 + String keyStr = "The quick brown fox jumps over the lazy1"; + byte[] key = keyStr.getBytes(UTF_8); + //Should be: + long h1 = 0xe3301a827e5cdfe3L; + long h2 = 0xbdbf05f8da0f0392L; + + checkHashByteBuf(key, h1, h2); } /** @@ -190,6 +243,21 @@ public void checkByteArrAllOnesZeros() { //byte[], test a ones byte and a zeros Assert.assertEquals(result[1], h2); } + @Test + public void checkByteBufAllOnesZeros() { //byte[], test a ones byte and a zeros byte + byte[] key = { + 0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b, 0x20, 0x62, 0x72, 0x6f, 0x77, 0x6e, + 0x20, 0x66, 0x6f, 0x78, 0x20, 0x6a, 0x75, 0x6d, 0x70, 0x73, 0x20, 0x6f, 0x76, 0x65, + 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79, 0x20, 0x64, 0x6f, 0x67, + (byte) 0xff, 0x64, 0x6f, 0x67, 0x00 + }; + + long h1 = 0xe88abda785929c9eL; + long h2 = 0x96b98587cacc83d6L; + + checkHashByteBuf(key, h1, h2); + } + /** * This test demonstrates that the hash of byte[], char[], int[], or long[] will produce the * same hash result if, and only if, all the arrays have the same exact length in bytes, and if @@ -204,6 +272,13 @@ public void checkCrossTypeHashConsistency() { println(org.apache.datasketches.Util.longToHexBytes(out1[0])); println(org.apache.datasketches.Util.longToHexBytes(out1[1])); + println("ByteBuffer"); + ByteBuffer bBuf = ByteBuffer.wrap(bArr); + out = hash(bBuf, 0L); + Assert.assertEquals(out, out1); + println(org.apache.datasketches.Util.longToHexBytes(out1[0])); + println(org.apache.datasketches.Util.longToHexBytes(out1[1])); + println("Chars"); char[] cArr = {0X0201, 0X0403, 0X0605, 0X0807, 0X0a09, 0X0c0b, 0X0e0d, 0X100f, 0X1211, 0X1413, 0X1615, 0X1817}; @@ -267,6 +342,42 @@ private static int[] stringToInts(String in) { return out; } + /** + * Tests {@link MurmurHash3#hash(ByteBuffer, long)} on the provided key. + * + * @param key byte array to hash + * @param h1 first half of expected hash + * @param h2 second half of expected hash + */ + private static void checkHashByteBuf(byte[] key, long h1, long h2) { + // Include dummy byte at start, end to make sure position, limit are respected. + ByteBuffer bigEndianBuf = ByteBuffer.allocate(key.length + 2).order(ByteOrder.BIG_ENDIAN); + bigEndianBuf.position(1); + bigEndianBuf.put(key); + bigEndianBuf.limit(1 + key.length); + bigEndianBuf.position(1); + + // Test with little endian too. + ByteBuffer littleEndianBuf = bigEndianBuf.duplicate().order(ByteOrder.LITTLE_ENDIAN); + + long[] result1 = MurmurHash3.hash(bigEndianBuf, 0); + long[] result2 = MurmurHash3.hash(littleEndianBuf, 0); + + // Position, limit, order should not be changed. + Assert.assertEquals(1, bigEndianBuf.position()); + Assert.assertEquals(1, littleEndianBuf.position()); + Assert.assertEquals(1 + key.length, bigEndianBuf.limit()); + Assert.assertEquals(1 + key.length, littleEndianBuf.limit()); + Assert.assertEquals(ByteOrder.BIG_ENDIAN, bigEndianBuf.order()); + Assert.assertEquals(ByteOrder.LITTLE_ENDIAN, littleEndianBuf.order()); + + // Check the actual hashes. + Assert.assertEquals(result1[0], h1); + Assert.assertEquals(result1[1], h2); + Assert.assertEquals(result2[0], h1); + Assert.assertEquals(result2[1], h2); + } + @Test public void printlnTest() { println("PRINTING: "+this.getClass().getName()); diff --git a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java index e78e80785..ea8ffef60 100644 --- a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java +++ b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java @@ -26,6 +26,8 @@ import org.apache.datasketches.memory.WritableMemory; +import java.nio.ByteBuffer; + /** * @author Lee Rhodes * @@ -40,6 +42,10 @@ public void checkUpdateTypes() { sk.update(byteArr); sk.update(new byte[] {}); sk.update(new byte[] {0, 1, 2, 3}); + ByteBuffer byteBuf = null; + sk.update(byteBuf); + sk.update(ByteBuffer.wrap(new byte[] {})); + sk.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3})); char[] charArr = null; sk.update(charArr); sk.update(new char[] {}); @@ -66,6 +72,10 @@ public void checkUpdateTypes() { u.update(byteArr1); u.update(new byte[] {}); u.update(new byte[] {0, 1, 2, 3}); + ByteBuffer byteBuf1 = null; + u.update(byteBuf1); + u.update(ByteBuffer.wrap(new byte[] {})); + u.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3})); char[] charArr1 = null; u.update(charArr1); u.update(new char[] {});