Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions src/main/java/org/apache/datasketches/hash/MurmurHash3.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
package org.apache.datasketches.hash;

import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;


/**
Expand Down Expand Up @@ -177,6 +179,57 @@ public static long[] hash(final char[] key, final long seed) {
return hashState.finalMix128(k1, k2, chars << 1); //convert to bytes
}

//--Hash of ByteBuffer------------------------------------------------
/**
* Returns a long array of size 2, which is a 128-bit hash of the input.
*
* @param buf The input byte buffer. Must be non-null and non-empty.
* @param seed A long valued seed.
* @return the hash.
*/
public static long[] hash(final ByteBuffer buf, final long seed) {
final HashState hashState = new HashState(seed, seed);
final ByteBuffer littleEndianBuf;

if (buf.order() == ByteOrder.LITTLE_ENDIAN) {
littleEndianBuf = buf;
} else {
littleEndianBuf = buf.duplicate().order(ByteOrder.LITTLE_ENDIAN);
}

final int bytes = littleEndianBuf.remaining(); //in bytes
final int offset = littleEndianBuf.position();

// Number of full 128-bit blocks of 16 bytes.
// Possible exclusion of a remainder of up to 15 bytes.
final int nblocks = bytes >>> 4; //bytes / 16

// Process the 128-bit blocks (the body) into the hash
for (int i = 0; i < nblocks; i++ ) { //16 bytes per block
final long k1 = getLong(littleEndianBuf, offset + (i << 4), 8); //0, 16, 32, ...
final long k2 = getLong(littleEndianBuf, offset + (i << 4) + 8, 8); //8, 24, 40, ...
hashState.blockMix128(k1, k2);
}

// Get the tail index, remainder length
final int tail = nblocks << 4; //16 bytes per block
final int rem = bytes - tail; // remainder bytes: 0,1,...,15

// Get the tail
final long k1;
final long k2;
if (rem > 8) { //k1 -> whole; k2 -> partial
k1 = getLong(littleEndianBuf, offset + tail, 8);
k2 = getLong(littleEndianBuf, offset + tail + 8, rem - 8);
}
else { //k1 -> whole, partial or 0; k2 == 0
k1 = (rem == 0) ? 0 : getLong(littleEndianBuf, offset + tail, rem);
k2 = 0;
}
// Mix the tail into the hash and return
return hashState.finalMix128(k1, k2, bytes);
}

//--Hash of byte[]----------------------------------------------------
/**
* Returns a long array of size 2, which is a 128-bit hash of the input.
Expand Down Expand Up @@ -308,6 +361,43 @@ private static long mixK2(long k2) {
}
}

//--Helper methods----------------------------------------------------
/**
* Gets a long from the given byte buffer starting at the given position index and continuing for
* remainder (rem) bytes. The buffer must be in little-endian order. The bytes are extracted in
* little-endian order. The buffer endianness and limit are not checked.
*
* @param buf The given input byte buffer.
* @param index Zero-based index from the start of the byte array.
* @param rem Remainder bytes. An integer in the range [1,8].
* @return long
*/
private static long getLong(final ByteBuffer buf, final int index, final int rem) {
long out = 0L;

switch (rem) {
case 8:
out = buf.getLong(index);
break;
case 7:
out ^= (buf.get(index + 6) & 0xFFL) << 48;
case 6:
out ^= (buf.get(index + 5) & 0xFFL) << 40;
case 5:
out ^= (buf.get(index + 4) & 0xFFL) << 32;
case 4:
out ^= (buf.get(index + 3) & 0xFFL) << 24;
case 3:
out ^= (buf.get(index + 2) & 0xFFL) << 16;
case 2:
out ^= (buf.get(index + 1) & 0xFFL) << 8;
case 1:
out ^= buf.get(index) & 0xFFL;
}

return out;
}

//--Helper methods----------------------------------------------------
/**
* Gets a long from the given byte array starting at the given byte array index and continuing for
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

import org.apache.datasketches.memory.Memory;

import java.nio.ByteBuffer;

/**
* Although this class is package-private, it provides a single place to define and document
* the common public API for both HllSketch and Union.
Expand Down Expand Up @@ -327,6 +329,23 @@ public void update(final String datum) {
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}

/**
* Present the given byte buffer as a potential unique item.
* Bytes are read from the current position of the buffer until its limit.
* If the byte buffer is null or has no bytes remaining, no update attempt is made and the method returns.
*
* This method will not modify the position, mark, limit, or byte order of the buffer.
*
* Little-endian order is preferred, but not required. This method may perform better if the provided byte
* buffer is in little-endian order.
*
* @param data The given byte buffer.
*/
public void update(final ByteBuffer data) {
if ((data == null) || (data.remaining() == 0)) { return; }
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}

/**
* Present the given byte array as a potential unique item.
* If the byte array is null or empty no update attempt is made and the method returns.
Expand Down
111 changes: 111 additions & 0 deletions src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,18 @@

package org.apache.datasketches.hash;

import static java.nio.charset.StandardCharsets.UTF_16;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static java.nio.charset.StandardCharsets.UTF_8;

import org.testng.Assert;
import org.testng.annotations.Test;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.List;

/**
* Tests the MurmurHash3 against specific, known hash results given known
* inputs obtained from the public domain C++ version 150.
Expand All @@ -46,6 +52,18 @@ public void checkByteArrRemainderGT8() { //byte[], remainder > 8
Assert.assertEquals(result[1], h2);
}

@Test
public void checkByteBufRemainderGT8() { //byte buffer, remainder > 8
String keyStr = "The quick brown fox jumps over the lazy dog";
byte[] key = keyStr.getBytes(UTF_8);

//Should be:
long h1 = 0xe34bbc7bbc071b6cL;
long h2 = 0x7a433ca9c49a9347L;

checkHashByteBuf(key, h1, h2);
}

@Test
public void checkByteArrChange1bit() { //byte[], change one bit
String keyStr = "The quick brown fox jumps over the lazy eog";
Expand All @@ -58,6 +76,18 @@ public void checkByteArrChange1bit() { //byte[], change one bit
Assert.assertEquals(result[1], h2);
}

@Test
public void checkByteBufChange1bit() { //byte buffer, change one bit
String keyStr = "The quick brown fox jumps over the lazy eog";
byte[] key = keyStr.getBytes(UTF_8);

//Should be:
long h1 = 0x362108102c62d1c9L;
long h2 = 0x3285cd100292b305L;

checkHashByteBuf(key, h1, h2);
}

@Test
public void checkByteArrRemainderLt8() { //byte[], test a remainder < 8
String keyStr = "The quick brown fox jumps over the lazy dogdogdog";
Expand All @@ -70,6 +100,18 @@ public void checkByteArrRemainderLt8() { //byte[], test a remainder < 8
Assert.assertEquals(result[1], h2);
}

@Test
public void checkByteBufRemainderLt8() { //byte buffer, test a remainder < 8
String keyStr = "The quick brown fox jumps over the lazy dogdogdog";
byte[] key = keyStr.getBytes(UTF_8);

//Should be;
long h1 = 0x9c8205300e612fc4L;
long h2 = 0xcbc0af6136aa3df9L;

checkHashByteBuf(key, h1, h2);
}

@Test
public void checkByteArrReaminderEQ8() { //byte[], test a remainder = 8
String keyStr = "The quick brown fox jumps over the lazy1";
Expand All @@ -80,7 +122,18 @@ public void checkByteArrReaminderEQ8() { //byte[], test a remainder = 8
long h2 = 0xbdbf05f8da0f0392L;
Assert.assertEquals(result[0], h1);
Assert.assertEquals(result[1], h2);
}

@Test
public void checkByteBufReaminderEQ8() { //byte buffer, test a remainder = 8
String keyStr = "The quick brown fox jumps over the lazy1";
byte[] key = keyStr.getBytes(UTF_8);

//Should be:
long h1 = 0xe3301a827e5cdfe3L;
long h2 = 0xbdbf05f8da0f0392L;

checkHashByteBuf(key, h1, h2);
}

/**
Expand Down Expand Up @@ -190,6 +243,21 @@ public void checkByteArrAllOnesZeros() { //byte[], test a ones byte and a zeros
Assert.assertEquals(result[1], h2);
}

@Test
public void checkByteBufAllOnesZeros() { //byte[], test a ones byte and a zeros byte
byte[] key = {
0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b, 0x20, 0x62, 0x72, 0x6f, 0x77, 0x6e,
0x20, 0x66, 0x6f, 0x78, 0x20, 0x6a, 0x75, 0x6d, 0x70, 0x73, 0x20, 0x6f, 0x76, 0x65,
0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79, 0x20, 0x64, 0x6f, 0x67,
(byte) 0xff, 0x64, 0x6f, 0x67, 0x00
};

long h1 = 0xe88abda785929c9eL;
long h2 = 0x96b98587cacc83d6L;

checkHashByteBuf(key, h1, h2);
}

/**
* This test demonstrates that the hash of byte[], char[], int[], or long[] will produce the
* same hash result if, and only if, all the arrays have the same exact length in bytes, and if
Expand All @@ -204,6 +272,13 @@ public void checkCrossTypeHashConsistency() {
println(org.apache.datasketches.Util.longToHexBytes(out1[0]));
println(org.apache.datasketches.Util.longToHexBytes(out1[1]));

println("ByteBuffer");
ByteBuffer bBuf = ByteBuffer.wrap(bArr);
out = hash(bBuf, 0L);
Assert.assertEquals(out, out1);
println(org.apache.datasketches.Util.longToHexBytes(out1[0]));
println(org.apache.datasketches.Util.longToHexBytes(out1[1]));

println("Chars");
char[] cArr = {0X0201, 0X0403, 0X0605, 0X0807, 0X0a09, 0X0c0b, 0X0e0d, 0X100f,
0X1211, 0X1413, 0X1615, 0X1817};
Expand Down Expand Up @@ -267,6 +342,42 @@ private static int[] stringToInts(String in) {
return out;
}

/**
* Tests {@link MurmurHash3#hash(ByteBuffer, long)} on the provided key.
*
* @param key byte array to hash
* @param h1 first half of expected hash
* @param h2 second half of expected hash
*/
private static void checkHashByteBuf(byte[] key, long h1, long h2) {
// Include dummy byte at start, end to make sure position, limit are respected.
ByteBuffer bigEndianBuf = ByteBuffer.allocate(key.length + 2).order(ByteOrder.BIG_ENDIAN);
bigEndianBuf.position(1);
bigEndianBuf.put(key);
bigEndianBuf.limit(1 + key.length);
bigEndianBuf.position(1);

// Test with little endian too.
ByteBuffer littleEndianBuf = bigEndianBuf.duplicate().order(ByteOrder.LITTLE_ENDIAN);

long[] result1 = MurmurHash3.hash(bigEndianBuf, 0);
long[] result2 = MurmurHash3.hash(littleEndianBuf, 0);

// Position, limit, order should not be changed.
Assert.assertEquals(1, bigEndianBuf.position());
Assert.assertEquals(1, littleEndianBuf.position());
Assert.assertEquals(1 + key.length, bigEndianBuf.limit());
Assert.assertEquals(1 + key.length, littleEndianBuf.limit());
Assert.assertEquals(ByteOrder.BIG_ENDIAN, bigEndianBuf.order());
Assert.assertEquals(ByteOrder.LITTLE_ENDIAN, littleEndianBuf.order());

// Check the actual hashes.
Assert.assertEquals(result1[0], h1);
Assert.assertEquals(result1[1], h2);
Assert.assertEquals(result2[0], h1);
Assert.assertEquals(result2[1], h2);
}

@Test
public void printlnTest() {
println("PRINTING: "+this.getClass().getName());
Expand Down
10 changes: 10 additions & 0 deletions src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

import org.apache.datasketches.memory.WritableMemory;

import java.nio.ByteBuffer;

/**
* @author Lee Rhodes
*
Expand All @@ -40,6 +42,10 @@ public void checkUpdateTypes() {
sk.update(byteArr);
sk.update(new byte[] {});
sk.update(new byte[] {0, 1, 2, 3});
ByteBuffer byteBuf = null;
sk.update(byteBuf);
sk.update(ByteBuffer.wrap(new byte[] {}));
sk.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3}));
char[] charArr = null;
sk.update(charArr);
sk.update(new char[] {});
Expand All @@ -66,6 +72,10 @@ public void checkUpdateTypes() {
u.update(byteArr1);
u.update(new byte[] {});
u.update(new byte[] {0, 1, 2, 3});
ByteBuffer byteBuf1 = null;
u.update(byteBuf1);
u.update(ByteBuffer.wrap(new byte[] {}));
u.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3}));
char[] charArr1 = null;
u.update(charArr1);
u.update(new char[] {});
Expand Down