diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java new file mode 100644 index 00000000000..51d29ba79bf --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory.util.hash; + +import static io.netty.util.internal.PlatformDependent.getByte; +import static io.netty.util.internal.PlatformDependent.getInt; +import static io.netty.util.internal.PlatformDependent.getLong; + +import java.nio.ByteOrder; + +import io.netty.buffer.ArrowBuf; + +/** + * Utility for calculating the hash code for a consecutive memory region. + * This class provides the basic framework for efficiently calculating the hash code. + * It first splits the memory region into small segments with 8 bytes, 4 bytes and 1 byte, + * and calculates hash codes for them separately. It produces the final hash code by combining + * the hash codes and finalizing the resulting hash code. + * + *

+ * To compute the hash code, the user simply calls the hashCode methods with the starting + * address and length of the memory region. + *

+ *

+ * A default light-weight implementation of this class is given in {@link DirectHasher}. However, the users can + * devise their own customized hasher by sub-classing this method and overriding the abstract methods. + * In particular + *

  • + * {@link ArrowBufHasher#combineHashCode(int, int)} provides the method for combining hash + * codes for individual small segments. + *
  • + *
  • + * {@link ArrowBufHasher#finalizeHashCode(int)} provides the method for finalizing the hash code. + *
  • + *
  • + * {@link ArrowBufHasher#getByteHashCode(byte)} provides the method for calculating the hash code + * for 1-byte memory segment. + *
  • + *
  • + * {@link ArrowBufHasher#getIntHashCode(int)} provides the method for calculating the hash code + * for 4-byte memory segment. + *
  • + *
  • + * {@link ArrowBufHasher#getLongHashCode(long)} provides the method for calculating the hash code + * for 8-byte memory segment. + *
  • + *

    + */ +public abstract class ArrowBufHasher { + + public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; + + /** + * Calculates the hash code for a memory region. + * @param address start address of the memory region. + * @param length length of the memory region. + * @return the hash code. + */ + public int hashCode(long address, int length) { + int hashValue = 0; + int index = 0; + while (index + 8 <= length) { + long longValue = getLong(address + index); + if (!LITTLE_ENDIAN) { + // assume the buffer is in little endian + longValue = Long.reverseBytes(longValue); + } + int longHash = getLongHashCode(longValue); + hashValue = combineHashCode(hashValue, longHash); + index += 8; + } + + while (index + 4 <= length) { + int intValue = getInt(address + index); + if (!LITTLE_ENDIAN) { + intValue = Integer.reverseBytes(intValue); + } + int intHash = getIntHashCode(intValue); + hashValue = combineHashCode(hashValue, intHash); + index += 4; + } + + while (index < length) { + byte byteValue = getByte(address + index); + int byteHash = getByteHashCode(byteValue); + hashValue = combineHashCode(hashValue, byteHash); + index += 1; + } + + return finalizeHashCode(hashValue); + } + + /** + * Calculates the hash code for a memory region. + * @param buf the buffer for the memory region. + * @param offset offset within the buffer for the memory region. + * @param length length of the memory region. + * @return the hash code. + */ + public int hashCode(ArrowBuf buf, int offset, int length) { + buf.checkBytes(offset, offset + length); + return hashCode(buf.memoryAddress() + offset, length); + } + + /** + * Calculates the hash code by combining the existing hash code and a new hash code. + * @param currentHashCode the existing hash code. + * @param newHashCode the new hash code. + * @return the combined hash code. + */ + protected abstract int combineHashCode(int currentHashCode, int newHashCode); + + /** + * Gets the hash code for a byte value. + * @param byteValue the byte value. + * @return the hash code. + */ + protected abstract int getByteHashCode(byte byteValue); + + /** + * Gets the hash code for a integer value. + * @param intValue the integer value. + * @return the hash code. + */ + protected abstract int getIntHashCode(int intValue); + + /** + * Gets the hash code for a long value. + * @param longValue the long value. + * @return the hash code. + */ + protected abstract int getLongHashCode(long longValue); + + /** + * Finalize the hash code. + * @param hashCode the current hash code. + * @return the finalized hash code. + */ + protected abstract int finalizeHashCode(int hashCode); +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java new file mode 100644 index 00000000000..18f39c3814f --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory.util.hash; + +/** + * Calculate hash code by directly returning the integers. + * This is the default and the fastest way to get the hash code. + *

    + * Objects of class are stateless, so it can be shared between threads. + *

    + */ +public class DirectHasher extends ArrowBufHasher { + + public static DirectHasher INSTANCE = new DirectHasher(); + + private static final int DEFAULT_SEED = 0; + + private DirectHasher() { + + } + + @Override + protected int combineHashCode(int currentHashCode, int newHashCode) { + return currentHashCode * 37 + newHashCode; + } + + @Override + protected int getByteHashCode(byte byteValue) { + return (int) byteValue; + } + + @Override + protected int getIntHashCode(int intValue) { + return intValue; + } + + @Override + protected int getLongHashCode(long longValue) { + return Long.hashCode(longValue); + } + + @Override + protected int finalizeHashCode(int hashCode) { + // finalize by the Murmur hashing algorithm + // details can be found in + // https://en.wikipedia.org/wiki/MurmurHash + + int c1 = 0xcc9e2d51; + int c2 = 0x1b873593; + int r1 = 15; + int r2 = 13; + int m = 5; + int n = 0xe6546b64; + + int k = hashCode; + k = k * c1; + k = k << r1; + k = k * c2; + + int hash = DEFAULT_SEED; + hash = hash ^ k; + hash = hash << r2; + hash = hash * m + n; + + return hash; + } +} diff --git a/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java new file mode 100644 index 00000000000..9fc9c500ec8 --- /dev/null +++ b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory.util; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; + +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.DirectHasher; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import io.netty.buffer.ArrowBuf; + +/** + * Test cases for {@link ArrowBufHasher} and its subclasses. + */ +public class TestArrowBufHasher { + + private final int BUFFER_LENGTH = 1024; + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testDirectHasher() { + try (ArrowBuf buf1 = allocator.buffer(BUFFER_LENGTH); + ArrowBuf buf2 = allocator.buffer(BUFFER_LENGTH)) { + // prepare data + for (int i = 0; i < BUFFER_LENGTH / 4; i++) { + buf1.setFloat(i * 4, i / 10.0f); + buf2.setFloat(i * 4, i / 10.0f); + } + + ArrowBufHasher hasher = DirectHasher.INSTANCE; + + assertEquals(hasher.hashCode(buf1, 0, 100), hasher.hashCode(buf2, 0, 100)); + assertEquals(hasher.hashCode(buf1, 1, 5), hasher.hashCode(buf2, 1, 5)); + assertEquals(hasher.hashCode(buf1, 10, 17), hasher.hashCode(buf2, 10, 17)); + assertEquals(hasher.hashCode(buf1, 33, 25), hasher.hashCode(buf2, 33, 25)); + assertEquals(hasher.hashCode(buf1, 22, 22), hasher.hashCode(buf2, 22, 22)); + assertEquals(hasher.hashCode(buf1, 123, 333), hasher.hashCode(buf2, 123, 333)); + assertEquals(hasher.hashCode(buf1, 374, 1), hasher.hashCode(buf2, 374, 1)); + assertEquals(hasher.hashCode(buf1, 11, 0), hasher.hashCode(buf2, 11, 0)); + assertEquals(hasher.hashCode(buf1, 75, 25), hasher.hashCode(buf2, 75, 25)); + assertEquals(hasher.hashCode(buf1, 0, 1024), hasher.hashCode(buf2, 0, 1024)); + } + } + + @Test + public void testDirectHasherNegative() { + try (ArrowBuf buf = allocator.buffer(BUFFER_LENGTH)) { + // prepare data + for (int i = 0; i < BUFFER_LENGTH / 4; i++) { + buf.setFloat(i * 4, i / 10.0f); + } + + ArrowBufHasher hasher = DirectHasher.INSTANCE; + assertThrows(IllegalArgumentException.class, () -> { + hasher.hashCode(buf, 0, -1); + }); + + assertThrows(IndexOutOfBoundsException.class, () -> { + hasher.hashCode(buf, 0, 1028); + }); + + assertThrows(IndexOutOfBoundsException.class, () -> { + hasher.hashCode(buf, 500, 1000); + }); + } + } +}