-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-5898: [Java] Provide functionality to efficiently compute hash code for arbitrary memory segment #4844
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
155 changes: 155 additions & 0 deletions
155
java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,155 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.arrow.memory.util.hash; | ||
|
|
||
| import static io.netty.util.internal.PlatformDependent.getByte; | ||
| import static io.netty.util.internal.PlatformDependent.getInt; | ||
| import static io.netty.util.internal.PlatformDependent.getLong; | ||
|
|
||
| import java.nio.ByteOrder; | ||
|
|
||
| import io.netty.buffer.ArrowBuf; | ||
|
|
||
| /** | ||
| * Utility for calculating the hash code for a consecutive memory region. | ||
| * This class provides the basic framework for efficiently calculating the hash code. | ||
| * It first splits the memory region into small segments with 8 bytes, 4 bytes and 1 byte, | ||
| * and calculates hash codes for them separately. It produces the final hash code by combining | ||
| * the hash codes and finalizing the resulting hash code. | ||
| * | ||
| * <p> | ||
| * To compute the hash code, the user simply calls the hashCode methods with the starting | ||
| * address and length of the memory region. | ||
| * </p> | ||
| * <p> | ||
| * A default light-weight implementation of this class is given in {@link DirectHasher}. However, the users can | ||
| * devise their own customized hasher by sub-classing this method and overriding the abstract methods. | ||
| * In particular | ||
| * <li> | ||
| * {@link ArrowBufHasher#combineHashCode(int, int)} provides the method for combining hash | ||
| * codes for individual small segments. | ||
| * </li> | ||
| * <li> | ||
| * {@link ArrowBufHasher#finalizeHashCode(int)} provides the method for finalizing the hash code. | ||
| * </li> | ||
| * <li> | ||
| * {@link ArrowBufHasher#getByteHashCode(byte)} provides the method for calculating the hash code | ||
| * for 1-byte memory segment. | ||
| * </li> | ||
| * <li> | ||
| * {@link ArrowBufHasher#getIntHashCode(int)} provides the method for calculating the hash code | ||
| * for 4-byte memory segment. | ||
| * </li> | ||
| * <li> | ||
| * {@link ArrowBufHasher#getLongHashCode(long)} provides the method for calculating the hash code | ||
| * for 8-byte memory segment. | ||
| * </li> | ||
| * </p> | ||
| */ | ||
| public abstract class ArrowBufHasher { | ||
|
|
||
| public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; | ||
|
|
||
| /** | ||
| * Calculates the hash code for a memory region. | ||
| * @param address start address of the memory region. | ||
| * @param length length of the memory region. | ||
| * @return the hash code. | ||
| */ | ||
| public int hashCode(long address, int length) { | ||
| int hashValue = 0; | ||
| int index = 0; | ||
| while (index + 8 <= length) { | ||
| long longValue = getLong(address + index); | ||
| if (!LITTLE_ENDIAN) { | ||
| // assume the buffer is in little endian | ||
| longValue = Long.reverseBytes(longValue); | ||
| } | ||
| int longHash = getLongHashCode(longValue); | ||
| hashValue = combineHashCode(hashValue, longHash); | ||
| index += 8; | ||
| } | ||
|
|
||
| while (index + 4 <= length) { | ||
| int intValue = getInt(address + index); | ||
| if (!LITTLE_ENDIAN) { | ||
| intValue = Integer.reverseBytes(intValue); | ||
| } | ||
| int intHash = getIntHashCode(intValue); | ||
| hashValue = combineHashCode(hashValue, intHash); | ||
| index += 4; | ||
| } | ||
|
|
||
| while (index < length) { | ||
| byte byteValue = getByte(address + index); | ||
| int byteHash = getByteHashCode(byteValue); | ||
| hashValue = combineHashCode(hashValue, byteHash); | ||
| index += 1; | ||
| } | ||
|
|
||
| return finalizeHashCode(hashValue); | ||
| } | ||
|
|
||
| /** | ||
| * Calculates the hash code for a memory region. | ||
| * @param buf the buffer for the memory region. | ||
| * @param offset offset within the buffer for the memory region. | ||
| * @param length length of the memory region. | ||
| * @return the hash code. | ||
| */ | ||
| public int hashCode(ArrowBuf buf, int offset, int length) { | ||
| buf.checkBytes(offset, offset + length); | ||
| return hashCode(buf.memoryAddress() + offset, length); | ||
| } | ||
|
|
||
| /** | ||
| * Calculates the hash code by combining the existing hash code and a new hash code. | ||
| * @param currentHashCode the existing hash code. | ||
| * @param newHashCode the new hash code. | ||
| * @return the combined hash code. | ||
| */ | ||
| protected abstract int combineHashCode(int currentHashCode, int newHashCode); | ||
|
|
||
| /** | ||
| * Gets the hash code for a byte value. | ||
| * @param byteValue the byte value. | ||
| * @return the hash code. | ||
| */ | ||
| protected abstract int getByteHashCode(byte byteValue); | ||
|
|
||
| /** | ||
| * Gets the hash code for a integer value. | ||
| * @param intValue the integer value. | ||
| * @return the hash code. | ||
| */ | ||
| protected abstract int getIntHashCode(int intValue); | ||
|
|
||
| /** | ||
| * Gets the hash code for a long value. | ||
| * @param longValue the long value. | ||
| * @return the hash code. | ||
| */ | ||
| protected abstract int getLongHashCode(long longValue); | ||
|
|
||
| /** | ||
| * Finalize the hash code. | ||
| * @param hashCode the current hash code. | ||
| * @return the finalized hash code. | ||
| */ | ||
| protected abstract int finalizeHashCode(int hashCode); | ||
| } | ||
82 changes: 82 additions & 0 deletions
82
java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.arrow.memory.util.hash; | ||
|
|
||
| /** | ||
| * Calculate hash code by directly returning the integers. | ||
| * This is the default and the fastest way to get the hash code. | ||
| * <p> | ||
| * Objects of class are stateless, so it can be shared between threads. | ||
| * </p> | ||
| */ | ||
| public class DirectHasher extends ArrowBufHasher { | ||
|
|
||
| public static DirectHasher INSTANCE = new DirectHasher(); | ||
|
|
||
| private static final int DEFAULT_SEED = 0; | ||
|
|
||
| private DirectHasher() { | ||
|
|
||
| } | ||
|
|
||
| @Override | ||
| protected int combineHashCode(int currentHashCode, int newHashCode) { | ||
| return currentHashCode * 37 + newHashCode; | ||
| } | ||
|
|
||
| @Override | ||
| protected int getByteHashCode(byte byteValue) { | ||
| return (int) byteValue; | ||
| } | ||
|
|
||
| @Override | ||
| protected int getIntHashCode(int intValue) { | ||
| return intValue; | ||
| } | ||
|
|
||
| @Override | ||
| protected int getLongHashCode(long longValue) { | ||
| return Long.hashCode(longValue); | ||
| } | ||
|
|
||
| @Override | ||
| protected int finalizeHashCode(int hashCode) { | ||
| // finalize by the Murmur hashing algorithm | ||
| // details can be found in | ||
| // https://en.wikipedia.org/wiki/MurmurHash | ||
|
|
||
| int c1 = 0xcc9e2d51; | ||
| int c2 = 0x1b873593; | ||
| int r1 = 15; | ||
| int r2 = 13; | ||
| int m = 5; | ||
| int n = 0xe6546b64; | ||
|
|
||
| int k = hashCode; | ||
| k = k * c1; | ||
| k = k << r1; | ||
| k = k * c2; | ||
|
|
||
| int hash = DEFAULT_SEED; | ||
| hash = hash ^ k; | ||
| hash = hash << r2; | ||
| hash = hash * m + n; | ||
|
|
||
| return hash; | ||
| } | ||
| } |
100 changes: 100 additions & 0 deletions
100
java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.arrow.memory.util; | ||
|
|
||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertThrows; | ||
|
|
||
| import org.apache.arrow.memory.BufferAllocator; | ||
| import org.apache.arrow.memory.RootAllocator; | ||
|
|
||
| import org.apache.arrow.memory.util.hash.ArrowBufHasher; | ||
| import org.apache.arrow.memory.util.hash.DirectHasher; | ||
| import org.junit.After; | ||
| import org.junit.Before; | ||
| import org.junit.Test; | ||
|
|
||
| import io.netty.buffer.ArrowBuf; | ||
|
|
||
| /** | ||
| * Test cases for {@link ArrowBufHasher} and its subclasses. | ||
| */ | ||
| public class TestArrowBufHasher { | ||
|
|
||
| private final int BUFFER_LENGTH = 1024; | ||
|
|
||
| private BufferAllocator allocator; | ||
|
|
||
| @Before | ||
| public void prepare() { | ||
| allocator = new RootAllocator(1024 * 1024); | ||
| } | ||
|
|
||
| @After | ||
| public void shutdown() { | ||
| allocator.close(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testDirectHasher() { | ||
| try (ArrowBuf buf1 = allocator.buffer(BUFFER_LENGTH); | ||
| ArrowBuf buf2 = allocator.buffer(BUFFER_LENGTH)) { | ||
| // prepare data | ||
| for (int i = 0; i < BUFFER_LENGTH / 4; i++) { | ||
| buf1.setFloat(i * 4, i / 10.0f); | ||
| buf2.setFloat(i * 4, i / 10.0f); | ||
| } | ||
|
|
||
| ArrowBufHasher hasher = DirectHasher.INSTANCE; | ||
|
|
||
| assertEquals(hasher.hashCode(buf1, 0, 100), hasher.hashCode(buf2, 0, 100)); | ||
| assertEquals(hasher.hashCode(buf1, 1, 5), hasher.hashCode(buf2, 1, 5)); | ||
| assertEquals(hasher.hashCode(buf1, 10, 17), hasher.hashCode(buf2, 10, 17)); | ||
| assertEquals(hasher.hashCode(buf1, 33, 25), hasher.hashCode(buf2, 33, 25)); | ||
| assertEquals(hasher.hashCode(buf1, 22, 22), hasher.hashCode(buf2, 22, 22)); | ||
| assertEquals(hasher.hashCode(buf1, 123, 333), hasher.hashCode(buf2, 123, 333)); | ||
| assertEquals(hasher.hashCode(buf1, 374, 1), hasher.hashCode(buf2, 374, 1)); | ||
| assertEquals(hasher.hashCode(buf1, 11, 0), hasher.hashCode(buf2, 11, 0)); | ||
| assertEquals(hasher.hashCode(buf1, 75, 25), hasher.hashCode(buf2, 75, 25)); | ||
| assertEquals(hasher.hashCode(buf1, 0, 1024), hasher.hashCode(buf2, 0, 1024)); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testDirectHasherNegative() { | ||
| try (ArrowBuf buf = allocator.buffer(BUFFER_LENGTH)) { | ||
| // prepare data | ||
| for (int i = 0; i < BUFFER_LENGTH / 4; i++) { | ||
| buf.setFloat(i * 4, i / 10.0f); | ||
| } | ||
|
|
||
| ArrowBufHasher hasher = DirectHasher.INSTANCE; | ||
| assertThrows(IllegalArgumentException.class, () -> { | ||
| hasher.hashCode(buf, 0, -1); | ||
| }); | ||
|
|
||
| assertThrows(IndexOutOfBoundsException.class, () -> { | ||
| hasher.hashCode(buf, 0, 1028); | ||
| }); | ||
|
|
||
| assertThrows(IndexOutOfBoundsException.class, () -> { | ||
| hasher.hashCode(buf, 500, 1000); | ||
| }); | ||
| } | ||
| } | ||
| } |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you document how consumers are expected to use this class?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure. Good suggestion.