From b1b6f78d0b04422b8af2ac01b89ce3367db36cdb Mon Sep 17 00:00:00 2001
From: liyafan82
Date: Tue, 23 Jul 2019 11:19:24 +0800
Subject: [PATCH] [ARROW-5898][Java] Provide functionality to efficiently
compute hash code for arbitrary memory segment
---
.../memory/util/hash/ArrowBufHasher.java | 155 ++++++++++++++++++
.../arrow/memory/util/hash/DirectHasher.java | 82 +++++++++
.../arrow/memory/util/TestArrowBufHasher.java | 100 +++++++++++
3 files changed, 337 insertions(+)
create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java
create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
create mode 100644 java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java
diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java
new file mode 100644
index 00000000000..51d29ba79bf
--- /dev/null
+++ b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory.util.hash;
+
+import static io.netty.util.internal.PlatformDependent.getByte;
+import static io.netty.util.internal.PlatformDependent.getInt;
+import static io.netty.util.internal.PlatformDependent.getLong;
+
+import java.nio.ByteOrder;
+
+import io.netty.buffer.ArrowBuf;
+
+/**
+ * Utility for calculating the hash code for a consecutive memory region.
+ * This class provides the basic framework for efficiently calculating the hash code.
+ * It first splits the memory region into small segments with 8 bytes, 4 bytes and 1 byte,
+ * and calculates hash codes for them separately. It produces the final hash code by combining
+ * the hash codes and finalizing the resulting hash code.
+ *
+ *
+ * To compute the hash code, the user simply calls the hashCode methods with the starting
+ * address and length of the memory region.
+ *
+ *
+ * A default light-weight implementation of this class is given in {@link DirectHasher}. However, the users can
+ * devise their own customized hasher by sub-classing this method and overriding the abstract methods.
+ * In particular
+ *
+ * {@link ArrowBufHasher#combineHashCode(int, int)} provides the method for combining hash
+ * codes for individual small segments.
+ *
+ *
+ * {@link ArrowBufHasher#finalizeHashCode(int)} provides the method for finalizing the hash code.
+ *
+ *
+ * {@link ArrowBufHasher#getByteHashCode(byte)} provides the method for calculating the hash code
+ * for 1-byte memory segment.
+ *
+ *
+ * {@link ArrowBufHasher#getIntHashCode(int)} provides the method for calculating the hash code
+ * for 4-byte memory segment.
+ *
+ *
+ * {@link ArrowBufHasher#getLongHashCode(long)} provides the method for calculating the hash code
+ * for 8-byte memory segment.
+ *
+ *
+ */
+public abstract class ArrowBufHasher {
+
+ public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
+
+ /**
+ * Calculates the hash code for a memory region.
+ * @param address start address of the memory region.
+ * @param length length of the memory region.
+ * @return the hash code.
+ */
+ public int hashCode(long address, int length) {
+ int hashValue = 0;
+ int index = 0;
+ while (index + 8 <= length) {
+ long longValue = getLong(address + index);
+ if (!LITTLE_ENDIAN) {
+ // assume the buffer is in little endian
+ longValue = Long.reverseBytes(longValue);
+ }
+ int longHash = getLongHashCode(longValue);
+ hashValue = combineHashCode(hashValue, longHash);
+ index += 8;
+ }
+
+ while (index + 4 <= length) {
+ int intValue = getInt(address + index);
+ if (!LITTLE_ENDIAN) {
+ intValue = Integer.reverseBytes(intValue);
+ }
+ int intHash = getIntHashCode(intValue);
+ hashValue = combineHashCode(hashValue, intHash);
+ index += 4;
+ }
+
+ while (index < length) {
+ byte byteValue = getByte(address + index);
+ int byteHash = getByteHashCode(byteValue);
+ hashValue = combineHashCode(hashValue, byteHash);
+ index += 1;
+ }
+
+ return finalizeHashCode(hashValue);
+ }
+
+ /**
+ * Calculates the hash code for a memory region.
+ * @param buf the buffer for the memory region.
+ * @param offset offset within the buffer for the memory region.
+ * @param length length of the memory region.
+ * @return the hash code.
+ */
+ public int hashCode(ArrowBuf buf, int offset, int length) {
+ buf.checkBytes(offset, offset + length);
+ return hashCode(buf.memoryAddress() + offset, length);
+ }
+
+ /**
+ * Calculates the hash code by combining the existing hash code and a new hash code.
+ * @param currentHashCode the existing hash code.
+ * @param newHashCode the new hash code.
+ * @return the combined hash code.
+ */
+ protected abstract int combineHashCode(int currentHashCode, int newHashCode);
+
+ /**
+ * Gets the hash code for a byte value.
+ * @param byteValue the byte value.
+ * @return the hash code.
+ */
+ protected abstract int getByteHashCode(byte byteValue);
+
+ /**
+ * Gets the hash code for a integer value.
+ * @param intValue the integer value.
+ * @return the hash code.
+ */
+ protected abstract int getIntHashCode(int intValue);
+
+ /**
+ * Gets the hash code for a long value.
+ * @param longValue the long value.
+ * @return the hash code.
+ */
+ protected abstract int getLongHashCode(long longValue);
+
+ /**
+ * Finalize the hash code.
+ * @param hashCode the current hash code.
+ * @return the finalized hash code.
+ */
+ protected abstract int finalizeHashCode(int hashCode);
+}
diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
new file mode 100644
index 00000000000..18f39c3814f
--- /dev/null
+++ b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory.util.hash;
+
+/**
+ * Calculate hash code by directly returning the integers.
+ * This is the default and the fastest way to get the hash code.
+ *
+ * Objects of class are stateless, so it can be shared between threads.
+ *
+ */
+public class DirectHasher extends ArrowBufHasher {
+
+ public static DirectHasher INSTANCE = new DirectHasher();
+
+ private static final int DEFAULT_SEED = 0;
+
+ private DirectHasher() {
+
+ }
+
+ @Override
+ protected int combineHashCode(int currentHashCode, int newHashCode) {
+ return currentHashCode * 37 + newHashCode;
+ }
+
+ @Override
+ protected int getByteHashCode(byte byteValue) {
+ return (int) byteValue;
+ }
+
+ @Override
+ protected int getIntHashCode(int intValue) {
+ return intValue;
+ }
+
+ @Override
+ protected int getLongHashCode(long longValue) {
+ return Long.hashCode(longValue);
+ }
+
+ @Override
+ protected int finalizeHashCode(int hashCode) {
+ // finalize by the Murmur hashing algorithm
+ // details can be found in
+ // https://en.wikipedia.org/wiki/MurmurHash
+
+ int c1 = 0xcc9e2d51;
+ int c2 = 0x1b873593;
+ int r1 = 15;
+ int r2 = 13;
+ int m = 5;
+ int n = 0xe6546b64;
+
+ int k = hashCode;
+ k = k * c1;
+ k = k << r1;
+ k = k * c2;
+
+ int hash = DEFAULT_SEED;
+ hash = hash ^ k;
+ hash = hash << r2;
+ hash = hash * m + n;
+
+ return hash;
+ }
+}
diff --git a/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java
new file mode 100644
index 00000000000..9fc9c500ec8
--- /dev/null
+++ b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+
+import org.apache.arrow.memory.util.hash.ArrowBufHasher;
+import org.apache.arrow.memory.util.hash.DirectHasher;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import io.netty.buffer.ArrowBuf;
+
+/**
+ * Test cases for {@link ArrowBufHasher} and its subclasses.
+ */
+public class TestArrowBufHasher {
+
+ private final int BUFFER_LENGTH = 1024;
+
+ private BufferAllocator allocator;
+
+ @Before
+ public void prepare() {
+ allocator = new RootAllocator(1024 * 1024);
+ }
+
+ @After
+ public void shutdown() {
+ allocator.close();
+ }
+
+ @Test
+ public void testDirectHasher() {
+ try (ArrowBuf buf1 = allocator.buffer(BUFFER_LENGTH);
+ ArrowBuf buf2 = allocator.buffer(BUFFER_LENGTH)) {
+ // prepare data
+ for (int i = 0; i < BUFFER_LENGTH / 4; i++) {
+ buf1.setFloat(i * 4, i / 10.0f);
+ buf2.setFloat(i * 4, i / 10.0f);
+ }
+
+ ArrowBufHasher hasher = DirectHasher.INSTANCE;
+
+ assertEquals(hasher.hashCode(buf1, 0, 100), hasher.hashCode(buf2, 0, 100));
+ assertEquals(hasher.hashCode(buf1, 1, 5), hasher.hashCode(buf2, 1, 5));
+ assertEquals(hasher.hashCode(buf1, 10, 17), hasher.hashCode(buf2, 10, 17));
+ assertEquals(hasher.hashCode(buf1, 33, 25), hasher.hashCode(buf2, 33, 25));
+ assertEquals(hasher.hashCode(buf1, 22, 22), hasher.hashCode(buf2, 22, 22));
+ assertEquals(hasher.hashCode(buf1, 123, 333), hasher.hashCode(buf2, 123, 333));
+ assertEquals(hasher.hashCode(buf1, 374, 1), hasher.hashCode(buf2, 374, 1));
+ assertEquals(hasher.hashCode(buf1, 11, 0), hasher.hashCode(buf2, 11, 0));
+ assertEquals(hasher.hashCode(buf1, 75, 25), hasher.hashCode(buf2, 75, 25));
+ assertEquals(hasher.hashCode(buf1, 0, 1024), hasher.hashCode(buf2, 0, 1024));
+ }
+ }
+
+ @Test
+ public void testDirectHasherNegative() {
+ try (ArrowBuf buf = allocator.buffer(BUFFER_LENGTH)) {
+ // prepare data
+ for (int i = 0; i < BUFFER_LENGTH / 4; i++) {
+ buf.setFloat(i * 4, i / 10.0f);
+ }
+
+ ArrowBufHasher hasher = DirectHasher.INSTANCE;
+ assertThrows(IllegalArgumentException.class, () -> {
+ hasher.hashCode(buf, 0, -1);
+ });
+
+ assertThrows(IndexOutOfBoundsException.class, () -> {
+ hasher.hashCode(buf, 0, 1028);
+ });
+
+ assertThrows(IndexOutOfBoundsException.class, () -> {
+ hasher.hashCode(buf, 500, 1000);
+ });
+ }
+ }
+}