apache · liyafan82 · Jul 23, 2019 · emkornfield · Jul 20, 2019 · liyafan82
diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory.util.hash;
+
+import static io.netty.util.internal.PlatformDependent.getByte;
+import static io.netty.util.internal.PlatformDependent.getInt;
+import static io.netty.util.internal.PlatformDependent.getLong;
+
+import java.nio.ByteOrder;
+
+import io.netty.buffer.ArrowBuf;
+
+/**
+ * Utility for calculating the hash code for a consecutive memory region.
+ * This class provides the basic framework for efficiently calculating the hash code.
+ * It first splits the memory region into small segments with 8 bytes, 4 bytes and 1 byte,
+ * and calculates hash codes for them separately. It produces the final hash code by combining
+ * the hash codes and finalizing the resulting hash code.
+ *
+ * <p>
+ *   To compute the hash code, the user simply calls the hashCode methods with the starting
+ *   address and length of the memory region.
+ * </p>
+ * <p>
+ *   A default light-weight implementation of this class is given in {@link DirectHasher}. However, the users can
+ *   devise their own customized hasher by sub-classing this method and overriding the abstract methods.
+ *   In particular
+ *   <li>
+ *     {@link ArrowBufHasher#combineHashCode(int, int)} provides the method for combining hash
+ *     codes for individual small segments.
+ *   </li>
+ *   <li>
+ *     {@link ArrowBufHasher#finalizeHashCode(int)} provides the method for finalizing the hash code.
+ *   </li>
+ *   <li>
+ *     {@link ArrowBufHasher#getByteHashCode(byte)} provides the method for calculating the hash code
+ *     for 1-byte memory segment.
+ *   </li>
+ *   <li>
+ *     {@link ArrowBufHasher#getIntHashCode(int)} provides the method for calculating the hash code
+ *     for 4-byte memory segment.
+ *   </li>
+ *   <li>
+ *     {@link ArrowBufHasher#getLongHashCode(long)} provides the method for calculating the hash code
+ *     for 8-byte memory segment.
+ *   </li>
+ * </p>
+ */
+public abstract class ArrowBufHasher {
+
+  public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
+
+  /**
+   * Calculates the hash code for a memory region.
+   * @param address start address of the memory region.
+   * @param length length of the memory region.
+   * @return the hash code.
+   */
+  public int hashCode(long address, int length) {
+    int hashValue = 0;
+    int index = 0;
+    while (index + 8 <= length) {
+      long longValue = getLong(address + index);
+      if (!LITTLE_ENDIAN) {
+        // assume the buffer is in little endian
+        longValue = Long.reverseBytes(longValue);
+      }
+      int longHash = getLongHashCode(longValue);
+      hashValue = combineHashCode(hashValue, longHash);
+      index += 8;
+    }
+
+    while (index + 4 <= length) {
+      int intValue = getInt(address + index);
+      if (!LITTLE_ENDIAN) {
+        intValue = Integer.reverseBytes(intValue);
+      }
+      int intHash = getIntHashCode(intValue);
+      hashValue = combineHashCode(hashValue, intHash);
+      index += 4;
+    }
+
+    while (index < length) {
+      byte byteValue = getByte(address + index);
+      int byteHash = getByteHashCode(byteValue);
+      hashValue = combineHashCode(hashValue, byteHash);
+      index += 1;
+    }
+
+    return finalizeHashCode(hashValue);
+  }
+
+  /**
+   * Calculates the hash code for a memory region.
+   * @param buf the buffer for the memory region.
+   * @param offset offset within the buffer for the memory region.
+   * @param length length of the memory region.
+   * @return the hash code.
+   */
+  public int hashCode(ArrowBuf buf, int offset, int length) {
+    buf.checkBytes(offset, offset + length);
+    return hashCode(buf.memoryAddress() + offset, length);
+  }
+
+  /**
+   * Calculates the hash code by combining the existing hash code and a new hash code.
+   * @param currentHashCode the existing hash code.
+   * @param newHashCode the new hash code.
+   * @return the combined hash code.
+   */
+  protected abstract int combineHashCode(int currentHashCode, int newHashCode);
+
+  /**
+   * Gets the hash code for a byte value.
+   * @param byteValue the byte value.
+   * @return the hash code.
+   */
+  protected abstract int getByteHashCode(byte byteValue);
+
+  /**
+   * Gets the hash code for a integer value.
+   * @param intValue the integer value.
+   * @return the hash code.
+   */
+  protected abstract int getIntHashCode(int intValue);
+
+  /**
+   * Gets the hash code for a long value.
+   * @param longValue the long value.
+   * @return the hash code.
+   */
+  protected abstract int getLongHashCode(long longValue);
+
+  /**
+   * Finalize the hash code.
+   * @param hashCode the current hash code.
+   * @return the finalized hash code.
+   */
+  protected abstract int finalizeHashCode(int hashCode);
+}
diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory.util.hash;
+
+/**
+ * Calculate hash code by directly returning the integers.
+ * This is the default and the fastest way to get the hash code.
+ * <p>
+ *   Objects of class are stateless, so it can be shared between threads.
+ * </p>
+ */
+public class DirectHasher extends ArrowBufHasher {
+
+  public static DirectHasher INSTANCE = new DirectHasher();
+
+  private static final int DEFAULT_SEED = 0;
+
+  private DirectHasher() {
+
+  }
+
+  @Override
+  protected int combineHashCode(int currentHashCode, int newHashCode) {
+    return currentHashCode * 37 + newHashCode;
+  }
+
+  @Override
+  protected int getByteHashCode(byte byteValue) {
+    return (int) byteValue;
+  }
+
+  @Override
+  protected int getIntHashCode(int intValue) {
+    return intValue;
+  }
+
+  @Override
+  protected int getLongHashCode(long longValue) {
+    return Long.hashCode(longValue);
+  }
+
+  @Override
+  protected int finalizeHashCode(int hashCode) {
+    // finalize by the Murmur hashing algorithm
+    // details can be found in
+    // https://en.wikipedia.org/wiki/MurmurHash
+
+    int c1 = 0xcc9e2d51;
+    int c2 = 0x1b873593;
+    int r1 = 15;
+    int r2 = 13;
+    int m = 5;
+    int n = 0xe6546b64;
+
+    int k = hashCode;
+    k = k * c1;
+    k = k << r1;
+    k = k * c2;
+
+    int hash = DEFAULT_SEED;
+    hash = hash ^ k;
+    hash = hash << r2;
+    hash = hash * m + n;
+
+    return hash;
+  }
+}
diff --git a/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufHasher.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+
+import org.apache.arrow.memory.util.hash.ArrowBufHasher;
+import org.apache.arrow.memory.util.hash.DirectHasher;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import io.netty.buffer.ArrowBuf;
+
+/**
+ * Test cases for {@link ArrowBufHasher} and its subclasses.
+ */
+public class TestArrowBufHasher {
+
+  private final int BUFFER_LENGTH = 1024;
+
+  private BufferAllocator allocator;
+
+  @Before
+  public void prepare() {
+    allocator = new RootAllocator(1024 * 1024);
+  }
+
+  @After
+  public void shutdown() {
+    allocator.close();
+  }
+
+  @Test
+  public void testDirectHasher() {
+    try (ArrowBuf buf1 = allocator.buffer(BUFFER_LENGTH);
+         ArrowBuf buf2 = allocator.buffer(BUFFER_LENGTH)) {
+      // prepare data
+      for (int i = 0; i < BUFFER_LENGTH / 4; i++) {
+        buf1.setFloat(i * 4, i / 10.0f);
+        buf2.setFloat(i * 4, i / 10.0f);
+      }
+
+      ArrowBufHasher hasher = DirectHasher.INSTANCE;
+
+      assertEquals(hasher.hashCode(buf1, 0, 100), hasher.hashCode(buf2, 0, 100));
+      assertEquals(hasher.hashCode(buf1, 1, 5), hasher.hashCode(buf2, 1, 5));
+      assertEquals(hasher.hashCode(buf1, 10, 17), hasher.hashCode(buf2, 10, 17));
+      assertEquals(hasher.hashCode(buf1, 33, 25), hasher.hashCode(buf2, 33, 25));
+      assertEquals(hasher.hashCode(buf1, 22, 22), hasher.hashCode(buf2, 22, 22));
+      assertEquals(hasher.hashCode(buf1, 123, 333), hasher.hashCode(buf2, 123, 333));
+      assertEquals(hasher.hashCode(buf1, 374, 1), hasher.hashCode(buf2, 374, 1));
+      assertEquals(hasher.hashCode(buf1, 11, 0), hasher.hashCode(buf2, 11, 0));
+      assertEquals(hasher.hashCode(buf1, 75, 25), hasher.hashCode(buf2, 75, 25));
+      assertEquals(hasher.hashCode(buf1, 0, 1024), hasher.hashCode(buf2, 0, 1024));
+    }
+  }
+
+  @Test
+  public void testDirectHasherNegative() {
+    try (ArrowBuf buf = allocator.buffer(BUFFER_LENGTH)) {
+      // prepare data
+      for (int i = 0; i < BUFFER_LENGTH / 4; i++) {
+        buf.setFloat(i * 4, i / 10.0f);
+      }
+
+      ArrowBufHasher hasher = DirectHasher.INSTANCE;
+      assertThrows(IllegalArgumentException.class, () -> {
+        hasher.hashCode(buf, 0, -1);
+      });
+
+      assertThrows(IndexOutOfBoundsException.class, () -> {
+        hasher.hashCode(buf, 0, 1028);
+      });
+
+      assertThrows(IndexOutOfBoundsException.class, () -> {
+        hasher.hashCode(buf, 500, 1000);
+      });
+    }
+  }
+}