Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.memory.util.hash;

import static io.netty.util.internal.PlatformDependent.getByte;
import static io.netty.util.internal.PlatformDependent.getInt;
import static io.netty.util.internal.PlatformDependent.getLong;

import java.nio.ByteOrder;

import io.netty.buffer.ArrowBuf;

/**
* Utility for calculating the hash code for a consecutive memory region.
* This class provides the basic framework for efficiently calculating the hash code.
* It first splits the memory region into small segments with 8 bytes, 4 bytes and 1 byte,
* and calculates hash codes for them separately. It produces the final hash code by combining
* the hash codes and finalizing the resulting hash code.
*
* <p>
* To compute the hash code, the user simply calls the hashCode methods with the starting
* address and length of the memory region.
* </p>
* <p>
* A default light-weight implementation of this class is given in {@link DirectHasher}. However, the users can
* devise their own customized hasher by sub-classing this method and overriding the abstract methods.
* In particular
* <li>
* {@link ArrowBufHasher#combineHashCode(int, int)} provides the method for combining hash
* codes for individual small segments.
* </li>
* <li>
* {@link ArrowBufHasher#finalizeHashCode(int)} provides the method for finalizing the hash code.
* </li>
* <li>
* {@link ArrowBufHasher#getByteHashCode(byte)} provides the method for calculating the hash code
* for 1-byte memory segment.
* </li>
* <li>
* {@link ArrowBufHasher#getIntHashCode(int)} provides the method for calculating the hash code
* for 4-byte memory segment.
* </li>
* <li>
* {@link ArrowBufHasher#getLongHashCode(long)} provides the method for calculating the hash code
* for 8-byte memory segment.
* </li>
* </p>
*/
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you document how consumers are expected to use this class?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. Good suggestion.

public abstract class ArrowBufHasher {

public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;

/**
* Calculates the hash code for a memory region.
* @param address start address of the memory region.
* @param length length of the memory region.
* @return the hash code.
*/
public int hashCode(long address, int length) {
int hashValue = 0;
int index = 0;
while (index + 8 <= length) {
long longValue = getLong(address + index);
if (!LITTLE_ENDIAN) {
// assume the buffer is in little endian
longValue = Long.reverseBytes(longValue);
}
int longHash = getLongHashCode(longValue);
hashValue = combineHashCode(hashValue, longHash);
index += 8;
}

while (index + 4 <= length) {
int intValue = getInt(address + index);
if (!LITTLE_ENDIAN) {
intValue = Integer.reverseBytes(intValue);
}
int intHash = getIntHashCode(intValue);
hashValue = combineHashCode(hashValue, intHash);
index += 4;
}

while (index < length) {
byte byteValue = getByte(address + index);
int byteHash = getByteHashCode(byteValue);
hashValue = combineHashCode(hashValue, byteHash);
index += 1;
}

return finalizeHashCode(hashValue);
}

/**
* Calculates the hash code for a memory region.
* @param buf the buffer for the memory region.
* @param offset offset within the buffer for the memory region.
* @param length length of the memory region.
* @return the hash code.
*/
public int hashCode(ArrowBuf buf, int offset, int length) {
buf.checkBytes(offset, offset + length);
return hashCode(buf.memoryAddress() + offset, length);
}

/**
* Calculates the hash code by combining the existing hash code and a new hash code.
* @param currentHashCode the existing hash code.
* @param newHashCode the new hash code.
* @return the combined hash code.
*/
protected abstract int combineHashCode(int currentHashCode, int newHashCode);

/**
* Gets the hash code for a byte value.
* @param byteValue the byte value.
* @return the hash code.
*/
protected abstract int getByteHashCode(byte byteValue);

/**
* Gets the hash code for a integer value.
* @param intValue the integer value.
* @return the hash code.
*/
protected abstract int getIntHashCode(int intValue);

/**
* Gets the hash code for a long value.
* @param longValue the long value.
* @return the hash code.
*/
protected abstract int getLongHashCode(long longValue);

/**
* Finalize the hash code.
* @param hashCode the current hash code.
* @return the finalized hash code.
*/
protected abstract int finalizeHashCode(int hashCode);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.memory.util.hash;

/**
* Calculate hash code by directly returning the integers.
* This is the default and the fastest way to get the hash code.
* <p>
* Objects of class are stateless, so it can be shared between threads.
* </p>
*/
public class DirectHasher extends ArrowBufHasher {

public static DirectHasher INSTANCE = new DirectHasher();

private static final int DEFAULT_SEED = 0;

private DirectHasher() {

}

@Override
protected int combineHashCode(int currentHashCode, int newHashCode) {
return currentHashCode * 37 + newHashCode;
}

@Override
protected int getByteHashCode(byte byteValue) {
return (int) byteValue;
}

@Override
protected int getIntHashCode(int intValue) {
return intValue;
}

@Override
protected int getLongHashCode(long longValue) {
return Long.hashCode(longValue);
}

@Override
protected int finalizeHashCode(int hashCode) {
// finalize by the Murmur hashing algorithm
// details can be found in
// https://en.wikipedia.org/wiki/MurmurHash

int c1 = 0xcc9e2d51;
int c2 = 0x1b873593;
int r1 = 15;
int r2 = 13;
int m = 5;
int n = 0xe6546b64;

int k = hashCode;
k = k * c1;
k = k << r1;
k = k * c2;

int hash = DEFAULT_SEED;
hash = hash ^ k;
hash = hash << r2;
hash = hash * m + n;

return hash;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.memory.util;

import static org.junit.Assert.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;

import org.apache.arrow.memory.util.hash.ArrowBufHasher;
import org.apache.arrow.memory.util.hash.DirectHasher;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import io.netty.buffer.ArrowBuf;

/**
* Test cases for {@link ArrowBufHasher} and its subclasses.
*/
public class TestArrowBufHasher {

private final int BUFFER_LENGTH = 1024;

private BufferAllocator allocator;

@Before
public void prepare() {
allocator = new RootAllocator(1024 * 1024);
}

@After
public void shutdown() {
allocator.close();
}

@Test
public void testDirectHasher() {
try (ArrowBuf buf1 = allocator.buffer(BUFFER_LENGTH);
ArrowBuf buf2 = allocator.buffer(BUFFER_LENGTH)) {
// prepare data
for (int i = 0; i < BUFFER_LENGTH / 4; i++) {
buf1.setFloat(i * 4, i / 10.0f);
buf2.setFloat(i * 4, i / 10.0f);
}

ArrowBufHasher hasher = DirectHasher.INSTANCE;

assertEquals(hasher.hashCode(buf1, 0, 100), hasher.hashCode(buf2, 0, 100));
assertEquals(hasher.hashCode(buf1, 1, 5), hasher.hashCode(buf2, 1, 5));
assertEquals(hasher.hashCode(buf1, 10, 17), hasher.hashCode(buf2, 10, 17));
assertEquals(hasher.hashCode(buf1, 33, 25), hasher.hashCode(buf2, 33, 25));
assertEquals(hasher.hashCode(buf1, 22, 22), hasher.hashCode(buf2, 22, 22));
assertEquals(hasher.hashCode(buf1, 123, 333), hasher.hashCode(buf2, 123, 333));
assertEquals(hasher.hashCode(buf1, 374, 1), hasher.hashCode(buf2, 374, 1));
assertEquals(hasher.hashCode(buf1, 11, 0), hasher.hashCode(buf2, 11, 0));
assertEquals(hasher.hashCode(buf1, 75, 25), hasher.hashCode(buf2, 75, 25));
assertEquals(hasher.hashCode(buf1, 0, 1024), hasher.hashCode(buf2, 0, 1024));
}
}

@Test
public void testDirectHasherNegative() {
try (ArrowBuf buf = allocator.buffer(BUFFER_LENGTH)) {
// prepare data
for (int i = 0; i < BUFFER_LENGTH / 4; i++) {
buf.setFloat(i * 4, i / 10.0f);
}

ArrowBufHasher hasher = DirectHasher.INSTANCE;
assertThrows(IllegalArgumentException.class, () -> {
hasher.hashCode(buf, 0, -1);
});

assertThrows(IndexOutOfBoundsException.class, () -> {
hasher.hashCode(buf, 0, 1028);
});

assertThrows(IndexOutOfBoundsException.class, () -> {
hasher.hashCode(buf, 500, 1000);
});
}
}
}