From 8f2e1a2f778499c5eda1ac44bdaaa14916c10d9a Mon Sep 17 00:00:00 2001 From: tianchen Date: Wed, 10 Jul 2019 20:43:22 +0800 Subject: [PATCH 1/6] hash table prototype --- .../org/apache/arrow/vector/ValueVector.java | 6 + .../apache/arrow/vector/VarCharVector.java | 18 ++ .../vector/dictionary/DictionaryEncoder.java | 24 +- .../dictionary/DictionaryHashTable.java | 246 ++++++++++++++++++ 4 files changed, 276 insertions(+), 18 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 86a381a0aec..53d8b9460f0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -237,4 +237,10 @@ public interface ValueVector extends Closeable, Iterable { * @return true if element is null */ boolean isNull(int index); + + //TODO remove default + default int hashCode(int index) {return 0;} + + //TODO remove default + default boolean equals(int index, ValueVector to, int toIndex) {return false;} } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java index c012ce3cf30..c66f8ad1b2d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -30,6 +30,9 @@ import org.apache.arrow.vector.util.Text; import org.apache.arrow.vector.util.TransferPair; +import java.util.Arrays; +import java.util.Objects; + /** * VarCharVector implements a variable width vector of VARCHAR * values which could be NULL. A validity buffer (bit vector) is maintained @@ -270,6 +273,21 @@ public void setSafe(int index, Text text) { setSafe(index, text.getBytes(), 0, text.getLength()); } + @Override + public int hashCode (int index) { + //TODO cal hashCode in memory level + byte[] values = get(index); + return Arrays.hashCode(values); + } + + @Override + public boolean equals(int index, ValueVector to, int toIndex) { + //TODO compare value in memory level + String value1 = this.getObject(index).toString(); + String value2 = to.getObject(toIndex).toString(); + return value1.equals(value2); + } + /*----------------------------------------------------------------* | | diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 623e4f4bc12..53bd9670b23 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -17,7 +17,6 @@ package org.apache.arrow.vector.dictionary; -import org.apache.arrow.vector.BaseBinaryVector; import org.apache.arrow.vector.BaseIntVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; @@ -45,14 +44,10 @@ public class DictionaryEncoder { public static ValueVector encode(ValueVector vector, Dictionary dictionary) { validateType(vector.getMinorType()); // load dictionary values into a hashmap for lookup - DictionaryEncodeHashMap lookUps = new DictionaryEncodeHashMap<>(dictionary.getVector().getValueCount()); - - boolean binaryType = isBinaryType(vector.getMinorType()); + DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector(), vector); for (int i = 0; i < dictionary.getVector().getValueCount(); i++) { - Object key = binaryType ? ((BaseBinaryVector) dictionary.getVector()).getByteArrayWrapper(i) : - dictionary.getVector().getObject(i); - lookUps.put(key, i); + hashTable.put(i); } Field valueField = vector.getField(); @@ -73,12 +68,12 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { int count = vector.getValueCount(); for (int i = 0; i < count; i++) { - Object value = binaryType ? ((BaseBinaryVector) vector).getByteArrayWrapper(i) : vector.getObject(i); - if (value != null) { // if it's null leave it null + if (!vector.isNull(i)) { // if it's null leave it null // note: this may fail if value was not included in the dictionary - int encoded = lookUps.get(value); + //int encoded = lookUps.get(value); + int encoded = hashTable.getIndex(i); if (encoded == -1) { - throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value); + throw new IllegalArgumentException("Dictionary encoding not defined for value:" + vector.getObject(i)); } indices.setWithPossibleTruncate(i, encoded); } @@ -119,13 +114,6 @@ public static ValueVector decode(ValueVector indices, Dictionary dictionary) { return decoded; } - private static boolean isBinaryType(MinorType type) { - if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY) { - return true; - } - return false; - } - private static void validateType(MinorType type) { if (type == MinorType.UNION) { throw new IllegalArgumentException("Dictionary encoding not implemented for current type: " + type); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java new file mode 100644 index 00000000000..38a3e543656 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java @@ -0,0 +1,246 @@ +package org.apache.arrow.vector.dictionary; + +import org.apache.arrow.vector.ValueVector; + +import java.util.Objects; + +/** + * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector) + * It stores the index in dictionary vector and for a given index in encode vector, + * it could return dictionary index. + */ +public class DictionaryHashTable { + + /** + * Represents a null value in map. + */ + static final int NULL_VALUE = -1; + + /** + * The default initial capacity - MUST be a power of two. + */ + static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; + + /** + * The maximum capacity, used if a higher value is implicitly specified + * by either of the constructors with arguments. + */ + static final int MAXIMUM_CAPACITY = 1 << 30; + + /** + * The load factor used when none specified in constructor. + */ + static final float DEFAULT_LOAD_FACTOR = 0.75f; + + static final DictionaryHashTable.Entry[] EMPTY_TABLE = {}; + + /** + * The table, initialized on first use, and resized as + * necessary. When allocated, length is always a power of two. + */ + transient DictionaryHashTable.Entry[] table = (DictionaryHashTable.Entry[]) EMPTY_TABLE; + + /** + * The number of key-value mappings contained in this map. + */ + transient int size; + + /** + * The next size value at which to resize (capacity * load factor). + */ + int threshold; + + /** + * The load factor for the hash table. + */ + final float loadFactor; + + private final ValueVector dictionary; + + private final ValueVector toEncode; + + /** + * Constructs an empty map with the specified initial capacity and load factor. + */ + public DictionaryHashTable(int initialCapacity, ValueVector dictionary, ValueVector toEncode) { + if (initialCapacity < 0) { + throw new IllegalArgumentException("Illegal initial capacity: " + + initialCapacity); + } + if (initialCapacity > MAXIMUM_CAPACITY) { + initialCapacity = MAXIMUM_CAPACITY; + } + this.loadFactor = DEFAULT_LOAD_FACTOR; + this.threshold = initialCapacity; + + this.dictionary = dictionary; + this.toEncode = toEncode; + } + + public DictionaryHashTable(ValueVector dictionary, ValueVector toEncode) { + this(DEFAULT_INITIAL_CAPACITY, dictionary, toEncode); + } + + /** + * Compute the capacity with given threshold and create init table. + */ + private void inflateTable(int threshold) { + int capacity = roundUpToPowerOf2(threshold); + this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1); + table = new DictionaryHashTable.Entry[capacity]; + } + + /** + * Computes the storage location in an array for the given hashCode. + */ + static int indexFor(int h, int length) { + return h & (length - 1); + } + + /** + * Returns a power of two size for the given size. + */ + static final int roundUpToPowerOf2(int size) { + int n = size - 1; + n |= n >>> 1; + n |= n >>> 2; + n |= n >>> 4; + n |= n >>> 8; + n |= n >>> 16; + return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; + } + + public int getIndex(int indexInArray) { + int hash = toEncode.hashCode(indexInArray); + int index = indexFor(hash, table.length); + for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) { + if ((e.hash == hash)) { + int dictIndex = e.index; + if (dictionary.equals(dictIndex, toEncode, indexInArray)) { + return dictIndex; + } + } + } + return NULL_VALUE; + } + + public void put(int indexInDictionary) { + if (table == EMPTY_TABLE) { + inflateTable(threshold); + } + + int hash = dictionary.hashCode(indexInDictionary); + int i = indexFor(hash, table.length); + for (DictionaryHashTable.Entry e = table[i]; e != null; e = e.next) { + if (e.hash == hash && e.index == indexInDictionary) { + //already has this index, return + return; + } + } + + addEntry(hash, indexInDictionary, i); + } + + /** + * Create a new Entry at the specific position of table. + */ + void createEntry(int hash, int index, int bucketIndex) { + DictionaryHashTable.Entry e = table[bucketIndex]; + table[bucketIndex] = new DictionaryHashTable.Entry(hash, index, e); + size++; + } + + /** + * Add Entry at the specified location of the table. + */ + void addEntry(int hash, int index, int bucketIndex) { + if ((size >= threshold) && (null != table[bucketIndex])) { + resize(2 * table.length); + bucketIndex = indexFor(hash, table.length); + } + + createEntry(hash, index, bucketIndex); + } + + /** + * Resize table with given new capacity. + */ + void resize(int newCapacity) { + DictionaryHashTable.Entry[] oldTable = table; + int oldCapacity = oldTable.length; + if (oldCapacity == MAXIMUM_CAPACITY) { + threshold = Integer.MAX_VALUE; + return; + } + + DictionaryHashTable.Entry[] newTable = new DictionaryHashTable.Entry[newCapacity]; + transfer(newTable); + table = newTable; + threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1); + } + + /** + * Transfer entries into new table from old table. + * @param newTable new table + */ + void transfer(DictionaryHashTable.Entry[] newTable) { + int newCapacity = newTable.length; + for (DictionaryHashTable.Entry e : table) { + while (null != e) { + DictionaryHashTable.Entry next = e.next; + int i = indexFor(e.hash, newCapacity); + e.next = newTable[i]; + newTable[i] = e; + e = next; + } + } + } + + /** + * Returns the number of mappings in this Map. + */ + public int size() { + return size; + } + + /** + * Removes all elements from this map, leaving it empty. + */ + public void clear() { + size = 0; + for (int i = 0; i < table.length; i++) { + table[i] = null; + } + } + + /** + * Class to keep dictionary index data within hash table. + */ + static class Entry { + //dictionary index + int index; + DictionaryHashTable.Entry next; + int hash; + + Entry(int hash, int index, DictionaryHashTable.Entry next) { + this.index = index; + this.hash = hash; + this.next = next; + } + + public final int getIndex() { + return this.index; + } + + public final boolean equals(Object o) { + if (!(o instanceof DictionaryEncodeHashMap.Entry)) { + return false; + } + DictionaryHashTable.Entry e = (DictionaryHashTable.Entry) o; + if (Objects.equals(index, e.getIndex())) { + return true; + } + return false; + } + } +} From c89608b7ac5abf8566bded5a95c6e79b21726e11 Mon Sep 17 00:00:00 2001 From: tianchen Date: Sat, 13 Jul 2019 12:21:51 +0800 Subject: [PATCH 2/6] fix --- .../arrow/vector/BaseFixedWidthVector.java | 30 ++++++++++++++++ .../arrow/vector/BaseVariableWidthVector.java | 29 +++++++++++++++ .../org/apache/arrow/vector/ValueVector.java | 14 +++++--- .../apache/arrow/vector/VarCharVector.java | 16 --------- .../vector/util/ByteFunctionHelpers.java | 36 +++++++++++++++++++ .../arrow/vector/TestDictionaryVector.java | 3 ++ 6 files changed, 107 insertions(+), 21 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index 8feca751adf..b0d716ae413 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -25,6 +25,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.ByteFunctionHelpers; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; @@ -837,4 +838,33 @@ public void copyFromSafe(int fromIndex, int thisIndex, BaseFixedWidthVector from handleSafe(thisIndex); copyFrom(fromIndex, thisIndex, from); } + + @Override + public int hashCode(int index) { + int start = typeWidth * index; + int end = typeWidth * (index + 1); + return ByteFunctionHelpers.hash(this.getDataBuffer(), start, end); + } + + @Override + public boolean equals(int index, ValueVector to, int toIndex) { + if (to == null) { + return false; + } + if (this.getClass() != to.getClass()) { + return false; + } + + BaseFixedWidthVector that = (BaseFixedWidthVector) to; + + int leftStart = typeWidth * index; + int leftEnd = typeWidth * (index + 1); + + int rightStart = typeWidth * toIndex; + int rightEnd = typeWidth * (toIndex + 1); + + int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, leftEnd, + that.getDataBuffer(), rightStart, rightEnd); + return ret == 1; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 5262f339e22..e82b1f3c3ea 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -27,6 +27,7 @@ import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.ByteFunctionHelpers; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; @@ -1334,4 +1335,32 @@ public void copyFromSafe(int fromIndex, int thisIndex, BaseVariableWidthVector f } lastSet = thisIndex; } + + @Override + public int hashCode(int index) { + final int start = getStartOffset(index); + final int end = getStartOffset(index + 1); + return ByteFunctionHelpers.hash(this.getDataBuffer(), start, end); + } + + @Override + public boolean equals(int index, ValueVector to, int toIndex) { + if (to == null) { + return false; + } + if (this.getClass() != to.getClass()) { + return false; + } + + BaseVariableWidthVector that = (BaseVariableWidthVector) to; + + final int leftStart = getStartOffset(index); + final int leftEnd = getStartOffset(index + 1); + + final int rightStart = that.getStartOffset(toIndex); + final int rightEnd = that.getStartOffset(toIndex + 1); + + int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, leftEnd, that.getDataBuffer(), rightStart, rightEnd); + return ret == 1; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 53d8b9460f0..ffd8286cd16 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -238,9 +238,13 @@ public interface ValueVector extends Closeable, Iterable { */ boolean isNull(int index); - //TODO remove default - default int hashCode(int index) {return 0;} - - //TODO remove default - default boolean equals(int index, ValueVector to, int toIndex) {return false;} + //TODO remove default and implement in subclasses + default int hashCode(int index) { + return 0; + } + + //TODO remove default and implement in subclasses + default boolean equals(int index, ValueVector to, int toIndex) { + return false; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java index c66f8ad1b2d..76c89e7faa3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -273,22 +273,6 @@ public void setSafe(int index, Text text) { setSafe(index, text.getBytes(), 0, text.getLength()); } - @Override - public int hashCode (int index) { - //TODO cal hashCode in memory level - byte[] values = get(index); - return Arrays.hashCode(values); - } - - @Override - public boolean equals(int index, ValueVector to, int toIndex) { - //TODO compare value in memory level - String value1 = this.getObject(index).toString(); - String value2 = to.getObject(toIndex).toString(); - return value1.equals(value2); - } - - /*----------------------------------------------------------------* | | | vector transfer | diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index fba32b6e732..9b0fecedc52 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -49,6 +49,42 @@ public static final int equal(final ArrowBuf left, int lStart, int lEnd, final A return memEqual(left.memoryAddress(), lStart, lEnd, right.memoryAddress(), rStart, rEnd); } + public static final int hash(final ArrowBuf buf, int start, int end) { + long addr = buf.memoryAddress(); + int len = end - start; + long pos = addr + start; + + int hash = 0; + + while (len > 7) { + long value = PlatformDependent.getLong(pos); + hash = comebineHash(hash, Long.hashCode(value)); + + pos += 8; + len -= 8; + } + + while (len > 3) { + int value = PlatformDependent.getInt(pos); + hash = comebineHash(hash, Integer.hashCode(value)); + + pos += 4; + len -= 4; + } + + while (len-- != 0) { + byte value = PlatformDependent.getByte(pos); + hash = comebineHash(hash, Byte.hashCode(value)); + pos ++; + } + + return hash; + } + + private static int comebineHash(int currentHash, int newHash) { + return currentHash * 31 + newHash; + } + private static int memEqual(final long laddr, int lStart, int lEnd, final long raddr, int rStart, final int rEnd) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index 0d2bce9f3f1..3c26be08eeb 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -37,6 +37,7 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.junit.After; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; public class TestDictionaryVector { @@ -161,6 +162,7 @@ private void writeListVector(UnionListWriter writer, int[] values) { } @Test + @Ignore public void testEncodeList() { // Create a new value vector try (final ListVector vector = ListVector.empty("vector", allocator); @@ -222,6 +224,7 @@ private void writeStructVector(NullableStructWriter writer, int value1, long val } @Test + @Ignore public void testEncodeStruct() { // Create a new value vector try (final StructVector vector = StructVector.empty("vector", allocator); From 7a87526e35808e09068c984a5ffb4a055fc1e682 Mon Sep 17 00:00:00 2001 From: tianchen Date: Sat, 13 Jul 2019 14:49:07 +0800 Subject: [PATCH 3/6] implementation of equals and hashCode --- .../main/codegen/templates/UnionVector.java | 51 ++++++++++++++----- .../org/apache/arrow/vector/ValueVector.java | 8 +-- .../org/apache/arrow/vector/ZeroVector.java | 10 ++++ .../vector/complex/FixedSizeListVector.java | 36 +++++++++++-- .../arrow/vector/complex/ListVector.java | 42 +++++++++++++++ .../complex/NonNullableStructVector.java | 19 +++++++ .../arrow/vector/complex/StructVector.java | 10 ++++ .../dictionary/DictionaryHashTable.java | 17 +++++++ .../vector/types/pojo/TestExtensionType.java | 13 +++-- 9 files changed, 179 insertions(+), 27 deletions(-) diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 04eed725379..b05005dad6a 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -17,6 +17,7 @@ import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.ReferenceManager; +import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.types.pojo.FieldType; <@pp.dropOutputFile /> @@ -491,32 +492,35 @@ public Iterator iterator() { return vectors.iterator(); } - - public Object getObject(int index) { + private ValueVector getVector(int index) { int type = typeBuffer.getByte(index * TYPE_WIDTH); switch (MinorType.values()[type]) { - case NULL: - return null; + case NULL: + return null; <#list vv.types as type> <#list type.minor as minor> <#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> <#assign uncappedName = name?uncap_first/> <#if !minor.typeParams?? > - case ${name?upper_case}: - return get${name}Vector().getObject(index); + case ${name?upper_case}: + return get${name}Vector(); - case STRUCT: - return getStruct().getObject(index); - case LIST: - return getList().getObject(index); - default: - throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]); + case STRUCT: + return getStruct(); + case LIST: + return getList(); + default: + throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]); } } + public Object getObject(int index) { + return getVector(index).getObject(index); + } + public byte[] get(int index) { return null; } @@ -622,4 +626,27 @@ public void setType(int index, MinorType type) { private int getTypeBufferValueCapacity() { return typeBuffer.capacity() / TYPE_WIDTH; } + + @Override + public int hashCode(int index) { + return getVector(index).hashCode(index); + } + + @Override + public boolean equals(int index, ValueVector to, int toIndex) { + if (to == null) { + return false; + } + if (this.getClass() != to.getClass()) { + return false; + } + UnionVector that = (UnionVector) to; + ValueVector leftVector = getVector(index); + ValueVector rightVector = that.getVector(toIndex); + + if (leftVector.getClass() != rightVector.getClass()) { + return false; + } + return leftVector.equals(index, rightVector, toIndex); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index ffd8286cd16..8aad68c569a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -239,12 +239,8 @@ public interface ValueVector extends Closeable, Iterable { boolean isNull(int index); //TODO remove default and implement in subclasses - default int hashCode(int index) { - return 0; - } + int hashCode(int index); //TODO remove default and implement in subclasses - default boolean equals(int index, ValueVector to, int toIndex) { - return false; - } + boolean equals(int index, ValueVector to, int toIndex); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index 37784ede1d1..b34aa4d6f15 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -244,4 +244,14 @@ public int getNullCount() { public boolean isNull(int index) { return false; } + + @Override + public int hashCode(int index) { + return 0; + } + + @Override + public boolean equals(int index, ValueVector to, int toIndex) { + return false; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index f62ec59b76a..4560103b576 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -45,11 +45,7 @@ import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; -import org.apache.arrow.vector.util.CallBack; -import org.apache.arrow.vector.util.JsonStringArrayList; -import org.apache.arrow.vector.util.OversizedAllocationException; -import org.apache.arrow.vector.util.SchemaChangeRuntimeException; -import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.*; import io.netty.buffer.ArrowBuf; @@ -496,6 +492,36 @@ public TransferPair makeTransferPair(ValueVector target) { return new TransferImpl((FixedSizeListVector) target); } + @Override + public int hashCode(int index){ + if (isSet(index) == 0) { + return 0; + } + int hash = 0; + for (int i = 0; i < listSize; i++) { + hash = 31 * vector.hashCode(index * listSize + i); + } + return hash; + } + + public boolean equals(int index, ValueVector to, int toIndex) { + if (to == null) { + return false; + } + if (this.getClass() != to.getClass()) { + return false; + } + + FixedSizeListVector that = (FixedSizeListVector) to; + + for (int i = 0; i < listSize; i++) { + if (!vector.equals(index * listSize + i, that, toIndex * listSize + i)) { + return false; + } + } + return true; + } + private class TransferImpl implements TransferPair { FixedSizeListVector to; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index af5333c3e56..c3eda183033 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -412,6 +412,48 @@ public ArrowBuf getOffsetBuffer() { return offsetBuffer; } + @Override + public int hashCode(int index){ + if (isSet(index) == 0) { + return 0; + } + int hash = 0; + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + for (int i = start; i < end; i++) { + hash = 31 * vector.hashCode(i); + } + return hash; + } + + public boolean equals(int index, ValueVector to, int toIndex) { + if (to == null) { + return false; + } + if (this.getClass() != to.getClass()) { + return false; + } + + ListVector that = (ListVector) to; + final int leftStart = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int leftEnd = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + + final int rightStart = that.offsetBuffer.getInt(toIndex * OFFSET_WIDTH); + final int rightEnd = that.offsetBuffer.getInt((toIndex + 1) * OFFSET_WIDTH); + + if ((leftEnd - leftStart) != (rightEnd - rightStart)) { + return false; + } + + for (int i = 0; i < (leftEnd - leftStart); i++) { + if (!vector.equals(leftStart + i, that, rightStart + i)) { + return false; + } + } + + return true; + } + private class TransferImpl implements TransferPair { ListVector to; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java index 1ca315a843c..dc2557b3661 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java @@ -286,6 +286,24 @@ public Object getObject(int index) { return vv; } + @Override + public int hashCode(int index) { + int hash = 0; + for (String child : getChildFieldNames()) { + ValueVector v = getChild(child); + if (v != null && index < v.getValueCount()) { + hash += 31 * hash + v.hashCode(index); + } + } + return hash; + } + + @Override + public boolean equals(int index, ValueVector to, int toIndex) { + //TODO + return false; + } + @Override public boolean isNull(int index) { return false; @@ -372,4 +390,5 @@ public void initializeChildrenFromFields(List children) { public List getChildrenFromFields() { return getChildren(); } + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java index a07e0d2f985..2ad9fb75091 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java @@ -479,6 +479,15 @@ public Object getObject(int index) { } } + @Override + public int hashCode(int index) { + if (isSet(index) == 0) { + return 0; + } else { + return super.hashCode(index); + } + } + @Override public void get(int index, ComplexHolder holder) { holder.isSet = isSet(index); @@ -546,4 +555,5 @@ public void setValueCount(int valueCount) { super.setValueCount(valueCount); this.valueCount = valueCount; } + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java index 38a3e543656..7069971634d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.arrow.vector.dictionary; import org.apache.arrow.vector.ValueVector; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java index 20d270c8988..969915ca6e5 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java @@ -31,10 +31,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.ExtensionTypeVector; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.FixedSizeBinaryVector; -import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.*; import org.apache.arrow.vector.ipc.ArrowFileReader; import org.apache.arrow.vector.ipc.ArrowFileWriter; import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; @@ -204,6 +201,14 @@ public UUID getObject(int index) { return new UUID(bb.getLong(), bb.getLong()); } + @Override public int hashCode(int index) { + return 0; + } + + @Override public boolean equals(int index, ValueVector to, int toIndex) { + return false; + } + public void set(int index, UUID uuid) { ByteBuffer bb = ByteBuffer.allocate(16); bb.putLong(uuid.getMostSignificantBits()); From 175192a38e003dbde3e471b0a981453912c9a23e Mon Sep 17 00:00:00 2001 From: tianchen Date: Sat, 13 Jul 2019 16:39:36 +0800 Subject: [PATCH 4/6] fix test and style --- .../arrow/vector/BaseVariableWidthVector.java | 3 +- .../apache/arrow/vector/VarCharVector.java | 3 -- .../vector/complex/FixedSizeListVector.java | 9 +++-- .../arrow/vector/complex/ListVector.java | 5 +-- .../complex/NonNullableStructVector.java | 36 +++++++++++++++++-- .../vector/dictionary/DictionaryEncoder.java | 2 +- .../dictionary/DictionaryHashTable.java | 12 +++++-- .../vector/util/ByteFunctionHelpers.java | 3 ++ .../arrow/vector/TestDictionaryVector.java | 3 -- .../vector/types/pojo/TestExtensionType.java | 6 +++- 10 files changed, 65 insertions(+), 17 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index e82b1f3c3ea..19fcc67e174 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -1360,7 +1360,8 @@ public boolean equals(int index, ValueVector to, int toIndex) { final int rightStart = that.getStartOffset(toIndex); final int rightEnd = that.getStartOffset(toIndex + 1); - int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, leftEnd, that.getDataBuffer(), rightStart, rightEnd); + int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, leftEnd, + that.getDataBuffer(), rightStart, rightEnd); return ret == 1; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java index 76c89e7faa3..7a914d90e3f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -30,9 +30,6 @@ import org.apache.arrow.vector.util.Text; import org.apache.arrow.vector.util.TransferPair; -import java.util.Arrays; -import java.util.Objects; - /** * VarCharVector implements a variable width vector of VARCHAR * values which could be NULL. A validity buffer (bit vector) is maintained diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index 4560103b576..b50e6d8abd2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -45,7 +45,11 @@ import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; -import org.apache.arrow.vector.util.*; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; +import org.apache.arrow.vector.util.TransferPair; import io.netty.buffer.ArrowBuf; @@ -493,7 +497,7 @@ public TransferPair makeTransferPair(ValueVector target) { } @Override - public int hashCode(int index){ + public int hashCode(int index) { if (isSet(index) == 0) { return 0; } @@ -504,6 +508,7 @@ public int hashCode(int index){ return hash; } + @Override public boolean equals(int index, ValueVector to, int toIndex) { if (to == null) { return false; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index c3eda183033..4c4f0568706 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -413,7 +413,7 @@ public ArrowBuf getOffsetBuffer() { } @Override - public int hashCode(int index){ + public int hashCode(int index) { if (isSet(index) == 0) { return 0; } @@ -426,6 +426,7 @@ public int hashCode(int index){ return hash; } + @Override public boolean equals(int index, ValueVector to, int toIndex) { if (to == null) { return false; @@ -446,7 +447,7 @@ public boolean equals(int index, ValueVector to, int toIndex) { } for (int i = 0; i < (leftEnd - leftStart); i++) { - if (!vector.equals(leftStart + i, that, rightStart + i)) { + if (!vector.equals(leftStart + i, that.vector, rightStart + i)) { return false; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java index dc2557b3661..1d9b8713697 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java @@ -300,8 +300,40 @@ public int hashCode(int index) { @Override public boolean equals(int index, ValueVector to, int toIndex) { - //TODO - return false; + if (to == null) { + return false; + } + if (this.getClass() != to.getClass()) { + return false; + } + NonNullableStructVector that = (NonNullableStructVector) to; + List leftChildrens = new ArrayList<>(); + List rightChildrens = new ArrayList<>(); + + for (String child : getChildFieldNames()) { + ValueVector v = getChild(child); + if (v != null) { + leftChildrens.add(v); + } + } + + for (String child : that.getChildFieldNames()) { + ValueVector v = that.getChild(child); + if (v != null) { + rightChildrens.add(v); + } + } + + if (leftChildrens.size() != rightChildrens.size()) { + return false; + } + + for (int i = 0; i < leftChildrens.size(); i++) { + if (!leftChildrens.get(i).equals(index, rightChildrens.get(i), toIndex)) { + return false; + } + } + return true; } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 53bd9670b23..b926785c5c9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -43,7 +43,7 @@ public class DictionaryEncoder { */ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { validateType(vector.getMinorType()); - // load dictionary values into a hashmap for lookup + // load dictionary indices into a hashmap for lookup DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector(), vector); for (int i = 0; i < dictionary.getVector().getValueCount(); i++) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java index 7069971634d..be2fd02c67e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java @@ -17,10 +17,10 @@ package org.apache.arrow.vector.dictionary; -import org.apache.arrow.vector.ValueVector; - import java.util.Objects; +import org.apache.arrow.vector.ValueVector; + /** * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector) * It stores the index in dictionary vector and for a given index in encode vector, @@ -127,6 +127,11 @@ static final int roundUpToPowerOf2(int size) { return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; } + /** + * get the corresponding dictionary index with the given index in vector which to encode. + * @param indexInArray index in vector. + * @return dictionary vector index or -1 if no value equals. + */ public int getIndex(int indexInArray) { int hash = toEncode.hashCode(indexInArray); int index = indexFor(hash, table.length); @@ -141,6 +146,9 @@ public int getIndex(int indexInArray) { return NULL_VALUE; } + /** + * put the index of dictionary vector to build hash table. + */ public void put(int indexInDictionary) { if (table == EMPTY_TABLE) { inflateTable(threshold); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index 9b0fecedc52..5b429a2ee33 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -49,6 +49,9 @@ public static final int equal(final ArrowBuf left, int lStart, int lEnd, final A return memEqual(left.memoryAddress(), lStart, lEnd, right.memoryAddress(), rStart, rEnd); } + /** + * Compute hashCode with the given {@link ArrowBuf} and start/end index. + */ public static final int hash(final ArrowBuf buf, int start, int end) { long addr = buf.memoryAddress(); int len = end - start; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index 3c26be08eeb..0d2bce9f3f1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -37,7 +37,6 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.junit.After; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; public class TestDictionaryVector { @@ -162,7 +161,6 @@ private void writeListVector(UnionListWriter writer, int[] values) { } @Test - @Ignore public void testEncodeList() { // Create a new value vector try (final ListVector vector = ListVector.empty("vector", allocator); @@ -224,7 +222,6 @@ private void writeStructVector(NullableStructWriter writer, int value1, long val } @Test - @Ignore public void testEncodeStruct() { // Create a new value vector try (final StructVector vector = StructVector.empty("vector", allocator); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java index 969915ca6e5..d0fbee9a341 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java @@ -31,7 +31,11 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.*; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.ipc.ArrowFileReader; import org.apache.arrow.vector.ipc.ArrowFileWriter; import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; From 5facc2a51dedef0432a3570f5fc32c365f1fce39 Mon Sep 17 00:00:00 2001 From: tianchen Date: Tue, 16 Jul 2019 15:52:44 +0800 Subject: [PATCH 5/6] resolve comments --- .../java/org/apache/arrow/vector/ValueVector.java | 14 +++++++++++--- .../arrow/vector/complex/FixedSizeListVector.java | 3 ++- .../arrow/vector/dictionary/DictionaryEncoder.java | 4 ++-- .../vector/dictionary/DictionaryHashTable.java | 11 ++++------- .../arrow/vector/util/ByteFunctionHelpers.java | 5 ++++- .../arrow/vector/types/pojo/TestExtensionType.java | 10 ++++++---- 6 files changed, 29 insertions(+), 18 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 8aad68c569a..795493a4127 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -238,9 +238,17 @@ public interface ValueVector extends Closeable, Iterable { */ boolean isNull(int index); - //TODO remove default and implement in subclasses + /** + * Returns hashCode of element in index. + */ int hashCode(int index); - //TODO remove default and implement in subclasses - boolean equals(int index, ValueVector to, int toIndex); + /** + * Check whether the element in index equals to the element in targetIndex from the target vector. + * @param index index to compare in this vector + * @param target target vector + * @param targetIndex index to compare in target vector + * @return true if equals, otherwise false. + */ + boolean equals(int index, ValueVector target, int targetIndex); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index b50e6d8abd2..872e16a1847 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -45,6 +45,7 @@ import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.ByteFunctionHelpers; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.JsonStringArrayList; import org.apache.arrow.vector.util.OversizedAllocationException; @@ -503,7 +504,7 @@ public int hashCode(int index) { } int hash = 0; for (int i = 0; i < listSize; i++) { - hash = 31 * vector.hashCode(index * listSize + i); + hash = ByteFunctionHelpers.comebineHash(hash, vector.hashCode(index * listSize + i)); } return hash; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index b926785c5c9..ed69df3cf40 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -45,7 +45,7 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { validateType(vector.getMinorType()); // load dictionary indices into a hashmap for lookup - DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector(), vector); + DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector()); for (int i = 0; i < dictionary.getVector().getValueCount(); i++) { hashTable.put(i); } @@ -71,7 +71,7 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { if (!vector.isNull(i)) { // if it's null leave it null // note: this may fail if value was not included in the dictionary //int encoded = lookUps.get(value); - int encoded = hashTable.getIndex(i); + int encoded = hashTable.getIndex(i, vector); if (encoded == -1) { throw new IllegalArgumentException("Dictionary encoding not defined for value:" + vector.getObject(i)); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java index be2fd02c67e..bf0c788d5b2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java @@ -74,12 +74,10 @@ public class DictionaryHashTable { private final ValueVector dictionary; - private final ValueVector toEncode; - /** * Constructs an empty map with the specified initial capacity and load factor. */ - public DictionaryHashTable(int initialCapacity, ValueVector dictionary, ValueVector toEncode) { + public DictionaryHashTable(int initialCapacity, ValueVector dictionary) { if (initialCapacity < 0) { throw new IllegalArgumentException("Illegal initial capacity: " + initialCapacity); @@ -91,11 +89,10 @@ public DictionaryHashTable(int initialCapacity, ValueVector dictionary, ValueVec this.threshold = initialCapacity; this.dictionary = dictionary; - this.toEncode = toEncode; } - public DictionaryHashTable(ValueVector dictionary, ValueVector toEncode) { - this(DEFAULT_INITIAL_CAPACITY, dictionary, toEncode); + public DictionaryHashTable(ValueVector dictionary) { + this(DEFAULT_INITIAL_CAPACITY, dictionary); } /** @@ -132,7 +129,7 @@ static final int roundUpToPowerOf2(int size) { * @param indexInArray index in vector. * @return dictionary vector index or -1 if no value equals. */ - public int getIndex(int indexInArray) { + public int getIndex(int indexInArray, ValueVector toEncode) { int hash = toEncode.hashCode(indexInArray); int index = indexFor(hash, table.length); for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index 5b429a2ee33..05b6720b093 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -84,7 +84,10 @@ public static final int hash(final ArrowBuf buf, int start, int end) { return hash; } - private static int comebineHash(int currentHash, int newHash) { + /** + * Generate a new hashCode with the given current hashCode and new hashCode. + */ + public static int comebineHash(int currentHash, int newHash) { return currentHash * 31 + newHash; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java index d0fbee9a341..792bd29903b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java @@ -205,12 +205,14 @@ public UUID getObject(int index) { return new UUID(bb.getLong(), bb.getLong()); } - @Override public int hashCode(int index) { - return 0; + @Override + public int hashCode(int index) { + return getUnderlyingVector().hashCode(index); } - @Override public boolean equals(int index, ValueVector to, int toIndex) { - return false; + @Override + public boolean equals(int index, ValueVector to, int toIndex) { + return getUnderlyingVector().equals(index, to, toIndex); } public void set(int index, UUID uuid) { From 2db7302681b783aeef176af2fdd7e8a432af06bf Mon Sep 17 00:00:00 2001 From: tianchen Date: Thu, 18 Jul 2019 16:19:43 +0800 Subject: [PATCH 6/6] fix --- .../org/apache/arrow/vector/util/ByteFunctionHelpers.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index 05b6720b093..8dbdc4987ce 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -69,7 +69,7 @@ public static final int hash(final ArrowBuf buf, int start, int end) { while (len > 3) { int value = PlatformDependent.getInt(pos); - hash = comebineHash(hash, Integer.hashCode(value)); + hash = comebineHash(hash, value); pos += 4; len -= 4; @@ -77,7 +77,7 @@ public static final int hash(final ArrowBuf buf, int start, int end) { while (len-- != 0) { byte value = PlatformDependent.getByte(pos); - hash = comebineHash(hash, Byte.hashCode(value)); + hash = comebineHash(hash, value); pos ++; }