From 98f4c559375077a04004d11bc64eb13bb78dc020 Mon Sep 17 00:00:00 2001 From: tianchen Date: Mon, 1 Jul 2019 11:40:49 +0800 Subject: [PATCH 1/5] init --- .../arrow/memory/DictionaryEncodeHashMap.java | 273 ++++++++++++++++++ .../DictionaryEncodeHashMapBenchmarks.java | 86 ++++++ 2 files changed, 359 insertions(+) create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java create mode 100644 java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java diff --git a/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java b/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java new file mode 100644 index 00000000000..85cc066d1e6 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory; + +import java.util.*; + +/** + * Created by niki.lj on 2019/6/29. + */ +public class DictionaryEncodeHashMap { + + static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; + + static final int MAXIMUM_CAPACITY = 1 << 30; + + static final float DEFAULT_LOAD_FACTOR = 0.75f; + + static final Entry[] EMPTY_TABLE = {}; + + transient Entry[] table = (Entry[]) EMPTY_TABLE; + + transient int size; + + int threshold; + + final float loadFactor; + + transient int modCount; + + public DictionaryEncodeHashMap(int initialCapacity, float loadFactor) { + if (initialCapacity < 0) + throw new IllegalArgumentException("Illegal initial capacity: " + + initialCapacity); + if (initialCapacity > MAXIMUM_CAPACITY) + initialCapacity = MAXIMUM_CAPACITY; + if (loadFactor <= 0 || Float.isNaN(loadFactor)) + throw new IllegalArgumentException("Illegal load factor: " + + loadFactor); + this.loadFactor = loadFactor; + this.threshold = initialCapacity; + } + + public DictionaryEncodeHashMap(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public DictionaryEncodeHashMap() { + this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR); + } + + private void inflateTable(int threshold){ + int capacity = roundUpToPowerOf2(threshold); + this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1); + table = new Entry[capacity]; + } + + private static void set(long value) { + System.out.println(value); + } + + static final int hash(Object key) { + int h; + return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16); + } + + static int indexFor(int h, int length) { + return h & (length-1); + } + + + static final int roundUpToPowerOf2(int size) { + int n = size - 1; + n |= n >>> 1; + n |= n >>> 2; + n |= n >>> 4; + n |= n >>> 8; + n |= n >>> 16; + return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; + } + + void createEntry(int hash, K key, int value, int bucketIndex) { + Entry e = table[bucketIndex]; + table[bucketIndex] = new Entry<>(hash, key, value, e); + size++; + } + + public int get(K key) { + int hash = hash(key); + int index = indexFor(hash, table.length); + for (Entry e = table[index] ; e != null ; e = e.next) { + if ((e.hash == hash) && e.key.equals(key)) { + return e.value; + } + } + return -1; + } + + public int put(K key, int value) { + if (table == EMPTY_TABLE) { + inflateTable(threshold); + } + + if (key == null) { + return putForNullKey(value); + } + + int hash = hash(key); + int i = indexFor(hash, table.length); + for (Entry e = table[i]; e != null; e = e.next) { + Object k; + if (e.hash == hash && ((k = e.key) == key || key.equals(k))) { + int oldValue = e.value; + e.value = value; + return oldValue; + } + } + + modCount++; + addEntry(hash, key, value, i); + return -1; + } + + private int putForNullKey(int value) { + for (Entry e = table[0]; e != null; e = e.next) { + if (e.key == null) { + int oldValue = e.value; + e.value = value; + return oldValue; + } + } + modCount++; + addEntry(0, null, value, 0); + return -1; + } + + void addEntry(int hash, K key, int value, int bucketIndex) { + if ((size >= threshold) && (null != table[bucketIndex])) { + resize(2 * table.length); + hash = (null != key) ? hash(key) : 0; + bucketIndex = indexFor(hash, table.length); + } + + createEntry(hash, key, value, bucketIndex); + } + + void resize(int newCapacity) { + Entry[] oldTable = table; + int oldCapacity = oldTable.length; + if (oldCapacity == MAXIMUM_CAPACITY) { + threshold = Integer.MAX_VALUE; + return; + } + + Entry[] newTable = new Entry[newCapacity]; + transfer(newTable); + table = newTable; + threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1); + } + + void transfer(Entry[] newTable) { + int newCapacity = newTable.length; + for (Entry e : table) { + while(null != e) { + Entry next = e.next; + int i = indexFor(e.hash, newCapacity); + e.next = newTable[i]; + newTable[i] = e; + e = next; + } + } + } + + public int remove(K key) { + Entry e = removeEntryForKey(key); + return (e == null ? -1 : e.value); + } + + final Entry removeEntryForKey(Object key) { + if (size == 0) { + return null; + } + int hash = (key == null) ? 0 : hash(key); + int i = indexFor(hash, table.length); + Entry prev = table[i]; + Entry e = prev; + + while (e != null) { + Entry next = e.next; + Object k; + if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k)))) { + modCount++; + size--; + if (prev == e) { + table[i] = next; + } else { + prev.next = next; + } + + return e; + } + prev = e; + e = next; + } + + return e; + } + + + static class Entry { + final K key; + int value; + Entry next; + int hash; + + Entry(int hash, K key, int value, Entry next) { + this.key = key; + this.value = value; + this.hash = hash; + this.next = next; + } + + public final K getKey() { + return key; + } + + public final int getValue() { + return value; + } + + public final int setValue(int newValue) { + int oldValue = value; + value = newValue; + return oldValue; + } + + public final boolean equals(Object o) { + if (!(o instanceof DictionaryEncodeHashMap.Entry)) { + return false; + } + Entry e = (Entry) o; + if (Objects.equals(key, e.getKey())) { + if (value == e.getValue()) { + return true; + } + } + return false; + } + + public final int hashCode() { + return Objects.hashCode(key) ^ Objects.hashCode(value); + } + + public final String toString() { + return getKey() + "=" + getValue(); + } + } + +} diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java new file mode 100644 index 00000000000..ec1ea3f3fb2 --- /dev/null +++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import org.apache.arrow.memory.DictionaryEncodeHashMap; +import org.apache.arrow.vector.BaseValueVectorBenchmarks; +import org.junit.Test; +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import java.util.HashMap; +import java.util.concurrent.TimeUnit; + +/** + * Benchmarks for {@link org.apache.arrow.memory.DictionaryEncodeHashMap}. + */ +@State(Scope.Benchmark) +public class DictionaryEncodeHashMapBenchmarks { + private static final int SIZE = 1000; + + private HashMap hashMap; + private DictionaryEncodeHashMap dictionaryEncodeHashMap; + + /** + * Setup benchmarks. + */ + @Setup + public void prepare() { + hashMap = new HashMap(); + dictionaryEncodeHashMap = new DictionaryEncodeHashMap(); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public int testHashMap() { + for (int i = 0; i < SIZE; i++) { + hashMap.put("test" + i, i); + } + for (int i = 0; i < SIZE; i++) { + hashMap.get("test" + i); + } + return 0; + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public int testDictionaryEncodeHashMap() { + for (int i = 0; i < SIZE; i++) { + dictionaryEncodeHashMap.put("test" + i, i); + } + for (int i = 0; i < SIZE; i++) { + dictionaryEncodeHashMap.get("test" + i); + } + return 0; + } + + @Test + public void evaluate() throws RunnerException { + Options opt = new OptionsBuilder() + .include(DictionaryEncodeHashMapBenchmarks.class.getSimpleName()) + .forks(1) + .build(); + + new Runner(opt).run(); + } +} From 86eb350b3198def217ef620085fa3b27ac24db4f Mon Sep 17 00:00:00 2001 From: tianchen Date: Mon, 1 Jul 2019 14:53:34 +0800 Subject: [PATCH 2/5] add interface --- .../arrow/memory/DictionaryEncodeHashMap.java | 63 +++++++++++-------- .../org/apache/arrow/memory/ObjectIntMap.java | 30 +++++++++ .../DictionaryEncodeHashMapBenchmarks.java | 52 ++++++++++++--- 3 files changed, 109 insertions(+), 36 deletions(-) create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java diff --git a/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java b/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java index 85cc066d1e6..0f8acbade48 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java @@ -17,12 +17,15 @@ package org.apache.arrow.memory; -import java.util.*; +import java.util.Objects; /** - * Created by niki.lj on 2019/6/29. + * Implementation of the {@link ObjectIntMap} interface, used for DictionaryEncoder. + * Note that value in this map is always not less than 0, and -1 represents a null value. */ -public class DictionaryEncodeHashMap { +public class DictionaryEncodeHashMap implements ObjectIntMap { + + static final int NULL_VALUE = -1; static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; @@ -40,8 +43,6 @@ public class DictionaryEncodeHashMap { final float loadFactor; - transient int modCount; - public DictionaryEncodeHashMap(int initialCapacity, float loadFactor) { if (initialCapacity < 0) throw new IllegalArgumentException("Illegal initial capacity: " + @@ -69,10 +70,6 @@ private void inflateTable(int threshold){ table = new Entry[capacity]; } - private static void set(long value) { - System.out.println(value); - } - static final int hash(Object key) { int h; return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16); @@ -93,12 +90,7 @@ static final int roundUpToPowerOf2(int size) { return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; } - void createEntry(int hash, K key, int value, int bucketIndex) { - Entry e = table[bucketIndex]; - table[bucketIndex] = new Entry<>(hash, key, value, e); - size++; - } - + @Override public int get(K key) { int hash = hash(key); int index = indexFor(hash, table.length); @@ -107,9 +99,10 @@ public int get(K key) { return e.value; } } - return -1; + return NULL_VALUE; } + @Override public int put(K key, int value) { if (table == EMPTY_TABLE) { inflateTable(threshold); @@ -130,9 +123,21 @@ public int put(K key, int value) { } } - modCount++; addEntry(hash, key, value, i); - return -1; + return NULL_VALUE; + } + + + @Override + public int remove(K key) { + Entry e = removeEntryForKey(key); + return (e == null ? NULL_VALUE : e.value); + } + + void createEntry(int hash, K key, int value, int bucketIndex) { + Entry e = table[bucketIndex]; + table[bucketIndex] = new Entry<>(hash, key, value, e); + size++; } private int putForNullKey(int value) { @@ -143,9 +148,8 @@ private int putForNullKey(int value) { return oldValue; } } - modCount++; addEntry(0, null, value, 0); - return -1; + return NULL_VALUE; } void addEntry(int hash, K key, int value, int bucketIndex) { @@ -185,11 +189,6 @@ void transfer(Entry[] newTable) { } } - public int remove(K key) { - Entry e = removeEntryForKey(key); - return (e == null ? -1 : e.value); - } - final Entry removeEntryForKey(Object key) { if (size == 0) { return null; @@ -203,7 +202,6 @@ final Entry removeEntryForKey(Object key) { Entry next = e.next; Object k; if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k)))) { - modCount++; size--; if (prev == e) { table[i] = next; @@ -220,6 +218,19 @@ final Entry removeEntryForKey(Object key) { return e; } + /** + * Returns the number of mappings in this Map. + */ + public int size() { + return size; + } + + public void clear() { + size = 0; + for (int i = 0; i < table.length; i++) { + table[i] = null; + } + } static class Entry { final K key; diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java b/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java new file mode 100644 index 00000000000..12c2e935e99 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory; + +/** + * Specific hash map for int type value, reducing boxing/unboxing operations. + */ +public interface ObjectIntMap { + + int put(K key, int value); + + int get(K key); + + int remove(K key); +} diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java index ec1ea3f3fb2..cfb39ac0134 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java +++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java @@ -18,15 +18,23 @@ package org.apache.arrow.vector.dictionary; import org.apache.arrow.memory.DictionaryEncodeHashMap; -import org.apache.arrow.vector.BaseValueVectorBenchmarks; import org.junit.Test; -import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.runner.Runner; import org.openjdk.jmh.runner.RunnerException; import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; +import java.util.Random; import java.util.concurrent.TimeUnit; /** @@ -36,40 +44,64 @@ public class DictionaryEncodeHashMapBenchmarks { private static final int SIZE = 1000; - private HashMap hashMap; - private DictionaryEncodeHashMap dictionaryEncodeHashMap; + private static final int KEY_LENGTH = 10; + + private List testData = new ArrayList<>(); + + private HashMap hashMap = new HashMap(); + private DictionaryEncodeHashMap dictionaryEncodeHashMap = new DictionaryEncodeHashMap(); /** * Setup benchmarks. */ @Setup public void prepare() { - hashMap = new HashMap(); - dictionaryEncodeHashMap = new DictionaryEncodeHashMap(); + for (int i = 0; i < SIZE; i++) { + testData.add(getRandomString(KEY_LENGTH)); + } } + private String getRandomString(int length){ + String str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + Random random = new Random(); + StringBuffer sb = new StringBuffer(); + for(int i = 0; i < length; i++){ + int number = random.nextInt(62); + sb.append(str.charAt(number)); + } + return sb.toString(); + } + + /** + * Test set/get int values for {@link HashMap}. + * @return useless. To avoid DCE by JIT. + */ @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) public int testHashMap() { for (int i = 0; i < SIZE; i++) { - hashMap.put("test" + i, i); + hashMap.put(testData.get(i), i); } for (int i = 0; i < SIZE; i++) { - hashMap.get("test" + i); + hashMap.get(testData.get(i)); } return 0; } + /** + * Test set/get int values for {@link DictionaryEncodeHashMap}. + * @return useless. To avoid DCE by JIT. + */ @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) public int testDictionaryEncodeHashMap() { for (int i = 0; i < SIZE; i++) { - dictionaryEncodeHashMap.put("test" + i, i); + dictionaryEncodeHashMap.put(testData.get(i), i); } for (int i = 0; i < SIZE; i++) { - dictionaryEncodeHashMap.get("test" + i); + dictionaryEncodeHashMap.get(testData.get(i)); } return 0; } From 10596ad87b34087697a26c7d7259b6014a62c93e Mon Sep 17 00:00:00 2001 From: tianchen Date: Mon, 1 Jul 2019 15:33:17 +0800 Subject: [PATCH 3/5] fix style --- .../arrow/memory/DictionaryEncodeHashMap.java | 26 ++++++++++++++----- .../org/apache/arrow/memory/ObjectIntMap.java | 1 + .../DictionaryEncodeHashMapBenchmarks.java | 16 ++++++------ 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java b/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java index 0f8acbade48..bac32bad61f 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java @@ -22,6 +22,7 @@ /** * Implementation of the {@link ObjectIntMap} interface, used for DictionaryEncoder. * Note that value in this map is always not less than 0, and -1 represents a null value. + * @param key type. */ public class DictionaryEncodeHashMap implements ObjectIntMap { @@ -43,15 +44,21 @@ public class DictionaryEncodeHashMap implements ObjectIntMap { final float loadFactor; + /** + * Constructs an empty map with the specified initial capacity and load factor. + */ public DictionaryEncodeHashMap(int initialCapacity, float loadFactor) { - if (initialCapacity < 0) + if (initialCapacity < 0) { throw new IllegalArgumentException("Illegal initial capacity: " + initialCapacity); - if (initialCapacity > MAXIMUM_CAPACITY) + } + if (initialCapacity > MAXIMUM_CAPACITY) { initialCapacity = MAXIMUM_CAPACITY; - if (loadFactor <= 0 || Float.isNaN(loadFactor)) + } + if (loadFactor <= 0 || Float.isNaN(loadFactor)) { throw new IllegalArgumentException("Illegal load factor: " + loadFactor); + } this.loadFactor = loadFactor; this.threshold = initialCapacity; } @@ -64,7 +71,7 @@ public DictionaryEncodeHashMap() { this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR); } - private void inflateTable(int threshold){ + private void inflateTable(int threshold) { int capacity = roundUpToPowerOf2(threshold); this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1); table = new Entry[capacity]; @@ -76,7 +83,7 @@ static final int hash(Object key) { } static int indexFor(int h, int length) { - return h & (length-1); + return h & (length - 1); } @@ -179,7 +186,7 @@ void resize(int newCapacity) { void transfer(Entry[] newTable) { int newCapacity = newTable.length; for (Entry e : table) { - while(null != e) { + while (null != e) { Entry next = e.next; int i = indexFor(e.hash, newCapacity); e.next = newTable[i]; @@ -225,6 +232,9 @@ public int size() { return size; } + /** + * Removes all elements from this map, leaving it empty. + */ public void clear() { size = 0; for (int i = 0; i < table.length; i++) { @@ -232,6 +242,10 @@ public void clear() { } } + /** + * Class to keep key-value data within hash map. + * @param key type. + */ static class Entry { final K key; int value; diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java b/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java index 12c2e935e99..78bdde25f39 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java @@ -19,6 +19,7 @@ /** * Specific hash map for int type value, reducing boxing/unboxing operations. + * @param key type. */ public interface ObjectIntMap { diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java index cfb39ac0134..c1a3c7e2adc 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java +++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java @@ -17,6 +17,12 @@ package org.apache.arrow.vector.dictionary; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; +import java.util.concurrent.TimeUnit; + import org.apache.arrow.memory.DictionaryEncodeHashMap; import org.junit.Test; import org.openjdk.jmh.annotations.Benchmark; @@ -31,12 +37,6 @@ import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Random; -import java.util.concurrent.TimeUnit; - /** * Benchmarks for {@link org.apache.arrow.memory.DictionaryEncodeHashMap}. */ @@ -61,11 +61,11 @@ public void prepare() { } } - private String getRandomString(int length){ + private String getRandomString(int length) { String str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; Random random = new Random(); StringBuffer sb = new StringBuffer(); - for(int i = 0; i < length; i++){ + for (int i = 0; i < length; i++) { int number = random.nextInt(62); sb.append(str.charAt(number)); } From f62003337294108e1ebda24d33babee5c1576383 Mon Sep 17 00:00:00 2001 From: tianchen Date: Tue, 2 Jul 2019 15:03:12 +0800 Subject: [PATCH 4/5] add javadoc and change package --- .../DictionaryEncodeHashMapBenchmarks.java | 3 +- .../dictionary}/DictionaryEncodeHashMap.java | 76 ++++++++++++++++++- .../vector/dictionary}/ObjectIntMap.java | 21 ++++- 3 files changed, 94 insertions(+), 6 deletions(-) rename java/{memory/src/main/java/org/apache/arrow/memory => vector/src/main/java/org/apache/arrow/vector/dictionary}/DictionaryEncodeHashMap.java (78%) rename java/{memory/src/main/java/org/apache/arrow/memory => vector/src/main/java/org/apache/arrow/vector/dictionary}/ObjectIntMap.java (53%) diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java index c1a3c7e2adc..e97bff2e1dd 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java +++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java @@ -23,7 +23,6 @@ import java.util.Random; import java.util.concurrent.TimeUnit; -import org.apache.arrow.memory.DictionaryEncodeHashMap; import org.junit.Test; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -38,7 +37,7 @@ import org.openjdk.jmh.runner.options.OptionsBuilder; /** - * Benchmarks for {@link org.apache.arrow.memory.DictionaryEncodeHashMap}. + * Benchmarks for {@link DictionaryEncodeHashMap}. */ @State(Scope.Benchmark) public class DictionaryEncodeHashMapBenchmarks { diff --git a/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java similarity index 78% rename from java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java rename to java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java index bac32bad61f..659a8d6b634 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/DictionaryEncodeHashMap.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.arrow.memory; +package org.apache.arrow.vector.dictionary; import java.util.Objects; @@ -26,22 +26,48 @@ */ public class DictionaryEncodeHashMap implements ObjectIntMap { + /** + * Represents a null value in map. + */ static final int NULL_VALUE = -1; + /** + * The default initial capacity - MUST be a power of two. + */ static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; + /** + * The maximum capacity, used if a higher value is implicitly specified + * by either of the constructors with arguments. + */ static final int MAXIMUM_CAPACITY = 1 << 30; + /** + * The load factor used when none specified in constructor. + */ static final float DEFAULT_LOAD_FACTOR = 0.75f; static final Entry[] EMPTY_TABLE = {}; + /** + * The table, initialized on first use, and resized as + * necessary. When allocated, length is always a power of two. + */ transient Entry[] table = (Entry[]) EMPTY_TABLE; + /** + * The number of key-value mappings contained in this map. + */ transient int size; + /** + * The next size value at which to resize (capacity * load factor). + */ int threshold; + /** + * The load factor for the hash table. + */ final float loadFactor; /** @@ -71,22 +97,33 @@ public DictionaryEncodeHashMap() { this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR); } + /** + * Compute the capacity with given threshold and create init table. + */ private void inflateTable(int threshold) { int capacity = roundUpToPowerOf2(threshold); this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1); table = new Entry[capacity]; } + /** + * Computes key.hashCode() and spreads (XORs) higher bits of hash to lower. + */ static final int hash(Object key) { int h; return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16); } + /** + * Computes the storage location in an array for the given hashCode. + */ static int indexFor(int h, int length) { return h & (length - 1); } - + /** + * Returns a power of two size for the given size. + */ static final int roundUpToPowerOf2(int size) { int n = size - 1; n |= n >>> 1; @@ -97,6 +134,10 @@ static final int roundUpToPowerOf2(int size) { return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; } + /** + * Returns the value to which the specified key is mapped, + * or -1 if this map contains no mapping for the key. + */ @Override public int get(K key) { int hash = hash(key); @@ -109,6 +150,11 @@ public int get(K key) { return NULL_VALUE; } + /** + * Associates the specified value with the specified key in this map. + * If the map previously contained a mapping for the key, the old + * value is replaced. + */ @Override public int put(K key, int value) { if (table == EMPTY_TABLE) { @@ -134,19 +180,30 @@ public int put(K key, int value) { return NULL_VALUE; } - + /** + * Removes the mapping for the specified key from this map if present. + * @param key key whose mapping is to be removed from the map + * @return the previous value associated with key, or + * -1 if there was no mapping for key. + */ @Override public int remove(K key) { Entry e = removeEntryForKey(key); return (e == null ? NULL_VALUE : e.value); } + /** + * Create a new Entry at the specific position of table. + */ void createEntry(int hash, K key, int value, int bucketIndex) { Entry e = table[bucketIndex]; table[bucketIndex] = new Entry<>(hash, key, value, e); size++; } + /** + * Put value when key is null. + */ private int putForNullKey(int value) { for (Entry e = table[0]; e != null; e = e.next) { if (e.key == null) { @@ -159,6 +216,9 @@ private int putForNullKey(int value) { return NULL_VALUE; } + /** + * Add Entry at the specified location of the table. + */ void addEntry(int hash, K key, int value, int bucketIndex) { if ((size >= threshold) && (null != table[bucketIndex])) { resize(2 * table.length); @@ -169,6 +229,9 @@ void addEntry(int hash, K key, int value, int bucketIndex) { createEntry(hash, key, value, bucketIndex); } + /** + * Resize table with given new capacity. + */ void resize(int newCapacity) { Entry[] oldTable = table; int oldCapacity = oldTable.length; @@ -183,6 +246,10 @@ void resize(int newCapacity) { threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1); } + /** + * Transfer entries into new table from old table. + * @param newTable new table + */ void transfer(Entry[] newTable) { int newCapacity = newTable.length; for (Entry e : table) { @@ -196,6 +263,9 @@ void transfer(Entry[] newTable) { } } + /** + * Remove entry associated with the given key. + */ final Entry removeEntryForKey(Object key) { if (size == 0) { return null; diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java similarity index 53% rename from java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java rename to java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java index 78bdde25f39..582bb561cb7 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/ObjectIntMap.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.arrow.memory; +package org.apache.arrow.vector.dictionary; /** * Specific hash map for int type value, reducing boxing/unboxing operations. @@ -23,9 +23,28 @@ */ public interface ObjectIntMap { + /** + * Associates the specified value with the specified key in this map. + * If the map previously contained a mapping for the key, the old + * value is replaced. + * @param key key with which the specified value is to be associated + * @param value value to be associated with the specified key + * @return the previous value associated with key, or + * -1 if there was no mapping for key. + */ int put(K key, int value); + /** + * Returns the value to which the specified key is mapped, + * or -1 if this map contains no mapping for the key. + */ int get(K key); + /** + * Removes the mapping for the specified key from this map if present. + * @param key key whose mapping is to be removed from the map + * @return the previous value associated with key, or + * -1 if there was no mapping for key. + */ int remove(K key); } From 38ee5a4afae6880d6cb90755fc0a61f8a5cbc163 Mon Sep 17 00:00:00 2001 From: tianchen Date: Tue, 2 Jul 2019 16:01:54 +0800 Subject: [PATCH 5/5] add UT --- .../vector/TestDictionaryEncodeHashMap.java | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java new file mode 100644 index 00000000000..5f4e710b8af --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.arrow.vector.dictionary.DictionaryEncodeHashMap; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + + + +public class TestDictionaryEncodeHashMap { + + private List testData = new ArrayList<>(); + + private static final int SIZE = 100; + + private static final int KEY_LENGTH = 5; + + private DictionaryEncodeHashMap map = new DictionaryEncodeHashMap(); + + @Before + public void init() { + for (int i = 0; i < SIZE; i++) { + testData.add(generateUniqueKey(KEY_LENGTH)); + } + } + + @After + public void terminate() throws Exception { + testData.clear(); + } + + private String generateUniqueKey(int length) { + String str = "abcdefghijklmnopqrstuvwxyz"; + Random random = new Random(); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < length; i++) { + int number = random.nextInt(26); + sb.append(str.charAt(number)); + } + if (testData.contains(sb.toString())) { + return generateUniqueKey(length); + } + return sb.toString(); + } + + @Test + public void testPutAndGet() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + + for (int i = 0; i < SIZE; i++) { + assertEquals(i, map.get(testData.get(i))); + } + } + + @Test + public void testPutExistKey() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + map.put("test_key", 101); + assertEquals(101, map.get("test_key")); + map.put("test_key", 102); + assertEquals(102, map.get("test_key")); + } + + @Test + public void testGetNonExistKey() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + //remove if exists. + map.remove("test_key"); + assertEquals(-1, map.get("test_key")); + } + + @Test + public void testRemove() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + map.put("test_key", 10000); + assertEquals(SIZE + 1, map.size()); + assertEquals(10000, map.get("test_key")); + map.remove("test_key"); + assertEquals(SIZE, map.size()); + assertEquals(-1, map.get("test_key")); + } + + @Test + public void testSize() { + assertEquals(0, map.size()); + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + assertEquals(SIZE, map.size()); + } +}