From baca69c822d486131988a75f32f1723645dbe34c Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Thu, 17 Nov 2016 15:34:55 +0100 Subject: [PATCH 1/9] Add methods to count the number null values in the vector --- .../org/apache/arrow/vector/BaseValueVector.java | 5 +++++ .../java/org/apache/arrow/vector/BitVector.java | 15 +++++++++++++++ .../org/apache/arrow/vector/ValueVector.java | 5 +++++ .../java/org/apache/arrow/vector/ZeroVector.java | 5 +++++ .../apache/arrow/vector/complex/ListVector.java | 5 +++++ .../org/apache/arrow/vector/TestValueVector.java | 16 ++++++++++++++++ 6 files changed, 51 insertions(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 884cdf0910b..5d179bb0613 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -72,6 +72,11 @@ protected BaseAccessor() { } public boolean isNull(int index) { return false; } + + @Override + public int getNullCount() { + return 0; + } } public abstract static class BaseMutator implements ValueVector.Mutator { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 26eeafd51d9..e2ab043e629 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -379,6 +379,21 @@ public final void get(int index, NullableBitHolder holder) { holder.isSet = 1; holder.value = get(index); } + + /** + * Get the number of bits set to 1 + * @return the number of bits set to 1 + */ + public final int getNullCount() { + int count = 0; + for (int i = 0; i < allocationSizeInBytes; ++i) { + byte byteValue = data.getByte(i); + // Java uses two's complement binary representation, hence 11111111_b which is -1 when converted to Int + // will have 32bits set to 1. Masking the MSB and then adding it back solves the issue. + count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7); + } + return count; + } } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 5b24a41850d..ff7b94c34d8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -180,6 +180,11 @@ interface Accessor { * Returns true if the value at the given index is null, false otherwise. */ boolean isNull(int index); + + /** + * Returns the number of null values + */ + int getNullCount(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index c2482adefec..e163b4fa939 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -69,6 +69,11 @@ public int getValueCount() { public boolean isNull(int index) { return true; } + + @Override + public int getNullCount() { + return 0; + } }; private final Mutator defaultMutator = new Mutator() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 461bdbcda1b..074b0aa7e58 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -310,6 +310,11 @@ public Object getObject(int index) { public boolean isNull(int index) { return bits.getAccessor().get(index) == 0; } + + @Override + public int getNullCount() { + return bits.getAccessor().getNullCount(); + } } public class Mutator extends BaseRepeatedMutator { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 124452e96ee..533c3c139e8 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -301,6 +301,8 @@ public void testBitVector() { assertEquals(0, accessor.get(100)); assertEquals(1, accessor.get(1022)); + assertEquals(2, accessor.getNullCount()); + // test setting the same value twice m.set(0, 1); m.set(0, 1); @@ -315,8 +317,22 @@ public void testBitVector() { assertEquals(0, accessor.get(0)); assertEquals(1, accessor.get(1)); + // should not change + assertEquals(2, accessor.getNullCount()); + // Ensure unallocated space returns 0 assertEquals(0, accessor.get(3)); + + m.set(1, 0); + m.set(1022, 0); + + assertEquals(0, accessor.getNullCount()); + + for (int i = 0; i < 1024; ++i) { + m.set(i, 1); + } + + assertEquals(1024, accessor.getNullCount()); } } From f26425077b58f575cc0ed3a9de4a6ec7969bc32a Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Thu, 17 Nov 2016 15:37:11 +0100 Subject: [PATCH 2/9] use getNullCount() rather than isNull --- .../java/org/apache/arrow/vector/VectorUnloader.java | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java index e2462180ffa..92d8cb045ae 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -60,15 +60,7 @@ public ArrowRecordBatch getRecordBatch() { private void appendNodes(FieldVector vector, List nodes, List buffers) { Accessor accessor = vector.getAccessor(); - int nullCount = 0; - // TODO: should not have to do that - // we can do that a lot more efficiently (for example with Long.bitCount(i)) - for (int i = 0; i < accessor.getValueCount(); i++) { - if (accessor.isNull(i)) { - nullCount ++; - } - } - nodes.add(new ArrowFieldNode(accessor.getValueCount(), nullCount)); + nodes.add(new ArrowFieldNode(accessor.getValueCount(), accessor.getNullCount())); List fieldBuffers = vector.getFieldBuffers(); List expectedBuffers = vector.getField().getTypeLayout().getVectorTypes(); if (fieldBuffers.size() != expectedBuffers.size()) { From b12a2a55e1e14b2a36ebc8ec82f6eff53970d486 Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Thu, 17 Nov 2016 18:10:54 +0100 Subject: [PATCH 3/9] fix wrong value returned by the method --- .../src/main/java/org/apache/arrow/vector/BitVector.java | 3 ++- .../java/org/apache/arrow/vector/TestValueVector.java | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index e2ab043e629..83ab224e39f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -384,6 +384,7 @@ public final void get(int index, NullableBitHolder holder) { * Get the number of bits set to 1 * @return the number of bits set to 1 */ + @Override public final int getNullCount() { int count = 0; for (int i = 0; i < allocationSizeInBytes; ++i) { @@ -392,7 +393,7 @@ public final int getNullCount() { // will have 32bits set to 1. Masking the MSB and then adding it back solves the issue. count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7); } - return count; + return (allocationSizeInBytes * 8) - count; } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 533c3c139e8..e98f1eba18c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -301,7 +301,7 @@ public void testBitVector() { assertEquals(0, accessor.get(100)); assertEquals(1, accessor.get(1022)); - assertEquals(2, accessor.getNullCount()); + assertEquals(1022, accessor.getNullCount()); // test setting the same value twice m.set(0, 1); @@ -318,7 +318,7 @@ public void testBitVector() { assertEquals(1, accessor.get(1)); // should not change - assertEquals(2, accessor.getNullCount()); + assertEquals(1022, accessor.getNullCount()); // Ensure unallocated space returns 0 assertEquals(0, accessor.get(3)); @@ -326,13 +326,13 @@ public void testBitVector() { m.set(1, 0); m.set(1022, 0); - assertEquals(0, accessor.getNullCount()); + assertEquals(1024, accessor.getNullCount()); for (int i = 0; i < 1024; ++i) { m.set(i, 1); } - assertEquals(1024, accessor.getNullCount()); + assertEquals(0, accessor.getNullCount()); } } From 95667d3c7c9ce4d61a527c49401ba026a3152818 Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Fri, 2 Dec 2016 20:00:34 +0100 Subject: [PATCH 4/9] fix the comment --- .../src/main/java/org/apache/arrow/vector/BitVector.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 83ab224e39f..4605ae84a96 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -381,8 +381,8 @@ public final void get(int index, NullableBitHolder holder) { } /** - * Get the number of bits set to 1 - * @return the number of bits set to 1 + * Get the number nulls, this correspond to the number of bits set to 0 in the vector + * @return the number of bits set to 0 */ @Override public final int getNullCount() { From 0530c8501db5b8ae3e033b8f48b95b2a9df8d8b4 Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Fri, 2 Dec 2016 20:18:41 +0100 Subject: [PATCH 5/9] test the null count byte by byte and the odd length case --- .../apache/arrow/vector/TestValueVector.java | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index e98f1eba18c..382a52f0717 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -323,12 +323,38 @@ public void testBitVector() { // Ensure unallocated space returns 0 assertEquals(0, accessor.get(3)); + // unset the previously set bits m.set(1, 0); m.set(1022, 0); - + // this should set all the array to 0 assertEquals(1024, accessor.getNullCount()); + // set all the array to 1 for (int i = 0; i < 1024; ++i) { + assertEquals(1024 - i, accessor.getNullCount()); + m.set(i, 1); + } + + assertEquals(0, accessor.getNullCount()); + + // re-allocate the vector with a new (odd) size, smaller by more than a byte than the original vector + // this allows to ensure that we use one byte less memory than the previous one + vector.allocateNew(1015); + + // ensure it has been zeroed + assertEquals(1016, accessor.getNullCount()); + + m.set(0, 1); + m.set(1015, 1); // ensure that the last item of the last byte is allocated + + assertEquals(1014, accessor.getNullCount()); + + vector.zeroVector(); + assertEquals(1016, accessor.getNullCount()); + + // set all the array to 1 + for (int i = 0; i < 1016; ++i) { + assertEquals(1016 - i, accessor.getNullCount()); m.set(i, 1); } From e858432bd4d640442ce5d0e5b9f94417b36ae7fd Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Sat, 10 Dec 2016 22:05:39 +0100 Subject: [PATCH 6/9] use the valueCount as basis for counting nulls rather than allocated bytes --- .../org/apache/arrow/vector/BitVector.java | 6 +++++- .../apache/arrow/vector/TestValueVector.java | 18 ++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 4605ae84a96..e037a86777f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -393,7 +393,11 @@ public final int getNullCount() { // will have 32bits set to 1. Masking the MSB and then adding it back solves the issue. count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7); } - return (allocationSizeInBytes * 8) - count; + int nullCount = (allocationSizeInBytes * 8) - count; + // if the valueCount is not a multiple of 8, the bits on the right were counted as null bits + int remainder = valueCount % 8; + nullCount -= remainder == 0 ? 0 : 8 - remainder; + return nullCount; } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 382a52f0717..b33919b2790 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -288,6 +288,7 @@ public void testBitVector() { try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { final BitVector.Mutator m = vector.getMutator(); vector.allocateNew(1024); + m.setValueCount(1024); // Put and set a few values m.set(0, 1); @@ -295,6 +296,8 @@ public void testBitVector() { m.set(100, 0); m.set(1022, 1); + m.setValueCount(1024); + final BitVector.Accessor accessor = vector.getAccessor(); assertEquals(1, accessor.get(0)); assertEquals(0, accessor.get(1)); @@ -337,24 +340,23 @@ public void testBitVector() { assertEquals(0, accessor.getNullCount()); - // re-allocate the vector with a new (odd) size, smaller by more than a byte than the original vector - // this allows to ensure that we use one byte less memory than the previous one vector.allocateNew(1015); + m.setValueCount(1015); // ensure it has been zeroed - assertEquals(1016, accessor.getNullCount()); + assertEquals(1015, accessor.getNullCount()); m.set(0, 1); - m.set(1015, 1); // ensure that the last item of the last byte is allocated + m.set(1014, 1); // ensure that the last item of the last byte is allocated - assertEquals(1014, accessor.getNullCount()); + assertEquals(1013, accessor.getNullCount()); vector.zeroVector(); - assertEquals(1016, accessor.getNullCount()); + assertEquals(1015, accessor.getNullCount()); // set all the array to 1 - for (int i = 0; i < 1016; ++i) { - assertEquals(1016 - i, accessor.getNullCount()); + for (int i = 0; i < 1015; ++i) { + assertEquals(1015 - i, accessor.getNullCount()); m.set(i, 1); } From ad3f24a64da049639919696c03fb4c57facd899b Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Fri, 16 Dec 2016 00:10:47 +0100 Subject: [PATCH 7/9] the used size is not the same as the allocated size --- .../src/main/java/org/apache/arrow/vector/BitVector.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index e037a86777f..9beabcbe46b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -387,13 +387,15 @@ public final void get(int index, NullableBitHolder holder) { @Override public final int getNullCount() { int count = 0; - for (int i = 0; i < allocationSizeInBytes; ++i) { + int sizeInBytes = getSizeFromCount(valueCount); + + for (int i = 0; i < sizeInBytes; ++i) { byte byteValue = data.getByte(i); // Java uses two's complement binary representation, hence 11111111_b which is -1 when converted to Int // will have 32bits set to 1. Masking the MSB and then adding it back solves the issue. count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7); } - int nullCount = (allocationSizeInBytes * 8) - count; + int nullCount = (sizeInBytes * 8) - count; // if the valueCount is not a multiple of 8, the bits on the right were counted as null bits int remainder = valueCount % 8; nullCount -= remainder == 0 ? 0 : 8 - remainder; From 9ff3355b548b3a2d7c042319d29897119b300ad7 Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Fri, 16 Dec 2016 00:11:10 +0100 Subject: [PATCH 8/9] implement the base case of getNullCount() --- .../java/org/apache/arrow/vector/BaseValueVector.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 5d179bb0613..2a61403c0dc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -74,8 +74,15 @@ public boolean isNull(int index) { } @Override + // override this in case your implementation is faster, see BitVector public int getNullCount() { - return 0; + int nullCount = 0; + for (int i = 0; i < getValueCount(); i++) { + if (isNull(i)) { + nullCount ++; + } + } + return nullCount; } } From 27c0342f28a159b68ab32563f0d30497e131d9ae Mon Sep 17 00:00:00 2001 From: Mohamed Zenadi Date: Fri, 16 Dec 2016 00:27:01 +0100 Subject: [PATCH 9/9] implement missing getNullCount implementation for NullableMapVector --- .../org/apache/arrow/vector/complex/NullableMapVector.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index f0ddf2727e9..5fa35307ab6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -203,6 +203,11 @@ public void get(int index, ComplexHolder holder) { super.get(index, holder); } + @Override + public int getNullCount() { + return bits.getAccessor().getNullCount(); + } + @Override public boolean isNull(int index) { return isSet(index) == 0;