diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 884cdf0910b..2a61403c0dc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -72,6 +72,18 @@ protected BaseAccessor() { } public boolean isNull(int index) { return false; } + + @Override + // override this in case your implementation is faster, see BitVector + public int getNullCount() { + int nullCount = 0; + for (int i = 0; i < getValueCount(); i++) { + if (isNull(i)) { + nullCount ++; + } + } + return nullCount; + } } public abstract static class BaseMutator implements ValueVector.Mutator { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 26eeafd51d9..9beabcbe46b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -379,6 +379,28 @@ public final void get(int index, NullableBitHolder holder) { holder.isSet = 1; holder.value = get(index); } + + /** + * Get the number nulls, this correspond to the number of bits set to 0 in the vector + * @return the number of bits set to 0 + */ + @Override + public final int getNullCount() { + int count = 0; + int sizeInBytes = getSizeFromCount(valueCount); + + for (int i = 0; i < sizeInBytes; ++i) { + byte byteValue = data.getByte(i); + // Java uses two's complement binary representation, hence 11111111_b which is -1 when converted to Int + // will have 32bits set to 1. Masking the MSB and then adding it back solves the issue. + count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7); + } + int nullCount = (sizeInBytes * 8) - count; + // if the valueCount is not a multiple of 8, the bits on the right were counted as null bits + int remainder = valueCount % 8; + nullCount -= remainder == 0 ? 0 : 8 - remainder; + return nullCount; + } } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 5b24a41850d..ff7b94c34d8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -180,6 +180,11 @@ interface Accessor { * Returns true if the value at the given index is null, false otherwise. */ boolean isNull(int index); + + /** + * Returns the number of null values + */ + int getNullCount(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java index e2462180ffa..92d8cb045ae 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -60,15 +60,7 @@ public ArrowRecordBatch getRecordBatch() { private void appendNodes(FieldVector vector, List nodes, List buffers) { Accessor accessor = vector.getAccessor(); - int nullCount = 0; - // TODO: should not have to do that - // we can do that a lot more efficiently (for example with Long.bitCount(i)) - for (int i = 0; i < accessor.getValueCount(); i++) { - if (accessor.isNull(i)) { - nullCount ++; - } - } - nodes.add(new ArrowFieldNode(accessor.getValueCount(), nullCount)); + nodes.add(new ArrowFieldNode(accessor.getValueCount(), accessor.getNullCount())); List fieldBuffers = vector.getFieldBuffers(); List expectedBuffers = vector.getField().getTypeLayout().getVectorTypes(); if (fieldBuffers.size() != expectedBuffers.size()) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index c2482adefec..e163b4fa939 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -69,6 +69,11 @@ public int getValueCount() { public boolean isNull(int index) { return true; } + + @Override + public int getNullCount() { + return 0; + } }; private final Mutator defaultMutator = new Mutator() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 461bdbcda1b..074b0aa7e58 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -310,6 +310,11 @@ public Object getObject(int index) { public boolean isNull(int index) { return bits.getAccessor().get(index) == 0; } + + @Override + public int getNullCount() { + return bits.getAccessor().getNullCount(); + } } public class Mutator extends BaseRepeatedMutator { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index f0ddf2727e9..5fa35307ab6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -203,6 +203,11 @@ public void get(int index, ComplexHolder holder) { super.get(index, holder); } + @Override + public int getNullCount() { + return bits.getAccessor().getNullCount(); + } + @Override public boolean isNull(int index) { return isSet(index) == 0; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 124452e96ee..b33919b2790 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -288,6 +288,7 @@ public void testBitVector() { try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { final BitVector.Mutator m = vector.getMutator(); vector.allocateNew(1024); + m.setValueCount(1024); // Put and set a few values m.set(0, 1); @@ -295,12 +296,16 @@ public void testBitVector() { m.set(100, 0); m.set(1022, 1); + m.setValueCount(1024); + final BitVector.Accessor accessor = vector.getAccessor(); assertEquals(1, accessor.get(0)); assertEquals(0, accessor.get(1)); assertEquals(0, accessor.get(100)); assertEquals(1, accessor.get(1022)); + assertEquals(1022, accessor.getNullCount()); + // test setting the same value twice m.set(0, 1); m.set(0, 1); @@ -315,8 +320,47 @@ public void testBitVector() { assertEquals(0, accessor.get(0)); assertEquals(1, accessor.get(1)); + // should not change + assertEquals(1022, accessor.getNullCount()); + // Ensure unallocated space returns 0 assertEquals(0, accessor.get(3)); + + // unset the previously set bits + m.set(1, 0); + m.set(1022, 0); + // this should set all the array to 0 + assertEquals(1024, accessor.getNullCount()); + + // set all the array to 1 + for (int i = 0; i < 1024; ++i) { + assertEquals(1024 - i, accessor.getNullCount()); + m.set(i, 1); + } + + assertEquals(0, accessor.getNullCount()); + + vector.allocateNew(1015); + m.setValueCount(1015); + + // ensure it has been zeroed + assertEquals(1015, accessor.getNullCount()); + + m.set(0, 1); + m.set(1014, 1); // ensure that the last item of the last byte is allocated + + assertEquals(1013, accessor.getNullCount()); + + vector.zeroVector(); + assertEquals(1015, accessor.getNullCount()); + + // set all the array to 1 + for (int i = 0; i < 1015; ++i) { + assertEquals(1015 - i, accessor.getNullCount()); + m.set(i, 1); + } + + assertEquals(0, accessor.getNullCount()); } }