From 4e87d88e877fcbeee0b1b2ff2960e8b4d7e73b55 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Thu, 1 Dec 2016 11:56:42 -0800 Subject: [PATCH 1/2] ARROW-398: Java file format requires bitmaps of all 1's to be written when there are no nulls --- .../templates/NullableValueVectors.java | 2 +- .../main/codegen/templates/UnionVector.java | 2 +- .../arrow/vector/BaseDataValueVector.java | 7 ++- .../org/apache/arrow/vector/BitVector.java | 29 ++++++++++ .../org/apache/arrow/vector/BufferBacked.java | 4 +- .../org/apache/arrow/vector/ValueVector.java | 17 ------ .../org/apache/arrow/vector/VectorLoader.java | 1 - .../arrow/vector/complex/ListVector.java | 2 +- .../vector/complex/NullableMapVector.java | 2 +- .../arrow/vector/TestVectorUnloadLoad.java | 56 +++++++++++++++++++ .../vector/file/TestArrowReaderWriter.java | 4 -- 11 files changed, 96 insertions(+), 30 deletions(-) diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index 48af7a2bafe..716fedcf866 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -144,7 +144,7 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - org.apache.arrow.vector.BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + org.apache.arrow.vector.BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); bits.valueCount = fieldNode.getLength(); } diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 5ca3f901484..9608b3c48eb 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -105,7 +105,7 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); this.valueCount = fieldNode.getLength(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java index c22258d4265..4c6d363f21c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -21,6 +21,7 @@ import java.util.List; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.schema.ArrowFieldNode; import io.netty.buffer.ArrowBuf; @@ -29,13 +30,13 @@ public abstract class BaseDataValueVector extends BaseValueVector implements Buf protected final static byte[] emptyByteArray = new byte[]{}; // Nullable vectors use this - public static void load(List vectors, List buffers) { + public static void load(ArrowFieldNode fieldNode, List vectors, List buffers) { int expectedSize = vectors.size(); if (buffers.size() != expectedSize) { throw new IllegalArgumentException("Illegal buffer count, expected " + expectedSize + ", got: " + buffers.size()); } for (int i = 0; i < expectedSize; i++) { - vectors.get(i).load(buffers.get(i)); + vectors.get(i).load(fieldNode, buffers.get(i)); } } @@ -106,7 +107,7 @@ public ArrowBuf getBuffer() { } @Override - public void load(ArrowBuf data) { + public void load(ArrowFieldNode fieldNode, ArrowBuf data) { this.data.release(); this.data = data.retain(allocator); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index c12db5045c2..76ca86529c2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -22,6 +22,7 @@ import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.BitHolder; import org.apache.arrow.vector.holders.NullableBitHolder; +import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.OversizedAllocationException; @@ -48,6 +49,34 @@ public BitVector(String name, BufferAllocator allocator) { super(name, allocator); } + @Override + public void load(ArrowFieldNode fieldNode, ArrowBuf data) { + // When the vector is all nulls or all defined, the content of the buffer can be omitted + if (data.readableBytes() == 0 && fieldNode.getLength() != 0) { + data.release(); + allocateNew(fieldNode.getLength()); + int n = getSizeFromCount(fieldNode.getLength()); + if (fieldNode.getNullCount() == 0) { + // all defined + // create an all 1s buffer + for (int i = 0; i < n; ++i) { + this.data.setByte(i, 0xFF); + } + } else if (fieldNode.getNullCount() == fieldNode.getLength()) { + // all null + // create an all 0s buffer + for (int i = 0; i < n; ++i) { + this.data.setByte(i, 0x00); + } + } else { + throw new IllegalArgumentException("The buffer can be empty only if there's no data or it's all null or all defined"); + } + this.data.writerIndex(n); + } else { + super.load(fieldNode, data); + } + } + @Override public Field getField() { throw new UnsupportedOperationException("internal vector"); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java index d1c262d2265..3c8b3210d77 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java @@ -17,6 +17,8 @@ */ package org.apache.arrow.vector; +import org.apache.arrow.vector.schema.ArrowFieldNode; + import io.netty.buffer.ArrowBuf; /** @@ -24,7 +26,7 @@ */ public interface BufferBacked { - void load(ArrowBuf data); + void load(ArrowFieldNode fieldNode, ArrowBuf data); ArrowBuf unLoad(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index ba7790e47ef..5b24a41850d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -130,13 +130,6 @@ public interface ValueVector extends Closeable, Iterable { */ FieldReader getReader(); - /** - * Get the metadata for this field. Used in serialization - * - * @return FieldMetadata for this field. - */ -// SerializedField getMetadata(); - /** * Returns the number of bytes that is used by this vector instance. */ @@ -166,16 +159,6 @@ public interface ValueVector extends Closeable, Iterable { */ ArrowBuf[] getBuffers(boolean clear); - /** - * Load the data provided in the buffer. Typically used when deserializing from the wire. - * - * @param metadata - * Metadata used to decode the incoming buffer. - * @param buffer - * The buffer that contains the ValueVector. - */ -// void load(SerializedField metadata, DrillBuf buffer); - /** * An abstraction that is used to read from this vector instance. */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index c5d642ee0cc..757f061dd5a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -81,7 +81,6 @@ private void loadBuffers(FieldVector vector, Field field, Iterator buf try { vector.loadFieldBuffers(fieldNode, ownBuffers); } catch (RuntimeException e) { - e.printStackTrace(); throw new IllegalArgumentException("Could not load buffers for field " + field + " error message" + e.getMessage(), e); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index dd99c734f7f..e18f99f95d7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -93,7 +93,7 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index 8e1bbfabdc9..f0ddf2727e9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -62,7 +62,7 @@ public NullableMapVector(String name, BufferAllocator allocator, CallBack callBa @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); this.valueCount = fieldNode.getLength(); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java index 78f69eedc1c..25a63d6fb2d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -17,7 +17,13 @@ */ package org.apache.arrow.vector; +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + import java.io.IOException; +import java.util.Collections; import java.util.List; import org.apache.arrow.memory.BufferAllocator; @@ -29,12 +35,17 @@ import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; import org.apache.arrow.vector.complex.writer.BigIntWriter; import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.AfterClass; import org.junit.Assert; import org.junit.Test; +import io.netty.buffer.ArrowBuf; + public class TestVectorUnloadLoad { static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); @@ -88,6 +99,51 @@ public void testUnloadLoad() throws IOException { } } + /** + * The validity buffer can be empty if: + * - all values are defined + * - all values are null + * @throws IOException + */ + @Test + public void testLoadEmptyValidityBuffer() throws IOException { + Schema schema = new Schema(asList( + new Field("intDefined", true, new ArrowType.Int(32, true), Collections.emptyList()), + new Field("intNull", true, new ArrowType.Int(32, true), Collections.emptyList()) + )); + int count = 10; + ArrowBuf validity = allocator.getEmpty(); + ArrowBuf values = allocator.buffer(count * 4); // integers + for (int i = 0; i < count; i++) { + values.setInt(i * 4, i); + } + try ( + ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), new ArrowFieldNode(count, count)), asList(validity, values, validity, values)); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = new VectorSchemaRoot(schema, finalVectorsAllocator); + ) { + + // load it + VectorLoader vectorLoader = new VectorLoader(newRoot); + + vectorLoader.load(recordBatch); + + FieldReader intDefinedReader = newRoot.getVector("intDefined").getReader(); + FieldReader intNullReader = newRoot.getVector("intNull").getReader(); + for (int i = 0; i < count; i++) { + intDefinedReader.setPosition(i); + intNullReader.setPosition(i); + Integer defined = intDefinedReader.readInteger(); + assertNotNull("#" + i, defined); + assertEquals("#" + i, i, defined.intValue()); + Integer nullVal = intNullReader.readInteger(); + assertNull("#" + i, nullVal); + } + } finally { + values.release(); + } + } + public static VectorUnloader newVectorUnloader(FieldVector root) { Schema schema = new Schema(root.getField().getChildren()); int valueCount = root.getAccessor().getValueCount(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java index f90329aca11..8ed89fa347b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java @@ -30,10 +30,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ArrowFooter; -import org.apache.arrow.vector.file.ArrowReader; -import org.apache.arrow.vector.file.ArrowWriter; import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.schema.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.ArrowType; From c29da53206e6b2b3160a3c77abc4caed637f5185 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Thu, 1 Dec 2016 12:45:08 -0800 Subject: [PATCH 2/2] fix extraneous bits --- .../org/apache/arrow/vector/BitVector.java | 19 ++++++---- .../arrow/vector/TestVectorUnloadLoad.java | 35 +++++++++++++------ 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 76ca86529c2..7ce1236b2ec 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -54,20 +54,27 @@ public void load(ArrowFieldNode fieldNode, ArrowBuf data) { // When the vector is all nulls or all defined, the content of the buffer can be omitted if (data.readableBytes() == 0 && fieldNode.getLength() != 0) { data.release(); - allocateNew(fieldNode.getLength()); - int n = getSizeFromCount(fieldNode.getLength()); + int count = fieldNode.getLength(); + allocateNew(count); + int n = getSizeFromCount(count); if (fieldNode.getNullCount() == 0) { // all defined // create an all 1s buffer - for (int i = 0; i < n; ++i) { + // set full bytes + int fullBytesCount = count / 8; + for (int i = 0; i < fullBytesCount; ++i) { this.data.setByte(i, 0xFF); } + int remainder = count % 8; + // set remaining bits + if (remainder > 0) { + byte bitMask = (byte) (0xFFL >>> ((8 - remainder) & 7));; + this.data.setByte(fullBytesCount, bitMask); + } } else if (fieldNode.getNullCount() == fieldNode.getLength()) { // all null // create an all 0s buffer - for (int i = 0; i < n; ++i) { - this.data.setByte(i, 0x00); - } + zeroVector(); } else { throw new IllegalArgumentException("The buffer can be empty only if there's no data or it's all null or all defined"); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java index 25a63d6fb2d..9dfe8d840e4 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -19,8 +19,8 @@ import static java.util.Arrays.asList; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Collections; @@ -128,17 +128,30 @@ public void testLoadEmptyValidityBuffer() throws IOException { vectorLoader.load(recordBatch); - FieldReader intDefinedReader = newRoot.getVector("intDefined").getReader(); - FieldReader intNullReader = newRoot.getVector("intNull").getReader(); + NullableIntVector intDefinedVector = (NullableIntVector)newRoot.getVector("intDefined"); + NullableIntVector intNullVector = (NullableIntVector)newRoot.getVector("intNull"); for (int i = 0; i < count; i++) { - intDefinedReader.setPosition(i); - intNullReader.setPosition(i); - Integer defined = intDefinedReader.readInteger(); - assertNotNull("#" + i, defined); - assertEquals("#" + i, i, defined.intValue()); - Integer nullVal = intNullReader.readInteger(); - assertNull("#" + i, nullVal); + assertFalse("#" + i, intDefinedVector.getAccessor().isNull(i)); + assertEquals("#" + i, i, intDefinedVector.getAccessor().get(i)); + assertTrue("#" + i, intNullVector.getAccessor().isNull(i)); } + intDefinedVector.getMutator().setSafe(count + 10, 1234); + assertTrue(intDefinedVector.getAccessor().isNull(count + 1)); + // empty slots should still default to unset + intDefinedVector.getMutator().setSafe(count + 1, 789); + assertFalse(intDefinedVector.getAccessor().isNull(count + 1)); + assertEquals(789, intDefinedVector.getAccessor().get(count + 1)); + assertTrue(intDefinedVector.getAccessor().isNull(count)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 2)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 3)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 4)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 5)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 6)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 7)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 8)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 9)); + assertFalse(intDefinedVector.getAccessor().isNull(count + 10)); + assertEquals(1234, intDefinedVector.getAccessor().get(count + 10)); } finally { values.release(); }