diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 537c31e909f..39c4b43f1d3 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1548,7 +1548,6 @@ def _temp_path(): generate_unions_case() .skip_category('Go') - .skip_category('Java') # TODO(ARROW-1692) .skip_category('JS') .skip_category('Rust'), diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index fbbe90cbca7..3deb8d32af8 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -126,6 +126,8 @@ def _gold_tests(self, gold_dir): if f.name == name).skip except StopIteration: skip = set() + if name == 'union' and prefix == '0.17.1': + skip.add("Java") yield datagen.File(name, None, None, skip=skip, path=out_path) def _run_test_cases(self, producer, consumer, case_runner, diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index d785d39d726..e67445f1bb1 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -34,6 +34,7 @@ import org.apache.arrow.vector.types.UnionMode; import org.apache.arrow.vector.compare.RangeEqualsVisitor; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.TransferPair; @@ -93,7 +94,6 @@ public class DenseUnionVector implements FieldVector { NonNullableStructVector internalStruct; private ArrowBuf typeBuffer; private ArrowBuf offsetBuffer; - private ArrowBuf validityBuffer; /** * The key is type Id, and the value is vector. @@ -104,6 +104,10 @@ public class DenseUnionVector implements FieldVector { * The index is the type id, and the value is the type field. */ private Field[] typeFields = new Field[Byte.MAX_VALUE + 1]; + /** + * The index is the index into the typeFields array, and the value is the logical field id. + */ + private byte[] typeMapFields = new byte[Byte.MAX_VALUE + 1]; /** * The next typd id to allocate. @@ -115,7 +119,6 @@ public class DenseUnionVector implements FieldVector { private final CallBack callBack; private long typeBufferAllocationSizeInBytes; private long offsetBufferAllocationSizeInBytes; - private long validityBufferAllocationSizeInBytes; private final FieldType fieldType; @@ -142,9 +145,6 @@ public DenseUnionVector(String name, BufferAllocator allocator, FieldType fieldT callBack, AbstractStructVector.ConflictPolicy.CONFLICT_REPLACE, false); - this.validityBuffer = allocator.getEmpty(); - this.validityBufferAllocationSizeInBytes = - DataSizeRoundingUtil.divideBy8Ceil(BaseValueVector.INITIAL_VALUE_ALLOCATION); this.typeBuffer = allocator.getEmpty(); this.callBack = callBack; this.typeBufferAllocationSizeInBytes = BaseValueVector.INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; @@ -178,22 +178,17 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - if (ownBuffers.size() != 3) { + if (ownBuffers.size() != 2) { throw new IllegalArgumentException("Illegal buffer count for dense union with type " + getField().getFieldType() + ", expected " + 2 + ", got: " + ownBuffers.size()); } ArrowBuf buffer = ownBuffers.get(0); - validityBuffer.getReferenceManager().release(); - validityBuffer = buffer.getReferenceManager().retain(buffer, allocator); - validityBufferAllocationSizeInBytes = validityBuffer.capacity(); - - buffer = ownBuffers.get(1); typeBuffer.getReferenceManager().release(); typeBuffer = buffer.getReferenceManager().retain(buffer, allocator); typeBufferAllocationSizeInBytes = typeBuffer.capacity(); - buffer = ownBuffers.get(2); + buffer = ownBuffers.get(1); offsetBuffer.getReferenceManager().release(); offsetBuffer = buffer.getReferenceManager().retain(buffer, allocator); offsetBufferAllocationSizeInBytes = offsetBuffer.capacity(); @@ -203,9 +198,8 @@ public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers @Override public List getFieldBuffers() { - List result = new ArrayList<>(1); + List result = new ArrayList<>(2); setReaderAndWriterIndex(); - result.add(validityBuffer); result.add(typeBuffer); result.add(offsetBuffer); @@ -213,9 +207,6 @@ public List getFieldBuffers() { } private void setReaderAndWriterIndex() { - validityBuffer.readerIndex(0); - validityBuffer.writerIndex(DataSizeRoundingUtil.divideBy8Ceil(valueCount)); - typeBuffer.readerIndex(0); typeBuffer.writerIndex(valueCount * TYPE_WIDTH); @@ -243,7 +234,18 @@ public synchronized byte registerNewTypeId(Field field) { typeFields.length + " relative types. Please use union of union instead"); } byte typeId = nextTypeId; + if (fieldType != null) { + int[] typeIds = ((ArrowType.Union) fieldType.getType()).getTypeIds(); + if (typeIds != null) { + int thisTypeId = typeIds[nextTypeId]; + if (thisTypeId > Byte.MAX_VALUE) { + throw new IllegalStateException("Dense union vector types must be bytes. " + thisTypeId + " is too large"); + } + typeId = (byte) thisTypeId; + } + } typeFields[typeId] = field; + typeMapFields[nextTypeId] = typeId; this.nextTypeId += 1; return typeId; } @@ -268,11 +270,11 @@ public long getDataBufferAddress() { @Override public long getValidityBufferAddress() { - return validityBuffer.memoryAddress(); + throw new UnsupportedOperationException(); } @Override - public ArrowBuf getValidityBuffer() { return validityBuffer; } + public ArrowBuf getValidityBuffer() { throw new UnsupportedOperationException(); } @Override public ArrowBuf getOffsetBuffer() { return offsetBuffer; } @@ -283,7 +285,7 @@ public long getValidityBufferAddress() { public ArrowBuf getDataBuffer() { throw new UnsupportedOperationException(); } public StructVector getStruct(byte typeId) { - StructVector structVector = (StructVector) childVectors[typeId]; + StructVector structVector = typeId < 0 ? null : (StructVector) childVectors[typeId]; if (structVector == null) { int vectorCount = internalStruct.size(); structVector = addOrGet(typeId, MinorType.STRUCT, StructVector.class); @@ -307,7 +309,7 @@ public StructVector getStruct(byte typeId) { <#if !minor.typeParams?? || minor.class == "Decimal"> public ${name}Vector get${name}Vector(byte typeId<#if minor.class == "Decimal">, ArrowType arrowType) { - ValueVector vector = childVectors[typeId]; + ValueVector vector = typeId < 0 ? null : childVectors[typeId]; if (vector == null) { int vectorCount = internalStruct.size(); vector = addOrGet(typeId, MinorType.${name?upper_case}<#if minor.class == "Decimal">, arrowType, ${name}Vector.class); @@ -326,7 +328,7 @@ public StructVector getStruct(byte typeId) { public ListVector getList(byte typeId) { - ListVector listVector = (ListVector) childVectors[typeId]; + ListVector listVector = typeId < 0 ? null : (ListVector) childVectors[typeId]; if (listVector == null) { int vectorCount = internalStruct.size(); listVector = addOrGet(typeId, MinorType.LIST, ListVector.class); @@ -346,7 +348,7 @@ public byte getTypeId(int index) { } public ValueVector getVectorByType(byte typeId) { - return childVectors[typeId]; + return typeId < 0 ? null : childVectors[typeId]; } @Override @@ -383,7 +385,7 @@ public boolean allocateNewSafe() { private void allocateTypeBuffer() { typeBuffer = allocator.buffer(typeBufferAllocationSizeInBytes); typeBuffer.readerIndex(0); - typeBuffer.setZero(0, typeBuffer.capacity()); + setNegative(0, typeBuffer.capacity()); } private void allocateOffsetBuffer() { @@ -392,16 +394,10 @@ private void allocateOffsetBuffer() { offsetBuffer.setZero(0, offsetBuffer.capacity()); } - private void allocateValidityBuffer() { - validityBuffer = allocator.buffer(validityBufferAllocationSizeInBytes); - validityBuffer.readerIndex(0); - validityBuffer.setZero(0, validityBuffer.capacity()); - } @Override public void reAlloc() { internalStruct.reAlloc(); - reallocValidityBuffer(); reallocTypeBuffer(); reallocOffsetBuffer(); } @@ -410,32 +406,6 @@ public int getOffset(int index) { return offsetBuffer.getInt(index * OFFSET_WIDTH); } - private void reallocValidityBuffer() { - final long currentBufferCapacity = validityBuffer.capacity(); - long newAllocationSize = currentBufferCapacity * 2; - if (newAllocationSize == 0) { - if (validityBufferAllocationSizeInBytes > 0) { - newAllocationSize = validityBufferAllocationSizeInBytes; - } else { - newAllocationSize = DataSizeRoundingUtil.divideBy8Ceil(BaseValueVector.INITIAL_VALUE_ALLOCATION) * 2; - } - } - - newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); - assert newAllocationSize >= 1; - - if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer"); - } - - final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); - newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); - newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); - validityBuffer.getReferenceManager().release(1); - validityBuffer = newBuf; - validityBufferAllocationSizeInBytes = (int)newAllocationSize; - } - private void reallocTypeBuffer() { final long currentBufferCapacity = typeBuffer.capacity(); long newAllocationSize = currentBufferCapacity * 2; @@ -456,10 +426,10 @@ private void reallocTypeBuffer() { final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); newBuf.setBytes(0, typeBuffer, 0, currentBufferCapacity); - newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); typeBuffer.getReferenceManager().release(1); typeBuffer = newBuf; typeBufferAllocationSizeInBytes = (int)newAllocationSize; + setNegative(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); } private void reallocOffsetBuffer() { @@ -493,11 +463,7 @@ public void setInitialCapacity(int numRecords) { } @Override public int getValueCapacity() { - long capacity = getValidityBufferValueCapacity(); - long typeCapacity = getTypeBufferValueCapacity(); - if (typeCapacity < capacity) { - capacity = typeCapacity; - } + long capacity = getTypeBufferValueCapacity(); long offsetCapacity = getOffsetBufferValueCapacity(); if (offsetCapacity < capacity) { capacity = offsetCapacity; @@ -517,8 +483,6 @@ public void close() { @Override public void clear() { valueCount = 0; - validityBuffer.getReferenceManager().release(); - validityBuffer = allocator.getEmpty(); typeBuffer.getReferenceManager().release(); typeBuffer = allocator.getEmpty(); offsetBuffer.getReferenceManager().release(); @@ -529,8 +493,7 @@ public void clear() { @Override public void reset() { valueCount = 0; - validityBuffer.setZero(0, validityBuffer.capacity()); - typeBuffer.setZero(0, typeBuffer.capacity()); + setNegative(0, typeBuffer.capacity()); offsetBuffer.setZero(0, offsetBuffer.capacity()); internalStruct.reset(); } @@ -630,6 +593,7 @@ private void createTransferPairs() { ValueVector srcVec = internalStruct.getVectorById(i); ValueVector dstVec = to.internalStruct.getVectorById(i); to.typeFields[i] = typeFields[i]; + to.typeMapFields[i] = typeMapFields[i]; to.childVectors[i] = dstVec; internalTransferPairs[i] = srcVec.makeTransferPair(dstVec); } @@ -638,10 +602,8 @@ private void createTransferPairs() { @Override public void transfer() { to.clear(); - ReferenceManager refManager = validityBuffer.getReferenceManager(); - to.validityBuffer = refManager.transferOwnership(validityBuffer, to.allocator).getTransferredBuffer(); - refManager = typeBuffer.getReferenceManager(); + ReferenceManager refManager = typeBuffer.getReferenceManager(); to.typeBuffer = refManager.transferOwnership(typeBuffer, to.allocator).getTransferredBuffer(); refManager = offsetBuffer.getReferenceManager(); @@ -661,19 +623,6 @@ public void transfer() { public void splitAndTransfer(int startIndex, int length) { to.clear(); - // transfer validity buffer - while (to.getValidityBufferValueCapacity() < length) { - to.reallocValidityBuffer(); - } - for (int i = 0; i < length; i++) { - int validity = BitVectorHelper.get(validityBuffer, startIndex + i); - if (validity == 0) { - BitVectorHelper.unsetBit(to.validityBuffer, i); - } else { - BitVectorHelper.setBit(to.validityBuffer, i); - } - } - // transfer type buffer int startPoint = startIndex * TYPE_WIDTH; int sliceLength = length * TYPE_WIDTH; @@ -758,16 +707,12 @@ public ArrowBuf[] getBuffers(boolean clear) { List list = new java.util.ArrayList<>(); setReaderAndWriterIndex(); if (getBufferSize() != 0) { - list.add(validityBuffer); list.add(typeBuffer); list.add(offsetBuffer); list.addAll(java.util.Arrays.asList(internalStruct.getBuffers(clear))); } if (clear) { valueCount = 0; - validityBuffer.getReferenceManager().retain(); - validityBuffer.close(); - validityBuffer = allocator.getEmpty(); typeBuffer.getReferenceManager().retain(); typeBuffer.close(); typeBuffer = allocator.getEmpty(); @@ -790,13 +735,10 @@ private ValueVector getVector(int index) { } public Object getObject(int index) { - if (isNull(index)) { - return null; - } ValueVector vector = getVector(index); if (vector != null) { int offset = offsetBuffer.getInt(index * OFFSET_WIDTH); - return vector.getObject(offset); + return vector.isNull(offset) ? null : vector.getObject(offset); } return null; } @@ -811,13 +753,18 @@ public int getValueCount() { return valueCount; } + /** + * IMPORTANT: Union types always return non null as there is no validity buffer. + * + * To check validity correctly you must check the underlying vector. + */ public boolean isNull(int index) { - return BitVectorHelper.get(validityBuffer, index) == 0; + return false; } @Override public int getNullCount() { - return BitVectorHelper.getNullCount(validityBuffer, valueCount); + return 0; } public int isSet(int index) { @@ -829,7 +776,6 @@ public int isSet(int index) { public void setValueCount(int valueCount) { this.valueCount = valueCount; while (valueCount > getTypeBufferValueCapacity()) { - reallocValidityBuffer(); reallocTypeBuffer(); reallocOffsetBuffer(); } @@ -837,15 +783,15 @@ public void setValueCount(int valueCount) { } private void setChildVectorValueCounts() { - int [] counts = new int[nextTypeId]; + int [] counts = new int[Byte.MAX_VALUE + 1]; for (int i = 0; i < this.valueCount; i++) { - if (!isNull(i)) { - byte typeId = getTypeId(i); + byte typeId = getTypeId(i); + if (typeId != -1) { counts[typeId] += 1; } } for (int i = 0; i < nextTypeId; i++) { - childVectors[i].setValueCount(counts[i]); + childVectors[typeMapFields[i]].setValueCount(counts[typeMapFields[i]]); } } @@ -893,10 +839,6 @@ public void setSafe(int index, Nullable${name}Holder holder) { while (index >= getOffsetBufferValueCapacity()) { reallocOffsetBuffer(); } - while (index >= getValidityBufferValueCapacity()) { - reallocValidityBuffer(); - } - BitVectorHelper.setBit(validityBuffer, index); byte typeId = getTypeId(index); ${name}Vector vector = get${name}Vector(typeId<#if minor.class == "Decimal">, new ArrowType.Decimal(holder.precision, holder.scale)); int offset = vector.getValueCount(); @@ -923,10 +865,6 @@ private long getOffsetBufferValueCapacity() { return offsetBuffer.capacity() / OFFSET_WIDTH; } - private long getValidityBufferValueCapacity() { - return validityBuffer.capacity() * 8; - } - @Override public int hashCode(int index, ArrowBufHasher hasher) { if (isNull(index)) { @@ -950,4 +888,10 @@ public OUT accept(VectorVisitor visitor, IN value) { public String getName() { return name; } + + private void setNegative(long start, long end) { + for (long i = start;i < end; i++) { + typeBuffer.setByte(i, -1); + } + } } diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index b89870dcf06..26d1be8df44 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -20,16 +20,21 @@ import org.apache.arrow.memory.ReferenceManager; import org.apache.arrow.memory.util.CommonUtil; import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.DataSizeRoundingUtil; import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.AbstractStructVector; import org.apache.arrow.vector.complex.NonNullableStructVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.UnionMode; import org.apache.arrow.vector.compare.RangeEqualsVisitor; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; @@ -50,6 +55,7 @@ import org.apache.arrow.vector.compare.VectorVisitor; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.util.DataSizeRoundingUtil; import org.apache.arrow.vector.util.ValueVectorUtility; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.memory.util.ArrowBufPointer; @@ -100,6 +106,7 @@ public class UnionVector implements FieldVector { private int typeBufferAllocationSizeInBytes; private final FieldType fieldType; + private final Field[] typeIds = new Field[Byte.MAX_VALUE + 1]; private static final byte TYPE_WIDTH = 1; private static final FieldType INTERNAL_STRUCT_TYPE = new FieldType(false /*nullable*/, @@ -143,6 +150,17 @@ public MinorType getMinorType() { @Override public void initializeChildrenFromFields(List children) { + int count = 0; + for (Field child: children) { + int typeId = Types.getMinorTypeForArrowType(child.getType()).ordinal(); + if (fieldType != null) { + int[] typeIds = ((ArrowType.Union)fieldType.getType()).getTypeIds(); + if (typeIds != null) { + typeId = typeIds[count++]; + } + } + typeIds[typeId] = child; + } internalStruct.initializeChildrenFromFields(children); } @@ -154,9 +172,8 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { if (ownBuffers.size() != 1) { - throw new IllegalArgumentException("Illegal buffer count, expected " + 1 + ", got: " + ownBuffers.size()); + throw new IllegalArgumentException("Illegal buffer count, expected 1, got: " + ownBuffers.size()); } - ArrowBuf buffer = ownBuffers.get(0); typeBuffer.getReferenceManager().release(); typeBuffer = buffer.getReferenceManager().retain(buffer, allocator); @@ -192,16 +209,25 @@ private FieldType fieldType(MinorType type) { return FieldType.nullable(type.getType()); } - private T addOrGet(MinorType minorType, Class c) { - return internalStruct.addOrGet(fieldName(minorType), fieldType(minorType), c); + private T addOrGet(Types.MinorType minorType, Class c) { + return addOrGet(null, minorType, c); + } + + private T addOrGet(String name, Types.MinorType minorType, ArrowType arrowType, Class c) { + return internalStruct.addOrGet(name == null ? fieldName(minorType) : name, FieldType.nullable(arrowType), c); } - private T addOrGet(MinorType minorType, ArrowType arrowType, Class c) { - return internalStruct.addOrGet(fieldName(minorType), FieldType.nullable(arrowType), c); + private T addOrGet(String name, Types.MinorType minorType, Class c) { + return internalStruct.addOrGet(name == null ? fieldName(minorType) : name, fieldType(minorType), c); } + @Override public long getValidityBufferAddress() { + throw new UnsupportedOperationException(); + } + + public long getTypeBufferAddress() { return typeBuffer.memoryAddress(); } @@ -215,8 +241,12 @@ public long getOffsetBufferAddress() { throw new UnsupportedOperationException(); } + public ArrowBuf getTypeBuffer() { + return typeBuffer; + } + @Override - public ArrowBuf getValidityBuffer() { return typeBuffer; } + public ArrowBuf getValidityBuffer() { throw new UnsupportedOperationException(); } @Override public ArrowBuf getDataBuffer() { throw new UnsupportedOperationException(); } @@ -247,10 +277,14 @@ public StructVector getStruct() { private ${name}Vector ${uncappedName}Vector; - public ${name}Vector get${name}Vector(<#if minor.class == "Decimal">ArrowType arrowType) { + public ${name}Vector get${name}Vector(<#if minor.class == "Decimal"> ArrowType arrowType) { + return get${name}Vector(null<#if minor.class == "Decimal">, arrowType); + } + + public ${name}Vector get${name}Vector(String name<#if minor.class == "Decimal">, ArrowType arrowType) { if (${uncappedName}Vector == null) { int vectorCount = internalStruct.size(); - ${uncappedName}Vector = addOrGet(MinorType.${name?upper_case},<#if minor.class == "Decimal"> arrowType, ${name}Vector.class); + ${uncappedName}Vector = addOrGet(name, MinorType.${name?upper_case},<#if minor.class == "Decimal"> arrowType, ${name}Vector.class); if (internalStruct.size() > vectorCount) { ${uncappedName}Vector.allocateNew(); if (callBack != null) { @@ -481,7 +515,7 @@ public TransferImpl(UnionVector to) { @Override public void transfer() { to.clear(); - final ReferenceManager refManager = typeBuffer.getReferenceManager(); + ReferenceManager refManager = typeBuffer.getReferenceManager(); to.typeBuffer = refManager.transferOwnership(typeBuffer, to.allocator).getTransferredBuffer(); internalStructVectorTransferPair.transfer(); to.valueCount = valueCount; @@ -493,6 +527,7 @@ public void splitAndTransfer(int startIndex, int length) { Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); to.clear(); + internalStructVectorTransferPair.splitAndTransfer(startIndex, length); final int startPoint = startIndex * TYPE_WIDTH; final int sliceLength = length * TYPE_WIDTH; @@ -586,7 +621,16 @@ public ValueVector getVectorByType(int typeId) { } public ValueVector getVectorByType(int typeId, ArrowType arrowType) { - switch (MinorType.values()[typeId]) { + Field type = typeIds[typeId]; + Types.MinorType minorType; + String name = null; + if (type == null) { + minorType = Types.MinorType.values()[typeId]; + } else { + minorType = Types.getMinorTypeForArrowType(type.getType()); + name = type.getName(); + } + switch (minorType) { case NULL: return null; <#list vv.types as type> @@ -596,7 +640,7 @@ public ValueVector getVectorByType(int typeId, ArrowType arrowType) { <#assign uncappedName = name?uncap_first/> <#if !minor.typeParams?? || minor.class == "Decimal" > case ${name?upper_case}: - return get${name}Vector(<#if minor.class == "Decimal">arrowType); + return get${name}Vector(name<#if minor.class == "Decimal">, arrowType); @@ -612,7 +656,7 @@ public ValueVector getVectorByType(int typeId, ArrowType arrowType) { public Object getObject(int index) { ValueVector vector = getVector(index); if (vector != null) { - return vector.getObject(index); + return vector.isNull(index) ? null : vector.getObject(index); } return null; } @@ -634,23 +678,18 @@ public int getValueCount() { return valueCount; } + /** + * IMPORTANT: Union types always return non null as there is no validity buffer. + * + * To check validity correctly you must check the underlying vector. + */ public boolean isNull(int index) { - ValueVector vec = getVector(index); - if (vec == null) { - return true; - } - return vec.isNull(index); + return false; } @Override public int getNullCount() { - int nullCount = 0; - for (int i = 0; i < getValueCount(); i++) { - if (isNull(i)) { - nullCount++; - } - } - return nullCount; + return 0; } public int isSet(int index) { @@ -713,7 +752,7 @@ public void setSafe(int index, UnionHolder holder, ArrowType arrowType) { <#if !minor.typeParams?? || minor.class == "Decimal" > public void setSafe(int index, Nullable${name}Holder holder<#if minor.class == "Decimal">, ArrowType arrowType) { setType(index, MinorType.${name?upper_case}); - get${name}Vector(<#if minor.class == "Decimal">arrowType).setSafe(index, holder); + get${name}Vector(null<#if minor.class == "Decimal">, arrowType).setSafe(index, holder); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java index ff2a786b2f0..0bd64c06eab 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -33,7 +33,7 @@ public enum BufferType { DATA("DATA"), OFFSET("OFFSET"), VALIDITY("VALIDITY"), - TYPE("TYPE"); + TYPE("TYPE_ID"); private final String name; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java index 80d1f4f2ea4..db9a8acfd83 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java @@ -223,10 +223,7 @@ public int getNullCount() { @Override public boolean isNull(int index) { - if (index < valueCount) { - return true; - } - throw new IndexOutOfBoundsException(); + return true; } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 191597ad877..501ca98c0a4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -71,8 +71,6 @@ public TypeLayout visit(Union type) { switch (type.getMode()) { case Dense: vectors = asList( - // TODO: validate this - BufferLayout.validityVector(), BufferLayout.typeBuffer(), BufferLayout.offsetBuffer() // offset to find the vector ); @@ -278,7 +276,7 @@ public Integer visit(Union type) { switch (type.getMode()) { case Dense: // TODO: validate this - return 3; + return 2; case Sparse: // type buffer return 1; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index 54f47007fec..13935ef4f10 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -59,6 +59,7 @@ import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.DecimalUtility; @@ -69,7 +70,9 @@ import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.MappingJsonFactory; +import com.fasterxml.jackson.databind.ObjectMapper; /** * A reader for JSON files that translates them into vectors. This reader is used for integration tests. @@ -92,7 +95,10 @@ public class JsonFileReader implements AutoCloseable, DictionaryProvider { public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException { super(); this.allocator = allocator; - MappingJsonFactory jsonFactory = new MappingJsonFactory(); + MappingJsonFactory jsonFactory = new MappingJsonFactory(new ObjectMapper() + //ignore case for enums + .configure(MapperFeature.ACCEPT_CASE_INSENSITIVE_ENUMS, true) + ); this.parser = jsonFactory.createParser(inputFile); // Allow reading NaN for floating point values this.parser.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, true); @@ -173,7 +179,6 @@ public boolean read(VectorSchemaRoot root) throws IOException { if (t == START_OBJECT) { { int count = readNextField("count", Integer.class); - root.setRowCount(count); nextFieldIs("columns"); readToken(START_ARRAY); { @@ -183,6 +188,7 @@ public boolean read(VectorSchemaRoot root) throws IOException { } } readToken(END_ARRAY); + root.setRowCount(count); } readToken(END_OBJECT); return true; @@ -707,7 +713,7 @@ private void readFromJsonIntoVector(Field field, FieldVector vector) throws Json BufferType bufferType = vectorTypes.get(v); nextFieldIs(bufferType.getName()); int innerBufferValueCount = valueCount; - if (bufferType.equals(OFFSET)) { + if (bufferType.equals(OFFSET) && !field.getType().getTypeID().equals(ArrowType.ArrowTypeID.Union)) { /* offset buffer has 1 additional value capacity */ innerBufferValueCount = valueCount + 1; } @@ -720,7 +726,10 @@ private void readFromJsonIntoVector(Field field, FieldVector vector) throws Json return; } - final int nullCount = BitVectorHelper.getNullCount(vectorBuffers[0], valueCount); + int nullCount = 0; + if (!(vector.getField().getFieldType().getType() instanceof ArrowType.Union)) { + nullCount = BitVectorHelper.getNullCount(vectorBuffers[0], valueCount); + } final ArrowFieldNode fieldNode = new ArrowFieldNode(valueCount, nullCount); vector.loadFieldBuffers(fieldNode, Arrays.asList(vectorBuffers)); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java index d0f0f51e38e..e210b002890 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java @@ -219,11 +219,18 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE BufferType bufferType = vectorTypes.get(v); ArrowBuf vectorBuffer = vectorBuffers.get(v); generator.writeArrayFieldStart(bufferType.getName()); - final int bufferValueCount = (bufferType.equals(OFFSET)) ? valueCount + 1 : valueCount; + final int bufferValueCount = (bufferType.equals(OFFSET) && vector.getMinorType() != MinorType.DENSEUNION) ? + valueCount + 1 : valueCount; for (int i = 0; i < bufferValueCount; i++) { if (bufferType.equals(DATA) && (vector.getMinorType() == MinorType.VARCHAR || vector.getMinorType() == MinorType.VARBINARY)) { writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v - 1), vector, i); + } else if (bufferType.equals(OFFSET) && vector.getValueCount() == 0 && + (vector.getMinorType() == MinorType.VARBINARY || vector.getMinorType() == MinorType.VARCHAR)) { + ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(4); + vectorBufferTmp.setInt(0, 0); + writeValueToGenerator(bufferType, vectorBufferTmp, null, vector, i); + vectorBufferTmp.release(); } else { writeValueToGenerator(bufferType, vectorBuffer, null, vector, i); } @@ -353,7 +360,7 @@ private void writeValueToGenerator( case VARBINARY: { Preconditions.checkNotNull(offsetBuffer); String hexString = Hex.encodeHexString(BaseVariableWidthVector.get(buffer, - offsetBuffer, index)); + offsetBuffer, index)); generator.writeObject(hexString); break; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 0ec427e5a03..886478ce403 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -822,7 +822,14 @@ public MinorType visit(FixedSizeList type) { @Override public MinorType visit(Union type) { - return MinorType.UNION; + switch (type.getMode()) { + case Sparse: + return MinorType.UNION; + case Dense: + return MinorType.DENSEUNION; + default: + throw new IllegalArgumentException("only Dense or Sparse unions supported: " + type); + } } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index 277f793d13f..741972b4ad2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -151,6 +151,8 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.LargeBinary || type instanceof ArrowType.FixedSizeBinary) { return Arrays.equals((byte[]) o1, (byte[]) o2); + } else if (o1 instanceof byte[] && o2 instanceof byte[]) { + return Arrays.equals((byte[]) o1, (byte[]) o2); } return Objects.equals(o1, o2); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index b49f489e061..1e6a203b3e4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -374,18 +374,18 @@ public ValueVector visit(UnionVector deltaVector, Void value) { } // append type buffers - PlatformDependent.copyMemory(deltaVector.getValidityBufferAddress(), - targetUnionVector.getValidityBufferAddress() + targetVector.getValueCount(), + PlatformDependent.copyMemory(deltaVector.getTypeBufferAddress(), + targetUnionVector.getTypeBufferAddress() + targetVector.getValueCount(), deltaVector.getValueCount()); // build the hash set for all types HashSet targetTypes = new HashSet<>(); for (int i = 0; i < targetUnionVector.getValueCount(); i++) { - targetTypes.add((int) targetUnionVector.getValidityBuffer().getByte(i)); + targetTypes.add(targetUnionVector.getTypeValue(i)); } HashSet deltaTypes = new HashSet<>(); for (int i = 0; i < deltaVector.getValueCount(); i++) { - deltaTypes.add((int) deltaVector.getValidityBuffer().getByte(i)); + deltaTypes.add(deltaVector.getTypeValue(i)); } // append child vectors @@ -432,11 +432,6 @@ public ValueVector visit(DenseUnionVector deltaVector, Void value) { targetDenseUnionVector.reAlloc(); } - // append validity buffers - BitVectorHelper.concatBits( - targetVector.getValidityBuffer(), targetVector.getValueCount(), - deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); - // append type buffers PlatformDependent.copyMemory(deltaVector.getTypeBuffer().memoryAddress(), targetDenseUnionVector.getTypeBuffer() .memoryAddress() + targetVector.getValueCount(), diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index dcd60507fa8..d7fccd1ed15 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -89,12 +89,12 @@ public void testDenseUnionVector() throws Exception { assertEquals(false, unionVector.isNull(0)); assertEquals(100, unionVector.getObject(0)); - assertEquals(true, unionVector.isNull(1)); + assertNull(unionVector.getObject(1)); assertEquals(false, unionVector.isNull(2)); assertEquals(100, unionVector.getObject(2)); - assertEquals(true, unionVector.isNull(3)); + assertNull(unionVector.getObject(3)); } } @@ -136,12 +136,12 @@ public void testTransfer() throws Exception { assertFalse(destVector.isNull(1)); assertEquals(false, destVector.getObject(1)); - assertTrue(destVector.isNull(2)); + assertNull(destVector.getObject(2)); assertFalse(destVector.isNull(3)); assertEquals(10, destVector.getObject(3)); - assertTrue(destVector.isNull(4)); + assertNull(destVector.getObject(4)); assertFalse(destVector.isNull(5)); assertEquals(false, destVector.getObject(5)); @@ -385,7 +385,6 @@ public void testGetBufferAddress() throws Exception { List buffers = vector.getFieldBuffers(); - long bitAddress = vector.getValidityBufferAddress(); long offsetAddress = vector.getOffsetBufferAddress(); try { @@ -396,9 +395,8 @@ public void testGetBufferAddress() throws Exception { assertTrue(error); } - assertEquals(3, buffers.size()); - assertEquals(bitAddress, buffers.get(0).memoryAddress()); - assertEquals(offsetAddress, buffers.get(2).memoryAddress()); + assertEquals(2, buffers.size()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); } } @@ -461,15 +459,12 @@ public void testMultipleStructs() { unionVector.setTypeId(0, typeId1); offsetBuf.setInt(0, 0); - BitVectorHelper.setBit(unionVector.getValidityBuffer(), 0); unionVector.setTypeId(1, typeId2); offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH, 0); - BitVectorHelper.setBit(unionVector.getValidityBuffer(), 1); unionVector.setTypeId(2, typeId1); offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH * 2, 1); - BitVectorHelper.setBit(unionVector.getValidityBuffer(), 2); unionVector.setValueCount(3); @@ -526,25 +521,19 @@ public void testMultipleVarChars() { // slot 0 points to child1 unionVector.setTypeId(0, typeId1); offsetBuf.setInt(0, 0); - BitVectorHelper.setBit(unionVector.getValidityBuffer(), 0); // slot 1 points to child2 unionVector.setTypeId(1, typeId2); offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH, 0); - BitVectorHelper.setBit(unionVector.getValidityBuffer(), 1); // slot 2 points to child2 unionVector.setTypeId(2, typeId2); offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH * 2, 1); - BitVectorHelper.setBit(unionVector.getValidityBuffer(), 2); - // slot 3 points to null - BitVectorHelper.unsetBit(unionVector.getValidityBuffer(), 3); // slot 4 points to child1 unionVector.setTypeId(4, typeId1); offsetBuf.setInt(DenseUnionVector.OFFSET_WIDTH * 4, 1); - BitVectorHelper.setBit(unionVector.getValidityBuffer(), 4); unionVector.setValueCount(5); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java b/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java index de4b87272a8..18175276737 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestTypeLayout.java @@ -34,7 +34,7 @@ public void testTypeBufferCount() { ArrowType type = new ArrowType.Int(8, true); assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); - type = new ArrowType.Union(UnionMode.Sparse, new int[3]); + type = new ArrowType.Union(UnionMode.Sparse, new int[2]); assertEquals(TypeLayout.getTypeBufferCount(type), TypeLayout.getTypeLayout(type).getBufferLayouts().size()); type = new ArrowType.Union(UnionMode.Dense, new int[1]); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java index e37ff01f9e1..15d81ab6799 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -19,6 +19,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.util.ArrayList; @@ -81,12 +82,12 @@ public void testUnionVector() throws Exception { assertEquals(false, unionVector.isNull(0)); assertEquals(100, unionVector.getObject(0)); - assertEquals(true, unionVector.isNull(1)); + assertNull(unionVector.getObject(1)); assertEquals(false, unionVector.isNull(2)); assertEquals(100, unionVector.getObject(2)); - assertEquals(true, unionVector.isNull(3)); + assertNull(unionVector.getObject(3)); } } @@ -126,12 +127,12 @@ public void testTransfer() throws Exception { assertFalse(destVector.isNull(1)); assertEquals(false, destVector.getObject(1)); - assertTrue(destVector.isNull(2)); + assertNull(destVector.getObject(2)); assertFalse(destVector.isNull(3)); assertEquals(10, destVector.getObject(3)); - assertTrue(destVector.isNull(4)); + assertNull(destVector.getObject(4)); assertFalse(destVector.isNull(5)); assertEquals(false, destVector.getObject(5)); @@ -365,7 +366,6 @@ public void testGetBufferAddress() throws Exception { List buffers = vector.getFieldBuffers(); - long bitAddress = vector.getValidityBufferAddress(); try { long offsetAddress = vector.getOffsetBufferAddress(); @@ -385,7 +385,6 @@ public void testGetBufferAddress() throws Exception { } assertEquals(1, buffers.size()); - assertEquals(bitAddress, buffers.get(0).memoryAddress()); } } @@ -407,7 +406,7 @@ public void testSetGetNull() { holder.isSet = 0; srcVector.setSafe(0, holder); - assertTrue(srcVector.isNull(0)); + assertNull(srcVector.getObject(0)); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index a4b9bffead9..c67fdefb24d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -2798,36 +2798,36 @@ public void testEmptyBufBehavior() { } try (final UnionVector vector = UnionVector.empty("v", allocator)) { - assertEquals(1, vector.getValidityBuffer().refCnt()); - assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(0, vector.getTypeBuffer().capacity()); vector.setValueCount(10); vector.allocateNewSafe(); - assertEquals(1, vector.getValidityBuffer().refCnt()); - assertEquals(4096, vector.getValidityBuffer().capacity()); + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(4096, vector.getTypeBuffer().capacity()); vector.close(); - assertEquals(1, vector.getValidityBuffer().refCnt()); - assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(1, vector.getTypeBuffer().refCnt()); + assertEquals(0, vector.getTypeBuffer().capacity()); } try (final DenseUnionVector vector = DenseUnionVector.empty("v", allocator)) { - assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getTypeBuffer().refCnt()); assertEquals(1, vector.getOffsetBuffer().refCnt()); - assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(0, vector.getTypeBuffer().capacity()); assertEquals(0, vector.getOffsetBuffer().capacity()); vector.setValueCount(valueCount); vector.allocateNew(); - assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getTypeBuffer().refCnt()); assertEquals(1, vector.getOffsetBuffer().refCnt()); - assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(4096, vector.getTypeBuffer().capacity()); assertEquals(16384, vector.getOffsetBuffer().capacity()); vector.close(); - assertEquals(1, vector.getValidityBuffer().refCnt()); + assertEquals(1, vector.getTypeBuffer().refCnt()); assertEquals(1, vector.getOffsetBuffer().refCnt()); - assertEquals(0, vector.getValidityBuffer().capacity()); + assertEquals(0, vector.getTypeBuffer().capacity()); assertEquals(0, vector.getOffsetBuffer().capacity()); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index c9214bdb019..9dce33122e8 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -19,7 +19,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertNull; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.DirtyRootAllocator; @@ -93,7 +93,7 @@ public void testPromoteToUnion() throws Exception { assertFalse("2 shouldn't be null", uv.isNull(2)); assertEquals(10, uv.getObject(2)); - assertTrue("3 should be null", uv.isNull(3)); + assertNull("3 should be null", uv.getObject(3)); assertFalse("4 shouldn't be null", uv.isNull(4)); assertEquals(100, uv.getObject(4));