diff --git a/integration/integration_test.py b/integration/integration_test.py index 79b098a614a..301017cac3d 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -463,6 +463,46 @@ def generate_column(self, size, name=None): return self.column_class(name, size, is_valid, values) +class FixedSizeBinaryType(PrimitiveType): + + def __init__(self, name, byte_width, nullable=True): + super(FixedSizeBinaryType, self).__init__(name, nullable=nullable) + self.byte_width = byte_width + + @property + def numpy_type(self): + return object + + @property + def column_class(self): + return FixedSizeBinaryColumn + + def _get_type(self): + return OrderedDict([('name', 'fixedsizebinary'), ('byteWidth', self.byte_width)]) + + def _get_type_layout(self): + return OrderedDict([ + ('vectors', + [OrderedDict([('type', 'VALIDITY'), + ('typeBitWidth', 1)]), + OrderedDict([('type', 'DATA'), + ('typeBitWidth', self.byte_width)])])]) + + def generate_column(self, size, name=None): + is_valid = self._make_is_valid(size) + values = [] + + for i in range(size): + draw = (np.random.randint(0, 255, size=self.byte_width) + .astype(np.uint8) + .tostring()) + values.append(draw) + + if name is None: + name = self.name + return self.column_class(name, size, is_valid, values) + + class StringType(BinaryType): @property @@ -525,6 +565,22 @@ def _get_buffers(self): ] +class FixedSizeBinaryColumn(PrimitiveColumn): + + def _encode_value(self, x): + return ''.join('{:02x}'.format(c).upper() for c in x) + + def _get_buffers(self): + data = [] + for i, v in enumerate(self.values): + data.append(self._encode_value(v)) + + return [ + ('VALIDITY', [int(x) for x in self.is_valid]), + ('DATA', data) + ] + + class StringColumn(BinaryColumn): def _encode_value(self, x): @@ -719,6 +775,9 @@ def get_field(name, type_, nullable=True): return BinaryType(name, nullable=nullable) elif type_ == 'utf8': return StringType(name, nullable=nullable) + elif type_.startswith('fixedsizebinary_'): + byte_width = int(type_.split('_')[1]) + return FixedSizeBinaryType(name, byte_width=byte_width, nullable=nullable) dtype = np.dtype(type_) @@ -751,7 +810,8 @@ def _generate_file(name, fields, batch_sizes, dictionaries=None): def generate_primitive_case(batch_sizes, name='primitive'): types = ['bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float32', 'float64', 'binary', 'utf8'] + 'float32', 'float64', 'binary', 'utf8', + 'fixedsizebinary_19', 'fixedsizebinary_120'] fields = [] diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index ce92c1333a5..63b193fc66a 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -57,6 +57,11 @@ fields: [], complex: false }, + { + name: "FixedSizeBinary", + fields: [{name: "byteWidth", type: int}], + complex: false + } { name: "Bool", fields: [], diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 970d887c760..f294a8e83a4 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -122,6 +122,21 @@ } ] }, + { + major: "Fixed", + width: -1, + javaType: "byte[]", + boxedType: "ArrowBuf", + minor: [ + { + class: "FixedSizeBinary", + typeParams: [ {name: "byteWidth", type: "int"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary", + friendlyType: "byte[]", + fields: [{name: "buffer", type: "ArrowBuf"}], + } + ] + }, { major: "VarLen", width: 4, diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index c2aa83757b9..f1c10d1021b 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -128,6 +128,10 @@ public void read(Nullable${name}Holder h) { holder.buffer.getBytes(holder.start, bytes, 0, ${type.width}); ${friendlyType} value = new BigDecimal(new BigInteger(bytes), holder.scale); return value; + <#elseif minor.class == "FixedSizeBinary"> + byte[] value = new byte [holder.byteWidth]; + holder.buffer.getBytes(0, value, 0, holder.byteWidth); + return value; <#else> ${friendlyType} value = new ${friendlyType}(this.holder.value); return value; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index 702db9f5281..cbc56fe3dde 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -42,7 +42,7 @@ */ public abstract class BaseFixedWidthVector extends BaseValueVector implements FixedWidthVector, FieldVector, VectorDefinitionSetter { - private final byte typeWidth; + private final int typeWidth; protected int valueAllocationSizeInBytes; protected int validityAllocationSizeInBytes; @@ -54,7 +54,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector protected int valueCount; public BaseFixedWidthVector(final String name, final BufferAllocator allocator, - FieldType fieldType, final byte typeWidth) { + FieldType fieldType, final int typeWidth) { super(name, allocator); this.typeWidth = typeWidth; field = new Field(name, fieldType, null); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 3887da4a618..b7b7b990630 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -58,7 +58,7 @@ public BitVector(String name, BufferAllocator allocator) { * @param allocator allocator for memory management. */ public BitVector(String name, FieldType fieldType, BufferAllocator allocator) { - super(name, allocator, fieldType, (byte) 0); + super(name, allocator, fieldType, 0); reader = new BitReaderImpl(BitVector.this); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java index f6529d8e55b..0faab7b2b29 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -70,7 +70,7 @@ public static BufferLayout dataBuffer(int typeBitWidth) { case 128: return VALUES_128; default: - throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + throw new IllegalArgumentException("only 8, 16, 32, 64, or 128 bits supported"); } } @@ -90,7 +90,7 @@ public static BufferLayout byteVector() { private final BufferType type; - private BufferLayout(BufferType type, int typeBitWidth) { + BufferLayout(BufferType type, int typeBitWidth) { super(); this.type = Preconditions.checkNotNull(type); this.typeBitWidth = (short) typeBitWidth; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java new file mode 100644 index 00000000000..232e6564ace --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java @@ -0,0 +1,404 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + + +/** + * FixedSizeBinaryVector implements a fixed width vector of + * binary values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class FixedSizeBinaryVector extends BaseFixedWidthVector { + private final int byteWidth; + private final FieldReader reader; + + /** + * Instantiate a FixedSizeBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + * @param byteWidth byte width of the binary values + */ + public FixedSizeBinaryVector(String name, BufferAllocator allocator, int byteWidth) { + this(name, FieldType.nullable(new FixedSizeBinary(byteWidth)), allocator); + } + + /** + * Instantiate a FixedSizeBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public FixedSizeBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, ((FixedSizeBinary) fieldType.getType()).getByteWidth()); + reader = new FixedSizeBinaryReaderImpl(FixedSizeBinaryVector.this); + byteWidth = ((FixedSizeBinary) fieldType.getType()).getByteWidth(); + } + + /** + * Get a reader that supports reading values from this vector + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.FIXEDSIZEBINARY; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public byte[] get(int index) { + assert index >= 0; + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + final byte[] dst = new byte[byteWidth]; + valueBuffer.getBytes(index * byteWidth, dst, 0, byteWidth); + return dst; + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + * @param holder nullable holder to carry the buffer + */ + public void get(int index, NullableFixedSizeBinaryHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.buffer = valueBuffer.slice(index * byteWidth, byteWidth); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + @Override + public byte[] getObject(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return null; + } else { + final byte[] dst = new byte[byteWidth]; + valueBuffer.getBytes(index * byteWidth, dst, 0, byteWidth); + return dst; + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, FixedSizeBinaryVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + from.valueBuffer.getBytes(fromIndex * byteWidth, valueBuffer, + thisIndex * byteWidth, byteWidth); + } + + /** + * Same as {@link #copyFrom(int, int, FixedSizeBinaryVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, FixedSizeBinaryVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + public int getByteWidth() { + return byteWidth; + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + public void set(int index, byte[] value) { + assert index >= 0; + assert byteWidth <= value.length; + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * byteWidth, value, 0, byteWidth); + } + + public void setSafe(int index, byte[] value) { + handleSafe(index); + set(index, value); + } + + public void set(int index, int isSet, byte[] value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + public void setSafe(int index, int isSet, byte[] value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void set(int index, ArrowBuf buffer) { + assert index >= 0; + assert byteWidth <= buffer.capacity(); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * byteWidth, buffer, 0, byteWidth); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void setSafe(int index, ArrowBuf buffer) { + handleSafe(index); + set(index, buffer); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void set(int index, int isSet, ArrowBuf buffer) { + if (isSet > 0) { + set(index, buffer); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void setSafe(int index, int isSet, ArrowBuf buffer) { + handleSafe(index); + set(index, isSet, buffer); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, FixedSizeBinaryHolder holder) { + assert holder.byteWidth == byteWidth; + set(index, holder.buffer); + } + + /** + * Same as {@link #set(int, FixedSizeBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, FixedSizeBinaryHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableFixedSizeBinaryHolder holder) { + assert holder.byteWidth == byteWidth; + if (holder.isSet < 0) { + throw new IllegalArgumentException("holder has a negative isSet value"); + } else if (holder.isSet > 0) { + set(index, holder.buffer); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, NullableFixedSizeBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableFixedSizeBinaryHolder holder) { + handleSafe(index); + set(index, holder); + } + + public void setNull(int index) { + handleSafe(index); + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static byte[] get(final ArrowBuf buffer, final int index, final int byteWidth) { + final byte[] dst = new byte[byteWidth]; + buffer.getBytes(index * byteWidth, dst, 0, byteWidth); + return dst; + } + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((FixedSizeBinaryVector) to); + } + + private class TransferImpl implements TransferPair { + FixedSizeBinaryVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new FixedSizeBinaryVector(ref, allocator, FixedSizeBinaryVector.this.byteWidth); + } + + public TransferImpl(FixedSizeBinaryVector to) { + this.to = to; + } + + @Override + public FixedSizeBinaryVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, FixedSizeBinaryVector.this); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index d6f32b4b4b1..4c05b97b4ac 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -31,6 +31,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; @@ -136,6 +137,11 @@ public TypeLayout visit(Decimal type) { return newFixedWidthTypeLayout(BufferLayout.dataBuffer(128)); } + @Override + public TypeLayout visit(FixedSizeBinary type) { + return newFixedWidthTypeLayout(new BufferLayout(BufferType.DATA, type.getByteWidth() * 8)); + } + @Override public TypeLayout visit(Bool type) { return newFixedWidthTypeLayout(BufferLayout.booleanVector()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 762a442c983..08df7b6d931 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -136,7 +136,7 @@ public static ValueVector decode(ValueVector indices, Dictionary dictionary) { private static void validateType(MinorType type) { // byte arrays don't work as keys in our dictionary map - we could wrap them with something to // implement equals and hashcode if we want that functionality - if (type == MinorType.VARBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { + if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { throw new IllegalArgumentException("Dictionary encoding for complex types not implemented: type " + type); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index d0a9b9e18b8..8995716461e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -329,6 +329,26 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException } }; + BufferReader FIXEDSIZEBINARY = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + ArrayList values = Lists.newArrayList(); + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = decodeHexSafe(parser.readValueAs(String.class)); + values.add(value); + } + + int byteWidth = count > 0 ? values.get(0).length : 0; + ArrowBuf buf = allocator.buffer(byteWidth * count); + for (byte[] value : values) { + buf.writeBytes(value); + } + + return buf; + } + }; + BufferReader VARCHAR = new BufferReader() { @Override protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { @@ -428,6 +448,9 @@ private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType case DECIMAL: reader = helper.DECIMAL; break; + case FIXEDSIZEBINARY: + reader = helper.FIXEDSIZEBINARY; + break; case VARCHAR: reader = helper.VARCHAR; break; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java index 6eb76a7a147..ad56d573b91 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java @@ -163,8 +163,7 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE generator.writeObjectField("name", field.getName()); int valueCount = vector.getValueCount(); generator.writeObjectField("count", valueCount); - final int scale = (vector instanceof DecimalVector) ? - ((DecimalVector) vector).getScale() : 0; + for (int v = 0; v < vectorTypes.size(); v++) { BufferType bufferType = vectorTypes.get(v); ArrowBuf vectorBuffer = vectorBuffers.get(v); @@ -173,9 +172,9 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE for (int i = 0; i < bufferValueCount; i++) { if (bufferType.equals(DATA) && (vector.getMinorType() == Types.MinorType.VARCHAR || vector.getMinorType() == Types.MinorType.VARBINARY)) { - writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v-1), vector, i, scale); + writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v-1), vector, i); } else { - writeValueToGenerator(bufferType, vectorBuffer, null, vector, i, scale); + writeValueToGenerator(bufferType, vectorBuffer, null, vector, i); } } generator.writeEndArray(); @@ -200,12 +199,12 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, ArrowBuf offsetBuffer, FieldVector vector, - final int index, final int scale) throws IOException { + final int index) throws IOException { if (bufferType.equals(TYPE)) { generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH)); } else if (bufferType.equals(OFFSET)) { generator.writeNumber(buffer.getInt(index * BaseVariableWidthVector.OFFSET_WIDTH)); - } else if(bufferType.equals(VALIDITY)) { + } else if (bufferType.equals(VALIDITY)) { generator.writeNumber(vector.isNull(index) ? 0 : 1); } else if (bufferType.equals(DATA)) { switch (vector.getMinorType()) { @@ -279,6 +278,11 @@ private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, generator.writeObject(hexString); break; } + case FIXEDSIZEBINARY: + int byteWidth = ((FixedSizeBinaryVector) vector).getByteWidth(); + String fixedSizeHexString = Hex.encodeHexString(FixedSizeBinaryVector.get(buffer, index, byteWidth)); + generator.writeObject(fixedSizeHexString); + break; case VARCHAR: { assert offsetBuffer != null; byte[] b = (BaseVariableWidthVector.get(buffer, offsetBuffer, index)); @@ -286,6 +290,7 @@ private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, break; } case DECIMAL: { + int scale = ((DecimalVector) vector).getScale(); BigDecimal decimalValue = DecimalUtility.getBigDecimalFromArrowBuf(buffer, index, scale); // We write the unscaled value, because the scale is stored in the type metadata. generator.writeString(decimalValue.unscaledValue().toString()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 3c5fd81d572..4adbefbeb5a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -65,6 +66,7 @@ import org.apache.arrow.vector.complex.impl.DateDayWriterImpl; import org.apache.arrow.vector.complex.impl.DateMilliWriterImpl; import org.apache.arrow.vector.complex.impl.DecimalWriterImpl; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryWriterImpl; import org.apache.arrow.vector.complex.impl.Float4WriterImpl; import org.apache.arrow.vector.complex.impl.Float8WriterImpl; import org.apache.arrow.vector.complex.impl.IntWriterImpl; @@ -100,6 +102,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; @@ -387,6 +390,17 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new DecimalWriterImpl((DecimalVector) vector); } }, + FIXEDSIZEBINARY(null) { + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new FixedSizeBinaryVector(name, fieldType, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new FixedSizeBinaryWriterImpl((FixedSizeBinaryVector) vector); + } + }, UINT1(new Int(8, false)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { @@ -607,6 +621,11 @@ public MinorType visit(Decimal type) { return MinorType.DECIMAL; } + @Override + public MinorType visit(FixedSizeBinary type) { + return MinorType.FIXEDSIZEBINARY; + } + @Override public MinorType visit(Date type) { switch (type.getUnit()) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index c27e5e5c85c..c27634a7eda 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -140,7 +140,7 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { default: throw new UnsupportedOperationException("unsupported precision: " + fpType); } - } else if (type instanceof ArrowType.Binary) { + } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.FixedSizeBinary) { return Arrays.equals((byte[]) o1, (byte[]) o2); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java new file mode 100644 index 00000000000..90529879172 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java @@ -0,0 +1,261 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TestFixedSizeBinaryVector { + private static final int numValues = 123; + private static final int typeWidth = 9; + private static final int smallDataSize = 6; + private static final int largeDataSize = 12; + + private static byte[][] values; + + static { + values = new byte[numValues][typeWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < typeWidth; j++) { + values[i][j] = ((byte) i); + } + } + } + + private ArrowBuf[] bufs = new ArrowBuf[numValues]; + private FixedSizeBinaryHolder[] holders = new FixedSizeBinaryHolder[numValues]; + private NullableFixedSizeBinaryHolder[] nullableHolders = new NullableFixedSizeBinaryHolder[numValues]; + + private static byte[] smallValue; + + static { + smallValue = new byte[smallDataSize]; + for (int i = 0; i < smallDataSize; i++) { + smallValue[i] = ((byte) i); + } + } + + private ArrowBuf smallBuf; + private FixedSizeBinaryHolder smallHolder; + private NullableFixedSizeBinaryHolder smallNullableHolder; + + private static byte[] largeValue; + + static { + largeValue = new byte[largeDataSize]; + for (int i = 0; i < largeDataSize; i++) { + largeValue[i] = ((byte) i); + } + } + + private ArrowBuf largeBuf; + private FixedSizeBinaryHolder largeHolder; + private NullableFixedSizeBinaryHolder largeNullableHolder; + + private BufferAllocator allocator; + private FixedSizeBinaryVector vector; + + private static void failWithException(String message) throws Exception { + throw new Exception(message); + } + + + @Before + public void init() throws Exception { + allocator = new DirtyRootAllocator(Integer.MAX_VALUE, (byte) 100); + vector = new FixedSizeBinaryVector("fixedSizeBinary", allocator, typeWidth); + vector.allocateNew(); + + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(typeWidth); + bufs[i].setBytes(0, values[i]); + + holders[i] = new FixedSizeBinaryHolder(); + holders[i].byteWidth = typeWidth; + holders[i].buffer = bufs[i]; + + nullableHolders[i] = new NullableFixedSizeBinaryHolder(); + nullableHolders[i].byteWidth = typeWidth; + nullableHolders[i].buffer = bufs[i]; + nullableHolders[i].isSet = 1; + } + + smallBuf = allocator.buffer(smallDataSize); + smallBuf.setBytes(0, smallValue); + + smallHolder = new FixedSizeBinaryHolder(); + smallHolder.byteWidth = smallDataSize; + smallHolder.buffer = smallBuf; + + smallNullableHolder = new NullableFixedSizeBinaryHolder(); + smallNullableHolder.byteWidth = smallDataSize; + smallNullableHolder.buffer = smallBuf; + + largeBuf = allocator.buffer(largeDataSize); + largeBuf.setBytes(0, largeValue); + + largeHolder = new FixedSizeBinaryHolder(); + largeHolder.byteWidth = typeWidth; + largeHolder.buffer = largeBuf; + + largeNullableHolder = new NullableFixedSizeBinaryHolder(); + largeNullableHolder.byteWidth = typeWidth; + largeNullableHolder.buffer = largeBuf; + } + + @After + public void terminate() throws Exception { + for (int i=0; i 0); + byte[] actual = new byte[typeWidth]; + nullableHolders[i].buffer.getBytes(0, actual, 0, typeWidth); + assertArrayEquals(values[i], actual); + } + } + + @Test + public void testSetWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs, byteWidth matches but value or buffer is too small + try { + vector.set(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs, byteWidth matches but value or buffer is bigger than byteWidth + vector.set(0, largeValue); + vector.set(0, largeHolder); + vector.set(0, largeNullableHolder); + vector.set(0, largeBuf); + } + + @Test + public void setSetSafeWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs, byteWidth matches but value or buffer is too small + try { + vector.setSafe(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs, byteWidth matches but value or buffer is bigger than byteWidth + vector.setSafe(0, largeValue); + vector.setSafe(0, largeHolder); + vector.setSafe(0, largeNullableHolder); + vector.setSafe(0, largeBuf); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 29d39aabe6b..b52769e44ef 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -618,7 +618,7 @@ public void timeStampSecWriter() throws Exception { final LocalDateTime expectedSecDateTime = new LocalDateTime(2001, 2, 3, 4, 5, 6, 0); // write - MapVector parent = new MapVector("parent", allocator, null); + MapVector parent = MapVector.empty("parent", allocator); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); @@ -718,7 +718,7 @@ public void timeStampMicroWriters() throws Exception { final LocalDateTime expectedMicroDateTime = new LocalDateTime(2001, 2, 3, 4, 5, 6, 123); // write - MapVector parent = new MapVector("parent", allocator, null); + MapVector parent = MapVector.empty("parent", allocator); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); @@ -765,7 +765,7 @@ public void timeStampNanoWriters() throws Exception { final LocalDateTime expectedNanoDateTime = new LocalDateTime(2001, 2, 3, 4, 5, 6, 123); // write - MapVector parent = new MapVector("parent", allocator, null); + MapVector parent = MapVector.empty("parent", allocator); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); @@ -806,6 +806,51 @@ public void timeStampNanoWriters() throws Exception { } } + @Test + public void fixedSizeBinaryWriters() throws Exception { + // test values + int numValues = 10; + int byteWidth = 9; + byte[][] values = new byte[numValues][byteWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < byteWidth; j++) { + values[i][j] = ((byte) i); + } + } + ArrowBuf[] bufs = new ArrowBuf[numValues]; + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(byteWidth); + bufs[i].setBytes(0, values[i]); + } + + // write + MapVector parent = MapVector.empty("parent", allocator); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + + String fieldName = "fixedSizeBinary"; + FixedSizeBinaryWriter fixedSizeBinaryWriter = rootWriter.fixedSizeBinary(fieldName, byteWidth); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryWriter.setPosition(i); + fixedSizeBinaryWriter.writeFixedSizeBinary(bufs[i]); + } + + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + Assert.assertEquals(fieldName, children.get(0).getName()); + Assert.assertEquals(ArrowType.FixedSizeBinary.TYPE_TYPE, children.get(0).getType().getTypeID()); + + // read + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + + FieldReader fixedSizeBinaryReader = rootReader.reader(fieldName); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryReader.setPosition(i); + byte[] readValues = fixedSizeBinaryReader.readByteArray(); + Assert.assertArrayEquals(values[i], readValues); + } + } + @Test public void complexCopierWithList() { MapVector parent = MapVector.empty("parent", allocator); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java index 055c34e7010..3b809d00c2c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java @@ -42,6 +42,7 @@ import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider; import org.apache.arrow.vector.ipc.message.ArrowBlock; @@ -51,6 +52,7 @@ import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -549,6 +551,63 @@ public void testWriteReadNestedDictionary() throws IOException { } } + @Test + public void testWriteReadFixedSizeBinary() throws IOException { + File file = new File("target/mytest_fixed_size_binary.arrow"); + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + final int numValues = 10; + final int typeWidth = 11; + byte[][] byteValues = new byte[numValues][typeWidth]; + for (int i=0; i