From 071fb252c3ba56f451d8157b2591b9130c22f1c4 Mon Sep 17 00:00:00 2001 From: Jingyuan Wang Date: Thu, 18 Jan 2018 11:32:24 -0500 Subject: [PATCH 1/7] ARROW-633/634: Add FixedSizeBinary support in Java and integration tests --- integration/integration_test.py | 65 ++- .../src/main/codegen/data/ArrowTypes.tdd | 5 + .../main/codegen/data/ValueVectorTypes.tdd | 15 + .../codegen/templates/HolderReaderImpl.java | 4 + .../arrow/vector/BaseFixedWidthVector.java | 4 +- .../org/apache/arrow/vector/BufferLayout.java | 2 +- .../arrow/vector/FixedSizeBinaryVector.java | 401 ++++++++++++++++++ .../org/apache/arrow/vector/TypeLayout.java | 6 + .../vector/dictionary/DictionaryEncoder.java | 2 +- .../arrow/vector/ipc/JsonFileReader.java | 33 ++ .../arrow/vector/ipc/JsonFileWriter.java | 15 +- .../org/apache/arrow/vector/types/Types.java | 19 + .../apache/arrow/vector/util/Validator.java | 2 +- .../vector/TestFixedSizeBinaryVector.java | 261 ++++++++++++ .../complex/writer/TestComplexWriter.java | 51 ++- .../arrow/vector/ipc/TestArrowFile.java | 59 +++ .../arrow/vector/types/pojo/TestSchema.java | 4 +- 17 files changed, 934 insertions(+), 14 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java diff --git a/integration/integration_test.py b/integration/integration_test.py index 79b098a614a..715892b5711 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -463,6 +463,49 @@ def generate_column(self, size, name=None): return self.column_class(name, size, is_valid, values) +class FixedSizeBinaryType(PrimitiveType): + + def __init__(self, name, byte_width, nullable=True): + PrimitiveType.__init__(self, name, nullable) + self.byte_width = byte_width + + @property + def numpy_type(self): + return object + + @property + def column_class(self): + return FixedSizeBinaryColumn + + def _get_type(self): + return OrderedDict([('name', 'fixedsizebinary'), ('byteWidth', self.byte_width)]) + + def _get_type_layout(self): + return OrderedDict([ + ('vectors', + [OrderedDict([('type', 'VALIDITY'), + ('typeBitWidth', 1)]), + OrderedDict([('type', 'DATA'), + ('typeBitWidth', self.byte_width)])])]) + + def generate_column(self, size, name=None): + is_valid = self._make_is_valid(size) + values = [] + + for i in range(size): + if is_valid[i]: + draw = (np.random.randint(0, 255, size=self.byte_width) + .astype(np.uint8) + .tostring()) + values.append(draw) + else: + values.append("") + + if name is None: + name = self.name + return self.column_class(name, size, is_valid, values) + + class StringType(BinaryType): @property @@ -525,6 +568,22 @@ def _get_buffers(self): ] +class FixedSizeBinaryColumn(PrimitiveColumn): + + def _encode_value(self, x): + return ''.join('{:02x}'.format(c).upper() for c in x) + + def _get_buffers(self): + data = [] + for i, v in enumerate(self.values): + data.append(self._encode_value(v if self.is_valid[i] else "")) + + return [ + ('VALIDITY', [int(x) for x in self.is_valid]), + ('DATA', data) + ] + + class StringColumn(BinaryColumn): def _encode_value(self, x): @@ -719,6 +778,9 @@ def get_field(name, type_, nullable=True): return BinaryType(name, nullable=nullable) elif type_ == 'utf8': return StringType(name, nullable=nullable) + elif type_.startswith('fixedsizebinary_'): + byte_width = int(type_.split('_')[1]) + return FixedSizeBinaryType(name, byte_width=byte_width, nullable=nullable) dtype = np.dtype(type_) @@ -751,7 +813,8 @@ def _generate_file(name, fields, batch_sizes, dictionaries=None): def generate_primitive_case(batch_sizes, name='primitive'): types = ['bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float32', 'float64', 'binary', 'utf8'] + 'float32', 'float64', 'binary', 'utf8', + 'fixedsizebinary_19', 'fixedsizebinary_120'] fields = [] diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index ce92c1333a5..63b193fc66a 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -57,6 +57,11 @@ fields: [], complex: false }, + { + name: "FixedSizeBinary", + fields: [{name: "byteWidth", type: int}], + complex: false + } { name: "Bool", fields: [], diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 970d887c760..f294a8e83a4 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -122,6 +122,21 @@ } ] }, + { + major: "Fixed", + width: -1, + javaType: "byte[]", + boxedType: "ArrowBuf", + minor: [ + { + class: "FixedSizeBinary", + typeParams: [ {name: "byteWidth", type: "int"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary", + friendlyType: "byte[]", + fields: [{name: "buffer", type: "ArrowBuf"}], + } + ] + }, { major: "VarLen", width: 4, diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index c2aa83757b9..f1c10d1021b 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -128,6 +128,10 @@ public void read(Nullable${name}Holder h) { holder.buffer.getBytes(holder.start, bytes, 0, ${type.width}); ${friendlyType} value = new BigDecimal(new BigInteger(bytes), holder.scale); return value; + <#elseif minor.class == "FixedSizeBinary"> + byte[] value = new byte [holder.byteWidth]; + holder.buffer.getBytes(0, value, 0, holder.byteWidth); + return value; <#else> ${friendlyType} value = new ${friendlyType}(this.holder.value); return value; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index 702db9f5281..cbc56fe3dde 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -42,7 +42,7 @@ */ public abstract class BaseFixedWidthVector extends BaseValueVector implements FixedWidthVector, FieldVector, VectorDefinitionSetter { - private final byte typeWidth; + private final int typeWidth; protected int valueAllocationSizeInBytes; protected int validityAllocationSizeInBytes; @@ -54,7 +54,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector protected int valueCount; public BaseFixedWidthVector(final String name, final BufferAllocator allocator, - FieldType fieldType, final byte typeWidth) { + FieldType fieldType, final int typeWidth) { super(name, allocator); this.typeWidth = typeWidth; field = new Field(name, fieldType, null); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java index f6529d8e55b..54d6cd194f0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -70,7 +70,7 @@ public static BufferLayout dataBuffer(int typeBitWidth) { case 128: return VALUES_128; default: - throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + return new BufferLayout(BufferType.DATA, typeBitWidth); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java new file mode 100644 index 00000000000..f87372e1e6b --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java @@ -0,0 +1,401 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + + +/** + * FixedSizeBinaryVector implements a fixed width vector of + * binary values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class FixedSizeBinaryVector extends BaseFixedWidthVector { + private final int byteWidth; + private final FieldReader reader; + + /** + * Instantiate a FixedSizeBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + * @param byteWidth byte width of the binary values + */ + public FixedSizeBinaryVector(String name, BufferAllocator allocator, int byteWidth) { + this(name, FieldType.nullable(new FixedSizeBinary(byteWidth)), allocator); + } + + /** + * Instantiate a FixedSizeBinaryVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public FixedSizeBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, ((FixedSizeBinary) fieldType.getType()).getByteWidth()); + reader = new FixedSizeBinaryReaderImpl(FixedSizeBinaryVector.this); + byteWidth = ((FixedSizeBinary) fieldType.getType()).getByteWidth(); + } + + /** + * Get a reader that supports reading values from this vector + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.FIXEDSIZEBINARY; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public byte[] get(int index) { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + assert index >= 0; + final byte[] dst = new byte[byteWidth]; + valueBuffer.getBytes(index * byteWidth, dst, 0, byteWidth); + return dst; + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + * @param holder nullable holder to carry the buffer + */ + public void get(int index, NullableFixedSizeBinaryHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.buffer = valueBuffer.slice(index * byteWidth, byteWidth); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + @Override + public byte[] getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return get(index); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, FixedSizeBinaryVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + from.valueBuffer.getBytes(fromIndex * byteWidth, valueBuffer, + thisIndex * byteWidth, byteWidth); + } + + /** + * Same as {@link #copyFrom(int, int, FixedSizeBinaryVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, FixedSizeBinaryVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + public int getByteWidth() { + return byteWidth; + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + public void set(int index, byte[] value) { + assert index >= 0; + assert byteWidth <= value.length; + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * byteWidth, value, 0, byteWidth); + } + + public void setSafe(int index, byte[] value) { + handleSafe(index); + set(index, value); + } + + public void set(int index, int isSet, byte[] value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + public void setSafe(int index, int isSet, byte[] value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void set(int index, ArrowBuf buffer) { + assert index >= 0; + assert byteWidth <= buffer.capacity(); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * byteWidth, buffer, 0, byteWidth); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void setSafe(int index, ArrowBuf buffer) { + handleSafe(index); + set(index, buffer); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void set(int index, int isSet, ArrowBuf buffer) { + if (isSet > 0) { + set(index, buffer); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing binary value. + */ + public void setSafe(int index, int isSet, ArrowBuf buffer) { + handleSafe(index); + set(index, isSet, buffer); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, FixedSizeBinaryHolder holder) { + assert holder.byteWidth == byteWidth; + set(index, holder.buffer); + } + + /** + * Same as {@link #set(int, FixedSizeBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, FixedSizeBinaryHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableFixedSizeBinaryHolder holder) { + assert holder.byteWidth == byteWidth; + if (holder.isSet < 0) { + throw new IllegalArgumentException("holder has a negative isSet value"); + } else if (holder.isSet > 0) { + set(index, holder.buffer); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, NullableFixedSizeBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableFixedSizeBinaryHolder holder) { + handleSafe(index); + set(index, holder); + } + + public void setNull(int index) { + handleSafe(index); + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static byte[] get(final ArrowBuf buffer, final int index, final int byteWidth) { + final byte[] dst = new byte[byteWidth]; + buffer.getBytes(index * byteWidth, dst, 0, byteWidth); + return dst; + } + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((FixedSizeBinaryVector) to); + } + + private class TransferImpl implements TransferPair { + FixedSizeBinaryVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new FixedSizeBinaryVector(ref, allocator, FixedSizeBinaryVector.this.byteWidth); + } + + public TransferImpl(FixedSizeBinaryVector to) { + this.to = to; + } + + @Override + public FixedSizeBinaryVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, FixedSizeBinaryVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index d6f32b4b4b1..12bfcaf7b9f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -31,6 +31,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; @@ -136,6 +137,11 @@ public TypeLayout visit(Decimal type) { return newFixedWidthTypeLayout(BufferLayout.dataBuffer(128)); } + @Override + public TypeLayout visit(FixedSizeBinary type) { + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(type.getByteWidth() * 8)); + } + @Override public TypeLayout visit(Bool type) { return newFixedWidthTypeLayout(BufferLayout.booleanVector()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 762a442c983..08df7b6d931 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -136,7 +136,7 @@ public static ValueVector decode(ValueVector indices, Dictionary dictionary) { private static void validateType(MinorType type) { // byte arrays don't work as keys in our dictionary map - we could wrap them with something to // implement equals and hashcode if we want that functionality - if (type == MinorType.VARBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { + if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { throw new IllegalArgumentException("Dictionary encoding for complex types not implemented: type " + type); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index d0a9b9e18b8..ab7e0a6782f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -329,6 +329,36 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException } }; + BufferReader FIXEDSIZEBINARY = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + ArrayList values = Lists.newArrayList(); + int byteWidth = 0; + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = decodeHexSafe(parser.readValueAs(String.class)); + values.add(value); + if (value.length > 0) { + if (byteWidth == 0) { + byteWidth = value.length; + } else if (byteWidth != value.length) { + throw new IOException("mismatch byte width (" + value.length + ") at index " + i + ", expecting " + byteWidth); + } + } + } + if (count > 0 && byteWidth == 0) { + throw new IOException("could not determine the byte width of the vector because all elements are null"); + } + + ArrowBuf buf = allocator.buffer(byteWidth * count); + for (byte[] value : values) { + buf.writeBytes(value.length == 0? new byte[byteWidth] : value); + } + + return buf; + } + }; + BufferReader VARCHAR = new BufferReader() { @Override protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { @@ -428,6 +458,9 @@ private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType case DECIMAL: reader = helper.DECIMAL; break; + case FIXEDSIZEBINARY: + reader = helper.FIXEDSIZEBINARY; + break; case VARCHAR: reader = helper.VARCHAR; break; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java index 6eb76a7a147..86fddd7d96f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java @@ -165,6 +165,9 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE generator.writeObjectField("count", valueCount); final int scale = (vector instanceof DecimalVector) ? ((DecimalVector) vector).getScale() : 0; + final int byteWidth = (vector instanceof FixedSizeBinaryVector) ? + ((FixedSizeBinaryVector) vector).getByteWidth() : -1; + for (int v = 0; v < vectorTypes.size(); v++) { BufferType bufferType = vectorTypes.get(v); ArrowBuf vectorBuffer = vectorBuffers.get(v); @@ -173,9 +176,9 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE for (int i = 0; i < bufferValueCount; i++) { if (bufferType.equals(DATA) && (vector.getMinorType() == Types.MinorType.VARCHAR || vector.getMinorType() == Types.MinorType.VARBINARY)) { - writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v-1), vector, i, scale); + writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v-1), vector, i, scale, byteWidth); } else { - writeValueToGenerator(bufferType, vectorBuffer, null, vector, i, scale); + writeValueToGenerator(bufferType, vectorBuffer, null, vector, i, scale, byteWidth); } } generator.writeEndArray(); @@ -200,12 +203,12 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, ArrowBuf offsetBuffer, FieldVector vector, - final int index, final int scale) throws IOException { + final int index, final int scale, int byteWidth) throws IOException { if (bufferType.equals(TYPE)) { generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH)); } else if (bufferType.equals(OFFSET)) { generator.writeNumber(buffer.getInt(index * BaseVariableWidthVector.OFFSET_WIDTH)); - } else if(bufferType.equals(VALIDITY)) { + } else if (bufferType.equals(VALIDITY)) { generator.writeNumber(vector.isNull(index) ? 0 : 1); } else if (bufferType.equals(DATA)) { switch (vector.getMinorType()) { @@ -279,6 +282,10 @@ private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, generator.writeObject(hexString); break; } + case FIXEDSIZEBINARY: + String fixedSizeHexString = Hex.encodeHexString(FixedSizeBinaryVector.get(buffer, index, byteWidth)); + generator.writeObject(fixedSizeHexString); + break; case VARCHAR: { assert offsetBuffer != null; byte[] b = (BaseVariableWidthVector.get(buffer, offsetBuffer, index)); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 3c5fd81d572..4adbefbeb5a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -65,6 +66,7 @@ import org.apache.arrow.vector.complex.impl.DateDayWriterImpl; import org.apache.arrow.vector.complex.impl.DateMilliWriterImpl; import org.apache.arrow.vector.complex.impl.DecimalWriterImpl; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryWriterImpl; import org.apache.arrow.vector.complex.impl.Float4WriterImpl; import org.apache.arrow.vector.complex.impl.Float8WriterImpl; import org.apache.arrow.vector.complex.impl.IntWriterImpl; @@ -100,6 +102,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; @@ -387,6 +390,17 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new DecimalWriterImpl((DecimalVector) vector); } }, + FIXEDSIZEBINARY(null) { + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new FixedSizeBinaryVector(name, fieldType, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new FixedSizeBinaryWriterImpl((FixedSizeBinaryVector) vector); + } + }, UINT1(new Int(8, false)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { @@ -607,6 +621,11 @@ public MinorType visit(Decimal type) { return MinorType.DECIMAL; } + @Override + public MinorType visit(FixedSizeBinary type) { + return MinorType.FIXEDSIZEBINARY; + } + @Override public MinorType visit(Date type) { switch (type.getUnit()) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index c27e5e5c85c..c27634a7eda 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -140,7 +140,7 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { default: throw new UnsupportedOperationException("unsupported precision: " + fpType); } - } else if (type instanceof ArrowType.Binary) { + } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.FixedSizeBinary) { return Arrays.equals((byte[]) o1, (byte[]) o2); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java new file mode 100644 index 00000000000..90529879172 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java @@ -0,0 +1,261 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TestFixedSizeBinaryVector { + private static final int numValues = 123; + private static final int typeWidth = 9; + private static final int smallDataSize = 6; + private static final int largeDataSize = 12; + + private static byte[][] values; + + static { + values = new byte[numValues][typeWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < typeWidth; j++) { + values[i][j] = ((byte) i); + } + } + } + + private ArrowBuf[] bufs = new ArrowBuf[numValues]; + private FixedSizeBinaryHolder[] holders = new FixedSizeBinaryHolder[numValues]; + private NullableFixedSizeBinaryHolder[] nullableHolders = new NullableFixedSizeBinaryHolder[numValues]; + + private static byte[] smallValue; + + static { + smallValue = new byte[smallDataSize]; + for (int i = 0; i < smallDataSize; i++) { + smallValue[i] = ((byte) i); + } + } + + private ArrowBuf smallBuf; + private FixedSizeBinaryHolder smallHolder; + private NullableFixedSizeBinaryHolder smallNullableHolder; + + private static byte[] largeValue; + + static { + largeValue = new byte[largeDataSize]; + for (int i = 0; i < largeDataSize; i++) { + largeValue[i] = ((byte) i); + } + } + + private ArrowBuf largeBuf; + private FixedSizeBinaryHolder largeHolder; + private NullableFixedSizeBinaryHolder largeNullableHolder; + + private BufferAllocator allocator; + private FixedSizeBinaryVector vector; + + private static void failWithException(String message) throws Exception { + throw new Exception(message); + } + + + @Before + public void init() throws Exception { + allocator = new DirtyRootAllocator(Integer.MAX_VALUE, (byte) 100); + vector = new FixedSizeBinaryVector("fixedSizeBinary", allocator, typeWidth); + vector.allocateNew(); + + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(typeWidth); + bufs[i].setBytes(0, values[i]); + + holders[i] = new FixedSizeBinaryHolder(); + holders[i].byteWidth = typeWidth; + holders[i].buffer = bufs[i]; + + nullableHolders[i] = new NullableFixedSizeBinaryHolder(); + nullableHolders[i].byteWidth = typeWidth; + nullableHolders[i].buffer = bufs[i]; + nullableHolders[i].isSet = 1; + } + + smallBuf = allocator.buffer(smallDataSize); + smallBuf.setBytes(0, smallValue); + + smallHolder = new FixedSizeBinaryHolder(); + smallHolder.byteWidth = smallDataSize; + smallHolder.buffer = smallBuf; + + smallNullableHolder = new NullableFixedSizeBinaryHolder(); + smallNullableHolder.byteWidth = smallDataSize; + smallNullableHolder.buffer = smallBuf; + + largeBuf = allocator.buffer(largeDataSize); + largeBuf.setBytes(0, largeValue); + + largeHolder = new FixedSizeBinaryHolder(); + largeHolder.byteWidth = typeWidth; + largeHolder.buffer = largeBuf; + + largeNullableHolder = new NullableFixedSizeBinaryHolder(); + largeNullableHolder.byteWidth = typeWidth; + largeNullableHolder.buffer = largeBuf; + } + + @After + public void terminate() throws Exception { + for (int i=0; i 0); + byte[] actual = new byte[typeWidth]; + nullableHolders[i].buffer.getBytes(0, actual, 0, typeWidth); + assertArrayEquals(values[i], actual); + } + } + + @Test + public void testSetWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs, byteWidth matches but value or buffer is too small + try { + vector.set(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.set(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs, byteWidth matches but value or buffer is bigger than byteWidth + vector.set(0, largeValue); + vector.set(0, largeHolder); + vector.set(0, largeNullableHolder); + vector.set(0, largeBuf); + } + + @Test + public void setSetSafeWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs, byteWidth matches but value or buffer is too small + try { + vector.setSafe(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + vector.setSafe(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs, byteWidth matches but value or buffer is bigger than byteWidth + vector.setSafe(0, largeValue); + vector.setSafe(0, largeHolder); + vector.setSafe(0, largeNullableHolder); + vector.setSafe(0, largeBuf); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 29d39aabe6b..b52769e44ef 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -618,7 +618,7 @@ public void timeStampSecWriter() throws Exception { final LocalDateTime expectedSecDateTime = new LocalDateTime(2001, 2, 3, 4, 5, 6, 0); // write - MapVector parent = new MapVector("parent", allocator, null); + MapVector parent = MapVector.empty("parent", allocator); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); @@ -718,7 +718,7 @@ public void timeStampMicroWriters() throws Exception { final LocalDateTime expectedMicroDateTime = new LocalDateTime(2001, 2, 3, 4, 5, 6, 123); // write - MapVector parent = new MapVector("parent", allocator, null); + MapVector parent = MapVector.empty("parent", allocator); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); @@ -765,7 +765,7 @@ public void timeStampNanoWriters() throws Exception { final LocalDateTime expectedNanoDateTime = new LocalDateTime(2001, 2, 3, 4, 5, 6, 123); // write - MapVector parent = new MapVector("parent", allocator, null); + MapVector parent = MapVector.empty("parent", allocator); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); @@ -806,6 +806,51 @@ public void timeStampNanoWriters() throws Exception { } } + @Test + public void fixedSizeBinaryWriters() throws Exception { + // test values + int numValues = 10; + int byteWidth = 9; + byte[][] values = new byte[numValues][byteWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < byteWidth; j++) { + values[i][j] = ((byte) i); + } + } + ArrowBuf[] bufs = new ArrowBuf[numValues]; + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(byteWidth); + bufs[i].setBytes(0, values[i]); + } + + // write + MapVector parent = MapVector.empty("parent", allocator); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + + String fieldName = "fixedSizeBinary"; + FixedSizeBinaryWriter fixedSizeBinaryWriter = rootWriter.fixedSizeBinary(fieldName, byteWidth); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryWriter.setPosition(i); + fixedSizeBinaryWriter.writeFixedSizeBinary(bufs[i]); + } + + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + Assert.assertEquals(fieldName, children.get(0).getName()); + Assert.assertEquals(ArrowType.FixedSizeBinary.TYPE_TYPE, children.get(0).getType().getTypeID()); + + // read + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + + FieldReader fixedSizeBinaryReader = rootReader.reader(fieldName); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryReader.setPosition(i); + byte[] readValues = fixedSizeBinaryReader.readByteArray(); + Assert.assertArrayEquals(values[i], readValues); + } + } + @Test public void complexCopierWithList() { MapVector parent = MapVector.empty("parent", allocator); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java index 055c34e7010..3b809d00c2c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java @@ -42,6 +42,7 @@ import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider; import org.apache.arrow.vector.ipc.message.ArrowBlock; @@ -51,6 +52,7 @@ import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -549,6 +551,63 @@ public void testWriteReadNestedDictionary() throws IOException { } } + @Test + public void testWriteReadFixedSizeBinary() throws IOException { + File file = new File("target/mytest_fixed_size_binary.arrow"); + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + final int numValues = 10; + final int typeWidth = 11; + byte[][] byteValues = new byte[numValues][typeWidth]; + for (int i=0; i Date: Fri, 19 Jan 2018 14:07:00 -0500 Subject: [PATCH 2/7] expand get() method inside getObject() method to remove duplicate check of isSet() --- .../org/apache/arrow/vector/FixedSizeBinaryVector.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java index f87372e1e6b..a45ca89f485 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java @@ -93,7 +93,6 @@ public Types.MinorType getMinorType() { * * ******************************************************************/ - /** * Get the element at the given index from the vector. * @@ -101,10 +100,10 @@ public Types.MinorType getMinorType() { * @return element at given index */ public byte[] get(int index) { + assert index >= 0; if (isSet(index) == 0) { throw new IllegalStateException("Value at index is null"); } - assert index >= 0; final byte[] dst = new byte[byteWidth]; valueBuffer.getBytes(index * byteWidth, dst, 0, byteWidth); return dst; @@ -119,6 +118,7 @@ public byte[] get(int index) { * @param holder nullable holder to carry the buffer */ public void get(int index, NullableFixedSizeBinaryHolder holder) { + assert index >= 0; if (isSet(index) == 0) { holder.isSet = 0; return; @@ -135,10 +135,13 @@ public void get(int index, NullableFixedSizeBinaryHolder holder) { */ @Override public byte[] getObject(int index) { + assert index >= 0; if (isSet(index) == 0) { return null; } else { - return get(index); + final byte[] dst = new byte[byteWidth]; + valueBuffer.getBytes(index * byteWidth, dst, 0, byteWidth); + return dst; } } From a8a95530bb5b4f32fec9d060e9820a6d27b2b9a0 Mon Sep 17 00:00:00 2001 From: Jingyuan Wang Date: Fri, 19 Jan 2018 14:52:37 -0500 Subject: [PATCH 3/7] Pass byteWidth info for FixedSizeBinary type in JsonFileReader --- .../arrow/vector/ipc/JsonFileReader.java | 51 +++++++++---------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index ab7e0a6782f..50928811f07 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -192,11 +192,11 @@ public VectorSchemaRoot read() throws IOException { } private abstract class BufferReader { - abstract protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException; + abstract protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException; - ArrowBuf readBuffer(BufferAllocator allocator, int count) throws IOException { + ArrowBuf readBuffer(BufferAllocator allocator, int count, int byteWidth) throws IOException { readToken(START_ARRAY); - ArrowBuf buf = read(allocator, count); + ArrowBuf buf = read(allocator, count, byteWidth); readToken(END_ARRAY); return buf; } @@ -205,7 +205,7 @@ ArrowBuf readBuffer(BufferAllocator allocator, int count) throws IOException { private class BufferHelper { BufferReader BIT = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int bufferSize = BitVectorHelper.getValidityBufferSize(count); ArrowBuf buf = allocator.buffer(bufferSize); @@ -224,7 +224,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader INT1 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int size = count * TinyIntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -239,7 +239,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader INT2 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int size = count * SmallIntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -254,7 +254,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader INT4 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int size = count * IntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -269,7 +269,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader INT8 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int size = count * BigIntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -284,7 +284,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader FLOAT4 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int size = count * Float4Vector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -299,7 +299,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader FLOAT8 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int size = count * Float8Vector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -314,7 +314,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader DECIMAL = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { final int size = count * DecimalVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -331,23 +331,16 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader FIXEDSIZEBINARY = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { ArrayList values = Lists.newArrayList(); - int byteWidth = 0; for (int i = 0; i < count; i++) { parser.nextToken(); final byte[] value = decodeHexSafe(parser.readValueAs(String.class)); - values.add(value); - if (value.length > 0) { - if (byteWidth == 0) { - byteWidth = value.length; - } else if (byteWidth != value.length) { - throw new IOException("mismatch byte width (" + value.length + ") at index " + i + ", expecting " + byteWidth); - } + if (value.length > 0 && value.length != byteWidth) { + throw new RuntimeException("mismatch byte width (" + value.length + ") at index " + i + ", expecting " + + byteWidth); } - } - if (count > 0 && byteWidth == 0) { - throw new IOException("could not determine the byte width of the vector because all elements are null"); + values.add(value); } ArrowBuf buf = allocator.buffer(byteWidth * count); @@ -361,7 +354,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader VARCHAR = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { ArrayList values = Lists.newArrayList(); int bufferSize = 0; for (int i = 0; i < count; i++) { @@ -384,7 +377,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException BufferReader VARBINARY = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { ArrayList values = Lists.newArrayList(); int bufferSize = 0; for (int i = 0; i < count; i++) { @@ -407,7 +400,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException } private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType, - Types.MinorType type, int count) throws IOException { + Types.MinorType type, int count, int byteWidth) throws IOException { ArrowBuf buf; BufferHelper helper = new BufferHelper(); @@ -498,7 +491,7 @@ private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType throw new InvalidArrowFileException("Unrecognized buffer type " + bufferType); } - buf = reader.readBuffer(allocator, count); + buf = reader.readBuffer(allocator, count, byteWidth); assert buf != null; return buf; @@ -546,7 +539,9 @@ private void readFromJsonIntoVector(Field field, FieldVector vector) throws Json innerBufferValueCount = valueCount + 1; } - vectorBuffers[v] = readIntoBuffer(allocator, bufferType, vector.getMinorType(), innerBufferValueCount); + /* byteWidth is only necessary for FixedSizeBinary type, so it by default set to -1 for other types */ + int byteWidth = vector instanceof FixedSizeBinaryVector? ((FixedSizeBinaryVector) vector).getByteWidth() : -1; + vectorBuffers[v] = readIntoBuffer(allocator, bufferType, vector.getMinorType(), innerBufferValueCount, byteWidth); } final int nullCount = BitVectorHelper.getNullCount(vectorBuffers[0], valueCount); From 0c0015dc519b464926ab3237d86304056ce2ccad Mon Sep 17 00:00:00 2001 From: Jingyuan Wang Date: Fri, 19 Jan 2018 14:54:01 -0500 Subject: [PATCH 4/7] add a new line --- .../java/org/apache/arrow/vector/FixedSizeBinaryVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java index a45ca89f485..232e6564ace 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java @@ -401,4 +401,4 @@ public void copyValueSafe(int fromIndex, int toIndex) { to.copyFromSafe(fromIndex, toIndex, FixedSizeBinaryVector.this); } } -} \ No newline at end of file +} From 4fbb67dc5894695f9af1977c769cccd45a2b2cf1 Mon Sep 17 00:00:00 2001 From: jingyuan Date: Tue, 23 Jan 2018 17:01:23 -0500 Subject: [PATCH 5/7] put arbitrary values for FixSizeBinary nulls --- integration/integration_test.py | 13 +++--- .../arrow/vector/ipc/JsonFileReader.java | 43 ++++++++----------- 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index 715892b5711..6dddf71d96d 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -493,13 +493,10 @@ def generate_column(self, size, name=None): values = [] for i in range(size): - if is_valid[i]: - draw = (np.random.randint(0, 255, size=self.byte_width) - .astype(np.uint8) - .tostring()) - values.append(draw) - else: - values.append("") + draw = (np.random.randint(0, 255, size=self.byte_width) + .astype(np.uint8) + .tostring()) + values.append(draw) if name is None: name = self.name @@ -576,7 +573,7 @@ def _encode_value(self, x): def _get_buffers(self): data = [] for i, v in enumerate(self.values): - data.append(self._encode_value(v if self.is_valid[i] else "")) + data.append(self._encode_value(v)) return [ ('VALIDITY', [int(x) for x in self.is_valid]), diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index 50928811f07..dc0d0724e55 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -192,11 +192,11 @@ public VectorSchemaRoot read() throws IOException { } private abstract class BufferReader { - abstract protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException; + abstract protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException; - ArrowBuf readBuffer(BufferAllocator allocator, int count, int byteWidth) throws IOException { + ArrowBuf readBuffer(BufferAllocator allocator, int count) throws IOException { readToken(START_ARRAY); - ArrowBuf buf = read(allocator, count, byteWidth); + ArrowBuf buf = read(allocator, count); readToken(END_ARRAY); return buf; } @@ -205,7 +205,7 @@ ArrowBuf readBuffer(BufferAllocator allocator, int count, int byteWidth) throws private class BufferHelper { BufferReader BIT = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int bufferSize = BitVectorHelper.getValidityBufferSize(count); ArrowBuf buf = allocator.buffer(bufferSize); @@ -224,7 +224,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader INT1 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int size = count * TinyIntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -239,7 +239,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader INT2 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int size = count * SmallIntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -254,7 +254,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader INT4 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int size = count * IntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -269,7 +269,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader INT8 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int size = count * BigIntVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -284,7 +284,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader FLOAT4 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int size = count * Float4Vector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -299,7 +299,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader FLOAT8 = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int size = count * Float8Vector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -314,7 +314,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader DECIMAL = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { final int size = count * DecimalVector.TYPE_WIDTH; ArrowBuf buf = allocator.buffer(size); @@ -331,21 +331,18 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader FIXEDSIZEBINARY = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { ArrayList values = Lists.newArrayList(); for (int i = 0; i < count; i++) { parser.nextToken(); final byte[] value = decodeHexSafe(parser.readValueAs(String.class)); - if (value.length > 0 && value.length != byteWidth) { - throw new RuntimeException("mismatch byte width (" + value.length + ") at index " + i + ", expecting " + - byteWidth); - } values.add(value); } + int byteWidth = count > 0? values.get(0).length : 0; ArrowBuf buf = allocator.buffer(byteWidth * count); for (byte[] value : values) { - buf.writeBytes(value.length == 0? new byte[byteWidth] : value); + buf.writeBytes(value); } return buf; @@ -354,7 +351,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader VARCHAR = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { ArrayList values = Lists.newArrayList(); int bufferSize = 0; for (int i = 0; i < count; i++) { @@ -377,7 +374,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr BufferReader VARBINARY = new BufferReader() { @Override - protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) throws IOException { + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { ArrayList values = Lists.newArrayList(); int bufferSize = 0; for (int i = 0; i < count; i++) { @@ -400,7 +397,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count, int byteWidth) thr } private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType, - Types.MinorType type, int count, int byteWidth) throws IOException { + Types.MinorType type, int count) throws IOException { ArrowBuf buf; BufferHelper helper = new BufferHelper(); @@ -491,7 +488,7 @@ private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType throw new InvalidArrowFileException("Unrecognized buffer type " + bufferType); } - buf = reader.readBuffer(allocator, count, byteWidth); + buf = reader.readBuffer(allocator, count); assert buf != null; return buf; @@ -539,9 +536,7 @@ private void readFromJsonIntoVector(Field field, FieldVector vector) throws Json innerBufferValueCount = valueCount + 1; } - /* byteWidth is only necessary for FixedSizeBinary type, so it by default set to -1 for other types */ - int byteWidth = vector instanceof FixedSizeBinaryVector? ((FixedSizeBinaryVector) vector).getByteWidth() : -1; - vectorBuffers[v] = readIntoBuffer(allocator, bufferType, vector.getMinorType(), innerBufferValueCount, byteWidth); + vectorBuffers[v] = readIntoBuffer(allocator, bufferType, vector.getMinorType(), innerBufferValueCount); } final int nullCount = BitVectorHelper.getNullCount(vectorBuffers[0], valueCount); From 873c5b2ee62367308748891c586f0f52ea78431e Mon Sep 17 00:00:00 2001 From: jingyuan Date: Wed, 24 Jan 2018 15:00:33 -0500 Subject: [PATCH 6/7] do not pass scale or byteWidth into writeValueToGenerator in JsonFileWriter.java --- .../org/apache/arrow/vector/ipc/JsonFileWriter.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java index 86fddd7d96f..ad56d573b91 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java @@ -163,10 +163,6 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE generator.writeObjectField("name", field.getName()); int valueCount = vector.getValueCount(); generator.writeObjectField("count", valueCount); - final int scale = (vector instanceof DecimalVector) ? - ((DecimalVector) vector).getScale() : 0; - final int byteWidth = (vector instanceof FixedSizeBinaryVector) ? - ((FixedSizeBinaryVector) vector).getByteWidth() : -1; for (int v = 0; v < vectorTypes.size(); v++) { BufferType bufferType = vectorTypes.get(v); @@ -176,9 +172,9 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE for (int i = 0; i < bufferValueCount; i++) { if (bufferType.equals(DATA) && (vector.getMinorType() == Types.MinorType.VARCHAR || vector.getMinorType() == Types.MinorType.VARBINARY)) { - writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v-1), vector, i, scale, byteWidth); + writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v-1), vector, i); } else { - writeValueToGenerator(bufferType, vectorBuffer, null, vector, i, scale, byteWidth); + writeValueToGenerator(bufferType, vectorBuffer, null, vector, i); } } generator.writeEndArray(); @@ -203,7 +199,7 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, ArrowBuf offsetBuffer, FieldVector vector, - final int index, final int scale, int byteWidth) throws IOException { + final int index) throws IOException { if (bufferType.equals(TYPE)) { generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH)); } else if (bufferType.equals(OFFSET)) { @@ -283,6 +279,7 @@ private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, break; } case FIXEDSIZEBINARY: + int byteWidth = ((FixedSizeBinaryVector) vector).getByteWidth(); String fixedSizeHexString = Hex.encodeHexString(FixedSizeBinaryVector.get(buffer, index, byteWidth)); generator.writeObject(fixedSizeHexString); break; @@ -293,6 +290,7 @@ private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, break; } case DECIMAL: { + int scale = ((DecimalVector) vector).getScale(); BigDecimal decimalValue = DecimalUtility.getBigDecimalFromArrowBuf(buffer, index, scale); // We write the unscaled value, because the scale is stored in the type metadata. generator.writeString(decimalValue.unscaledValue().toString()); From c994a19f4e61079e9e9fc91717be036c62a036f6 Mon Sep 17 00:00:00 2001 From: jingyuan Date: Wed, 31 Jan 2018 12:53:34 -0500 Subject: [PATCH 7/7] create data buffer layout for FixedSizeBinary directly instead of using dataBuffer() method and some other changes based on PR comments --- integration/integration_test.py | 2 +- .../src/main/java/org/apache/arrow/vector/BitVector.java | 2 +- .../src/main/java/org/apache/arrow/vector/BufferLayout.java | 4 ++-- .../src/main/java/org/apache/arrow/vector/TypeLayout.java | 2 +- .../main/java/org/apache/arrow/vector/ipc/JsonFileReader.java | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index 6dddf71d96d..301017cac3d 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -466,7 +466,7 @@ def generate_column(self, size, name=None): class FixedSizeBinaryType(PrimitiveType): def __init__(self, name, byte_width, nullable=True): - PrimitiveType.__init__(self, name, nullable) + super(FixedSizeBinaryType, self).__init__(name, nullable=nullable) self.byte_width = byte_width @property diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 3887da4a618..b7b7b990630 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -58,7 +58,7 @@ public BitVector(String name, BufferAllocator allocator) { * @param allocator allocator for memory management. */ public BitVector(String name, FieldType fieldType, BufferAllocator allocator) { - super(name, allocator, fieldType, (byte) 0); + super(name, allocator, fieldType, 0); reader = new BitReaderImpl(BitVector.this); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java index 54d6cd194f0..0faab7b2b29 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -70,7 +70,7 @@ public static BufferLayout dataBuffer(int typeBitWidth) { case 128: return VALUES_128; default: - return new BufferLayout(BufferType.DATA, typeBitWidth); + throw new IllegalArgumentException("only 8, 16, 32, 64, or 128 bits supported"); } } @@ -90,7 +90,7 @@ public static BufferLayout byteVector() { private final BufferType type; - private BufferLayout(BufferType type, int typeBitWidth) { + BufferLayout(BufferType type, int typeBitWidth) { super(); this.type = Preconditions.checkNotNull(type); this.typeBitWidth = (short) typeBitWidth; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 12bfcaf7b9f..4c05b97b4ac 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -139,7 +139,7 @@ public TypeLayout visit(Decimal type) { @Override public TypeLayout visit(FixedSizeBinary type) { - return newFixedWidthTypeLayout(BufferLayout.dataBuffer(type.getByteWidth() * 8)); + return newFixedWidthTypeLayout(new BufferLayout(BufferType.DATA, type.getByteWidth() * 8)); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index dc0d0724e55..8995716461e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -339,7 +339,7 @@ protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException values.add(value); } - int byteWidth = count > 0? values.get(0).length : 0; + int byteWidth = count > 0 ? values.get(0).length : 0; ArrowBuf buf = allocator.buffer(byteWidth * count); for (byte[] value : values) { buf.writeBytes(value);