From d1564d706d333b51f20bf64e3f0bb0c3d7aee169 Mon Sep 17 00:00:00 2001 From: Jingyuan Wang Date: Tue, 29 Aug 2017 15:28:53 -0400 Subject: [PATCH 1/3] initial draft of adding FixedSizeBinary support in Java and integration tests --- integration/integration_test.py | 76 +++++++- .../src/main/codegen/data/ArrowTypes.tdd | 5 + .../main/codegen/data/ValueVectorTypes.tdd | 15 ++ .../codegen/templates/FixedValueVectors.java | 168 +++++++++++++----- .../codegen/templates/HolderReaderImpl.java | 4 + .../templates/NullableValueVectors.java | 4 +- .../vector/dictionary/DictionaryEncoder.java | 2 +- .../vector/file/json/JsonFileReader.java | 4 + .../vector/file/json/JsonFileWriter.java | 5 + .../arrow/vector/schema/TypeLayout.java | 5 + .../arrow/vector/schema/VectorLayout.java | 2 +- .../org/apache/arrow/vector/types/Types.java | 19 ++ .../apache/arrow/vector/util/Validator.java | 2 +- .../arrow/vector/file/TestArrowFile.java | 59 ++++++ .../arrow/vector/types/pojo/TestSchema.java | 4 +- 15 files changed, 316 insertions(+), 58 deletions(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index 46539484488..c8dbf44204d 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -374,6 +374,49 @@ def generate_column(self, size, name=None): return self.column_class(name, size, is_valid, values) +class FixedSizeBinaryType(PrimitiveType): + + def __init__(self, name, byte_width, nullable=True): + PrimitiveType.__init__(self, name, nullable) + self.byte_width = byte_width + + @property + def numpy_type(self): + return object + + @property + def column_class(self): + return FixedSizeBinaryColumn + + def _get_type(self): + return OrderedDict([('name', 'fixedsizebinary'), ('byteWidth', self.byte_width)]) + + def _get_type_layout(self): + return OrderedDict([ + ('vectors', + [OrderedDict([('type', 'VALIDITY'), + ('typeBitWidth', 1)]), + OrderedDict([('type', 'DATA'), + ('typeBitWidth', self.byte_width)])])]) + + def generate_column(self, size, name=None): + is_valid = self._make_is_valid(size) + values = [] + + for i in range(size): + if is_valid[i]: + draw = (np.random.randint(0, 255, size=self.byte_width) + .astype(np.uint8) + .tostring()) + values.append(draw) + else: + values.append("") + + if name is None: + name = self.name + return self.column_class(name, size, is_valid, values) + + class StringType(BinaryType): @property @@ -436,6 +479,22 @@ def _get_buffers(self): ] +class FixedSizeBinaryColumn(PrimitiveColumn): + + def _encode_value(self, x): + return ''.join('{:02x}'.format(c).upper() for c in x) + + def _get_buffers(self): + data = [] + for i, v in enumerate(self.values): + data.append(self._encode_value(v if self.is_valid[i] else "")) + + return [ + ('VALIDITY', [int(x) for x in self.is_valid]), + ('DATA', data) + ] + + class StringColumn(BinaryColumn): def _encode_value(self, x): @@ -648,6 +707,9 @@ def get_field(name, type_, nullable=True): return BinaryType(name, nullable=nullable) elif type_ == 'utf8': return StringType(name, nullable=nullable) + elif type_.startswith('fixedsizebinary_'): + byte_width = int(type_.split('_')[1]) + return FixedSizeBinaryType(name, byte_width=byte_width, nullable=nullable) dtype = np.dtype(type_) @@ -677,10 +739,11 @@ def _generate_file(name, fields, batch_sizes, dictionaries=None): return JsonFile(name, schema, batches, dictionaries) -def generate_primitive_case(batch_sizes): +def generate_primitive_case(batch_sizes, name_suffix=None): types = ['bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float32', 'float64', 'binary', 'utf8'] + 'float32', 'float64', 'binary', 'utf8', + 'fixedsizebinary_19', 'fixedsizebinary_120'] fields = [] @@ -688,7 +751,10 @@ def generate_primitive_case(batch_sizes): fields.append(get_field(type_ + "_nullable", type_, True)) fields.append(get_field(type_ + "_nonnullable", type_, False)) - return _generate_file("primitive", fields, batch_sizes) + name = "primitive" + if name_suffix is not None: + name += name_suffix + return _generate_file(name, fields, batch_sizes) def generate_datetime_case(): @@ -754,8 +820,8 @@ def _temp_path(): return file_objs = [ - generate_primitive_case([7, 10]), - generate_primitive_case([0, 0, 0]), + generate_primitive_case([7, 10], "_0"), + generate_primitive_case([0, 0, 0], "_1"), generate_datetime_case(), generate_nested_case(), generate_dictionary_case() diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index ce92c1333a5..63b193fc66a 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -57,6 +57,11 @@ fields: [], complex: false }, + { + name: "FixedSizeBinary", + fields: [{name: "byteWidth", type: int}], + complex: false + } { name: "Bool", fields: [], diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 970d887c760..ab5a213d297 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -122,6 +122,21 @@ } ] }, + { + major: "Fixed", + width: -1, + javaType: "byte[]", + boxedType: "ArrowBuf", + minor: [ + { + class: "FixedSizeBinary", + typeParams: [ {name: "byteWidth", type: "int"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary", + friendlyType: "byte[]", + fields: [{name: "index", type: "int", include: false}, {name: "buffer", type: "ArrowBuf"}], + } + ] + }, { major: "VarLen", width: 4, diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java index 9747d421c41..5b024371abb 100644 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -39,7 +39,9 @@ * ${minor.class} implements a vector of fixed width values. Elements in the vector are accessed * by position, starting from the logical start of the vector. Values should be pushed onto the * vector sequentially, but may be randomly accessed. +<#if (type.width > 0) > * The width of each element is ${type.width} byte(s) + * The equivalent Java primitive is '${minor.javaType!type.javaType}' * * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. @@ -47,12 +49,16 @@ public final class ${className} extends BaseDataValueVector implements FixedWidthVector{ private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); + <#if (type.width < 0)> + public final int TYPE_WIDTH; + <#else> public static final int TYPE_WIDTH = ${type.width}; + private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); - private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; + private int allocationSizeInBytes; private int allocationMonitor = 0; <#if minor.typeParams??> @@ -63,13 +69,18 @@ public final class ${className} extends BaseDataValueVector implements FixedWidt public ${className}(String name, BufferAllocator allocator<#list typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { super(name, allocator); + <#if minor.class == "FixedSizeBinary"> + TYPE_WIDTH = byteWidth; + <#list typeParams as typeParam> this.${typeParam.name} = ${typeParam.name}; + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; } <#else> public ${className}(String name, BufferAllocator allocator) { super(name, allocator); + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; } @@ -93,7 +104,7 @@ public int getBufferSizeFor(final int valueCount) { if (valueCount == 0) { return 0; } - return valueCount * ${type.width}; + return valueCount * TYPE_WIDTH; } @Override @@ -116,7 +127,7 @@ public ArrowBuf getOffsetBuffer() { @Override public int getValueCapacity(){ - return (int) (data.capacity() *1.0 / ${type.width}); + return (int) (data.capacity() *1.0 / TYPE_WIDTH); } @Override @@ -135,7 +146,7 @@ int getAllocationSize() { @Override public void setInitialCapacity(final int valueCount) { - final long size = 1L * valueCount * ${type.width}; + final long size = 1L * valueCount * TYPE_WIDTH; if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); } @@ -179,12 +190,12 @@ public boolean allocateNewSafe() { */ @Override public void allocateNew(final int valueCount) { - allocateBytes(valueCount * ${type.width}); + allocateBytes(valueCount * TYPE_WIDTH); } @Override public void reset() { - allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; allocationMonitor = 0; zeroVector(); super.reset(); @@ -254,8 +265,8 @@ public void transferTo(${className} target){ } public void splitAndTransferTo(int startIndex, int length, ${className} target) { - final int startPoint = startIndex * ${type.width}; - final int sliceLength = length * ${type.width}; + final int startPoint = startIndex * TYPE_WIDTH; + final int sliceLength = length * TYPE_WIDTH; target.clear(); target.data = data.slice(startPoint, sliceLength).transferOwnership(target.allocator).buffer; target.data.writerIndex(sliceLength); @@ -294,11 +305,11 @@ public void copyValueSafe(int fromIndex, int toIndex) { } public void copyFrom(int fromIndex, int thisIndex, ${className} from){ - <#if (type.width > 8 || minor.class == "IntervalDay")> - from.data.getBytes(fromIndex * ${type.width}, data, thisIndex * ${type.width}, ${type.width}); - <#else> <#-- type.width <= 8 --> - data.set${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}, - from.data.get${(minor.javaType!type.javaType)?cap_first}(fromIndex * ${type.width}) + <#if (type.width < 0 || type.width > 8 || minor.class == "IntervalDay")> + from.data.getBytes(fromIndex * TYPE_WIDTH, data, thisIndex * TYPE_WIDTH, TYPE_WIDTH); + <#else> <#-- type.width >= 0 && type.width <= 8 --> + data.set${(minor.javaType!type.javaType)?cap_first}(thisIndex * TYPE_WIDTH, + from.data.get${(minor.javaType!type.javaType)?cap_first}(fromIndex * TYPE_WIDTH) ); <#-- type.width --> } @@ -324,7 +335,7 @@ private void incrementAllocationMonitor() { public final class Accessor extends BaseDataValueVector.BaseAccessor { @Override public int getValueCount() { - return data.writerIndex() / ${type.width}; + return data.writerIndex() / TYPE_WIDTH; } @Override @@ -332,20 +343,27 @@ public boolean isNull(int index){ return false; } - <#if (type.width > 8 || minor.class == "IntervalDay")> + <#if (type.width < 0 || type.width > 8 || minor.class == "IntervalDay")> public ${minor.javaType!type.javaType} get(int index) { - return data.slice(index * ${type.width}, ${type.width}); + <#if (minor.class == "FixedSizeBinary")> + assert index >= 0; + final byte[] dst = new byte[TYPE_WIDTH]; + data.getBytes(index * TYPE_WIDTH, dst, 0, TYPE_WIDTH); + return dst; + <#else> + return data.slice(index * TYPE_WIDTH, TYPE_WIDTH); + } <#if (minor.class == "IntervalDay")> public void get(int index, ${minor.class}Holder holder){ - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; holder.days = data.getInt(offsetIndex); holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); } public void get(int index, Nullable${minor.class}Holder holder){ - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; holder.isSet = 1; holder.days = data.getInt(offsetIndex); holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); @@ -353,7 +371,7 @@ public void get(int index, Nullable${minor.class}Holder holder){ @Override public ${friendlyType} getObject(int index) { - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; final int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); final int days = data.getInt(offsetIndex); final Period p = new Period(); @@ -361,7 +379,7 @@ public void get(int index, Nullable${minor.class}Holder holder){ } public StringBuilder getAsStringBuilder(int index) { - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); final int days = data.getInt(offsetIndex); @@ -387,7 +405,7 @@ public StringBuilder getAsStringBuilder(int index) { <#elseif minor.class == "Decimal"> public void get(int index, ${minor.class}Holder holder) { - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; holder.buffer = data; holder.scale = scale; holder.precision = precision; @@ -395,7 +413,7 @@ public void get(int index, ${minor.class}Holder holder) { public void get(int index, Nullable${minor.class}Holder holder) { holder.isSet = 1; - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; holder.buffer = data; holder.scale = scale; holder.precision = precision; @@ -403,35 +421,52 @@ public void get(int index, Nullable${minor.class}Holder holder) { @Override public ${friendlyType} getObject(int index) { - return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(data, ${type.width} * index, scale); + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(data, TYPE_WIDTH * index, scale); + } + + <#elseif minor.class == "FixedSizeBinary"> + public void get(int index, ${minor.class}Holder holder) { + holder.index = index; + holder.buffer = data; + } + + public void get(int index, Nullable${minor.class}Holder holder) { + holder.isSet = 1; + holder.index = index; + holder.buffer = data; + } + + @Override + public ${friendlyType} getObject(int index) { + return get(index); } <#else> public void get(int index, ${minor.class}Holder holder){ holder.buffer = data; - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; } public void get(int index, Nullable${minor.class}Holder holder){ holder.isSet = 1; holder.buffer = data; - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; } @Override public ${friendlyType} getObject(int index) { - return data.slice(index * ${type.width}, ${type.width}) + return data.slice(index * TYPE_WIDTH, TYPE_WIDTH); } - <#else> <#-- type.width <= 8 --> + <#else> <#-- type.width >= 0 && type.width <= 8 --> public ${minor.javaType!type.javaType} get(int index) { - return data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + return data.get${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH); } <#if type.width == 4> public long getTwoAsLong(int index) { - return data.getLong(index * ${type.width}); + return data.getLong(index * TYPE_WIDTH); } @@ -514,12 +549,12 @@ public StringBuilder getAsStringBuilder(int index) { public void get(int index, ${minor.class}Holder holder){ - holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH); } public void get(int index, Nullable${minor.class}Holder holder){ holder.isSet = 1; - holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH); } <#-- type.width --> @@ -529,7 +564,9 @@ public void get(int index, Nullable${minor.class}Holder holder){ * ${minor.class}.Mutator implements a mutable vector of fixed width values. Elements in the * vector are accessed by position from the logical start of the vector. Values should be pushed * onto the vector sequentially, but may be randomly accessed. + <#if (type.width > 0)> * The width of each element is ${type.width} byte(s) + * The equivalent Java primitive is '${minor.javaType!type.javaType}' * * NB: this class is automatically generated from FixedValueVectorTypes.tdd using FreeMarker. @@ -545,21 +582,24 @@ public final class Mutator extends BaseDataValueVector.BaseMutator { * @param index position of the bit to set * @param value value to set */ - <#if (type.width > 8) || minor.class == "IntervalDay"> - public void set(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { - data.setBytes(index * ${type.width}, value, 0, ${type.width}); + <#if (type.width < 0 || type.width > 8 || minor.class == "IntervalDay")> + public void set(int index, <#if (type.width < 0 || type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { + <#if minor.class == "FixedSizeBinary"> + assert TYPE_WIDTH <= value.length; + + data.setBytes(index * TYPE_WIDTH, value, 0, TYPE_WIDTH); } - public void setSafe(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { + public void setSafe(int index, <#if (type.width < 0 || type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { while(index >= getValueCapacity()) { reAlloc(); } - data.setBytes(index * ${type.width}, value, 0, ${type.width}); + set(index, value); } <#if (minor.class == "IntervalDay")> public void set(int index, int days, int milliseconds){ - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; data.setInt(offsetIndex, days); data.setInt((offsetIndex + ${minor.millisecondsOffset}), milliseconds); } @@ -611,7 +651,41 @@ public void setSafe(int index, int start, ArrowBuf buffer){ } public void set(int index, int start, ArrowBuf buffer){ - data.setBytes(index * ${type.width}, buffer, start, ${type.width}); + data.setBytes(index * TYPE_WIDTH, buffer, start, TYPE_WIDTH); + } + + <#elseif minor.class == "FixedSizeBinary"> + public void set(int index, ${minor.class}Holder holder){ + assert index == holder.index; + set(index, holder.buffer); + } + + void set(int index, Nullable${minor.class}Holder holder){ + assert index == holder.index; + set(index, holder.buffer); + } + + public void setSafe(int index, Nullable${minor.class}Holder holder){ + assert index == holder.index; + setSafe(index, holder.buffer); + } + + public void setSafe(int index, ${minor.class}Holder holder){ + assert index == holder.index; + setSafe(index, holder.buffer); + } + + public void setSafe(int index, ArrowBuf buffer){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, buffer); + } + + public void set(int index, ArrowBuf buffer){ + assert TYPE_WIDTH <= buffer.capacity(); + int start = index * TYPE_WIDTH; + buffer.getBytes(0, data, start, TYPE_WIDTH); } <#else> @@ -624,7 +698,7 @@ public void set(int index, Nullable${minor.class}Holder holder){ } public void set(int index, int start, ArrowBuf buffer){ - data.setBytes(index * ${type.width}, buffer, start, ${type.width}); + data.setBytes(index * TYPE_WIDTH, buffer, start, TYPE_WIDTH); } public void setSafe(int index, ${minor.class}Holder holder){ @@ -643,7 +717,7 @@ public void setSafe(int index, int start, ArrowBuf buffer){ } public void set(int index, Nullable${minor.class}Holder holder){ - data.setBytes(index * ${type.width}, holder.buffer, holder.start, ${type.width}); + data.setBytes(index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); } @@ -654,15 +728,15 @@ public void generateTestData(int count) { final int valueCount = getAccessor().getValueCount(); for(int i = 0; i < valueCount; i++, even = !even) { final byte b = even ? Byte.MIN_VALUE : Byte.MAX_VALUE; - for(int w = 0; w < ${type.width}; w++){ + for(int w = 0; w < TYPE_WIDTH; w++){ data.setByte(i + w, b); } } } - <#else> <#-- type.width <= 8 --> + <#else> <#-- type.width >= 0 && type.width <= 8 --> public void set(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int value) { - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, value); + data.set${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH, value); } public void setSafe(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int value) { @@ -673,7 +747,7 @@ public void setSafe(int index, <#if (type.width >= 4)>${minor.javaType!type.java } protected void set(int index, ${minor.class}Holder holder){ - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); + data.set${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH, holder.value); } public void setSafe(int index, ${minor.class}Holder holder){ @@ -684,7 +758,7 @@ public void setSafe(int index, ${minor.class}Holder holder){ } protected void set(int index, Nullable${minor.class}Holder holder){ - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); + data.set${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH, holder.value); } public void setSafe(int index, Nullable${minor.class}Holder holder){ @@ -726,7 +800,7 @@ public void generateTestDataAlt(int size) { @Override public void setValueCount(int valueCount) { final int currentValueCapacity = getValueCapacity(); - final int idx = (${type.width} * valueCount); + final int idx = (TYPE_WIDTH * valueCount); while(valueCount > getValueCapacity()) { reAlloc(); } @@ -736,7 +810,7 @@ public void setValueCount(int valueCount) { allocationMonitor = 0; } VectorTrimmer.trim(data, idx); - data.writerIndex(valueCount * ${type.width}); + data.writerIndex(valueCount * TYPE_WIDTH); } } } diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index c2aa83757b9..6b1a79383ad 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -128,6 +128,10 @@ public void read(Nullable${name}Holder h) { holder.buffer.getBytes(holder.start, bytes, 0, ${type.width}); ${friendlyType} value = new BigDecimal(new BigInteger(bytes), holder.scale); return value; + <#elseif minor.class == "FixedSizeBinary"> + byte[] value = new byte [holder.byteWidth]; + holder.buffer.getBytes(holder.index * holder.byteWidth, value, 0, holder.byteWidth); + return value; <#else> ${friendlyType} value = new ${friendlyType}(this.holder.value); return value; diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index a4313332563..e6d4ed4f32f 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -572,7 +572,7 @@ public void setIndexDefined(int index){ * @param index position of the bit to set * @param value array of bytes (or int if smaller than 4 bytes) to write */ - public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width < 4)>int<#else>${minor.javaType!type.javaType} value) { + public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width >= 0 && type.width < 4)>int<#else>${minor.javaType!type.javaType} value) { setCount++; final ${valuesName}.Mutator valuesMutator = values.getMutator(); final BitVector.Mutator bitsMutator = bits.getMutator(); @@ -673,7 +673,7 @@ public boolean isSafe(int outIndex) { } <#assign fields = minor.fields!type.fields /> - public void set(int index, int isSet<#list fields as field>, ${field.type} ${field.name}Field ){ + public void set(int index, int isSet<#list fields as field><#if field.include!true >, ${field.type} ${field.name}Field ){ final ${valuesName}.Mutator valuesMutator = values.getMutator(); <#if type.major == "VarLen"> for (int i = lastSet + 1; i < index; i++) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 7e20794cbbe..21c2011bef0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -140,7 +140,7 @@ public static ValueVector decode(ValueVector indices, Dictionary dictionary) { private static void validateType(MinorType type) { // byte arrays don't work as keys in our dictionary map - we could wrap them with something to // implement equals and hashcode if we want that functionality - if (type == MinorType.VARBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { + if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { throw new IllegalArgumentException("Dictionary encoding for complex types not implemented: type " + type); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index 484a82fdaab..52a2a73772b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -64,6 +64,7 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ValueVector.Mutator; import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.NullableMapVector; @@ -315,6 +316,9 @@ private void setValueFromParser(ValueVector valueVector, int i) throws IOExcepti case VARBINARY: ((VarBinaryVector) valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); break; + case FIXEDSIZEBINARY: + ((FixedSizeBinaryVector) valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); + break; case VARCHAR: ((VarCharVector) valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); break; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java index a2229cef231..1279fb914a3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -42,6 +42,7 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ValueVector.Accessor; import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; @@ -237,6 +238,10 @@ private void writeValueToGenerator(ValueVector valueVector, int i) throws IOExce String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i)); generator.writeObject(hexString); break; + case FIXEDSIZEBINARY: + String fixedSizeHexString = Hex.encodeHexString(((FixedSizeBinaryVector) valueVector).getAccessor().get(i)); + generator.writeObject(fixedSizeHexString); + break; default: // TODO: each type Accessor accessor = valueVector.getAccessor(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 29407bf1ab4..327557858c6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -155,6 +155,11 @@ public TypeLayout visit(Binary type) { return newVariableWidthTypeLayout(); } + @Override + public TypeLayout visit(ArrowType.FixedSizeBinary type) { + return newFixedWidthTypeLayout(dataVector(type.getByteWidth() * 8)); + } + @Override public TypeLayout visit(Utf8 type) { return newVariableWidthTypeLayout(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java index 0871baf38ed..56cc8c74fc4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java @@ -58,7 +58,7 @@ public static VectorLayout dataVector(int typeBitWidth) { case 64: return VALUES_64; default: - throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + return new VectorLayout(DATA, typeBitWidth); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index c57dd6dafe9..022a8fc3ba2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -53,6 +53,7 @@ import org.apache.arrow.vector.NullableUInt4Vector; import org.apache.arrow.vector.NullableUInt8Vector; import org.apache.arrow.vector.NullableVarBinaryVector; +import org.apache.arrow.vector.NullableFixedSizeBinaryVector; import org.apache.arrow.vector.NullableVarCharVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; @@ -92,11 +93,13 @@ import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.complex.impl.UnionWriter; import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryWriterImpl; import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; @@ -376,6 +379,17 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarBinaryWriterImpl((NullableVarBinaryVector) vector); } }, + FIXEDSIZEBINARY(null) { + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new NullableFixedSizeBinaryVector(name, fieldType, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new FixedSizeBinaryWriterImpl((NullableFixedSizeBinaryVector) vector); + } + }, DECIMAL(null) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { @@ -597,6 +611,11 @@ public MinorType visit(Binary type) { return MinorType.VARBINARY; } + @Override + public MinorType visit(FixedSizeBinary type) { + return MinorType.FIXEDSIZEBINARY; + } + @Override public MinorType visit(Bool type) { return MinorType.BIT; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index 5851bd5fa5d..9b04b0bf62e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -140,7 +140,7 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { default: throw new UnsupportedOperationException("unsupported precision: " + fpType); } - } else if (type instanceof ArrowType.Binary) { + } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.FixedSizeBinary) { return Arrays.equals((byte[]) o1, (byte[]) o2); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index c483ba7de91..5a6897fcff4 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -39,6 +39,7 @@ import org.apache.arrow.vector.NullableFloat4Vector; import org.apache.arrow.vector.NullableIntVector; import org.apache.arrow.vector.NullableTinyIntVector; +import org.apache.arrow.vector.NullableFixedSizeBinaryVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.MapVector; @@ -54,6 +55,7 @@ import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -563,6 +565,63 @@ public void testWriteReadNestedDictionary() throws IOException { } } + @Test + public void testWriteReadFixedSizeBinary() throws IOException { + File file = new File("target/mytest_fixed_size_binary.arrow"); + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + final int numValues = 10; + final int typeWidth = 11; + byte[][] byteValues = new byte[numValues][typeWidth]; + for (int i=0; i Date: Tue, 29 Aug 2017 15:29:33 -0400 Subject: [PATCH 2/3] add unit test for FixedSizeBinaryVector --- .../vector/TestFixedSizeBinaryVector.java | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java new file mode 100644 index 00000000000..84cf7eba7c2 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java @@ -0,0 +1,262 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertArrayEquals; + +public class TestFixedSizeBinaryVector { + private static final int numValues = 123; + private static final int typeWidth = 9; + private static final int smallTypeWidth = 6; + private static final int largeTypeWidth = 12; + + private static byte[][] values; + static { + values = new byte[numValues][typeWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < typeWidth; j++) { + values[i][j] = ((byte) i); + } + } + } + + private ArrowBuf[] bufs = new ArrowBuf[numValues]; + private FixedSizeBinaryHolder[] holders = new FixedSizeBinaryHolder[numValues]; + private NullableFixedSizeBinaryHolder[] nullableHolders = new NullableFixedSizeBinaryHolder[numValues]; + + private static byte[] smallValue; + static { + smallValue = new byte[smallTypeWidth]; + for (int i = 0; i < smallTypeWidth; i++) { + smallValue[i] = ((byte) i); + } + } + + private ArrowBuf smallBuf; + private FixedSizeBinaryHolder smallHolder; + private NullableFixedSizeBinaryHolder smallNullableHolder; + + private static byte[] largeValue; + static { + largeValue = new byte[largeTypeWidth]; + for (int i = 0; i < largeTypeWidth; i++) { + largeValue[i] = ((byte) i); + } + } + + private ArrowBuf largeBuf; + private FixedSizeBinaryHolder largeHolder; + private NullableFixedSizeBinaryHolder largeNullableHolder; + + private FixedSizeBinaryVector.Mutator mutator; + private FixedSizeBinaryVector.Accessor accessor; + + private static void failWithException(String message) throws Exception { + throw new Exception(message); + } + + @Before + public void setUp() throws Exception { + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + FixedSizeBinaryVector fixedSizeBinaryVector = new FixedSizeBinaryVector("fixedSizeBinary", allocator, typeWidth); + fixedSizeBinaryVector.allocateNew(); + mutator = fixedSizeBinaryVector.getMutator(); + accessor = fixedSizeBinaryVector.getAccessor(); + + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(typeWidth); + bufs[i].setBytes(0, values[i]); + + holders[i] = new FixedSizeBinaryHolder(); + holders[i].byteWidth = typeWidth; + holders[i].index = i; + holders[i].buffer = bufs[i]; + + nullableHolders[i] = new NullableFixedSizeBinaryHolder(); + nullableHolders[i].byteWidth = typeWidth; + nullableHolders[i].index = i; + nullableHolders[i].buffer = bufs[i]; + } + + smallBuf = allocator.buffer(smallTypeWidth); + smallBuf.setBytes(0, smallValue); + + smallHolder = new FixedSizeBinaryHolder(); + smallHolder.byteWidth = smallTypeWidth; + smallHolder.index = 0; + smallHolder.buffer = smallBuf; + + smallNullableHolder = new NullableFixedSizeBinaryHolder(); + smallNullableHolder.byteWidth = smallTypeWidth; + smallNullableHolder.index = 0; + smallNullableHolder.buffer = smallBuf; + + largeBuf = allocator.buffer(largeTypeWidth); + largeBuf.setBytes(0, largeValue); + + largeHolder = new FixedSizeBinaryHolder(); + largeHolder.byteWidth = largeTypeWidth; + largeHolder.index = 0; + largeHolder.buffer = largeBuf; + + largeNullableHolder = new NullableFixedSizeBinaryHolder(); + largeNullableHolder.byteWidth = largeTypeWidth; + largeNullableHolder.index = 0; + largeNullableHolder.buffer = largeBuf; + } + + @Test + public void testSetUsingByteArray() { + for (int i = 0; i < numValues; i++) { + mutator.set(i, values[i]); + } + mutator.setValueCount(numValues); + for (int i = 0; i < numValues; i++) { + assertArrayEquals(values[i], accessor.getObject(i)); + } + } + + @Test + public void testSetUsingHolder() { + for (int i = 0; i < numValues; i++) { + mutator.set(i, holders[i]); + } + mutator.setValueCount(numValues); + for (int i = 0; i < numValues; i++) { + assertArrayEquals(values[i], accessor.getObject(i)); + } + } + + @Test + public void testSetUsingNullableHolder() { + for (int i = 0; i < numValues; i++) { + mutator.set(i, nullableHolders[i]); + } + mutator.setValueCount(numValues); + for (int i = 0; i < numValues; i++) { + assertArrayEquals(values[i], accessor.getObject(i)); + } + } + + @Test + public void testMutatorSetWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs + try { + mutator.set(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.set(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.set(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.set(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs + mutator.set(0, largeValue); + mutator.set(0, largeHolder); + mutator.set(0, largeNullableHolder); + mutator.set(0, largeBuf); + + // test holders with wrong indices + try { + mutator.set(0, holders[3]); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.set(0, nullableHolders[3]); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + } + + @Test + public void setMutatorSetSafeWithInvalidInput() throws Exception { + String errorMsg = "input data needs to be at least " + typeWidth + " bytes"; + + // test small inputs + try { + mutator.setSafe(0, smallValue); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.setSafe(0, smallHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.setSafe(0, smallNullableHolder); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.setSafe(0, smallBuf); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + // test large inputs + mutator.setSafe(0, largeValue); + mutator.setSafe(0, largeHolder); + mutator.setSafe(0, largeNullableHolder); + mutator.setSafe(0, largeBuf); + + // test holders with wrong indices + try { + mutator.setSafe(0, holders[3]); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + + try { + mutator.setSafe(0, nullableHolders[3]); + failWithException(errorMsg); + } catch (AssertionError ignore) { + } + } +} From d1b2844110a043af8c9802fb1d2f35e3efd9825a Mon Sep 17 00:00:00 2001 From: Jingyuan Wang Date: Tue, 29 Aug 2017 15:47:28 -0400 Subject: [PATCH 3/3] add unit test for FixedSizeBinaryReader/Writer --- .../complex/writer/TestComplexWriter.java | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index f81cd557a9d..80a6a083372 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -797,6 +797,51 @@ public void timeStampNanoWriters() throws Exception { } } + @Test + public void fixedSizeBinaryWriters() throws Exception { + // test values + int numValues = 10; + int byteWidth = 9; + byte[][] values = new byte[numValues][byteWidth]; + for (int i = 0; i < numValues; i++) { + for (int j = 0; j < byteWidth; j++) { + values[i][j] = ((byte) i); + } + } + ArrowBuf[] bufs = new ArrowBuf[numValues]; + for (int i = 0; i < numValues; i++) { + bufs[i] = allocator.buffer(byteWidth); + bufs[i].setBytes(0, values[i]); + } + + // write + MapVector parent = new MapVector("parent", allocator, null); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + + String fieldName = "fixedSizeBinary"; + FixedSizeBinaryWriter fixedSizeBinaryWriter = rootWriter.fixedSizeBinary(fieldName, byteWidth); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryWriter.setPosition(i); + fixedSizeBinaryWriter.writeFixedSizeBinary(i, bufs[i]); + } + + // schema + List children = parent.getField().getChildren().get(0).getChildren(); + Assert.assertEquals(fieldName, children.get(0).getName()); + Assert.assertEquals(ArrowType.FixedSizeBinary.TYPE_TYPE, children.get(0).getType().getTypeID()); + + // read + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + + FieldReader fixedSizeBinaryReader = rootReader.reader(fieldName); + for (int i = 0; i < numValues; i++) { + fixedSizeBinaryReader.setPosition(i); + byte[] readValues = fixedSizeBinaryReader.readByteArray(); + Assert.assertArrayEquals(values[i], readValues); + } + } + @Test public void complexCopierWithList() { MapVector parent = MapVector.empty("parent", allocator);