diff --git a/integration/integration_test.py b/integration/integration_test.py index 46539484488..c8dbf44204d 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -374,6 +374,49 @@ def generate_column(self, size, name=None): return self.column_class(name, size, is_valid, values) +class FixedSizeBinaryType(PrimitiveType): + + def __init__(self, name, byte_width, nullable=True): + PrimitiveType.__init__(self, name, nullable) + self.byte_width = byte_width + + @property + def numpy_type(self): + return object + + @property + def column_class(self): + return FixedSizeBinaryColumn + + def _get_type(self): + return OrderedDict([('name', 'fixedsizebinary'), ('byteWidth', self.byte_width)]) + + def _get_type_layout(self): + return OrderedDict([ + ('vectors', + [OrderedDict([('type', 'VALIDITY'), + ('typeBitWidth', 1)]), + OrderedDict([('type', 'DATA'), + ('typeBitWidth', self.byte_width)])])]) + + def generate_column(self, size, name=None): + is_valid = self._make_is_valid(size) + values = [] + + for i in range(size): + if is_valid[i]: + draw = (np.random.randint(0, 255, size=self.byte_width) + .astype(np.uint8) + .tostring()) + values.append(draw) + else: + values.append("") + + if name is None: + name = self.name + return self.column_class(name, size, is_valid, values) + + class StringType(BinaryType): @property @@ -436,6 +479,22 @@ def _get_buffers(self): ] +class FixedSizeBinaryColumn(PrimitiveColumn): + + def _encode_value(self, x): + return ''.join('{:02x}'.format(c).upper() for c in x) + + def _get_buffers(self): + data = [] + for i, v in enumerate(self.values): + data.append(self._encode_value(v if self.is_valid[i] else "")) + + return [ + ('VALIDITY', [int(x) for x in self.is_valid]), + ('DATA', data) + ] + + class StringColumn(BinaryColumn): def _encode_value(self, x): @@ -648,6 +707,9 @@ def get_field(name, type_, nullable=True): return BinaryType(name, nullable=nullable) elif type_ == 'utf8': return StringType(name, nullable=nullable) + elif type_.startswith('fixedsizebinary_'): + byte_width = int(type_.split('_')[1]) + return FixedSizeBinaryType(name, byte_width=byte_width, nullable=nullable) dtype = np.dtype(type_) @@ -677,10 +739,11 @@ def _generate_file(name, fields, batch_sizes, dictionaries=None): return JsonFile(name, schema, batches, dictionaries) -def generate_primitive_case(batch_sizes): +def generate_primitive_case(batch_sizes, name_suffix=None): types = ['bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float32', 'float64', 'binary', 'utf8'] + 'float32', 'float64', 'binary', 'utf8', + 'fixedsizebinary_19', 'fixedsizebinary_120'] fields = [] @@ -688,7 +751,10 @@ def generate_primitive_case(batch_sizes): fields.append(get_field(type_ + "_nullable", type_, True)) fields.append(get_field(type_ + "_nonnullable", type_, False)) - return _generate_file("primitive", fields, batch_sizes) + name = "primitive" + if name_suffix is not None: + name += name_suffix + return _generate_file(name, fields, batch_sizes) def generate_datetime_case(): @@ -754,8 +820,8 @@ def _temp_path(): return file_objs = [ - generate_primitive_case([7, 10]), - generate_primitive_case([0, 0, 0]), + generate_primitive_case([7, 10], "_0"), + generate_primitive_case([0, 0, 0], "_1"), generate_datetime_case(), generate_nested_case(), generate_dictionary_case() diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index ce92c1333a5..63b193fc66a 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -57,6 +57,11 @@ fields: [], complex: false }, + { + name: "FixedSizeBinary", + fields: [{name: "byteWidth", type: int}], + complex: false + } { name: "Bool", fields: [], diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 970d887c760..ab5a213d297 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -122,6 +122,21 @@ } ] }, + { + major: "Fixed", + width: -1, + javaType: "byte[]", + boxedType: "ArrowBuf", + minor: [ + { + class: "FixedSizeBinary", + typeParams: [ {name: "byteWidth", type: "int"} ], + arrowType: "org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary", + friendlyType: "byte[]", + fields: [{name: "index", type: "int", include: false}, {name: "buffer", type: "ArrowBuf"}], + } + ] + }, { major: "VarLen", width: 4, diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java index 9747d421c41..5b024371abb 100644 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -39,7 +39,9 @@ * ${minor.class} implements a vector of fixed width values. Elements in the vector are accessed * by position, starting from the logical start of the vector. Values should be pushed onto the * vector sequentially, but may be randomly accessed. +<#if (type.width > 0) > * The width of each element is ${type.width} byte(s) +#if> * The equivalent Java primitive is '${minor.javaType!type.javaType}' * * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. @@ -47,12 +49,16 @@ public final class ${className} extends BaseDataValueVector implements FixedWidthVector{ private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); + <#if (type.width < 0)> + public final int TYPE_WIDTH; + <#else> public static final int TYPE_WIDTH = ${type.width}; + #if> private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); - private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; + private int allocationSizeInBytes; private int allocationMonitor = 0; <#if minor.typeParams??> @@ -63,13 +69,18 @@ public final class ${className} extends BaseDataValueVector implements FixedWidt public ${className}(String name, BufferAllocator allocator<#list typeParams as typeParam>, ${typeParam.type} ${typeParam.name}#list>) { super(name, allocator); + <#if minor.class == "FixedSizeBinary"> + TYPE_WIDTH = byteWidth; + #if> <#list typeParams as typeParam> this.${typeParam.name} = ${typeParam.name}; #list> + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; } <#else> public ${className}(String name, BufferAllocator allocator) { super(name, allocator); + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; } #if> @@ -93,7 +104,7 @@ public int getBufferSizeFor(final int valueCount) { if (valueCount == 0) { return 0; } - return valueCount * ${type.width}; + return valueCount * TYPE_WIDTH; } @Override @@ -116,7 +127,7 @@ public ArrowBuf getOffsetBuffer() { @Override public int getValueCapacity(){ - return (int) (data.capacity() *1.0 / ${type.width}); + return (int) (data.capacity() *1.0 / TYPE_WIDTH); } @Override @@ -135,7 +146,7 @@ int getAllocationSize() { @Override public void setInitialCapacity(final int valueCount) { - final long size = 1L * valueCount * ${type.width}; + final long size = 1L * valueCount * TYPE_WIDTH; if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); } @@ -179,12 +190,12 @@ public boolean allocateNewSafe() { */ @Override public void allocateNew(final int valueCount) { - allocateBytes(valueCount * ${type.width}); + allocateBytes(valueCount * TYPE_WIDTH); } @Override public void reset() { - allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; allocationMonitor = 0; zeroVector(); super.reset(); @@ -254,8 +265,8 @@ public void transferTo(${className} target){ } public void splitAndTransferTo(int startIndex, int length, ${className} target) { - final int startPoint = startIndex * ${type.width}; - final int sliceLength = length * ${type.width}; + final int startPoint = startIndex * TYPE_WIDTH; + final int sliceLength = length * TYPE_WIDTH; target.clear(); target.data = data.slice(startPoint, sliceLength).transferOwnership(target.allocator).buffer; target.data.writerIndex(sliceLength); @@ -294,11 +305,11 @@ public void copyValueSafe(int fromIndex, int toIndex) { } public void copyFrom(int fromIndex, int thisIndex, ${className} from){ - <#if (type.width > 8 || minor.class == "IntervalDay")> - from.data.getBytes(fromIndex * ${type.width}, data, thisIndex * ${type.width}, ${type.width}); - <#else> <#-- type.width <= 8 --> - data.set${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}, - from.data.get${(minor.javaType!type.javaType)?cap_first}(fromIndex * ${type.width}) + <#if (type.width < 0 || type.width > 8 || minor.class == "IntervalDay")> + from.data.getBytes(fromIndex * TYPE_WIDTH, data, thisIndex * TYPE_WIDTH, TYPE_WIDTH); + <#else> <#-- type.width >= 0 && type.width <= 8 --> + data.set${(minor.javaType!type.javaType)?cap_first}(thisIndex * TYPE_WIDTH, + from.data.get${(minor.javaType!type.javaType)?cap_first}(fromIndex * TYPE_WIDTH) ); #if> <#-- type.width --> } @@ -324,7 +335,7 @@ private void incrementAllocationMonitor() { public final class Accessor extends BaseDataValueVector.BaseAccessor { @Override public int getValueCount() { - return data.writerIndex() / ${type.width}; + return data.writerIndex() / TYPE_WIDTH; } @Override @@ -332,20 +343,27 @@ public boolean isNull(int index){ return false; } - <#if (type.width > 8 || minor.class == "IntervalDay")> + <#if (type.width < 0 || type.width > 8 || minor.class == "IntervalDay")> public ${minor.javaType!type.javaType} get(int index) { - return data.slice(index * ${type.width}, ${type.width}); + <#if (minor.class == "FixedSizeBinary")> + assert index >= 0; + final byte[] dst = new byte[TYPE_WIDTH]; + data.getBytes(index * TYPE_WIDTH, dst, 0, TYPE_WIDTH); + return dst; + <#else> + return data.slice(index * TYPE_WIDTH, TYPE_WIDTH); + #if> } <#if (minor.class == "IntervalDay")> public void get(int index, ${minor.class}Holder holder){ - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; holder.days = data.getInt(offsetIndex); holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); } public void get(int index, Nullable${minor.class}Holder holder){ - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; holder.isSet = 1; holder.days = data.getInt(offsetIndex); holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); @@ -353,7 +371,7 @@ public void get(int index, Nullable${minor.class}Holder holder){ @Override public ${friendlyType} getObject(int index) { - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; final int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); final int days = data.getInt(offsetIndex); final Period p = new Period(); @@ -361,7 +379,7 @@ public void get(int index, Nullable${minor.class}Holder holder){ } public StringBuilder getAsStringBuilder(int index) { - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); final int days = data.getInt(offsetIndex); @@ -387,7 +405,7 @@ public StringBuilder getAsStringBuilder(int index) { <#elseif minor.class == "Decimal"> public void get(int index, ${minor.class}Holder holder) { - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; holder.buffer = data; holder.scale = scale; holder.precision = precision; @@ -395,7 +413,7 @@ public void get(int index, ${minor.class}Holder holder) { public void get(int index, Nullable${minor.class}Holder holder) { holder.isSet = 1; - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; holder.buffer = data; holder.scale = scale; holder.precision = precision; @@ -403,35 +421,52 @@ public void get(int index, Nullable${minor.class}Holder holder) { @Override public ${friendlyType} getObject(int index) { - return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(data, ${type.width} * index, scale); + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(data, TYPE_WIDTH * index, scale); + } + + <#elseif minor.class == "FixedSizeBinary"> + public void get(int index, ${minor.class}Holder holder) { + holder.index = index; + holder.buffer = data; + } + + public void get(int index, Nullable${minor.class}Holder holder) { + holder.isSet = 1; + holder.index = index; + holder.buffer = data; + } + + @Override + public ${friendlyType} getObject(int index) { + return get(index); } <#else> public void get(int index, ${minor.class}Holder holder){ holder.buffer = data; - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; } public void get(int index, Nullable${minor.class}Holder holder){ holder.isSet = 1; holder.buffer = data; - holder.start = index * ${type.width}; + holder.start = index * TYPE_WIDTH; } @Override public ${friendlyType} getObject(int index) { - return data.slice(index * ${type.width}, ${type.width}) + return data.slice(index * TYPE_WIDTH, TYPE_WIDTH); } #if> - <#else> <#-- type.width <= 8 --> + <#else> <#-- type.width >= 0 && type.width <= 8 --> public ${minor.javaType!type.javaType} get(int index) { - return data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + return data.get${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH); } <#if type.width == 4> public long getTwoAsLong(int index) { - return data.getLong(index * ${type.width}); + return data.getLong(index * TYPE_WIDTH); } #if> @@ -514,12 +549,12 @@ public StringBuilder getAsStringBuilder(int index) { #if> public void get(int index, ${minor.class}Holder holder){ - holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH); } public void get(int index, Nullable${minor.class}Holder holder){ holder.isSet = 1; - holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH); } #if> <#-- type.width --> @@ -529,7 +564,9 @@ public void get(int index, Nullable${minor.class}Holder holder){ * ${minor.class}.Mutator implements a mutable vector of fixed width values. Elements in the * vector are accessed by position from the logical start of the vector. Values should be pushed * onto the vector sequentially, but may be randomly accessed. + <#if (type.width > 0)> * The width of each element is ${type.width} byte(s) + #if> * The equivalent Java primitive is '${minor.javaType!type.javaType}' * * NB: this class is automatically generated from FixedValueVectorTypes.tdd using FreeMarker. @@ -545,21 +582,24 @@ public final class Mutator extends BaseDataValueVector.BaseMutator { * @param index position of the bit to set * @param value value to set */ - <#if (type.width > 8) || minor.class == "IntervalDay"> - public void set(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int#if> value) { - data.setBytes(index * ${type.width}, value, 0, ${type.width}); + <#if (type.width < 0 || type.width > 8 || minor.class == "IntervalDay")> + public void set(int index, <#if (type.width < 0 || type.width > 4)>${minor.javaType!type.javaType}<#else>int#if> value) { + <#if minor.class == "FixedSizeBinary"> + assert TYPE_WIDTH <= value.length; + #if> + data.setBytes(index * TYPE_WIDTH, value, 0, TYPE_WIDTH); } - public void setSafe(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int#if> value) { + public void setSafe(int index, <#if (type.width < 0 || type.width > 4)>${minor.javaType!type.javaType}<#else>int#if> value) { while(index >= getValueCapacity()) { reAlloc(); } - data.setBytes(index * ${type.width}, value, 0, ${type.width}); + set(index, value); } <#if (minor.class == "IntervalDay")> public void set(int index, int days, int milliseconds){ - final int offsetIndex = index * ${type.width}; + final int offsetIndex = index * TYPE_WIDTH; data.setInt(offsetIndex, days); data.setInt((offsetIndex + ${minor.millisecondsOffset}), milliseconds); } @@ -611,7 +651,41 @@ public void setSafe(int index, int start, ArrowBuf buffer){ } public void set(int index, int start, ArrowBuf buffer){ - data.setBytes(index * ${type.width}, buffer, start, ${type.width}); + data.setBytes(index * TYPE_WIDTH, buffer, start, TYPE_WIDTH); + } + + <#elseif minor.class == "FixedSizeBinary"> + public void set(int index, ${minor.class}Holder holder){ + assert index == holder.index; + set(index, holder.buffer); + } + + void set(int index, Nullable${minor.class}Holder holder){ + assert index == holder.index; + set(index, holder.buffer); + } + + public void setSafe(int index, Nullable${minor.class}Holder holder){ + assert index == holder.index; + setSafe(index, holder.buffer); + } + + public void setSafe(int index, ${minor.class}Holder holder){ + assert index == holder.index; + setSafe(index, holder.buffer); + } + + public void setSafe(int index, ArrowBuf buffer){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, buffer); + } + + public void set(int index, ArrowBuf buffer){ + assert TYPE_WIDTH <= buffer.capacity(); + int start = index * TYPE_WIDTH; + buffer.getBytes(0, data, start, TYPE_WIDTH); } <#else> @@ -624,7 +698,7 @@ public void set(int index, Nullable${minor.class}Holder holder){ } public void set(int index, int start, ArrowBuf buffer){ - data.setBytes(index * ${type.width}, buffer, start, ${type.width}); + data.setBytes(index * TYPE_WIDTH, buffer, start, TYPE_WIDTH); } public void setSafe(int index, ${minor.class}Holder holder){ @@ -643,7 +717,7 @@ public void setSafe(int index, int start, ArrowBuf buffer){ } public void set(int index, Nullable${minor.class}Holder holder){ - data.setBytes(index * ${type.width}, holder.buffer, holder.start, ${type.width}); + data.setBytes(index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); } #if> @@ -654,15 +728,15 @@ public void generateTestData(int count) { final int valueCount = getAccessor().getValueCount(); for(int i = 0; i < valueCount; i++, even = !even) { final byte b = even ? Byte.MIN_VALUE : Byte.MAX_VALUE; - for(int w = 0; w < ${type.width}; w++){ + for(int w = 0; w < TYPE_WIDTH; w++){ data.setByte(i + w, b); } } } - <#else> <#-- type.width <= 8 --> + <#else> <#-- type.width >= 0 && type.width <= 8 --> public void set(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int#if> value) { - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, value); + data.set${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH, value); } public void setSafe(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int#if> value) { @@ -673,7 +747,7 @@ public void setSafe(int index, <#if (type.width >= 4)>${minor.javaType!type.java } protected void set(int index, ${minor.class}Holder holder){ - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); + data.set${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH, holder.value); } public void setSafe(int index, ${minor.class}Holder holder){ @@ -684,7 +758,7 @@ public void setSafe(int index, ${minor.class}Holder holder){ } protected void set(int index, Nullable${minor.class}Holder holder){ - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); + data.set${(minor.javaType!type.javaType)?cap_first}(index * TYPE_WIDTH, holder.value); } public void setSafe(int index, Nullable${minor.class}Holder holder){ @@ -726,7 +800,7 @@ public void generateTestDataAlt(int size) { @Override public void setValueCount(int valueCount) { final int currentValueCapacity = getValueCapacity(); - final int idx = (${type.width} * valueCount); + final int idx = (TYPE_WIDTH * valueCount); while(valueCount > getValueCapacity()) { reAlloc(); } @@ -736,7 +810,7 @@ public void setValueCount(int valueCount) { allocationMonitor = 0; } VectorTrimmer.trim(data, idx); - data.writerIndex(valueCount * ${type.width}); + data.writerIndex(valueCount * TYPE_WIDTH); } } } diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index c2aa83757b9..6b1a79383ad 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -128,6 +128,10 @@ public void read(Nullable${name}Holder h) { holder.buffer.getBytes(holder.start, bytes, 0, ${type.width}); ${friendlyType} value = new BigDecimal(new BigInteger(bytes), holder.scale); return value; + <#elseif minor.class == "FixedSizeBinary"> + byte[] value = new byte [holder.byteWidth]; + holder.buffer.getBytes(holder.index * holder.byteWidth, value, 0, holder.byteWidth); + return value; <#else> ${friendlyType} value = new ${friendlyType}(this.holder.value); return value; diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index a4313332563..e6d4ed4f32f 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -572,7 +572,7 @@ public void setIndexDefined(int index){ * @param index position of the bit to set * @param value array of bytes (or int if smaller than 4 bytes) to write */ - public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width < 4)>int<#else>${minor.javaType!type.javaType}#if> value) { + public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width >= 0 && type.width < 4)>int<#else>${minor.javaType!type.javaType}#if> value) { setCount++; final ${valuesName}.Mutator valuesMutator = values.getMutator(); final BitVector.Mutator bitsMutator = bits.getMutator(); @@ -673,7 +673,7 @@ public boolean isSafe(int outIndex) { } <#assign fields = minor.fields!type.fields /> - public void set(int index, int isSet<#list fields as field>, ${field.type} ${field.name}Field#list> ){ + public void set(int index, int isSet<#list fields as field><#if field.include!true >, ${field.type} ${field.name}Field#if>#list> ){ final ${valuesName}.Mutator valuesMutator = values.getMutator(); <#if type.major == "VarLen"> for (int i = lastSet + 1; i < index; i++) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 7e20794cbbe..21c2011bef0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -140,7 +140,7 @@ public static ValueVector decode(ValueVector indices, Dictionary dictionary) { private static void validateType(MinorType type) { // byte arrays don't work as keys in our dictionary map - we could wrap them with something to // implement equals and hashcode if we want that functionality - if (type == MinorType.VARBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { + if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) { throw new IllegalArgumentException("Dictionary encoding for complex types not implemented: type " + type); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index 484a82fdaab..52a2a73772b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -64,6 +64,7 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ValueVector.Mutator; import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.NullableMapVector; @@ -315,6 +316,9 @@ private void setValueFromParser(ValueVector valueVector, int i) throws IOExcepti case VARBINARY: ((VarBinaryVector) valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); break; + case FIXEDSIZEBINARY: + ((FixedSizeBinaryVector) valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); + break; case VARCHAR: ((VarCharVector) valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); break; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java index a2229cef231..1279fb914a3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -42,6 +42,7 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ValueVector.Accessor; import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; @@ -237,6 +238,10 @@ private void writeValueToGenerator(ValueVector valueVector, int i) throws IOExce String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i)); generator.writeObject(hexString); break; + case FIXEDSIZEBINARY: + String fixedSizeHexString = Hex.encodeHexString(((FixedSizeBinaryVector) valueVector).getAccessor().get(i)); + generator.writeObject(fixedSizeHexString); + break; default: // TODO: each type Accessor accessor = valueVector.getAccessor(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 29407bf1ab4..327557858c6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -155,6 +155,11 @@ public TypeLayout visit(Binary type) { return newVariableWidthTypeLayout(); } + @Override + public TypeLayout visit(ArrowType.FixedSizeBinary type) { + return newFixedWidthTypeLayout(dataVector(type.getByteWidth() * 8)); + } + @Override public TypeLayout visit(Utf8 type) { return newVariableWidthTypeLayout(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java index 0871baf38ed..56cc8c74fc4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java @@ -58,7 +58,7 @@ public static VectorLayout dataVector(int typeBitWidth) { case 64: return VALUES_64; default: - throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + return new VectorLayout(DATA, typeBitWidth); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index c57dd6dafe9..022a8fc3ba2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -53,6 +53,7 @@ import org.apache.arrow.vector.NullableUInt4Vector; import org.apache.arrow.vector.NullableUInt8Vector; import org.apache.arrow.vector.NullableVarBinaryVector; +import org.apache.arrow.vector.NullableFixedSizeBinaryVector; import org.apache.arrow.vector.NullableVarCharVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; @@ -92,11 +93,13 @@ import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.complex.impl.UnionWriter; import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.FixedSizeBinaryWriterImpl; import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; @@ -376,6 +379,17 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarBinaryWriterImpl((NullableVarBinaryVector) vector); } }, + FIXEDSIZEBINARY(null) { + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new NullableFixedSizeBinaryVector(name, fieldType, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new FixedSizeBinaryWriterImpl((NullableFixedSizeBinaryVector) vector); + } + }, DECIMAL(null) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { @@ -597,6 +611,11 @@ public MinorType visit(Binary type) { return MinorType.VARBINARY; } + @Override + public MinorType visit(FixedSizeBinary type) { + return MinorType.FIXEDSIZEBINARY; + } + @Override public MinorType visit(Bool type) { return MinorType.BIT; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index 5851bd5fa5d..9b04b0bf62e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -140,7 +140,7 @@ static boolean equals(ArrowType type, final Object o1, final Object o2) { default: throw new UnsupportedOperationException("unsupported precision: " + fpType); } - } else if (type instanceof ArrowType.Binary) { + } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.FixedSizeBinary) { return Arrays.equals((byte[]) o1, (byte[]) o2); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java new file mode 100644 index 00000000000..84cf7eba7c2 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java @@ -0,0 +1,262 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector;
+
+import io.netty.buffer.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.holders.FixedSizeBinaryHolder;
+import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+
+public class TestFixedSizeBinaryVector {
+ private static final int numValues = 123;
+ private static final int typeWidth = 9;
+ private static final int smallTypeWidth = 6;
+ private static final int largeTypeWidth = 12;
+
+ private static byte[][] values;
+ static {
+ values = new byte[numValues][typeWidth];
+ for (int i = 0; i < numValues; i++) {
+ for (int j = 0; j < typeWidth; j++) {
+ values[i][j] = ((byte) i);
+ }
+ }
+ }
+
+ private ArrowBuf[] bufs = new ArrowBuf[numValues];
+ private FixedSizeBinaryHolder[] holders = new FixedSizeBinaryHolder[numValues];
+ private NullableFixedSizeBinaryHolder[] nullableHolders = new NullableFixedSizeBinaryHolder[numValues];
+
+ private static byte[] smallValue;
+ static {
+ smallValue = new byte[smallTypeWidth];
+ for (int i = 0; i < smallTypeWidth; i++) {
+ smallValue[i] = ((byte) i);
+ }
+ }
+
+ private ArrowBuf smallBuf;
+ private FixedSizeBinaryHolder smallHolder;
+ private NullableFixedSizeBinaryHolder smallNullableHolder;
+
+ private static byte[] largeValue;
+ static {
+ largeValue = new byte[largeTypeWidth];
+ for (int i = 0; i < largeTypeWidth; i++) {
+ largeValue[i] = ((byte) i);
+ }
+ }
+
+ private ArrowBuf largeBuf;
+ private FixedSizeBinaryHolder largeHolder;
+ private NullableFixedSizeBinaryHolder largeNullableHolder;
+
+ private FixedSizeBinaryVector.Mutator mutator;
+ private FixedSizeBinaryVector.Accessor accessor;
+
+ private static void failWithException(String message) throws Exception {
+ throw new Exception(message);
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE);
+ FixedSizeBinaryVector fixedSizeBinaryVector = new FixedSizeBinaryVector("fixedSizeBinary", allocator, typeWidth);
+ fixedSizeBinaryVector.allocateNew();
+ mutator = fixedSizeBinaryVector.getMutator();
+ accessor = fixedSizeBinaryVector.getAccessor();
+
+ for (int i = 0; i < numValues; i++) {
+ bufs[i] = allocator.buffer(typeWidth);
+ bufs[i].setBytes(0, values[i]);
+
+ holders[i] = new FixedSizeBinaryHolder();
+ holders[i].byteWidth = typeWidth;
+ holders[i].index = i;
+ holders[i].buffer = bufs[i];
+
+ nullableHolders[i] = new NullableFixedSizeBinaryHolder();
+ nullableHolders[i].byteWidth = typeWidth;
+ nullableHolders[i].index = i;
+ nullableHolders[i].buffer = bufs[i];
+ }
+
+ smallBuf = allocator.buffer(smallTypeWidth);
+ smallBuf.setBytes(0, smallValue);
+
+ smallHolder = new FixedSizeBinaryHolder();
+ smallHolder.byteWidth = smallTypeWidth;
+ smallHolder.index = 0;
+ smallHolder.buffer = smallBuf;
+
+ smallNullableHolder = new NullableFixedSizeBinaryHolder();
+ smallNullableHolder.byteWidth = smallTypeWidth;
+ smallNullableHolder.index = 0;
+ smallNullableHolder.buffer = smallBuf;
+
+ largeBuf = allocator.buffer(largeTypeWidth);
+ largeBuf.setBytes(0, largeValue);
+
+ largeHolder = new FixedSizeBinaryHolder();
+ largeHolder.byteWidth = largeTypeWidth;
+ largeHolder.index = 0;
+ largeHolder.buffer = largeBuf;
+
+ largeNullableHolder = new NullableFixedSizeBinaryHolder();
+ largeNullableHolder.byteWidth = largeTypeWidth;
+ largeNullableHolder.index = 0;
+ largeNullableHolder.buffer = largeBuf;
+ }
+
+ @Test
+ public void testSetUsingByteArray() {
+ for (int i = 0; i < numValues; i++) {
+ mutator.set(i, values[i]);
+ }
+ mutator.setValueCount(numValues);
+ for (int i = 0; i < numValues; i++) {
+ assertArrayEquals(values[i], accessor.getObject(i));
+ }
+ }
+
+ @Test
+ public void testSetUsingHolder() {
+ for (int i = 0; i < numValues; i++) {
+ mutator.set(i, holders[i]);
+ }
+ mutator.setValueCount(numValues);
+ for (int i = 0; i < numValues; i++) {
+ assertArrayEquals(values[i], accessor.getObject(i));
+ }
+ }
+
+ @Test
+ public void testSetUsingNullableHolder() {
+ for (int i = 0; i < numValues; i++) {
+ mutator.set(i, nullableHolders[i]);
+ }
+ mutator.setValueCount(numValues);
+ for (int i = 0; i < numValues; i++) {
+ assertArrayEquals(values[i], accessor.getObject(i));
+ }
+ }
+
+ @Test
+ public void testMutatorSetWithInvalidInput() throws Exception {
+ String errorMsg = "input data needs to be at least " + typeWidth + " bytes";
+
+ // test small inputs
+ try {
+ mutator.set(0, smallValue);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.set(0, smallHolder);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.set(0, smallNullableHolder);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.set(0, smallBuf);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ // test large inputs
+ mutator.set(0, largeValue);
+ mutator.set(0, largeHolder);
+ mutator.set(0, largeNullableHolder);
+ mutator.set(0, largeBuf);
+
+ // test holders with wrong indices
+ try {
+ mutator.set(0, holders[3]);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.set(0, nullableHolders[3]);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+ }
+
+ @Test
+ public void setMutatorSetSafeWithInvalidInput() throws Exception {
+ String errorMsg = "input data needs to be at least " + typeWidth + " bytes";
+
+ // test small inputs
+ try {
+ mutator.setSafe(0, smallValue);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.setSafe(0, smallHolder);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.setSafe(0, smallNullableHolder);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.setSafe(0, smallBuf);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ // test large inputs
+ mutator.setSafe(0, largeValue);
+ mutator.setSafe(0, largeHolder);
+ mutator.setSafe(0, largeNullableHolder);
+ mutator.setSafe(0, largeBuf);
+
+ // test holders with wrong indices
+ try {
+ mutator.setSafe(0, holders[3]);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+
+ try {
+ mutator.setSafe(0, nullableHolders[3]);
+ failWithException(errorMsg);
+ } catch (AssertionError ignore) {
+ }
+ }
+}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java
index f81cd557a9d..80a6a083372 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java
@@ -797,6 +797,51 @@ public void timeStampNanoWriters() throws Exception {
}
}
+ @Test
+ public void fixedSizeBinaryWriters() throws Exception {
+ // test values
+ int numValues = 10;
+ int byteWidth = 9;
+ byte[][] values = new byte[numValues][byteWidth];
+ for (int i = 0; i < numValues; i++) {
+ for (int j = 0; j < byteWidth; j++) {
+ values[i][j] = ((byte) i);
+ }
+ }
+ ArrowBuf[] bufs = new ArrowBuf[numValues];
+ for (int i = 0; i < numValues; i++) {
+ bufs[i] = allocator.buffer(byteWidth);
+ bufs[i].setBytes(0, values[i]);
+ }
+
+ // write
+ MapVector parent = new MapVector("parent", allocator, null);
+ ComplexWriter writer = new ComplexWriterImpl("root", parent);
+ MapWriter rootWriter = writer.rootAsMap();
+
+ String fieldName = "fixedSizeBinary";
+ FixedSizeBinaryWriter fixedSizeBinaryWriter = rootWriter.fixedSizeBinary(fieldName, byteWidth);
+ for (int i = 0; i < numValues; i++) {
+ fixedSizeBinaryWriter.setPosition(i);
+ fixedSizeBinaryWriter.writeFixedSizeBinary(i, bufs[i]);
+ }
+
+ // schema
+ List