diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java index dc708724043d..7d50676688e0 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java @@ -148,6 +148,11 @@ public ColumnBinder visit(ArrowType.Utf8 type) { new VarCharBinder<>(varChar, jdbcType); } + @Override + public ColumnBinder visit(ArrowType.Utf8View type) { + throw new UnsupportedOperationException("Column binder implemented for type " + type + " is not supported"); + } + @Override public ColumnBinder visit(ArrowType.LargeUtf8 type) { LargeVarCharVector varChar = (LargeVarCharVector) vector; @@ -162,6 +167,11 @@ public ColumnBinder visit(ArrowType.Binary type) { new VarBinaryBinder<>(varBinary, jdbcType); } + @Override + public ColumnBinder visit(ArrowType.BinaryView type) { + throw new UnsupportedOperationException("Column binder implemented for type " + type + " is not supported"); + } + @Override public ColumnBinder visit(ArrowType.LargeBinary type) { LargeVarBinaryVector varBinary = (LargeVarBinaryVector) vector; diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index cd2a464f4fa1..bc6139cc84c5 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -209,6 +209,11 @@ public List visit(ArrowType.Utf8 type) { } } + @Override + public List visit(ArrowType.Utf8View type) { + throw new UnsupportedOperationException("Importing buffers for view type: " + type + " not supported"); + } + @Override public List visit(ArrowType.LargeUtf8 type) { try (ArrowBuf offsets = importOffsets(type, LargeVarCharVector.OFFSET_WIDTH)) { @@ -237,6 +242,11 @@ public List visit(ArrowType.Binary type) { } } + @Override + public List visit(ArrowType.BinaryView type) { + throw new UnsupportedOperationException("Importing buffers for view type: " + type + " not supported"); + } + @Override public List visit(ArrowType.LargeBinary type) { try (ArrowBuf offsets = importOffsets(type, LargeVarBinaryVector.OFFSET_WIDTH)) { diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/BinaryViewAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/BinaryViewAvaticaParameterConverter.java new file mode 100644 index 000000000000..dfd472701429 --- /dev/null +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/BinaryViewAvaticaParameterConverter.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.driver.jdbc.converter.impl; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.calcite.avatica.AvaticaParameter; +import org.apache.calcite.avatica.remote.TypedValue; + +/** AvaticaParameterConverter for BinaryView Arrow types. */ +public class BinaryViewAvaticaParameterConverter extends BaseAvaticaParameterConverter { + + public BinaryViewAvaticaParameterConverter(ArrowType.BinaryView type) { + + } + + @Override + public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public AvaticaParameter createParameter(Field field) { + return createParameter(field, false); + } +} diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/Utf8ViewAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/Utf8ViewAvaticaParameterConverter.java new file mode 100644 index 000000000000..2c826aefb9c1 --- /dev/null +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/Utf8ViewAvaticaParameterConverter.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.driver.jdbc.converter.impl; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.calcite.avatica.AvaticaParameter; +import org.apache.calcite.avatica.remote.TypedValue; + +/** + * AvaticaParameterConverter for Utf8View Arrow types. + */ +public class Utf8ViewAvaticaParameterConverter extends BaseAvaticaParameterConverter { + + public Utf8ViewAvaticaParameterConverter(ArrowType.Utf8View type) { + } + + @Override + public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + throw new UnsupportedOperationException("Utf8View not supported"); + } + + @Override + public AvaticaParameter createParameter(Field field) { + return createParameter(field, false); + } +} diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index b2bd8e745ecc..fd9127c22691 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -190,6 +190,11 @@ public Boolean visit(ArrowType.Utf8 type) { return new Utf8AvaticaParameterConverter(type).bindParameter(vector, typedValue, index); } + @Override + public Boolean visit(ArrowType.Utf8View type) { + throw new UnsupportedOperationException("Utf8View is unsupported"); + } + @Override public Boolean visit(ArrowType.LargeUtf8 type) { return new LargeUtf8AvaticaParameterConverter(type).bindParameter(vector, typedValue, index); @@ -200,6 +205,11 @@ public Boolean visit(ArrowType.Binary type) { return new BinaryAvaticaParameterConverter(type).bindParameter(vector, typedValue, index); } + @Override + public Boolean visit(ArrowType.BinaryView type) { + throw new UnsupportedOperationException("BinaryView is unsupported"); + } + @Override public Boolean visit(ArrowType.LargeBinary type) { return new LargeBinaryAvaticaParameterConverter(type).bindParameter(vector, typedValue, index); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java index 843fe0cb89d9..93b5faaef32c 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java @@ -23,6 +23,7 @@ import java.util.stream.Stream; import org.apache.arrow.driver.jdbc.converter.impl.BinaryAvaticaParameterConverter; +import org.apache.arrow.driver.jdbc.converter.impl.BinaryViewAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.BoolAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.DateAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.DecimalAvaticaParameterConverter; @@ -43,6 +44,7 @@ import org.apache.arrow.driver.jdbc.converter.impl.TimestampAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.UnionAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.Utf8AvaticaParameterConverter; +import org.apache.arrow.driver.jdbc.converter.impl.Utf8ViewAvaticaParameterConverter; import org.apache.arrow.flight.sql.FlightSqlColumnMetadata; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; @@ -208,6 +210,11 @@ public AvaticaParameter visit(ArrowType.Utf8 type) { return new Utf8AvaticaParameterConverter(type).createParameter(field); } + @Override + public AvaticaParameter visit(ArrowType.Utf8View type) { + return new Utf8ViewAvaticaParameterConverter(type).createParameter(field); + } + @Override public AvaticaParameter visit(ArrowType.LargeUtf8 type) { return new LargeUtf8AvaticaParameterConverter(type).createParameter(field); @@ -218,6 +225,11 @@ public AvaticaParameter visit(ArrowType.Binary type) { return new BinaryAvaticaParameterConverter(type).createParameter(field); } + @Override + public AvaticaParameter visit(ArrowType.BinaryView type) { + return new BinaryViewAvaticaParameterConverter(type).createParameter(field); + } + @Override public AvaticaParameter visit(ArrowType.LargeBinary type) { return new LargeBinaryAvaticaParameterConverter(type).createParameter(field); diff --git a/java/memory/memory-core/src/main/java/module-info.java b/java/memory/memory-core/src/main/java/module-info.java index 34ba34e80bc6..5024b7f45769 100644 --- a/java/memory/memory-core/src/main/java/module-info.java +++ b/java/memory/memory-core/src/main/java/module-info.java @@ -25,4 +25,5 @@ requires jsr305; requires org.immutables.value; requires org.slf4j; + requires org.checkerframework.checker.qual; } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java index 3530b819aadf..9e37c286ad83 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java @@ -44,4 +44,6 @@ public interface ReusableBuffer { * @param len the number of bytes of the new data */ void set(ArrowBuf srcBytes, long start, long len); + + void set(byte[] srcBytes, long start, long len); } diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 3cf9a968791a..9fe40f2319bf 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -65,6 +65,11 @@ fields: [], complex: false }, + { + name: "Utf8View", + fields: [], + complex: false + }, { name: "LargeUtf8", fields: [], @@ -75,6 +80,11 @@ fields: [], complex: false }, + { + name: "BinaryView", + fields: [], + complex: false + }, { name: "LargeBinary", fields: [], diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 6c2a96771245..ad1f1b93bb3a 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -189,7 +189,9 @@ fields: [{name: "start", type: "int"}, {name: "end", type: "int"}, {name: "buffer", type: "ArrowBuf"}], minor: [ { class: "VarBinary" , friendlyType: "byte[]" }, - { class: "VarChar" , friendlyType: "Text" } + { class: "VarChar" , friendlyType: "Text" }, + { class: "ViewVarBinary" , friendlyType: "byte[]" }, + { class: "ViewVarChar" , friendlyType: "Text" } ] }, { diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index 8394aaad4175..1151ea5d39dd 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -109,9 +109,9 @@ public void read(Nullable${name}Holder h) { byte[] value = new byte [length]; holder.buffer.getBytes(holder.start, value, 0, length); - <#if minor.class == "VarBinary" || minor.class == "LargeVarBinary"> + <#if minor.class == "VarBinary" || minor.class == "LargeVarBinary" || minor.class == "ViewVarBinary"> return value; - <#elseif minor.class == "VarChar" || minor.class == "LargeVarChar"> + <#elseif minor.class == "VarChar" || minor.class == "LargeVarChar" || minor.class == "ViewVarChar"> Text text = new Text(); text.set(value); return text; diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 822d4822987f..956bc91e9185 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -39,7 +39,7 @@ @SuppressWarnings("unused") public class UnionReader extends AbstractFieldReader { - private static final int NUM_SUPPORTED_TYPES = 46; + private static final int NUM_SUPPORTED_TYPES = 48; private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES]; public UnionVector data; diff --git a/java/vector/src/main/codegen/templates/ValueHolders.java b/java/vector/src/main/codegen/templates/ValueHolders.java index 973efd870a66..2a2bbe81b2e7 100644 --- a/java/vector/src/main/codegen/templates/ValueHolders.java +++ b/java/vector/src/main/codegen/templates/ValueHolders.java @@ -27,7 +27,6 @@ package org.apache.arrow.vector.holders; <#include "/@includes/vv_imports.ftl" /> - /** * Source code generated using FreeMarker template ${.template_name} */ @@ -40,11 +39,12 @@ public final class ${className} implements ValueHolder{ /** The last index (exclusive) into the Vector. **/ public int end; - + /** The Vector holding the actual values. **/ public ${minor.class}Vector vector; - + <#else> + public static final int WIDTH = ${type.width}; <#if mode.name == "Optional">public int isSet; @@ -70,10 +70,6 @@ public String toString(){ throw new UnsupportedOperationException(); } - - - - } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java index b41dbb245e8a..5f9decbae4ea 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java @@ -20,7 +20,7 @@ import org.apache.arrow.util.Preconditions; /** - * Tuple class containing a vector and whether is was created. + * Tuple class containing a vector and whether it was created. * * @param The type of vector the result is for. */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 2ef6e4bd8b37..6365493051b9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -42,8 +42,7 @@ /** * BaseLargeVariableWidthVector is a base class providing functionality for large strings/large bytes types. */ -public abstract class BaseLargeVariableWidthVector extends BaseValueVector - implements VariableWidthVector, FieldVector, VectorDefinitionSetter { +public abstract class BaseLargeVariableWidthVector extends BaseValueVector implements VariableWidthFieldVector { private static final int DEFAULT_RECORD_BYTE_COUNT = 12; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; private int lastValueCapacity; @@ -942,6 +941,7 @@ public void setValueCount(int valueCount) { * * @param index target index */ + @Override public void fillEmpties(int index) { handleSafe(index, emptyByteArray.length); fillHoles(index); @@ -955,6 +955,7 @@ public void fillEmpties(int index) { * * @param value desired index of last non-null element. */ + @Override public void setLastSet(int value) { lastSet = value; } @@ -964,6 +965,7 @@ public void setLastSet(int value) { * * @return index of the last non-null element */ + @Override public int getLastSet() { return lastSet; } @@ -1003,6 +1005,7 @@ public void setValueLengthSafe(int index, int length) { * @param index position of element to get * @return greater than 0 length for non-null element, 0 otherwise */ + @Override public int getValueLength(int index) { assert index >= 0; if (isSet(index) == 0) { @@ -1021,6 +1024,7 @@ public int getValueLength(int index) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void set(int index, byte[] value) { assert index >= 0; fillHoles(index); @@ -1037,6 +1041,7 @@ public void set(int index, byte[] value) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void setSafe(int index, byte[] value) { assert index >= 0; handleSafe(index, value.length); @@ -1055,6 +1060,7 @@ public void setSafe(int index, byte[] value) { * @param start start index in array of bytes * @param length length of data in array of bytes */ + @Override public void set(int index, byte[] value, int start, int length) { assert index >= 0; fillHoles(index); @@ -1091,6 +1097,7 @@ public void setSafe(int index, byte[] value, int start, int length) { * @param start start index in ByteBuffer * @param length length of data in ByteBuffer */ + @Override public void set(int index, ByteBuffer value, int start, int length) { assert index >= 0; fillHoles(index); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index d533629cdd44..0412b9600b77 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -42,8 +42,7 @@ /** * BaseVariableWidthVector is a base class providing functionality for strings/bytes types. */ -public abstract class BaseVariableWidthVector extends BaseValueVector - implements VariableWidthVector, FieldVector, VectorDefinitionSetter { +public abstract class BaseVariableWidthVector extends BaseValueVector implements VariableWidthFieldVector { private static final int DEFAULT_RECORD_BYTE_COUNT = 8; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); @@ -993,6 +992,7 @@ public void setValueCount(int valueCount) { * * @param index target index */ + @Override public void fillEmpties(int index) { handleSafe(index, emptyByteArray.length); fillHoles(index); @@ -1006,6 +1006,7 @@ public void fillEmpties(int index) { * * @param value desired index of last non-null element. */ + @Override public void setLastSet(int value) { lastSet = value; } @@ -1015,6 +1016,7 @@ public void setLastSet(int value) { * * @return index of the last non-null element */ + @Override public int getLastSet() { return lastSet; } @@ -1050,6 +1052,7 @@ public void setIndexDefined(int index) { * @param index position of the element to set * @param length length of the element */ + @Override public void setValueLengthSafe(int index, int length) { assert index >= 0; handleSafe(index, length); @@ -1065,6 +1068,7 @@ public void setValueLengthSafe(int index, int length) { * @param index position of element to get * @return greater than 0 length for non-null element, 0 otherwise */ + @Override public int getValueLength(int index) { assert index >= 0; if (isSet(index) == 0) { @@ -1083,6 +1087,7 @@ public int getValueLength(int index) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void set(int index, byte[] value) { assert index >= 0; fillHoles(index); @@ -1099,6 +1104,7 @@ public void set(int index, byte[] value) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void setSafe(int index, byte[] value) { assert index >= 0; handleSafe(index, value.length); @@ -1153,6 +1159,7 @@ public void setSafe(int index, byte[] value, int start, int length) { * @param start start index in ByteBuffer * @param length length of data in ByteBuffer */ + @Override public void set(int index, ByteBuffer value, int start, int length) { assert index >= 0; fillHoles(index); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java new file mode 100644 index 000000000000..2f80775a48f5 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -0,0 +1,1451 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.vector.util.DataSizeRoundingUtil.roundUpToMultipleOf16; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.ReusableBuffer; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BaseVariableWidthViewVector is a base class providing functionality for strings/bytes types in view format. + * + */ +public abstract class BaseVariableWidthViewVector extends BaseValueVector implements VariableWidthFieldVector { + // A single element of a view comprises 16 bytes + protected static final int ELEMENT_SIZE = 16; + public static final int INITIAL_VIEW_VALUE_ALLOCATION = 4096; + private static final int INITIAL_BYTE_COUNT = INITIAL_VIEW_VALUE_ALLOCATION * ELEMENT_SIZE; + private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); + private int lastValueCapacity; + private long lastValueAllocationSizeInBytes; + + /* + * Variable Width View Vector comprises the following format + * + * Short strings, length <= 12 + * | Bytes 0-3 | Bytes 4-15 | + * |------------|---------------------------------------| + * | length | data (padded with 0) | + * |------------|---------------------------------------| + * + * Long strings, length > 12 + * | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | + * |------------|------------|------------|-------------| + * | length | prefix | buf.index | offset | + * |------------|------------|------------|-------------| + * + * */ + // 12 byte unsigned int to track inline views + protected static final int INLINE_SIZE = 12; + // The first 4 bytes of view are allocated for length + protected static final int LENGTH_WIDTH = 4; + // The second 4 bytes of view are allocated for prefix width + protected static final int PREFIX_WIDTH = 4; + // The third 4 bytes of view are allocated for buffer index + protected static final int BUF_INDEX_WIDTH = 4; + protected static final byte[] EMPTY_BYTE_ARRAY = new byte[]{}; + protected ArrowBuf validityBuffer; + // The view buffer is used to store the variable width view elements + protected ArrowBuf viewBuffer; + // The external buffer which stores the long strings + protected List dataBuffers; + protected int initialDataBufferSize; + protected int valueCount; + protected int lastSet; + protected final Field field; + + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector + * @param allocator The allocator to use for creating/resizing buffers + */ + public BaseVariableWidthViewVector(Field field, final BufferAllocator allocator) { + super(allocator); + this.field = field; + lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + lastValueCapacity = INITIAL_VIEW_VALUE_ALLOCATION; + valueCount = 0; + lastSet = -1; + validityBuffer = allocator.getEmpty(); + viewBuffer = allocator.getEmpty(); + dataBuffers = new ArrayList<>(); + } + + @Override + public String getName() { + return field.getName(); + } + + /* TODO: + * see if getNullCount() can be made faster -- O(1) + */ + + /* TODO: + * Once the entire hierarchy has been refactored, move common functions + * like getNullCount(), splitAndTransferValidityBuffer to top level + * base class BaseValueVector. + * + * Along with this, some class members (validityBuffer) can also be + * abstracted out to top level base class. + * + * Right now BaseValueVector is the top level base class for other + * vector types in ValueVector hierarchy (non-nullable) and those + * vectors have not yet been refactored/removed so moving things to + * the top class as of now is not a good idea. + */ + + /* TODO: + * Implement TransferPair functionality + * https://github.com/apache/arrow/issues/40932 + * + */ + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return viewBuffer; + } + + /** + * BaseVariableWidthViewVector doesn't support offset buffer. + * + * @return throws UnsupportedOperationException + */ + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException("Offset buffer is not supported in BaseVariableWidthViewVector"); + } + + /** + * BaseVariableWidthViewVector doesn't support offset buffer. + * + * @return throws UnsupportedOperationException + */ + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException("Offset buffer is not supported in BaseVariableWidthViewVector"); + } + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return viewBuffer.memoryAddress(); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * ELEMENT_SIZE; + checkDataBufferSize(size); + lastValueAllocationSizeInBytes = (int) size; + lastValueCapacity = valueCount; + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + * @param density average number of bytes per variable width view element + */ + @Override + public void setInitialCapacity(int valueCount, double density) { + final long size = (long) valueCount * ELEMENT_SIZE; + initialDataBufferSize = (int) (valueCount * density); + checkDataBufferSize(size); + lastValueAllocationSizeInBytes = (int) size; + lastValueCapacity = valueCount; + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final double totalListSize = getTotalValueLengthUpToIndex(valueCount); + return totalListSize / valueCount; + } + + /** + * Get the current capacity which does not exceed either validity buffer or value buffer. + * Note: Here the `getValueCapacity` has a relationship with the value buffer. + * + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + final int validityCapacity = getValidityBufferValueCapacity(); + final int valueBufferCapacity = Math.max(capAtMaxInt(viewBuffer.capacity() / ELEMENT_SIZE), 0); + return Math.min(valueBufferCapacity, validityCapacity); + } + + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * zero out the vector and the data in associated buffers. + */ + public void zeroVector() { + initValidityBuffer(); + viewBuffer.setZero(0, viewBuffer.capacity()); + clearDataBuffers(); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /** + * Reset the vector to initial state. + * Note that this method doesn't release any memory. + */ + @Override + public void reset() { + zeroVector(); + lastSet = -1; + valueCount = 0; + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()}. + */ + @Override + public void clear() { + validityBuffer = releaseBuffer(validityBuffer); + viewBuffer = releaseBuffer(viewBuffer); + clearDataBuffers(); + lastSet = -1; + valueCount = 0; + } + + /** + * Release the data buffers and clear the list. + */ + public void clearDataBuffers() { + for (ArrowBuf buffer : dataBuffers) { + releaseBuffer(buffer); + } + dataBuffers.clear(); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector cannot have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + // TODO: https://github.com/apache/arrow/issues/40931 + throw new UnsupportedOperationException("loadFieldBuffers is not supported for BaseVariableWidthViewVector"); + } + + /** + * Get the buffers belonging to this vector. + * @return the inner buffers. + */ + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(2 + dataBuffers.size()); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(viewBuffer); + // append data buffers + result.addAll(dataBuffers); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + viewBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + viewBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + viewBuffer.writerIndex(valueCount * ELEMENT_SIZE); + } + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(long, int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + try { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param totalBytes desired total memory capacity + * @param valueCount the desired number of elements in the vector + * @throws OutOfMemoryException if memory allocation fails + */ + @Override + public void allocateNew(long totalBytes, int valueCount) { + assert totalBytes >= 0; + + checkDataBufferSize(totalBytes); + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(totalBytes, valueCount); + } catch (Exception e) { + clear(); + throw e; + } + } + + @Override + public void allocateNew(int valueCount) { + allocateNew(lastValueAllocationSizeInBytes, valueCount); + } + + /* Check if the data buffer size is within bounds. */ + private void checkDataBufferSize(long size) { + if (size > MAX_BUFFER_SIZE || size < 0) { + throw new OversizedAllocationException("Memory required for vector " + + "is (" + size + "), which is overflow or more than max allowed (" + MAX_BUFFER_SIZE + "). " + + "You could consider using LargeVarCharVector/LargeVarBinaryVector for large strings/large bytes types"); + } + } + + /* allocate the inner buffers */ + private void allocateBytes(final long valueBufferSize, final int valueCount) { + /* allocate data buffer */ + viewBuffer = allocator.buffer(valueBufferSize); + viewBuffer.readerIndex(0); + + validityBuffer = allocator.buffer((valueCount + 7) / 8); + initValidityBuffer(); + + lastValueCapacity = getValueCapacity(); + lastValueAllocationSizeInBytes = capAtMaxInt(viewBuffer.capacity()); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + @Override + public void reAlloc() { + reallocViewBuffer(); + reallocViewDataBuffer(); + reallocValidityBuffer(); + } + + /** + * Reallocate the view buffer. View Buffer stores the views for + * VIEWVARCHAR or VIEWVARBINARY elements in the vector. The behavior is to double + * the size of buffer. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocViewBuffer() { + long currentViewBufferCapacity = viewBuffer.capacity(); + + long newAllocationSize = currentViewBufferCapacity * 2; + if (newAllocationSize == 0) { + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2L; + } + } + + reallocViewBuffer(newAllocationSize); + } + + /** + * Reallocate the data buffer associated with view buffer. + */ + public void reallocViewDataBuffer() { + long currentDataBufferCapacity = 0; + if (!dataBuffers.isEmpty()) { + currentDataBufferCapacity = dataBuffers.get(dataBuffers.size() - 1).capacity(); + } + + long newAllocationSize = currentDataBufferCapacity * 2; + if (newAllocationSize == 0) { + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2L; + } + } + + reallocViewDataBuffer(newAllocationSize); + } + + /** + * Reallocate the view buffer to given size. View Buffer stores the views for + * VIEWVARCHAR or VIEWVARBINARY elements in the vector. The actual allocated size may be larger + * than the request one because it will round up the provided value to the nearest + * power of two. + * + * @param desiredAllocSize the desired new allocation size + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocViewBuffer(long desiredAllocSize) { + if (desiredAllocSize == 0) { + return; + } + long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + // for each set operation, we have to allocate 16 bytes + // here we are adjusting the desired allocation-based allocation size + // to align with the 16bytes requirement. + newAllocationSize = roundUpToMultipleOf16(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, viewBuffer, 0, viewBuffer.capacity()); + + viewBuffer.getReferenceManager().release(); + viewBuffer = newBuf; + lastValueAllocationSizeInBytes = viewBuffer.capacity(); + } + + /** + * Reallocate the data buffer for views. + * + * @param desiredAllocSize allocation size in bytes + */ + public void reallocViewDataBuffer(long desiredAllocSize) { + if (desiredAllocSize == 0) { + return; + } + + if (dataBuffers.isEmpty()) { + return; + } + + ArrowBuf currentBuf = dataBuffers.get(dataBuffers.size() - 1); + if (currentBuf.capacity() - currentBuf.writerIndex() >= desiredAllocSize) { + return; + } + + final long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + dataBuffers.add(newBuf); + } + + /** + * Reallocate Validity buffer. + */ + public void reallocValidityBuffer() { + int targetValidityCount = capAtMaxInt((validityBuffer.capacity() * 8) * 2); + if (targetValidityCount == 0) { + if (lastValueCapacity > 0) { + targetValidityCount = lastValueCapacity; + } else { + targetValidityCount = 2 * INITIAL_VALUE_ALLOCATION; + } + } + + long validityBufferSize = computeValidityBufferSize(targetValidityCount); + + final ArrowBuf newValidityBuffer = allocator.buffer(validityBufferSize); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.getReferenceManager().release(); + validityBuffer = newValidityBuffer; + + lastValueCapacity = getValueCapacity(); + } + + private long computeValidityBufferSize(int valueCount) { + return (valueCount + 7) / 8; + } + + /** + * Get the size (number of bytes) of underlying view buffer. + * @return number of bytes in the view buffer + */ + @Override + public int getByteCapacity() { + return capAtMaxInt(viewBuffer.capacity()); + } + + @Override + public int sizeOfValueBuffer() { + throw new UnsupportedOperationException("sizeOfValueBuffer is not supported for BaseVariableWidthViewVector"); + } + + /** + * Get the size (number of bytes) of underlying elements in the view buffer. + * @return number of bytes used by data in the view buffer + */ + public int sizeOfViewBufferElements() { + if (valueCount == 0) { + return 0; + } + int totalSize = 0; + for (int i = 0; i < valueCount; i++) { + totalSize += getValueLength(i); + } + return totalSize; + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + return getBufferSizeFor(this.valueCount); + } + + /** + * Get the potential buffer size for a particular number of records. + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + final int viewBufferSize = valueCount * ELEMENT_SIZE; + final int dataBufferSize = getDataBufferSize(); + return validityBufferSize + viewBufferSize + dataBufferSize; + } + + private int getDataBufferSize() { + int dataBufferSize = 0; + for (ArrowBuf buf : dataBuffers) { + dataBufferSize += (int) buf.writerIndex(); + } + return dataBufferSize; + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer, so it only should be used for in-context + * access. Also note that this buffer changes regularly, thus + * external classes shouldn't hold a reference to it (unless they change it). + *

+ * Note: This method only returns validityBuffer and valueBuffer. + * But it doesn't return the data buffers. + *

+ * TODO: Implement a strategy to retrieve the data buffers. + * data buffer retrieval. + * + * @param clear Whether to clear vector before returning, the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[2]; + buffers[0] = validityBuffer; + buffers[1] = viewBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Validate the scalar values held by this vector. + */ + public void validateScalars() { + // No validation by default. + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param field The field materialized by this vector. + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(field, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getName(), allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param field The field materialized by this vector. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public abstract TransferPair getTransferPair(Field field, BufferAllocator allocator); + + /** + * Transfer this vector's data to another vector. + * The memory associated with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseVariableWidthViewVector target) { + throw new UnsupportedOperationException("trasferTo function not supported!"); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseVariableWidthViewVector target) { + throw new UnsupportedOperationException("splitAndTransferTo function not supported!"); + } + + /*----------------------------------------------------------------* + | | + | common getters and setters | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Check if the given index is within the current value capacity + * of the vector. + * + * @param index position to check + * @return true if the index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if an element at given index is null. + * + * @param index position of an element + * @return true if an element at given index is null + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of an element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the value count of vector. This will always be zero unless + * setValueCount(int) has been called prior to calling this. + * + * @return valueCount for the vector + */ + @Override + public int getValueCount() { + return valueCount; + } + + /** + * Sets the value count for the vector. + * + * @param valueCount value count + */ + @Override + public void setValueCount(int valueCount) { + assert valueCount >= 0; + this.valueCount = valueCount; + while (valueCount > getValueCapacity()) { + reallocViewBuffer(); + reallocValidityBuffer(); + } + lastSet = valueCount - 1; + setReaderAndWriterIndex(); + } + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last-set position in + * the vector. + * + * @param index target index + */ + @Override + public void fillEmpties(int index) { + handleSafe(index, EMPTY_BYTE_ARRAY.length); + lastSet = index - 1; + } + + /** + * Set the index of the last non-null element in the vector. + * It is important to call this method with appropriate value + * before calling {@link #setValueCount(int)}. + * + * @param value desired index of last non-null element. + */ + @Override + public void setLastSet(int value) { + lastSet = value; + } + + /** + * Get the index of the last non-null element in the vector. + * + * @return index of the last non-null element + */ + @Override + public int getLastSet() { + return lastSet; + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + // We need to check and reallocate the validity buffer + while (index >= getValueCapacity()) { + reallocValidityBuffer(); + } + BitVectorHelper.setBit(validityBuffer, index); + } + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + @Override + public void setValueLengthSafe(int index, int length) { + assert index >= 0; + handleSafe(index, length); + lastSet = index; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of an element to get + * @return greater than length 0 for a non-null element, 0 otherwise + */ + @Override + public int getValueLength(int index) { + assert index >= 0; + if (index < 0 || index >= viewBuffer.capacity() / ELEMENT_SIZE) { + throw new IndexOutOfBoundsException("Index out of bounds: " + index); + } + if (isSet(index) == 0) { + return 0; + } + return viewBuffer.getInt(((long) index * ELEMENT_SIZE)); + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. This is same as using {@link #set(int, byte[], int, int)} + * with start as Zero and length as #value.length + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void set(int index, byte[] value) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[])} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + @Override + public void setSafe(int index, byte[] value) { + assert index >= 0; + // check if the current index can be populated + handleSafe(index, value.length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in an array of bytes + * @param length length of data in an array of bytes + */ + public void set(int index, byte[] value, int start, int length) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[], int, int)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in an array of bytes + * @param length length of data in an array of bytes + */ + public void setSafe(int index, byte[] value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied ByteBuffer. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void set(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value.array(), start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, ByteBuffer, int, int)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void setSafe(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value.array(), start, length); + lastSet = index; + } + + /** + * Set the element at the given index to null. + * + * @param index position of an element + */ + @Override + public void setNull(int index) { + // We need to check and reallocate the validity buffer + while (index >= getValueCapacity()) { + reallocValidityBuffer(); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet Zero for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + setBytes(index, buffer, start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet Zero for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + handleSafe(index, dataLength); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + setBytes(index, buffer, start, dataLength); + lastSet = index; + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, buffer, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, buffer, start, length); + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | helper methods for setters | + | | + *----------------------------------------------------------------*/ + + + protected ArrowBuf allocateOrGetLastDataBuffer(int length) { + long dataBufferSize; + if (initialDataBufferSize > 0) { + dataBufferSize = Math.max(initialDataBufferSize, length); + } else { + dataBufferSize = Math.max(lastValueAllocationSizeInBytes, length); + } + + if (dataBuffers.isEmpty() || dataBuffers.get(dataBuffers.size() - 1).capacity() - + dataBuffers.get(dataBuffers.size() - 1).writerIndex() < length) { + ArrowBuf newBuf = allocator.buffer(dataBufferSize); + dataBuffers.add(newBuf); + } + + return dataBuffers.get(dataBuffers.size() - 1); + } + + /** + * This method is used to create a view buffer for a variable width vector. + * It handles both inline and data buffers. + *

+ * If the length of the value is less than or equal to {@link #INLINE_SIZE}, the value is stored in the valueBuffer + * directly as an inline buffer. + * The valueBuffer stores the length of the value followed by the value itself. + * If the length of the value is greater than {@link #INLINE_SIZE}, a new buffer is allocated and added to dataBuffers + * to hold the value. + * The viewBuffer in this case stores the length of the value, a prefix of the value, the index of the + * new buffer in dataBuffers, and the offset of the value in the new buffer. + * + * @param index The index at which the new value will be inserted. + * @param value The byte array that contains the data to be inserted. + * @param start The start index in the byte array from where the data for the new value begins. + * @param length The length of the data in the byte array that belongs to the new value. + */ + protected final void setBytes(int index, byte[] value, int start, int length) { + int writePosition = index * ELEMENT_SIZE; + + // to clear the memory segment of view being written to + // this is helpful in case of overwriting the value + viewBuffer.setZero(writePosition, ELEMENT_SIZE); + + if (value.length <= INLINE_SIZE) { + // allocate inline buffer + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set data + viewBuffer.setBytes(writePosition, value, start, length); + } else { + // allocate data buffer + ArrowBuf currentBuf = allocateOrGetLastDataBuffer(length); + + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set prefix + viewBuffer.setBytes(writePosition, value, start, PREFIX_WIDTH); + writePosition += PREFIX_WIDTH; + // set buf id + viewBuffer.setInt(writePosition, dataBuffers.size() - 1); + writePosition += BUF_INDEX_WIDTH; + // set offset + viewBuffer.setInt(writePosition, (int) currentBuf.writerIndex()); + + currentBuf.setBytes(currentBuf.writerIndex(), value, start, length); + currentBuf.writerIndex(currentBuf.writerIndex() + length); + } + } + + /** + * This method is used to create a view buffer for a variable width vector. + * Similar to {@link #setBytes(int index, byte[] value, int start, int length)} + * + * @param index The index at which the new value will be inserted. + * @param valueBuf The byte array that contains the data to be inserted. + * @param start The start index in the byte array from where the data for the new value begins. + * @param length The length of the data in the byte array that belongs to the new value. + */ + protected final void setBytes(int index, ArrowBuf valueBuf, int start, int length) { + int writePosition = index * ELEMENT_SIZE; + + // to clear the memory segment of view being written to + // this is helpful in case of overwriting the value + viewBuffer.setZero(writePosition, ELEMENT_SIZE); + + if (length <= INLINE_SIZE) { + // allocate inline buffer + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set data + viewBuffer.setBytes(writePosition, valueBuf, start, length); + } else { + // allocate data buffer + ArrowBuf currentBuf = allocateOrGetLastDataBuffer(length); + + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set prefix + viewBuffer.setBytes(writePosition, valueBuf, start, PREFIX_WIDTH); + writePosition += PREFIX_WIDTH; + // set buf id + viewBuffer.setInt(writePosition, dataBuffers.size() - 1); + writePosition += BUF_INDEX_WIDTH; + // set offset + viewBuffer.setInt(writePosition, (int) currentBuf.writerIndex()); + + currentBuf.setBytes(currentBuf.writerIndex(), valueBuf, start, length); + currentBuf.writerIndex(currentBuf.writerIndex() + length); + } + } + + /** + * Get the total length of the elements up to the given index. + * @param index The index of the element in the vector. + * @return The total length up to the element at the given index. + */ + public final int getTotalValueLengthUpToIndex(int index) { + int totalLength = 0; + for (int i = 0; i < index - 1; i++) { + totalLength += getValueLength(i); + } + return totalLength; + } + + protected final void handleSafe(int index, int dataLength) { + final long lastSetCapacity = lastSet < 0 ? 0 : (long) index * ELEMENT_SIZE; + final long targetCapacity = roundUpToMultipleOf16(lastSetCapacity + dataLength); + // for views, we need each buffer with 16 byte alignment, so we need to check the last written index + // in the viewBuffer and allocate a new buffer which has 16 byte alignment for adding new values. + long writePosition = (long) index * ELEMENT_SIZE; + if (viewBuffer.capacity() <= writePosition || viewBuffer.capacity() < targetCapacity) { + /* + * Everytime we want to increase the capacity of the viewBuffer, we need to make sure that the new capacity + * meets 16 byte alignment. + * If the targetCapacity is larger than the writePosition, we may not necessarily + * want to allocate the targetCapacity to viewBuffer since when it is >={@link #INLINE_SIZE} either way + * we are writing to the dataBuffer. + */ + reallocViewBuffer(Math.max(writePosition, targetCapacity)); + } + + while (index >= getValueCapacity()) { + reallocValidityBuffer(); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular position in this + * vector. + * TODO: Improve functionality to support copying views. + * Enhance CopyFrom + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException("copyFrom is not supported for VariableWidthVector"); + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that it handles the case when the + * capacity of the vector needs to be expanded before copy. + * TODO: Improve functionality to support copying views. + * Enhance CopyFrom + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException("copyFromSafe is not supported for VariableWidthVector"); + } + + @Override + public ArrowBufPointer getDataPointer(int index) { + return getDataPointer(index, new ArrowBufPointer()); + } + + @Override + public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { + if (isNull(index)) { + reuse.set(null, 0, 0); + } else { + int length = getValueLength(index); + if (length < INLINE_SIZE) { + int start = index * ELEMENT_SIZE + LENGTH_WIDTH; + reuse.set(viewBuffer, start, length); + } else { + final int bufIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + ArrowBuf dataBuf = dataBuffers.get(bufIndex); + reuse.set(dataBuf, 0, length); + } + + } + return reuse; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isNull(index)) { + return ArrowBufPointer.NULL_HASH_CODE; + } + final int length = getValueLength(index); + if (length < INLINE_SIZE) { + int start = index * ELEMENT_SIZE + LENGTH_WIDTH; + return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, start + length); + } else { + final int bufIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + final int dataOffset = + viewBuffer.getInt( + ((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH); + ArrowBuf dataBuf = dataBuffers.get(bufIndex); + return ByteFunctionHelpers.hash(hasher, dataBuf, dataOffset, dataOffset + length); + } + } + + /** + * Retrieves the data of a variable-width element at a given index in the vector. + * + *

+ * If the length of the data is greater than {@link #INLINE_SIZE}, the data is stored in an inline buffer. + * The method retrieves the buffer index and data offset from the viewBuffer, and then retrieves the data from the + * corresponding buffer in the dataBuffers list. + *

+ * If the length of the data is less than or equal to {@link #INLINE_SIZE}, the data is stored directly in the + * viewBuffer. + * The method retrieves the data directly from the viewBuffer. + * + * @param index position of the element in the vector + * @return byte array containing the data of the element + */ + protected byte[] getData(int index) { + final int dataLength = getValueLength(index); + byte[] result = new byte[dataLength]; + if (dataLength > INLINE_SIZE) { + // data is in the data buffer + // get buffer index + final int bufferIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + // get data offset + final int dataOffset = + viewBuffer.getInt( + ((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH); + dataBuffers.get(bufferIndex).getBytes(dataOffset, result, 0, dataLength); + } else { + // data is in the view buffer + viewBuffer.getBytes( + (long) index * ELEMENT_SIZE + BUF_INDEX_WIDTH, result, 0, dataLength); + } + return result; + } + + protected void getData(int index, ReusableBuffer buffer) { + final int dataLength = getValueLength(index); + if (dataLength > INLINE_SIZE) { + // data is in the data buffer + // get buffer index + final int bufferIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + // get data offset + final int dataOffset = + viewBuffer.getInt( + ((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH); + ArrowBuf dataBuf = dataBuffers.get(bufferIndex); + buffer.set(dataBuf, dataOffset, dataLength); + } else { + // data is in the value buffer + buffer.set(viewBuffer, ((long) index * ELEMENT_SIZE) + BUF_INDEX_WIDTH, dataLength); + } + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java index 8560ba3a68b0..25c83260ef3e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java @@ -119,6 +119,7 @@ public byte[] get(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final long startOffset = getStartOffset(index); final long dataLength = getEndOffset(index) - startOffset; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java index df424c87488a..bc3a1e09aaa7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java @@ -103,6 +103,7 @@ public Types.MinorType getMinorType() { * @param index position of element to get * @return array of bytes for non-null element, null otherwise */ + @Override public byte[] get(int index) { assert index >= 0; if (isSet(index) == 0) { @@ -140,6 +141,7 @@ public Text getObject(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final long startOffset = getStartOffset(index); final long dataLength = getEndOffset(index) - startOffset; @@ -298,7 +300,7 @@ public void validateScalars() { *----------------------------------------------------------------*/ /** - * Construct a TransferPair comprising of this and a target vector of + * Construct a TransferPair comprising this and a target vector of * the same type. * * @param ref name of the target vector diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index ae465418cf2f..18032528c86d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -46,6 +46,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8View; /** * The buffer layout of vectors for a given type. @@ -173,11 +174,23 @@ public TypeLayout visit(Binary type) { return newVariableWidthTypeLayout(); } + @Override + public TypeLayout visit(ArrowType.BinaryView type) { + // TODO: https://github.com/apache/arrow/issues/40934 + throw new UnsupportedOperationException("BinaryView not supported"); + } + @Override public TypeLayout visit(Utf8 type) { return newVariableWidthTypeLayout(); } + @Override + public TypeLayout visit(Utf8View type) { + // TODO: https://github.com/apache/arrow/issues/40934 + throw new UnsupportedOperationException("Utf8View not supported"); + } + @Override public TypeLayout visit(LargeUtf8 type) { return newLargeVariableWidthTypeLayout(); @@ -347,11 +360,23 @@ public Integer visit(Binary type) { return VARIABLE_WIDTH_BUFFER_COUNT; } + @Override + public Integer visit(ArrowType.BinaryView type) { + // TODO: https://github.com/apache/arrow/issues/40935 + return VARIABLE_WIDTH_BUFFER_COUNT; + } + @Override public Integer visit(Utf8 type) { return VARIABLE_WIDTH_BUFFER_COUNT; } + @Override + public Integer visit(Utf8View type) { + // TODO: https://github.com/apache/arrow/issues/40935 + return VARIABLE_WIDTH_BUFFER_COUNT; + } + @Override public Integer visit(LargeUtf8 type) { return VARIABLE_WIDTH_BUFFER_COUNT; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java index ab67ebad965a..82d4feda9a99 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java @@ -120,6 +120,7 @@ public byte[] get(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final int startOffset = getStartOffset(index); final int dataLength = getEndOffset(index) - startOffset; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java index c6d5a7090bc6..fde9459e6008 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -137,6 +137,7 @@ public Text getObject(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final int startOffset = getStartOffset(index); final int dataLength = getEndOffset(index) - startOffset; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthFieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthFieldVector.java new file mode 100644 index 000000000000..58b6940a81a1 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthFieldVector.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.nio.ByteBuffer; + +import org.apache.arrow.memory.ReusableBuffer; + +/** +* A base interface for common functionalities in variable width vectors. +*/ +public interface VariableWidthFieldVector extends VariableWidthVector, FieldVector, VectorDefinitionSetter { + + /** + * Set the variable length element at the specified index to the supplied byte array. + * + * @param index position of the element to set + * @param value array of bytes with data + */ + void set(int index, byte[] value); + + /** + * Set the variable length element at the specified index to the supplied byte array. + * + * @param index position of the element to set + * @param value array of bytes with data + * @param start start position in the array + * @param length length of the data to write + */ + void set(int index, byte[] value, int start, int length); + + /** + * Set the variable length element at the specified index to the supplied ByteBuffer. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start position in the ByteBuffer + * @param length length of the data to write + */ + void set(int index, ByteBuffer value, int start, int length); + + /** + * Set the variable length element at the specified index to the supplied byte array, and it + * handles the case where index and length of a new element are beyond the existing capacity of the + * vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + void setSafe(int index, byte[] value); + + /** + * Set the variable length element at the specified index to the supplied byte array, and it + * handles the case where index and length of a new element are beyond the existing capacity. + * + * @param index position of the element to set + * @param value array of bytes with data + * @param start start position in the array + * @param length length of the data to write + */ + void setSafe(int index, byte[] value, int start, int length); + + /** + * Set the variable length element at the specified index to the supplied ByteBuffer, and it + * handles the case where index and length of a new element are beyond the existing capacity. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start position in the ByteBuffer + * @param length length of the data to write + */ + void setSafe(int index, ByteBuffer value, int start, int length); + + /** + * Get the variable length element at the specified index. + * + * @param index position of the element to get + * @return byte array with the data + */ + byte[] get(int index); + + /** + * Get the variable length element at the specified index using a ReusableBuffer. + * + * @param index position of the element to get + * @param buffer ReusableBuffer to write the data to + */ + void read(int index, ReusableBuffer buffer); + + /** + * Get the index of the last non-null element in the vector. + * + * @return index of the last non-null element + */ + int getLastSet(); + + /** + * Set the index of the last non-null element in the vector. + * + * @param value desired index of last non-null element + */ + void setLastSet(int value); + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of an element to get + * @return greater than length 0 for a non-null element, 0 otherwise + */ + int getValueLength(int index); + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last-set position in + * the vector. + * + * @param index target index + */ + void fillEmpties(int index); + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + void setValueLengthSafe(int index, int length); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ViewVarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarBinaryVector.java new file mode 100644 index 000000000000..393df96b2969 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarBinaryVector.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; +import org.apache.arrow.vector.complex.impl.ViewVarBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableViewVarBinaryHolder; +import org.apache.arrow.vector.holders.ViewVarBinaryHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * ViewVarBinaryVector implements a variable width view vector of binary values which could be NULL. A + * validity buffer (bit vector) is maintained to track which elements in the vector are null. + */ +public final class ViewVarBinaryVector extends BaseVariableWidthViewVector { + + /** + * Instantiate a ViewVarBinaryVector. This doesn't allocate any memory for the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public ViewVarBinaryVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.VIEWVARBINARY.getType()), allocator); + } + + /** + * Instantiate a ViewVarBinaryVector. This doesn't allocate any memory for the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a ViewVarBinaryVector. This doesn't allocate any memory for the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarBinaryVector(Field field, BufferAllocator allocator) { + super(field, allocator); + } + + @Override + protected FieldReader getReaderImpl() { + return new ViewVarBinaryReaderImpl(ViewVarBinaryVector.this); + } + + /** + * Get a minor type for this vector. The vector holds values belonging to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.VIEWVARBINARY; + } + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the variable length element at specified index as a byte array. + * + * @param index position of an element to get + * @return array of bytes for a non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + return getData(index); + } + + /** + * Read the value at the given position to the given output buffer. The caller is responsible for + * checking for nullity first. + * + * @param index position of an element. + * @param buffer the buffer to write into. + */ + @Override + public void read(int index, ReusableBuffer buffer) { + getData(index, buffer); + } + + /** + * Get the variable length element at a specified index as a byte array. + * + * @param index position of an element to get + * @return byte array for a non-null element, null otherwise + */ + @Override + public byte[] getObject(int index) { + return get(index); + } + + /** + * Get the variable length element at specified index and sets the state in provided holder. + * + * @param index position of an element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + /** + * Set the variable length element at the specified index to the data buffer supplied in the + * holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, ViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Same as {@link #set(int, ViewVarBinaryHolder)} except that it handles the case where index and + * length of a new element are beyond the existing capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, ViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Set the variable length element at the specified index to the data buffer supplied in the + * holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Same as {@link #set(int, NullableViewVarBinaryHolder)} except that it handles the case where index + * and length of a new element are beyond the existing capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising this and a target vector of the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException("Unsupported operation"); + } + + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException("Unsupported operation"); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java new file mode 100644 index 000000000000..010df02e0bce --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; +import org.apache.arrow.vector.complex.impl.ViewVarCharReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableViewVarCharHolder; +import org.apache.arrow.vector.holders.ViewVarCharHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; + +/** + * ViewVarCharVector implements a view of a variable width vector of VARCHAR + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. A viewBuffer keeps track + * of all values in the vector, and an external data buffer is kept to keep longer + * strings (>12). + */ +public final class ViewVarCharVector extends BaseVariableWidthViewVector { + + /** + * Instantiate a ViewVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public ViewVarCharVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.VARCHAR.getType()), allocator); + } + + /** + * Instantiate a ViewVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarCharVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a ViewVarCharVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarCharVector(Field field, BufferAllocator allocator) { + super(field, allocator); + } + + @Override + protected FieldReader getReaderImpl() { + return new ViewVarCharReaderImpl(ViewVarCharVector.this); + } + + /** + * Get a minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.VIEWVARCHAR; + } + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the variable length element at specified index as a byte array. + * + * @param index position of an element to get + * @return array of bytes for a non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + return getData(index); + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of an element to get + * @return Text object for a non-null element, null otherwise + */ + @Override + public Text getObject(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + + final Text result = new Text(); + read(index, result); + return result; + } + + /** + * Read the value at the given position to the given output buffer. + * The caller is responsible for checking for nullity first. + * + * @param index position of an element. + * @param buffer the buffer to write into. + */ + @Override + public void read(int index, ReusableBuffer buffer) { + getData(index, buffer); + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of an element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("NullableViewVarCharHolder get operation not supported"); + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, ViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("ViewVarCharHolder set operation not supported"); + } + + /** + * Same as {@link #set(int, ViewVarCharHolder)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, ViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("ViewVarCharHolder setSafe operation not supported"); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("NullableViewVarCharHolder set operation not supported"); + } + + /** + * Same as {@link #set(int, NullableViewVarCharHolder)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("NullableViewVarCharHolder setSafe operation not supported"); + } + + /** + * Set the variable length element at the specified index to the + * content in supplied Text. + * + * @param index position of the element to set + * @param text Text object with data + */ + public void set(int index, Text text) { + set(index, text.getBytes(), 0, (int) text.getLength()); + } + + /** + * Same as {@link #set(int, NullableViewVarCharHolder)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set. + * @param text Text object with data + */ + public void setSafe(int index, Text text) { + setSafe(index, text.getBytes(), 0, (int) text.getLength()); + } + + @Override + public void validateScalars() { + for (int i = 0; i < getValueCount(); ++i) { + byte[] value = get(i); + if (value != null) { + ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value), + "Non-UTF-8 data in VarCharVector at position " + i + "."); + } + } + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising this and a target vector of the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} (UnsupportedOperationException) + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException( + "ViewVarCharVector does not support getTransferPair(String, BufferAllocator)"); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param field The field materialized by this vector. + * @param allocator allocator for the target vector + * @return {@link TransferPair} (UnsupportedOperationException) + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException( + "ViewVarCharVector does not support getTransferPair(Field, BufferAllocator)"); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param target the target for the transfer + * @return {@link TransferPair} (UnsupportedOperationException) + */ + @Override + public TransferPair makeTransferPair(ValueVector target) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException( + "ViewVarCharVector does not support makeTransferPair(ValueVector)"); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 5323ddda838c..56220d270fa9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -27,6 +27,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; @@ -162,6 +163,11 @@ public Boolean visit(BaseLargeVariableWidthVector left, Range range) { return compareBaseLargeVariableWidthVectors(range); } + @Override + public Boolean visit(BaseVariableWidthViewVector left, Range range) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Boolean visit(ListVector left, Range range) { if (!validate(left)) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index 443ee1f96e27..9bbe5c1b8997 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; @@ -85,6 +86,11 @@ public Boolean visit(BaseLargeVariableWidthVector left, Void value) { return compareField(left.getField(), right.getField()); } + @Override + public Boolean visit(BaseVariableWidthViewVector left, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Boolean visit(ListVector left, Void value) { return compareField(left.getField(), right.getField()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java index aee090706b3c..de88f25e6753 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java @@ -20,6 +20,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.complex.DenseUnionVector; @@ -42,6 +43,8 @@ public interface VectorVisitor { OUT visit(BaseLargeVariableWidthVector left, IN value); + OUT visit(BaseVariableWidthViewVector left, IN value); + OUT visit(ListVector left, IN value); OUT visit(FixedSizeListVector left, IN value); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 0b0e0d66a98f..89d8441d42aa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -65,6 +65,8 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarBinaryVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; @@ -114,10 +116,13 @@ import org.apache.arrow.vector.complex.impl.UnionWriter; import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl; import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; +import org.apache.arrow.vector.complex.impl.ViewVarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.ViewVarCharWriterImpl; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.BinaryView; import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; @@ -138,6 +143,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8View; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; @@ -504,6 +510,20 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarCharWriterImpl((VarCharVector) vector); } }, + VIEWVARCHAR(Utf8View.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new ViewVarCharVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new ViewVarCharWriterImpl((ViewVarCharVector) vector); + } + }, LARGEVARCHAR(LargeUtf8.INSTANCE) { @Override public FieldVector getNewVector( @@ -546,6 +566,20 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarBinaryWriterImpl((VarBinaryVector) vector); } }, + VIEWVARBINARY(Binary.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new ViewVarBinaryVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new ViewVarBinaryWriterImpl((ViewVarBinaryVector) vector); + } + }, DECIMAL(null) { @Override public FieldVector getNewVector( @@ -923,6 +957,11 @@ public MinorType visit(Utf8 type) { return MinorType.VARCHAR; } + @Override + public MinorType visit(Utf8View type) { + return MinorType.VIEWVARCHAR; + } + @Override public Types.MinorType visit(LargeUtf8 type) { return MinorType.LARGEVARCHAR; @@ -933,6 +972,11 @@ public MinorType visit(Binary type) { return MinorType.VARBINARY; } + @Override + public MinorType visit(BinaryView type) { + return MinorType.VIEWVARBINARY; + } + @Override public MinorType visit(LargeBinary type) { return MinorType.LARGEVARBINARY; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java index 3af2c9837407..5f5993397513 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java @@ -93,6 +93,10 @@ public static long divideBy8Ceil(long input) { return (input + 7) >>> (long) DIVIDE_BY_8_SHIFT_BITS; } + public static long roundUpToMultipleOf16(long num) { + return (num + 15) & 0xFFFFFFFFFFFFFFF0L; + } + private DataSizeRoundingUtil() { } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java index d938cd833a41..10a195e1e7b0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java @@ -64,6 +64,13 @@ public void set(ArrowBuf srcBytes, long start, long len) { length = (int) len; } + @Override + public void set(byte[] srcBytes, long start, long len) { + setCapacity((int) len, false); + System.arraycopy(srcBytes, (int) start, bytes, 0, (int) len); + length = (int) len; + } + @Override public boolean equals(Object o) { if (o == this) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java index 95e35ce6938c..ea631c59ce2f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -235,9 +235,7 @@ public void set(Text other) { * @param len the number of bytes of the new string */ public void set(byte[] utf8, int start, int len) { - setCapacity(len, false); - System.arraycopy(utf8, start, bytes, 0, len); - this.length = len; + super.set(utf8, start, len); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 068717c7acbc..def8ef96877e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -26,6 +26,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ExtensionTypeVector; @@ -205,6 +206,11 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { return targetVector; } + @Override + public ValueVector visit(BaseVariableWidthViewVector left, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public ValueVector visit(ListVector deltaVector, Void value) { Preconditions.checkArgument(typeVisitor.equals(deltaVector), diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index d4abaa1945b9..0a67db0455b4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; @@ -133,6 +134,11 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector vector, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { int valueCount = vector.getValueCount(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java index 6d33be7a0dba..ddcb658c1a95 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; @@ -103,6 +104,11 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector vector, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { validateOffsetBuffer(vector, vector.getValueCount()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java index 3d1c5a4f27f7..bbdabdb1226a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java @@ -22,6 +22,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; @@ -308,6 +309,11 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector vector, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { validateVectorCommon(vector, ArrowType.List.class); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java index 7e99b1f90fb6..786a1142a2b0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; @@ -98,6 +99,11 @@ public Void visit(BaseLargeVariableWidthVector left, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector left, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java index 7e64dd386463..be83e573c7c4 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java @@ -29,11 +29,21 @@ public static VarCharVector newVarCharVector(String name, BufferAllocator alloca FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector(name, allocator, null); } + public static ViewVarCharVector newViewVarCharVector(String name, BufferAllocator allocator) { + return (ViewVarCharVector) + FieldType.nullable(new ArrowType.Utf8View()).createNewSingleVector(name, allocator, null); + } + public static VarBinaryVector newVarBinaryVector(String name, BufferAllocator allocator) { return (VarBinaryVector) FieldType.nullable(new ArrowType.Binary()).createNewSingleVector(name, allocator, null); } + public static ViewVarBinaryVector newViewVarBinaryVector(String name, BufferAllocator allocator) { + return (ViewVarBinaryVector) + FieldType.nullable(new ArrowType.BinaryView()).createNewSingleVector(name, allocator, null); + } + public static T newVector(Class c, String name, ArrowType type, BufferAllocator allocator) { return c.cast(FieldType.nullable(type).createNewSingleVector(name, allocator, null)); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java new file mode 100644 index 000000000000..efb5afac91b1 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java @@ -0,0 +1,1462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.TestUtils.newVector; +import static org.apache.arrow.vector.TestUtils.newViewVarBinaryVector; +import static org.apache.arrow.vector.TestUtils.newViewVarCharVector; +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Random; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.util.ReusableByteArray; +import org.apache.arrow.vector.util.Text; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + + +public class TestVarCharViewVector { + + // short string (length <= 12) + private static final byte[] STR0 = "0123456".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR1 = "012345678912".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR2 = "0123456789123".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR3 = "01234567891234567".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR4 = "01234567".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR5 = "A1234A".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR6 = "B1234567B".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR7 = "K01234567891234567K".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR8 = "M012345678912345678M".getBytes(StandardCharsets.UTF_8); + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @BeforeEach + public void prepare() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @AfterEach + public void shutdown() { + allocator.close(); + } + + public static void setBytes(int index, byte[] bytes, ViewVarCharVector vector) { + BitVectorHelper.setBit(vector.validityBuffer, index); + vector.setBytes(index, bytes, 0, bytes.length); + } + + @Test + public void testInlineAllocation() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 3); + final int valueCount = 3; + viewVarCharVector.set(0, STR0); + viewVarCharVector.set(1, STR1); + viewVarCharVector.set(2, STR4); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + + String str1 = new String(STR0, StandardCharsets.UTF_8); + String str2 = new String(STR1, StandardCharsets.UTF_8); + String str3 = new String(STR4, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + + assertTrue(viewVarCharVector.dataBuffers.isEmpty()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + } + } + + @Test + public void testDataBufferBasedAllocationInSameBuffer() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 4); + final int valueCount = 4; + String str4 = generateRandomString(34); + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, str4.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + assertNotNull(view4); + + String str1 = new String(STR1, StandardCharsets.UTF_8); + String str2 = new String(STR2, StandardCharsets.UTF_8); + String str3 = new String(STR3, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + assertEquals(new String(view4, StandardCharsets.UTF_8), str4); + + assertEquals(1, viewVarCharVector.dataBuffers.size()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(3)).getBuffer(), + StandardCharsets.UTF_8), str4); + } + } + + @Test + public void testDataBufferBasedAllocationInOtherBuffer() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 4); + final int valueCount = 4; + String str4 = generateRandomString(35); + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, str4.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + assertNotNull(view4); + + String str1 = new String(STR1, StandardCharsets.UTF_8); + String str2 = new String(STR2, StandardCharsets.UTF_8); + String str3 = new String(STR3, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + assertEquals(new String(view4, StandardCharsets.UTF_8), str4); + + assertEquals(2, viewVarCharVector.dataBuffers.size()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(3)).getBuffer(), + StandardCharsets.UTF_8), str4); + } + } + + @Test + public void testMixedAllocation() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(128, 6); + final int valueCount = 6; + String str4 = generateRandomString(35); + String str6 = generateRandomString(40); + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, str4.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.set(4, STR1); + viewVarCharVector.set(5, str6.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + byte[] view5 = viewVarCharVector.get(4); + byte[] view6 = viewVarCharVector.get(5); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + assertNotNull(view4); + assertNotNull(view5); + assertNotNull(view6); + + String str1 = new String(STR1, StandardCharsets.UTF_8); + String str2 = new String(STR2, StandardCharsets.UTF_8); + String str3 = new String(STR3, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + assertEquals(new String(view4, StandardCharsets.UTF_8), str4); + assertEquals(new String(view5, StandardCharsets.UTF_8), str1); + assertEquals(new String(view6, StandardCharsets.UTF_8), str6); + + assertEquals(1, viewVarCharVector.dataBuffers.size()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(3)).getBuffer(), + StandardCharsets.UTF_8), str4); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(4)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(5)).getBuffer(), + StandardCharsets.UTF_8), str6); + } + } + + @Test + public void testAllocationIndexOutOfBounds() { + assertThrows(IndexOutOfBoundsException.class, () -> { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(32, 3); + final int valueCount = 3; + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR2); + viewVarCharVector.setValueCount(valueCount); + } + }); + } + + @Test + public void testSizeOfViewBufferElements() { + try (final ViewVarCharVector vector = new ViewVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + int valueCount = 100; + int currentSize = 0; + vector.setInitialCapacity(valueCount); + vector.allocateNew(); + vector.setValueCount(valueCount); + for (int i = 0; i < valueCount; i++) { + currentSize += i; + vector.setSafe(i, new byte[i]); + } + assertEquals(currentSize, vector.sizeOfViewBufferElements()); + } + } + + @Test + public void testNullableVarType1() { + + // Create a new value vector for 1024 integers. + try (final ViewVarCharVector vector = newViewVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024 * 10, 1024); + + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); + ByteBuffer str3ByteBuffer = ByteBuffer.wrap(STR3); + vector.setSafe(5, str3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, str3ByteBuffer, 2, STR3.length - 2); + + // Set with convenience function + Text txt = new Text("foo"); + vector.setSafe(7, txt.getBytes(), 0, (int) txt.getLength()); + + // Check the sample strings. + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); + + // Check returning a Text object + assertEquals(txt, vector.getObject(7)); + + // Ensure null value throws. + assertNull(vector.get(8)); + } + } + + @Test + public void testGetTextRepeatedly() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + ValueVectorDataPopulator.setVector(vector, STR1, STR2); + vector.setValueCount(2); + + /* check the vector output */ + Text text = new Text(); + vector.read(0, text); + assertArrayEquals(STR1, text.getBytes()); + vector.read(1, text); + assertArrayEquals(STR2, text.getBytes()); + } + } + + @Test + public void testNullableVarType2() { + try (final ViewVarBinaryVector vector = newViewVarBinaryVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024 * 10, 1024); + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); + ByteBuffer str3ByteBuffer = ByteBuffer.wrap(STR3); + vector.setSafe(5, str3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, str3ByteBuffer, 2, STR3.length - 2); + + // Check the sample strings. + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); + + // Ensure null value throws. + assertNull(vector.get(7)); + } + } + + @Test + public void testGetBytesRepeatedly() { + try (ViewVarBinaryVector vector = new ViewVarBinaryVector("", allocator)) { + vector.allocateNew(5, 1); + + final String str = "hello world!!!"; + final String str2 = "foo"; + vector.setSafe(0, str.getBytes(StandardCharsets.UTF_8)); + vector.setSafe(1, str2.getBytes(StandardCharsets.UTF_8)); + + // verify results + ReusableByteArray reusableByteArray = new ReusableByteArray(); + vector.read(0, reusableByteArray); + assertArrayEquals( + str.getBytes(StandardCharsets.UTF_8), + Arrays.copyOfRange( + reusableByteArray.getBuffer(), 0, (int) reusableByteArray.getLength())); + byte[] oldBuffer = reusableByteArray.getBuffer(); + + vector.read(1, reusableByteArray); + assertArrayEquals( + str2.getBytes(StandardCharsets.UTF_8), + Arrays.copyOfRange( + reusableByteArray.getBuffer(), 0, (int) reusableByteArray.getLength())); + + // There should not have been any reallocation since the newer value is smaller in length. + assertSame(oldBuffer, reusableByteArray.getBuffer()); + } + } + + @Test + public void testReAllocVariableWidthViewVector() { + try (final ViewVarCharVector vector = newVector(ViewVarCharVector.class, EMPTY_SCHEMA_PATH, + Types.MinorType.VIEWVARCHAR, allocator)) { + final int capacityLimit = 4095; + final int overLimitIndex = 200; + vector.setInitialCapacity(capacityLimit); + vector.allocateNew(); + + int initialCapacity = vector.getValueCapacity(); + assertTrue(initialCapacity >= capacityLimit); + + /* Put values in indexes that fall within the initial allocation */ + vector.setSafe(0, STR1, 0, STR1.length); + vector.setSafe(initialCapacity - 1, STR2, 0, STR2.length); + + /* the set calls above should NOT have triggered a realloc */ + assertEquals(initialCapacity, vector.getValueCapacity()); + + /* Now try to put values in space that falls beyond the initial allocation */ + vector.setSafe(initialCapacity + overLimitIndex, STR3, 0, STR3.length); + + /* Check valueCapacity is more than initial allocation */ + assertTrue(initialCapacity * 2 <= vector.getValueCapacity()); + + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(initialCapacity - 1)); + assertArrayEquals(STR3, vector.get(initialCapacity + overLimitIndex)); + + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for ValueVectors + // as we don't call setSafe for null values, but we do call setValueCount when the current batch is processed. + vector.setValueCount(vector.getValueCapacity() + overLimitIndex); + } + } + + @Test + public void testSetSafeWithArrowBufNoExcessAllocs() { + final int numValues = BaseVariableWidthViewVector.INITIAL_VALUE_ALLOCATION * 2; + final byte[] valueBytes = "hello world!!!".getBytes(StandardCharsets.UTF_8); + final int valueBytesLength = valueBytes.length; + final int isSet = 1; + try (final ViewVarCharVector fromVector = + newVector( + ViewVarCharVector.class, + EMPTY_SCHEMA_PATH, + Types.MinorType.VIEWVARCHAR, + allocator); + final ViewVarCharVector toVector = + newVector( + ViewVarCharVector.class, + EMPTY_SCHEMA_PATH, + Types.MinorType.VIEWVARCHAR, + allocator)) { + /* + * Populate the `fromVector` with `numValues` with byte-arrays, each of size `valueBytesLength`. + */ + fromVector.setInitialCapacity(numValues); + fromVector.allocateNew(); + for (int i = 0; i < numValues; ++i) { + fromVector.setSafe(i, valueBytes, 0 /*start*/, valueBytesLength); + } + fromVector.setValueCount(numValues); + ArrowBuf fromDataBuffer = fromVector.getDataBuffer(); + assertTrue(numValues * valueBytesLength <= fromDataBuffer.capacity()); + + /* + * Copy the entries one-by-one from 'fromVector' to 'toVector', but use the setSafe with + * ArrowBuf API (instead of setSafe with byte-array). + */ + toVector.setInitialCapacity(numValues); + toVector.allocateNew(); + for (int i = 0; i < numValues; i++) { + int start = fromVector.getTotalValueLengthUpToIndex(i); + // across variable + // width implementations + int end = fromVector.getTotalValueLengthUpToIndex(i + 1); + toVector.setSafe(i, isSet, start, end, fromDataBuffer); + } + + /* + * Since the 'fromVector' and 'toVector' have the same initial capacity, and were populated + * with the same varchar elements, the allocations and hence, the final capacity should be + * the same. + */ + assertEquals(fromDataBuffer.capacity(), toVector.getDataBuffer().capacity()); + } + } + + @Test + public void testSetLastSetUsage() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myvector", allocator)) { + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + /* + * If we don't do setLastSe(3) before setValueCount(), then the latter will corrupt + * the value vector by filling in all positions [0,valuecount-1] will empty byte arrays. + * Run the test by commenting on the next line, and we should see incorrect vector output. + */ + vector.setLastSet(3); + vector.setValueCount(20); + + /* Check current lastSet */ + assertEquals(19, vector.getLastSet()); + + /* Check the vector output again */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + assertEquals(0, vector.getValueLength(4)); + assertEquals(0, vector.getValueLength(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertEquals(0, vector.getValueLength(10)); + assertEquals(0, vector.getValueLength(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + assertEquals(0, vector.getValueLength(15)); + assertEquals(0, vector.getValueLength(16)); + assertEquals(0, vector.getValueLength(17)); + assertEquals(0, vector.getValueLength(18)); + assertEquals(0, vector.getValueLength(19)); + } + } + + @Test + public void testFillEmptiesUsage() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myvector", allocator)) { + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + vector.setLastSet(3); + /* fill empty byte arrays from index [4, 9] */ + vector.fillEmpties(10); + + /* Check current lastSet */ + assertEquals(9, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertEquals(0, vector.getValueLength(4)); + assertEquals(0, vector.getValueLength(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + + setBytes(10, STR1, vector); + setBytes(11, STR2, vector); + + vector.setLastSet(11); + /* fill empty byte arrays from index [12, 14] */ + vector.setValueCount(15); + + /* Check current lastSet */ + assertEquals(14, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertEquals(0, vector.getValueLength(4)); + assertEquals(0, vector.getValueLength(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertArrayEquals(STR1, vector.get(10)); + assertArrayEquals(STR2, vector.get(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + } + } + + @Test + public void testGetBufferAddress1() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + + setVector(vector, STR1, STR2, STR3, STR4); + vector.setValueCount(15); + + /* check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + List buffers = vector.getFieldBuffers(); + long bitAddress = vector.getValidityBufferAddress(); + long dataAddress = vector.getDataBufferAddress(); + + assertEquals(3, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(dataAddress, buffers.get(1).memoryAddress()); + } + } + + @Test + public void testSetInitialCapacityInViews() { + try (final ViewVarCharVector vector = new ViewVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + + /* use the default 16 data bytes on average per element */ + final int viewSize = BaseVariableWidthViewVector.ELEMENT_SIZE; + int defaultCapacity = BaseVariableWidthViewVector.INITIAL_VIEW_VALUE_ALLOCATION / viewSize; + vector.setInitialCapacity(defaultCapacity); + vector.allocateNew(); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(CommonUtil.nextPowerOfTwo(defaultCapacity * viewSize), vector.getDataBuffer().capacity()); + + double density = 4.0; + final int valueCount = 5; + vector.setInitialCapacity(valueCount, density); + vector.allocateNew(); + assertEquals(8, vector.getValueCapacity()); + assertEquals(128, vector.getDataBuffer().capacity()); + int initialDataBufferSize = (int) (valueCount * density); + // making sure a databuffer is allocated + vector.set(4, "01234567890123456".getBytes(StandardCharsets.UTF_8)); + assertEquals(vector.dataBuffers.size(), 1); + ArrowBuf dataBuf = vector.dataBuffers.get(0); + try (ArrowBuf tempBuf = vector.allocator.buffer(initialDataBufferSize)) { + // replicating a new buffer allocation process when a new buffer is added to the + // data buffer when inserting an element with length > 12 + assertEquals(tempBuf.capacity(), dataBuf.capacity()); + } + } + } + + @Test + public void testGetPointerVariableWidthViews() { + final String[] sampleData = new String[]{ + "abc", "1234567890123", "def", null, "hello world java", "aaaaa", "world", "2019", null, "0717"}; + + try (ViewVarCharVector vec1 = new ViewVarCharVector("vec1", allocator); + ViewVarCharVector vec2 = new ViewVarCharVector("vec2", allocator)) { + + vec1.allocateNew((long) sampleData.length * 16, sampleData.length); + vec2.allocateNew((long) sampleData.length * 16, sampleData.length); + + for (int i = 0; i < sampleData.length; i++) { + String str = sampleData[i]; + if (str != null) { + vec1.set(i, sampleData[i].getBytes(StandardCharsets.UTF_8)); + vec2.set(i, sampleData[i].getBytes(StandardCharsets.UTF_8)); + } else { + vec1.setNull(i); + + vec2.setNull(i); + } + } + + ArrowBufPointer ptr1 = new ArrowBufPointer(); + ArrowBufPointer ptr2 = new ArrowBufPointer(); + + for (int i = 0; i < sampleData.length; i++) { + vec1.getDataPointer(i, ptr1); + vec2.getDataPointer(i, ptr2); + + assertTrue(ptr1.equals(ptr2)); + assertTrue(ptr2.equals(ptr2)); + } + } + } + + @Test + public void testGetNullFromVariableWidthViewVector() { + try (final ViewVarCharVector varCharViewVector = new ViewVarCharVector("viewvarcharvec", allocator); + final ViewVarBinaryVector varBinaryViewVector = new ViewVarBinaryVector("viewvarbinary", allocator)) { + varCharViewVector.allocateNew(16, 1); + varBinaryViewVector.allocateNew(16, 1); + + varCharViewVector.setNull(0); + varBinaryViewVector.setNull(0); + + assertNull(varCharViewVector.get(0)); + assertNull(varBinaryViewVector.get(0)); + } + } + + @Test + public void testVariableWidthViewVectorNullHashCode() { + try (ViewVarCharVector viewVarChar = new ViewVarCharVector("view var char vector", allocator)) { + viewVarChar.allocateNew(100, 1); + viewVarChar.setValueCount(1); + + viewVarChar.set(0, "abc".getBytes(StandardCharsets.UTF_8)); + viewVarChar.setNull(0); + + assertEquals(0, viewVarChar.hashCode(0)); + } + } + + @Test + public void testUnloadVariableWidthViewVector() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("view var char", allocator)) { + viewVarCharVector.allocateNew(16, 2); + viewVarCharVector.setValueCount(2); + viewVarCharVector.set(0, "abcd".getBytes(StandardCharsets.UTF_8)); + + List bufs = viewVarCharVector.getFieldBuffers(); + assertEquals(2, bufs.size()); + + ArrowBuf viewBuf = bufs.get(1); + + assertEquals(32, viewBuf.writerIndex()); + final String longString = "012345678901234"; + viewVarCharVector.set(1, longString.getBytes(StandardCharsets.UTF_8)); + + bufs = viewVarCharVector.getFieldBuffers(); + assertEquals(3, bufs.size()); + + ArrowBuf referenceBuf = bufs.get(2); + assertEquals(longString.length(), referenceBuf.writerIndex()); + } + } + + @Test + public void testUnSupportedOffSet() { + // offset is not a feature required in ViewVarCharVector + assertThrows(UnsupportedOperationException.class, () -> { + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + + setVector(vector, STR1, STR2); + vector.setValueCount(2); + + /* check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + + vector.getOffsetBuffer(); + } + }); + } + + private void validateViewBuffer(int index, ViewVarCharVector vector, byte[] expectedData, + int expectedBufId, int expectedOffSet) { + final ArrowBuf viewBuffer = vector.viewBuffer; + int writePosition = index * BaseVariableWidthViewVector.ELEMENT_SIZE; + final int prefixBufWidth = BaseVariableWidthViewVector.PREFIX_WIDTH; + final int lengthBufWidth = BaseVariableWidthViewVector.LENGTH_WIDTH; + int length = viewBuffer.getInt(writePosition); + + // validate length of the view + assertEquals(expectedData.length, length); + + byte[] prefixBytes = new byte[prefixBufWidth]; + viewBuffer.getBytes(writePosition + lengthBufWidth, prefixBytes); + + // validate the prefix + byte[] expectedPrefixBytes = new byte[prefixBufWidth]; + System.arraycopy(expectedData, 0, expectedPrefixBytes, 0, prefixBufWidth); + assertArrayEquals(expectedPrefixBytes, prefixBytes); + + if (length > 12) { + /// validate bufId + int bufId = viewBuffer.getInt(writePosition + lengthBufWidth + prefixBufWidth); + assertEquals(expectedBufId, bufId); + // validate offset + int offset = viewBuffer.getInt(writePosition + + lengthBufWidth + + prefixBufWidth + + BaseVariableWidthViewVector.BUF_INDEX_WIDTH); + assertEquals(expectedOffSet, offset); + } + // validate retrieved data + assertArrayEquals(expectedData, vector.get(index)); + } + + @Test + public void testOverwriteShortFromLongString() { + /*NA: not applicable */ + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.set(0, STR0); + vector.setValueCount(1); + assertEquals(0, vector.dataBuffers.size()); + assertArrayEquals(STR0, vector.get(0)); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + + // set long string + vector.set(0, STR3); + vector.setValueCount(1); + assertEquals(1, vector.dataBuffers.size()); + assertArrayEquals(STR3, vector.get(0)); + + validateViewBuffer(0, vector, STR3, 0, 0); + } + + // Overwriting in the middle of the buffer when existing buffers are all shorts. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(48, 3); + // set short string 1 + vector.set(0, STR0); + // set short string 2 + vector.set(1, STR5); + // set short string 3 + vector.set(2, STR6); + vector.setValueCount(3); + + // overwrite index 1 with a long string + vector.set(1, STR7); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + validateViewBuffer(1, vector, STR7, 0, 0); + validateViewBuffer(2, vector, STR6, /*NA*/-1, /*NA*/-1); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(80, 5); + // set short string 1 + vector.set(0, STR0); + // set long string 1 + vector.set(1, STR3); + // set short string 2 + vector.set(2, STR5); + // set short string 3 + vector.set(3, STR6); + // set long string 2 + vector.set(4, STR7); + vector.setValueCount(5); + + // overwrite index 2 with a long string + vector.set(2, STR8); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + validateViewBuffer(1, vector, STR3, 0, 0); + // Since we did overwrite index 2 with STR8, and as we are using append-only approach, + // it will be appended to the data buffer. + // Thus, it will be stored in the dataBuffer in order i.e. [STR3, STR7, STR8]. + validateViewBuffer(2, vector, STR8, 0, STR3.length + STR7.length); + validateViewBuffer(3, vector, STR6, /*NA*/-1, /*NA*/-1); + validateViewBuffer(4, vector, STR7, 0, STR3.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + // Here the short string is overwritten with a long string, and its length is larger than + // the remaining capacity of the existing data buffer. + // This would allocate a new buffer in the data buffers. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(80, 5); + // set short string 1 + vector.set(0, STR0); + // set long string 1 + vector.set(1, STR3); + // set short string 2 + vector.set(2, STR5); + // set short string 3 + vector.set(3, STR6); + // set long string 2 + vector.set(4, STR7); + + vector.setValueCount(5); + + // overwrite index 2 with a long string + String longString = generateRandomString(128); + byte[] longStringBytes = longString.getBytes(StandardCharsets.UTF_8); + // since the append-only approach is used and the remaining capacity + // is not enough to store the new string; a new buffer will be allocated. + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity < longStringBytes.length); + vector.set(2, longStringBytes); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + validateViewBuffer(1, vector, STR3, 0, 0); + // overwritten long string will be stored in the new data buffer. + validateViewBuffer(2, vector, longStringBytes, 1, 0); + validateViewBuffer(3, vector, STR6, /*NA*/-1, /*NA*/-1); + validateViewBuffer(4, vector, STR7, 0, STR3.length); + } + } + + @Test + public void testOverwriteLongFromShortString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.set(0, STR3); + vector.setValueCount(1); + // set long string + vector.set(0, STR0); + vector.setValueCount(1); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(48, 3); + // set long string 1 + vector.set(0, STR3); + // set long string 2 + vector.set(1, STR8); + // set long string 3 + vector.set(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a short string + vector.set(1, STR6); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR6, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR8 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, STR7, 0, STR3.length + STR8.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(80, 5); + // set long string 1 + vector.set(0, STR3); + // set short string 1 + vector.set(1, STR5); + // set long string 2 + vector.set(2, STR7); + // set long string 3 + vector.set(3, STR8); + // set short string 2 + vector.set(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a short string + vector.set(2, STR0); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR5, /*NA*/-1, /*NA*/-1); + validateViewBuffer(2, vector, STR0, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR7 will still be in the first data buffer in dataBuffers. + validateViewBuffer(3, vector, STR8, 0, STR3.length + STR7.length); + validateViewBuffer(4, vector, STR6, /*NA*/-1, /*NA*/-1); + } + } + + @Test + public void testOverwriteLongFromAShorterLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.set(0, STR7); + vector.setValueCount(1); + // set shorter long string, since append-only approach is used and the remaining capacity + // is not enough to store the new string; a new buffer will be allocated. + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity < STR3.length); + // set shorter long string + vector.set(0, STR3); + vector.setValueCount(1); + + validateViewBuffer(0, vector, STR3, 1, 0); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(128, 3); + // set long string 1 + vector.set(0, STR3); + // set long string 2 + vector.set(1, STR8); + // set long string 3 + vector.set(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a shorter long string + // Since append-only approach is used + // and the remaining capacity is enough to store in the same data buffer.; + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity > STR2.length); + vector.set(1, STR2); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR3, 0, 0); + // since the append-only approach is used, + // STR8 will still be in the first data buffer in dataBuffers. + validateViewBuffer(1, vector, STR2, 0, STR3.length + STR8.length + STR7.length); + validateViewBuffer(2, vector, STR7, 0, STR3.length + STR8.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(128, 5); + // set long string 1 + vector.set(0, STR3); + // set short string 1 + vector.set(1, STR5); + // set long string 2 + vector.set(2, STR7); + // set long string 3 + vector.set(3, STR8); + // set short string 2 + vector.set(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a shorter long string + // Since append-only approach is used + // and the remaining capacity is enough to store in the same data buffer.; + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity > STR2.length); + vector.set(2, STR2); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR5, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR7 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, STR2, 0, STR3.length + + STR7.length + STR8.length); + validateViewBuffer(3, vector, STR8, 0, STR3.length + STR7.length); + validateViewBuffer(4, vector, STR6, /*NA*/-1, /*NA*/-1); + } + } + + @Test + public void testOverwriteLongFromALongerLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.set(0, STR3); + vector.setValueCount(1); + // set longer long string, since append-only approach is used and the remaining capacity + // is not enough to store the new string; a new buffer will be allocated. + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity < STR7.length); + // set longer long string + vector.set(0, STR7); + vector.setValueCount(1); + + validateViewBuffer(0, vector, STR7, 1, 0); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(48, 3); + // set long string 1 + vector.set(0, STR3); + // set long string 2 + vector.set(1, STR8); + // set long string 3 + vector.set(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a longer long string + // the remaining capacity is not enough to store in the same data buffer + // since a new buffer is added to the dataBuffers + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + String longerString = generateRandomString(35); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + assertTrue(remainingCapacity < longerStringBytes.length); + + vector.set(1, longerStringBytes); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, longerStringBytes, 1, 0); + // since the append-only approach is used, + // STR8 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, STR7, 0, STR3.length + STR8.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(128, 5); + // set long string 1 + vector.set(0, STR3); + // set short string 1 + vector.set(1, STR5); + // set long string 2 + vector.set(2, STR7); + // set long string 3 + vector.set(3, STR2); + // set short string 2 + vector.set(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a longer long string + // the remaining capacity is enough to store in the same data buffer + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + String longerString = generateRandomString(24); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + assertTrue(remainingCapacity > longerStringBytes.length); + + vector.set(2, longerStringBytes); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR5, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR7 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, longerStringBytes, 0, STR3.length + STR7.length + STR2.length); + validateViewBuffer(3, vector, STR2, 0, STR3.length + STR7.length); + validateViewBuffer(4, vector, STR6, /*NA*/-1, /*NA*/-1); + } + } + + @Test + public void testSafeOverwriteShortFromLongString() { + /*NA: not applicable */ + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.setSafe(0, STR0); + vector.setValueCount(1); + assertEquals(0, vector.dataBuffers.size()); + assertArrayEquals(STR0, vector.get(0)); + + // set long string + vector.setSafe(0, STR3); + vector.setValueCount(1); + assertEquals(1, vector.dataBuffers.size()); + assertArrayEquals(STR3, vector.get(0)); + + } + + // Overwriting in the middle of the buffer when existing buffers are all shorts. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 3); + // set short string 1 + vector.setSafe(0, STR0); + // set short string 2 + vector.setSafe(1, STR5); + // set short string 3 + vector.setSafe(2, STR6); + vector.setValueCount(3); + + // overwrite index 1 with a long string + vector.setSafe(1, STR7); + vector.setValueCount(3); + + assertArrayEquals(STR0, vector.get(0)); + assertArrayEquals(STR7, vector.get(1)); + assertArrayEquals(STR6, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set short string 1 + vector.setSafe(0, STR0); + // set long string 1 + vector.setSafe(1, STR3); + // set short string 2 + vector.setSafe(2, STR5); + // set short string 3 + vector.setSafe(3, STR6); + // set long string 2 + vector.setSafe(4, STR7); + vector.setValueCount(5); + + // overwrite index 2 with a long string + vector.setSafe(2, STR8); + vector.setValueCount(5); + + assertArrayEquals(STR0, vector.get(0)); + assertArrayEquals(STR3, vector.get(1)); + assertArrayEquals(STR8, vector.get(2)); + assertArrayEquals(STR6, vector.get(3)); + assertArrayEquals(STR7, vector.get(4)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set short string 1 + vector.setSafe(0, STR0); + // set long string 1 + vector.setSafe(1, STR3); + // set short string 2 + vector.setSafe(2, STR5); + // set short string 3 + vector.setSafe(3, STR6); + // set long string 2 + vector.setSafe(4, STR7); + + vector.setValueCount(5); + + // overwrite index 2 with a long string + String longString = generateRandomString(128); + byte[] longStringBytes = longString.getBytes(StandardCharsets.UTF_8); + + vector.setSafe(2, longStringBytes); + vector.setValueCount(5); + + assertArrayEquals(STR0, vector.get(0)); + assertArrayEquals(STR3, vector.get(1)); + assertArrayEquals(longStringBytes, vector.get(2)); + assertArrayEquals(STR6, vector.get(3)); + assertArrayEquals(STR7, vector.get(4)); + } + } + + @Test + public void testSafeOverwriteLongFromShortString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.setSafe(0, STR3); + vector.setValueCount(1); + // set long string + vector.setSafe(0, STR0); + vector.setValueCount(1); + + assertArrayEquals(STR0, vector.get(0)); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 3); + // set long string 1 + vector.setSafe(0, STR3); + // set long string 2 + vector.setSafe(1, STR8); + // set long string 3 + vector.setSafe(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a short string + vector.setSafe(1, STR6); + vector.setValueCount(3); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR6, vector.get(1)); + assertArrayEquals(STR7, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set long string 1 + vector.setSafe(0, STR3); + // set short string 1 + vector.setSafe(1, STR5); + // set long string 2 + vector.setSafe(2, STR7); + // set long string 3 + vector.setSafe(3, STR8); + // set short string 2 + vector.setSafe(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a short string + vector.setSafe(2, STR0); + vector.setValueCount(5); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR5, vector.get(1)); + assertArrayEquals(STR0, vector.get(2)); + assertArrayEquals(STR8, vector.get(3)); + assertArrayEquals(STR6, vector.get(4)); + } + } + + @Test + public void testSafeOverwriteLongFromAShorterLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.setSafe(0, STR7); + vector.setValueCount(1); + // set shorter long string + vector.setSafe(0, STR3); + vector.setValueCount(1); + + assertArrayEquals(STR3, vector.get(0)); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(16, 3); + // set long string 1 + vector.setSafe(0, STR3); + // set long string 2 + vector.setSafe(1, STR8); + // set long string 3 + vector.setSafe(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a shorter long string + vector.setSafe(1, STR2); + vector.setValueCount(3); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR7, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set long string 1 + vector.setSafe(0, STR3); + // set short string 1 + vector.setSafe(1, STR5); + // set long string 2 + vector.setSafe(2, STR7); + // set long string 3 + vector.setSafe(3, STR8); + // set short string 2 + vector.setSafe(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a shorter long string + vector.setSafe(2, STR2); + vector.setValueCount(5); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR5, vector.get(1)); + assertArrayEquals(STR2, vector.get(2)); + assertArrayEquals(STR8, vector.get(3)); + assertArrayEquals(STR6, vector.get(4)); + } + } + + @Test + public void testSafeOverwriteLongFromALongerLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.setSafe(0, STR3); + vector.setValueCount(1); + // set longer long string + vector.setSafe(0, STR7); + vector.setValueCount(1); + + assertArrayEquals(STR7, vector.get(0)); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(16, 3); + // set long string 1 + vector.setSafe(0, STR3); + // set long string 2 + vector.setSafe(1, STR8); + // set long string 3 + vector.setSafe(2, STR7); + vector.setValueCount(3); + + String longerString = generateRandomString(35); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + + vector.setSafe(1, longerStringBytes); + vector.setValueCount(3); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(longerStringBytes, vector.get(1)); + assertArrayEquals(STR7, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set long string 1 + vector.setSafe(0, STR3); + // set short string 1 + vector.setSafe(1, STR5); + // set long string 2 + vector.setSafe(2, STR7); + // set long string 3 + vector.setSafe(3, STR2); + // set short string 2 + vector.setSafe(4, STR6); + vector.setValueCount(5); + + String longerString = generateRandomString(24); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + + vector.setSafe(2, longerStringBytes); + vector.setValueCount(5); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR5, vector.get(1)); + assertArrayEquals(longerStringBytes, vector.get(2)); + assertArrayEquals(STR2, vector.get(3)); + assertArrayEquals(STR6, vector.get(4)); + + } + } + + private String generateRandomString(int length) { + Random random = new Random(); + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append(random.nextInt(10)); // 0-9 + } + return sb.toString(); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index 9bfcb3c635d8..45e6e630792a 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -61,6 +61,7 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VariableWidthFieldVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; @@ -586,6 +587,17 @@ public static void setVector(VarCharVector vector, byte[]... values) { vector.setValueCount(length); } + public static void setVector(VariableWidthFieldVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + /** * Populate values for LargeVarCharVector. */