diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 0317879b580..85889901d92 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -63,6 +63,7 @@ jobs: env: JDK: ${{ matrix.jdk }} MAVEN: ${{ matrix.maven }} + MAVEN_OPTS: -Darrow.memory.debug.allocator=true steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -86,6 +87,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + JAVA_TOOL_OPTIONS: -Darrow.memory.debug.allocator=true run: | archery docker run \ -e CI=true \ diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 212ec6eb114..0bd22e138e0 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -96,13 +96,13 @@ if [ "${ARROW_JAVA_JNI}" = "ON" ]; then fi # Use `2 * ncores` threads -${mvn} -T 2C clean install +${mvn} -T 2C clean install -Darrow.memory.debug.allocator=true if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 # GH-43378: Maven site plugins not compatible with multithreading mkdir -p ${build_dir}/docs/java/reference - ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site + ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false -Darrow.memory.debug.allocator=true clean install site rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference fi diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java b/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java index b74fb1b4734..a09c4f90980 100644 --- a/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java +++ b/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java @@ -98,7 +98,11 @@ private void doImport(ArrowArray.Snapshot snapshot) { checkState(children[i] != NULL, "ArrowArray struct has NULL child at position %s", i); ArrayImporter childImporter = new ArrayImporter(allocator, childVectors.get(i), dictionaryProvider); - childImporter.importChild(this, ArrowArray.wrap(children[i])); + ArrowArray childArray = ArrowArray.wrap(children[i]); + ArrowArray.Snapshot childSnapshot = childArray.snapshot(); + childSnapshot.offset = snapshot.offset; + childArray.save(childSnapshot); + childImporter.importChild(this, childArray); } } @@ -124,7 +128,8 @@ private void doImport(ArrowArray.Snapshot snapshot) { NativeUtil.toJavaArray(snapshot.buffers, checkedCastToInt(snapshot.n_buffers)); try (final BufferImportTypeVisitor visitor = - new BufferImportTypeVisitor(allocator, underlyingAllocation, fieldNode, bufferPointers)) { + new BufferImportTypeVisitor( + allocator, underlyingAllocation, fieldNode, snapshot.offset, bufferPointers)) { final List buffers; if (bufferPointers == null || bufferPointers.length == 0) { buffers = Collections.emptyList(); diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 633ecd43bd5..1b3a21fe4b0 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -28,6 +28,7 @@ import org.apache.arrow.util.AutoCloseables; import org.apache.arrow.util.VisibleForTesting; import org.apache.arrow.vector.BaseVariableWidthViewVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; import org.apache.arrow.vector.DurationVector; @@ -59,6 +60,7 @@ class BufferImportTypeVisitor implements ArrowType.ArrowTypeVisitor imported; @@ -66,10 +68,12 @@ class BufferImportTypeVisitor implements ArrowType.ArrowTypeVisitor(); } @@ -110,14 +114,63 @@ private ArrowBuf importFixedBits(ArrowType type, int index, long bitsPerSlot) { return importBuffer(type, index, capacity); } + private ArrowBuf importFixedBitsWithOffset(ArrowType type, int index, long bitsPerSlot) { + // TODO: merge with importFixedBits + // Calculate the total capacity needed, including the offset + final long totalSlots = arrowArrayOffset + fieldNode.getLength(); + final long totalBits = totalSlots * bitsPerSlot; + final long capacity = DataSizeRoundingUtil.divideBy8Ceil(totalBits); + + // Import the buffer with the calculated capacity + ArrowBuf buf = importBuffer(type, index, capacity); + + // Calculate the start and end positions in bits + final long startBit = arrowArrayOffset * bitsPerSlot; + final long endBit = (arrowArrayOffset + fieldNode.getLength()) * bitsPerSlot; + + // Calculate the start and end positions in bytes + // TODO: this cannot process bit boundaries in slicing + final long startByte = DataSizeRoundingUtil.divideBy8Ceil(startBit); + final long endByte = DataSizeRoundingUtil.divideBy8Ceil(endBit); + + if (startByte != endByte) { + return buf.slice(startByte, endByte - startByte); + } else { + ArrowBuf bufCopy = allocator.buffer(buf.capacity()); + bufCopy.setZero(0, buf.capacity()); + for (int i = 0; i < bufCopy.capacity() * 8; i++) { + int bitIndex = (int) (i + arrowArrayOffset); + if (bitIndex < buf.capacity() * 8) { + if (BitVectorHelper.get(buf, bitIndex) == 1) { + BitVectorHelper.setBit(bufCopy, i); + } else { + BitVectorHelper.unsetBit(bufCopy, i); + } + } else { + BitVectorHelper.unsetBit(bufCopy, i); + } + } + imported.add(bufCopy); + return bufCopy; + } + } + private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) { final long capacity = bytesPerSlot * fieldNode.getLength(); return importBuffer(type, index, capacity); } + private ArrowBuf importFixedBytesWithOffset(ArrowType type, int index, long bytesPerSlot) { + final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset); + ArrowBuf buf = importBuffer(type, index, capacity); + return buf.slice(arrowArrayOffset * bytesPerSlot, fieldNode.getLength() * bytesPerSlot); + } + private ArrowBuf importOffsets(ArrowType type, long bytesPerSlot) { - final long capacity = bytesPerSlot * (fieldNode.getLength() + 1); - return importBuffer(type, 1, capacity); + final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset + 1); + ArrowBuf offsets = importBuffer(type, 1, capacity); + return offsets.slice( + arrowArrayOffset * bytesPerSlot, (long) (fieldNode.getLength() + 1) * bytesPerSlot); } private ArrowBuf importData(ArrowType type, long capacity) { @@ -137,6 +190,20 @@ private ArrowBuf maybeImportBitmap(ArrowType type) { return importFixedBits(type, 0, /*bitsPerSlot=*/ 1); } + private ArrowBuf maybeImportBitmapWithOffset(ArrowType type) { + // TODO: merge with maybeImportBitMap + checkState( + buffers.length > 0, + "Expected at least %s buffers for type %s, but found %s", + 1, + type, + buffers.length); + if (buffers[0] == NULL) { + return null; + } + return importFixedBitsWithOffset(type, 0, /*bitsPerSlot=*/ 1); + } + @Override public List visit(ArrowType.Null type) { checkState( @@ -155,18 +222,19 @@ public List visit(ArrowType.Struct type) { @Override public List visit(ArrowType.List type) { - return Arrays.asList(maybeImportBitmap(type), importOffsets(type, ListVector.OFFSET_WIDTH)); + return Arrays.asList( + maybeImportBitmapWithOffset(type), importOffsets(type, ListVector.OFFSET_WIDTH)); } @Override public List visit(ArrowType.LargeList type) { return Arrays.asList( - maybeImportBitmap(type), importOffsets(type, LargeListVector.OFFSET_WIDTH)); + maybeImportBitmapWithOffset(type), importOffsets(type, LargeListVector.OFFSET_WIDTH)); } @Override public List visit(ArrowType.FixedSizeList type) { - return Collections.singletonList(maybeImportBitmap(type)); + return Collections.singletonList(maybeImportBitmapWithOffset(type)); } @Override @@ -190,7 +258,8 @@ public List visit(ArrowType.Map type) { @Override public List visit(ArrowType.Int type) { - return Arrays.asList(maybeImportBitmap(type), importFixedBits(type, 1, type.getBitWidth())); + return Arrays.asList( + maybeImportBitmapWithOffset(type), importFixedBitsWithOffset(type, 1, type.getBitWidth())); } @Override @@ -212,19 +281,16 @@ public List visit(ArrowType.FloatingPoint type) { @Override public List visit(ArrowType.Utf8 type) { - try (ArrowBuf offsets = importOffsets(type, VarCharVector.OFFSET_WIDTH)) { - final int start = offsets.getInt(0); - final int end = offsets.getInt(fieldNode.getLength() * (long) VarCharVector.OFFSET_WIDTH); - checkState( - end >= start, - "Offset buffer for type %s is malformed: start: %s, end: %s", - type, - start, - end); - final int len = end - start; - offsets.getReferenceManager().retain(); - return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, len)); - } + ArrowBuf offsets = importOffsets(type, VarCharVector.OFFSET_WIDTH); + final int start = offsets.getInt(0); + final int end = offsets.getInt((fieldNode.getLength()) * (long) VarCharVector.OFFSET_WIDTH); + checkState( + end >= start, + "Offset buffer for type %s is malformed: start: %s, end: %s", + type, + start, + end); + return Arrays.asList(maybeImportBitmapWithOffset(type), offsets, importData(type, end)); } private List visitVariableWidthView(ArrowType type) { @@ -238,8 +304,8 @@ private List visitVariableWidthView(ArrowType type) { importBuffer(type, variadicSizeBufferIndex, variadicSizeBufferCapacity); ArrowBuf view = - importFixedBytes(type, viewBufferIndex, BaseVariableWidthViewVector.ELEMENT_SIZE); - buffers.add(maybeImportBitmap(type)); + importFixedBytesWithOffset(type, viewBufferIndex, BaseVariableWidthViewVector.ELEMENT_SIZE); + buffers.add(maybeImportBitmapWithOffset(type)); buffers.add(view); // 0th buffer is validity buffer @@ -280,19 +346,16 @@ public List visit(ArrowType.LargeUtf8 type) { @Override public List visit(ArrowType.Binary type) { - try (ArrowBuf offsets = importOffsets(type, VarBinaryVector.OFFSET_WIDTH)) { - final int start = offsets.getInt(0); - final int end = offsets.getInt(fieldNode.getLength() * (long) VarBinaryVector.OFFSET_WIDTH); - checkState( - end >= start, - "Offset buffer for type %s is malformed: start: %s, end: %s", - type, - start, - end); - final int len = end - start; - offsets.getReferenceManager().retain(); - return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, len)); - } + ArrowBuf offsets = importOffsets(type, VarBinaryVector.OFFSET_WIDTH); + final int start = offsets.getInt(0); + final int end = offsets.getInt(fieldNode.getLength() * (long) VarBinaryVector.OFFSET_WIDTH); + checkState( + end >= start, + "Offset buffer for type %s is malformed: start: %s, end: %s", + type, + start, + end); + return Arrays.asList(maybeImportBitmapWithOffset(type), offsets, importData(type, end)); } @Override diff --git a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java index 1d4cb411fab..7a8850bd091 100644 --- a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java @@ -58,7 +58,7 @@ void importBuffer() throws Exception { // Note values are all dummy values here try (BufferImportTypeVisitor notEmptyDataVisitor = new BufferImportTypeVisitor( - allocator, dummyHandle, new ArrowFieldNode(/* length= */ 1, 0), new long[] {0})) { + allocator, dummyHandle, new ArrowFieldNode(/* length= */ 1, 0), 0, new long[] {0})) { // Too few buffers assertThrows( @@ -82,7 +82,7 @@ allocator, dummyHandle, new ArrowFieldNode(/* length= */ 1, 0), new long[] {0})) try (BufferImportTypeVisitor emptyDataVisitor = new BufferImportTypeVisitor( - allocator, dummyHandle, new ArrowFieldNode(/* length= */ 0, 0), new long[] {0})) { + allocator, dummyHandle, new ArrowFieldNode(/* length= */ 0, 0), 0, new long[] {0})) { // Too few buffers assertThrows( @@ -106,7 +106,7 @@ void cleanupAfterFailure() throws Exception { long address = MemoryUtil.allocateMemory(16); try (BufferImportTypeVisitor visitor = new BufferImportTypeVisitor( - allocator, dummyHandle, new ArrowFieldNode(0, 0), new long[] {address})) { + allocator, dummyHandle, new ArrowFieldNode(0, 0), 0, new long[] {address})) { // This fails, but only after we've already imported a buffer. assertThrows(IllegalStateException.class, () -> visitor.visit(new ArrowType.Int(32, true))); } finally { @@ -123,7 +123,8 @@ void bufferAssociatedWithAllocator() throws Exception { long baseline = allocator.getAllocatedMemory(); ArrowFieldNode fieldNode = new ArrowFieldNode(fieldLength, 0); try (BufferImportTypeVisitor visitor = - new BufferImportTypeVisitor(allocator, dummyHandle, fieldNode, new long[] {0, address})) { + new BufferImportTypeVisitor( + allocator, dummyHandle, fieldNode, 0, new long[] {0, address})) { List buffers = visitor.visit(new ArrowType.Int(32, true)); assertThat(buffers).hasSize(2); assertThat(buffers.get(0)).isNull(); diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index 6591d1f7309..9299a0d7e9d 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -951,6 +951,312 @@ public void testImportReleasedArray() { } } + private FieldVector getSlicedVector(FieldVector vector, int offset, int length) { + // Consumer allocates empty structures + try (ArrowSchema consumerArrowSchema = ArrowSchema.allocateNew(allocator); + ArrowArray consumerArrowArray = ArrowArray.allocateNew(allocator)) { + + // Producer creates structures from existing memory pointers + try (ArrowSchema arrowSchema = ArrowSchema.wrap(consumerArrowSchema.memoryAddress()); + ArrowArray arrowArray = ArrowArray.wrap(consumerArrowArray.memoryAddress())) { + // Producer exports vector into the C Data Interface structures + Data.exportVector(allocator, vector, null, arrowArray, arrowSchema); + } + + ArrowArray.Snapshot snapshot = consumerArrowArray.snapshot(); + snapshot.offset = offset; + snapshot.length = length; + consumerArrowArray.save(snapshot); + + // Consumer imports vector + FieldVector imported = + Data.importVector(childAllocator, consumerArrowArray, consumerArrowSchema, null); + if (!(imported instanceof NullVector)) { + assertEquals(childAllocator, imported.getAllocator()); + } + + // Check whether the transfer works + TransferPair pair = imported.getTransferPair(allocator); + pair.transfer(); + return (FieldVector) pair.getTo(); + } + } + + @Test + public void testSliceVariableWidthVector() { + try (final VarCharVector vector = new VarCharVector("v", allocator); + VarCharVector target = new VarCharVector("v", allocator)) { + setVector( + vector, + "foo", + "bar", + "baz1", + "", + "baz23445", + null, + "12312baz", + "baz11", + "baz22", + "baz33"); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (VarCharVector slicedVector = + (VarCharVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, VarCharVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceVarBinaryVector() { + try (final VarBinaryVector vector = new VarBinaryVector("v", allocator); + VarBinaryVector target = new VarBinaryVector("v", allocator)) { + setVector( + vector, + new byte[] {0x01, 0x02, 0x03}, + new byte[] {0x04, 0x05}, + new byte[] {0x06, 0x07, 0x08, 0x09}, + new byte[] {}, + new byte[] {0x0A, 0x0B, 0x0C, 0x0D, 0x0E}, + null, + new byte[] {0x0F, 0x10, 0x11}, + new byte[] {0x12, 0x13}, + new byte[] {0x14, 0x15, 0x16}, + new byte[] {0x17, 0x18, 0x19, 0x1A}); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (VarBinaryVector slicedVector = + (VarBinaryVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, VarBinaryVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceFixedWidthVector() { + try (final IntVector vector = new IntVector("v", allocator); + IntVector target = new IntVector("v", allocator)) { + setVector(vector, 1, 2, null, 3, 4, null, 6, 7, 8, 9, 10); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (IntVector slicedVector = (IntVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, IntVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceViewVarCharVector() { + try (final ViewVarCharVector vector = new ViewVarCharVector("vu", allocator); + ViewVarCharVector target = new ViewVarCharVector("vu", allocator)) { + setVector( + vector, + "foo", + "bar", + "baz1", + "", + "baz1234567890123", + null, + "12312baz", + "baz11", + "baz22", + "baz33"); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (ViewVarCharVector slicedVector = + (ViewVarCharVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, ViewVarCharVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceViewVarBinaryVector() { + try (final ViewVarBinaryVector vector = new ViewVarBinaryVector("vz", allocator); + ViewVarBinaryVector target = new ViewVarBinaryVector("vz", allocator)) { + setVector( + vector, + new byte[] {0x66, 0x6F, 0x6F}, // "foo" + new byte[] {0x62, 0x61, 0x72}, // "bar" + new byte[] {0x62, 0x61, 0x7A, 0x31}, // "baz1" + new byte[] {}, // empty + new byte[] { + 0x62, 0x61, 0x7A, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, + 0x32, 0x33 + }, // "baz1234567890123" + null, // null + new byte[] {0x31, 0x32, 0x33, 0x31, 0x32, 0x62, 0x61, 0x7A}, // "12312baz" + new byte[] {0x62, 0x61, 0x7A, 0x31, 0x31}, // "baz11" + new byte[] {0x62, 0x61, 0x7A, 0x32, 0x32}, // "baz22" + new byte[] {0x62, 0x61, 0x7A, 0x33, 0x33} // "baz33" + ); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (ViewVarBinaryVector slicedVector = + (ViewVarBinaryVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, ViewVarBinaryVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceListVector() { + try (final ListVector vector = ListVector.empty("v", allocator); + ListVector target = ListVector.empty("v", allocator)) { + // Set values in the ListVector + setVector( + vector, + Arrays.asList(1, 2, 3), + Arrays.asList(4, 5), + Arrays.asList(6, 7, 8, 9), + Collections.emptyList(), + Arrays.asList(10, 11, 12, 13, 14), + null, + Arrays.asList(15, 16, 17), + Arrays.asList(18, 19), + Arrays.asList(20, 21, 22), + Arrays.asList(23, 24, 25, 26)); + + // Slice information + final int startIndex = 2; + final int length = 6; + + // Create a sliced vector manually to mimic C++ slice behavior + try (ListVector slicedVector = (ListVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, ListVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceLargeListVector() { + try (final LargeListVector vector = LargeListVector.empty("v", allocator); + LargeListVector target = LargeListVector.empty("v", allocator)) { + // Set values in the LargeListVector + setVector( + vector, + Arrays.asList(1, 2, 3), + Arrays.asList(4, 5), + Arrays.asList(6, 7, 8, 9), + Collections.emptyList(), + Arrays.asList(10, 11, 12, 13, 14), + null, + Arrays.asList(15, 16, 17), + Arrays.asList(18, 19), + Arrays.asList(20, 21, 22), + Arrays.asList(23, 24, 25, 26)); + + // Slice information + final int startIndex = 2; + final int length = 6; + + // Create a sliced vector manually to mimic C++ slice behavior + try (LargeListVector slicedVector = + (LargeListVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, LargeListVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceFixedSizeListVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceUnionVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceMapVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceIntVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceFloatingPointVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceLargeUtf8Vector() { + // TODO: complete this test and function + } + + @Test + public void testSliceFixedSizeBinaryVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceBoolVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceDecimalVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceDateVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceTimeVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceTimeStampVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceIntervalVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceDurationVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceListViewVector() { + // TODO: complete this test and function + } + private VectorSchemaRoot createTestVSR() { BitVector bitVector = new BitVector("boolean", allocator); diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index ab2ee1742f3..769277bb080 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -190,6 +190,17 @@ def round_trip_reader(self, schema, batches): def test_string_array(self): self.round_trip_array(lambda: pa.array([None, "a", "bb", "ccc"])) + def test_string_slice_array(self): + data = pa.array(["foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"]) + sliced_array = data.slice(offset=2, length=3) + self.round_trip_array(lambda: sliced_array) + + def test_binary_slice_array(self): + data = pa.array([bytes([97]), bytes([98, 98]), bytes([99, 101, 102]), bytes([99, 101, 103]), + bytes([99, 100, 101, 102]), bytes([98, 101]), bytes([98, 102])], type=pa.binary()) + sliced_array = data.slice(offset=2, length=3) + self.round_trip_array(lambda: sliced_array) + def test_stringview_array(self): # with nulls short strings self.round_trip_array(lambda: pa.array([None, "a", "bb", "c"], type=pa.string_view())) @@ -247,9 +258,31 @@ def test_decimal_array(self): ] self.round_trip_array(lambda: pa.array(data, pa.decimal128(5, 2))) + def test_decimal_slice_array(self): + array_data = [ + round(decimal.Decimal(722.82), 2), + round(decimal.Decimal(-934.11), 2), + None, + round(decimal.Decimal(122.82), 2), + round(decimal.Decimal(934.11), 2), + round(decimal.Decimal(632.11), 2), + round(decimal.Decimal(312.11), 2), + round(decimal.Decimal(221.11), 2), + ] + data = pa.array(array_data, pa.decimal128(5, 2)) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_int_array(self): self.round_trip_array(lambda: pa.array([1, 2, 3], type=pa.int32())) + def test_int_slice_array(self): + data = pa.array([1, 2, None, 4, 5, 6, 7, 8, 9, 10], type=pa.int32()) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_list_array(self): self.round_trip_array(lambda: pa.array( [[], [0], [1, 2], [4, 5, 6]], pa.list_(pa.int64()) @@ -257,6 +290,16 @@ def test_list_array(self): # is not preserved during round trips (it becomes "$data$"). ), check_metadata=False) + def test_list_slice_array(self): + data = pa.array( + [[], [0], None, [1, 2], [4, 5, 6], [7, 8, 9, 10], [11, 12, 13, 14, 15]], pa.list_(pa.int64()) + # disabled check_metadata since the list internal field name ("item") + # is not preserved during round trips (it becomes "$data$"). + ) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array, check_metadata=False) + def test_empty_list_array(self): """Validates GH-37056 fix. Empty list of int32 produces a vector with empty child data buffer, however with non-zero capacity. @@ -291,10 +334,36 @@ def test_struct_array(self): ] self.round_trip_array(lambda: pa.array(data, type=pa.struct(fields))) + def test_struct_slice_array(self): + fields = [ + ("f1", pa.int32()), + ("f2", pa.string()), + ] + array_data = [ + {"f1": 1, "f2": "a"}, + None, + {"f1": 3, "f2": None}, + {"f1": None, "f2": "d"}, + {"f1": None, "f2": None}, + {"f1": 6, "f2": "f"}, + {"f1": 7, "f2": "g"}, + {"f1": 8, "f2": "h"}, + ] + data = pa.array(array_data, type=pa.struct(fields)) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_dict(self): self.round_trip_array( lambda: pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8()))) + def test_slice_dict(self): + data = pa.array(["a", "b", None, "d", "e", "f"], pa.dictionary(pa.int64(), pa.utf8())) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_map(self): offsets = [0, None, 2, 6] pykeys = [b"a", b"b", b"c", b"d", b"e", b"f"] @@ -304,6 +373,17 @@ def test_map(self): self.round_trip_array( lambda: pa.MapArray.from_arrays(offsets, keys, items)) + def test_slice_map(self): + offsets = [0, None, 2, 6] + pykeys = [b"a", b"b", b"c", b"d", b"e", b"f"] + pyitems = [1, 2, 3, None, 4, 5] + keys = pa.array(pykeys, type="binary") + items = pa.array(pyitems, type="i4") + data = pa.MapArray.from_arrays(offsets, keys, items) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_field(self): self.round_trip_field(lambda: pa.field("aa", pa.bool_())) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index c762eb51725..d120526c8bf 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -730,4 +730,17 @@ public void copyValueSafe(int fromIndex, int toIndex) { } } } + + /** + * Slice this vector at desired index and length and transfer the corresponding data to the target + * vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, FixedSizeListVector target) { + TransferImpl transfer = new TransferImpl(target); + transfer.splitAndTransfer(startIndex, length); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java index ed075352c93..e7c85b184a9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -1124,4 +1124,17 @@ public long getElementStartIndex(int index) { public long getElementEndIndex(int index) { return offsetBuffer.getLong(((long) index + 1L) * OFFSET_WIDTH); } + + /** + * Slice this vector at desired index and length and transfer the corresponding data to the target + * vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, LargeListVector target) { + TransferImpl transfer = new TransferImpl(target); + transfer.splitAndTransfer(startIndex, length); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 76682c28fe6..23703d413f3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -648,6 +648,19 @@ public void copyValueSafe(int from, int to) { } } + /** + * Slice this vector at desired index and length and transfer the corresponding data to the target + * vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, ListVector target) { + TransferImpl transfer = new TransferImpl(target); + transfer.splitAndTransfer(startIndex, length); + } + @Override protected FieldReader getReaderImpl() { return new UnionListReader(this); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index 69e16dc4703..8fe1d10f4ce 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -60,6 +60,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VariableWidthFieldVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; import org.apache.arrow.vector.complex.FixedSizeListVector; @@ -567,6 +568,18 @@ public static void setVector(VarCharVector vector, byte[]... values) { vector.setValueCount(length); } + /** Populate values for ViewVarCharVector. */ + public static void setVector(ViewVarCharVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + public static void setVector(VariableWidthFieldVector vector, byte[]... values) { final int length = values.length; vector.allocateNewSafe(); @@ -602,6 +615,18 @@ public static void setVector(VarCharVector vector, String... values) { vector.setValueCount(length); } + /** Populate values for VarCharVector. */ + public static void setVector(ViewVarCharVector vector, String... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); + } + } + vector.setValueCount(length); + } + /** Populate values for LargeVarCharVector. */ public static void setVector(LargeVarCharVector vector, String... values) { final int length = values.length;