From b2883acb6cc03808b9bc62f353f712038c488b10 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Tue, 25 Jun 2024 13:02:55 +0530 Subject: [PATCH 01/25] fix: temp --- .../org/apache/arrow/c/RoundtripTest.java | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index 6591d1f7309..5e19790ae49 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -1041,4 +1041,35 @@ public void set(int index, UUID uuid) { getUnderlyingVector().set(index, bb.array()); } } + + @Test + public void testSliceVarCharVector() { + try (final VarCharVector vector = new VarCharVector("v", allocator)) { + setVector(vector, "abc", "def", "ghi", "efd", "mkl", "cwp", null); + // assertTrue(roundtrip(vector, VarCharVector.class)); + // then slice the vector and try to use the C Data interface. + System.out.println("Original vector: " + vector); + TransferPair tp = vector.getTransferPair(allocator); + System.out.println("Slice range : (2, 4)"); + tp.splitAndTransfer(2, 4); + final ArrowBuf originalOffsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf originalValueBuffer = vector.getDataBuffer(); + try (VarCharVector sliced = (VarCharVector) tp.getTo()) { + System.out.println("sliced vector: " + sliced); + + final ArrowBuf slicedOffsetBuffer = sliced.getOffsetBuffer(); + final ArrowBuf slicedValueBuffer = sliced.getDataBuffer(); + + final int lastSet = sliced.getLastSet(); + final int firstOffsetSlice = slicedOffsetBuffer.getInt(0); + final int lastOffsetSlice = + slicedOffsetBuffer.getInt((lastSet + 1) * (long) VarCharVector.OFFSET_WIDTH); + System.out.println("Sliced Buffer First Offset: " + firstOffsetSlice); + System.out.println("Sliced Buffer Last Offset: " + lastOffsetSlice); + System.out.println("Sliced Offset Buffer capacity: " + slicedOffsetBuffer.capacity()); + System.out.println("Sliced Buffer ValueBuffer capacity: " + slicedValueBuffer.capacity()); + System.out.println("Original ValueBuffer capacity: " + originalValueBuffer.capacity()); + } + } + } } From fd71ada621a2f74fbeccbd8573776b8a9f62304a Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 25 Jun 2024 19:01:59 +0530 Subject: [PATCH 02/25] fix: temp c++ slice test --- cpp/src/arrow/array/array_binary_test.cc | 73 ++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 04391be0ac7..dd53b48631e 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -245,6 +245,77 @@ class TestStringArray : public ::testing::Test { equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); } + void showOffsetDetails(std::shared_ptr& str_array) { + std::cout << std::string(50, '=') << std::endl; + int64_t length = str_array->length(); + auto offset_values = str_array->value_offsets(); + auto* raw_offset_values = str_array->raw_value_offsets(); + std::cout << offset_values->ToString() << std::endl; + int64_t size = offset_values->size(); + int64_t capacity = offset_values->capacity(); + std::cout << "Offset" << std::endl; + std::cout << "Size: " << size << std::endl; + std::cout << "Capacity: " << capacity << std::endl; + std::cout << "Offset Values" << std::endl; + std::cout << "Array length : " << length << std::endl; + std::cout << "OffSets of this Array" << std::endl; + for (int64_t i=0; i < length; i++) { + std::cout << raw_offset_values[i] << ", "; + } + std::cout << std::endl; + std::cout << std::string(50, '=') << std::endl; + } + + void showDataDetails(std::shared_ptr& str_array) { + std::cout << std::string(50, '=') << std::endl; + auto value_data = str_array->value_data(); + auto* raw_data = str_array->raw_data(); + std::cout << "Value Data: " << value_data->ToString() << std::endl; + int64_t size = value_data->size(); + int64_t capacity = value_data->capacity(); + std::cout << "Data" << std::endl; + std::cout << "Size: " << size << std::endl; + std::cout << "Capacity: " << capacity << std::endl; + std::cout << "Data length : " << size << std::endl; + std::cout << "Data of this Array" << std::endl; + for (int64_t i=0; i < size; i++) { + std::cout << raw_data[i] << ", "; + } + std::cout << std::endl; + std::cout << std::string(50, '=') << std::endl; + } + + void TestBufferDetailsWithSlice() { + BuilderType builder; + + ASSERT_OK(builder.Append("foo")); + ASSERT_OK(builder.Append("bar")); + ASSERT_OK(builder.Append("baz1")); + ASSERT_OK(builder.Append("baz223")); + ASSERT_OK(builder.Append("baz23445")); + ASSERT_OK(builder.Append("baz2121")); + ASSERT_OK(builder.Append("12312baz")); + + std::shared_ptr array; + FinishAndCheckPadding(&builder, &array); + + std::shared_ptr str_array = std::static_pointer_cast(array); + std::cout << "Original Array" << std::endl; + std::cout << array->ToString() << std::endl; + + showOffsetDetails(str_array); + + showDataDetails(str_array); + + std::shared_ptr sliced_array = std::static_pointer_cast(str_array->Slice(2,3)); + std::cout << "Sliced Array" << std::endl; + std::cout << sliced_array->ToString() << std::endl; + + showOffsetDetails(sliced_array); + + showDataDetails(sliced_array); + } + void TestSliceGetString() { BuilderType builder; @@ -363,6 +434,8 @@ TYPED_TEST(TestStringArray, TestEmptyStringComparison) { TYPED_TEST(TestStringArray, CompareNullByteSlots) { this->TestCompareNullByteSlots(); } +TYPED_TEST(TestStringArray, BufferDetailsWithSlice) { this->TestBufferDetailsWithSlice(); } + TYPED_TEST(TestStringArray, TestSliceGetString) { this->TestSliceGetString(); } TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); } From b61ac8fb316972b2484d0f449294c3bec7d61467 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 26 Jun 2024 09:34:39 +0530 Subject: [PATCH 03/25] fix: initial functional code --- .../org/apache/arrow/c/ArrayImporter.java | 3 +- .../arrow/c/BufferImportTypeVisitor.java | 63 +++++++++++-------- .../apache/arrow/c/ArrowArrayUtilityTest.java | 9 +-- java/c/src/test/python/integration_tests.py | 11 ++++ 4 files changed, 54 insertions(+), 32 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java b/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java index b74fb1b4734..a6f8fa72ef6 100644 --- a/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java +++ b/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java @@ -124,7 +124,8 @@ private void doImport(ArrowArray.Snapshot snapshot) { NativeUtil.toJavaArray(snapshot.buffers, checkedCastToInt(snapshot.n_buffers)); try (final BufferImportTypeVisitor visitor = - new BufferImportTypeVisitor(allocator, underlyingAllocation, fieldNode, bufferPointers)) { + new BufferImportTypeVisitor( + allocator, underlyingAllocation, fieldNode, snapshot.offset, bufferPointers)) { final List buffers; if (bufferPointers == null || bufferPointers.length == 0) { buffers = Collections.emptyList(); diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 633ecd43bd5..066d3b7670d 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -59,6 +59,7 @@ class BufferImportTypeVisitor implements ArrowType.ArrowTypeVisitor imported; @@ -66,10 +67,12 @@ class BufferImportTypeVisitor implements ArrowType.ArrowTypeVisitor(); } @@ -116,7 +119,7 @@ private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) } private ArrowBuf importOffsets(ArrowType type, long bytesPerSlot) { - final long capacity = bytesPerSlot * (fieldNode.getLength() + 1); + final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset + 1); return importBuffer(type, 1, capacity); } @@ -212,19 +215,22 @@ public List visit(ArrowType.FloatingPoint type) { @Override public List visit(ArrowType.Utf8 type) { - try (ArrowBuf offsets = importOffsets(type, VarCharVector.OFFSET_WIDTH)) { - final int start = offsets.getInt(0); - final int end = offsets.getInt(fieldNode.getLength() * (long) VarCharVector.OFFSET_WIDTH); - checkState( - end >= start, - "Offset buffer for type %s is malformed: start: %s, end: %s", - type, - start, - end); - final int len = end - start; - offsets.getReferenceManager().retain(); - return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, len)); - } + ArrowBuf offsets = importOffsets(type, VarCharVector.OFFSET_WIDTH); + ArrowBuf adjustedOffsets = + offsets.slice( + arrowArrayOffset * VarCharVector.OFFSET_WIDTH, + (long) (fieldNode.getLength() + 1) * VarCharVector.OFFSET_WIDTH); + final int start = adjustedOffsets.getInt(0); + final int end = + adjustedOffsets.getInt((fieldNode.getLength()) * (long) VarCharVector.OFFSET_WIDTH); + checkState( + end >= start, + "Offset buffer for type %s is malformed: start: %s, end: %s", + type, + start, + end); + final int len = end - start; + return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, len)); } private List visitVariableWidthView(ArrowType type) { @@ -280,19 +286,22 @@ public List visit(ArrowType.LargeUtf8 type) { @Override public List visit(ArrowType.Binary type) { - try (ArrowBuf offsets = importOffsets(type, VarBinaryVector.OFFSET_WIDTH)) { - final int start = offsets.getInt(0); - final int end = offsets.getInt(fieldNode.getLength() * (long) VarBinaryVector.OFFSET_WIDTH); - checkState( - end >= start, - "Offset buffer for type %s is malformed: start: %s, end: %s", - type, - start, - end); - final int len = end - start; - offsets.getReferenceManager().retain(); - return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, len)); - } + ArrowBuf offsets = importOffsets(type, VarBinaryVector.OFFSET_WIDTH); + ArrowBuf adjustedOffsets = + offsets.slice( + arrowArrayOffset * VarBinaryVector.OFFSET_WIDTH, + (long) (fieldNode.getLength() + 1) * VarBinaryVector.OFFSET_WIDTH); + final int start = adjustedOffsets.getInt(0); + final int end = + adjustedOffsets.getInt(fieldNode.getLength() * (long) VarBinaryVector.OFFSET_WIDTH); + checkState( + end >= start, + "Offset buffer for type %s is malformed: start: %s, end: %s", + type, + start, + end); + final int len = end - start; + return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, len)); } @Override diff --git a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java index 1d4cb411fab..7a8850bd091 100644 --- a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java @@ -58,7 +58,7 @@ void importBuffer() throws Exception { // Note values are all dummy values here try (BufferImportTypeVisitor notEmptyDataVisitor = new BufferImportTypeVisitor( - allocator, dummyHandle, new ArrowFieldNode(/* length= */ 1, 0), new long[] {0})) { + allocator, dummyHandle, new ArrowFieldNode(/* length= */ 1, 0), 0, new long[] {0})) { // Too few buffers assertThrows( @@ -82,7 +82,7 @@ allocator, dummyHandle, new ArrowFieldNode(/* length= */ 1, 0), new long[] {0})) try (BufferImportTypeVisitor emptyDataVisitor = new BufferImportTypeVisitor( - allocator, dummyHandle, new ArrowFieldNode(/* length= */ 0, 0), new long[] {0})) { + allocator, dummyHandle, new ArrowFieldNode(/* length= */ 0, 0), 0, new long[] {0})) { // Too few buffers assertThrows( @@ -106,7 +106,7 @@ void cleanupAfterFailure() throws Exception { long address = MemoryUtil.allocateMemory(16); try (BufferImportTypeVisitor visitor = new BufferImportTypeVisitor( - allocator, dummyHandle, new ArrowFieldNode(0, 0), new long[] {address})) { + allocator, dummyHandle, new ArrowFieldNode(0, 0), 0, new long[] {address})) { // This fails, but only after we've already imported a buffer. assertThrows(IllegalStateException.class, () -> visitor.visit(new ArrowType.Int(32, true))); } finally { @@ -123,7 +123,8 @@ void bufferAssociatedWithAllocator() throws Exception { long baseline = allocator.getAllocatedMemory(); ArrowFieldNode fieldNode = new ArrowFieldNode(fieldLength, 0); try (BufferImportTypeVisitor visitor = - new BufferImportTypeVisitor(allocator, dummyHandle, fieldNode, new long[] {0, address})) { + new BufferImportTypeVisitor( + allocator, dummyHandle, fieldNode, 0, new long[] {0, address})) { List buffers = visitor.visit(new ArrowType.Int(32, true)); assertThat(buffers).hasSize(2); assertThat(buffers.get(0)).isNull(); diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index ab2ee1742f3..d03ba310301 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -190,6 +190,17 @@ def round_trip_reader(self, schema, batches): def test_string_array(self): self.round_trip_array(lambda: pa.array([None, "a", "bb", "ccc"])) + def test_string_slice_array(self): + data = pa.array(["foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"]) + sliced_array = data.slice(offset=2, length=3) + self.round_trip_array(lambda: sliced_array) + + def test_binary_slice_array(self): + data = pa.array([bytes([97]), bytes([98, 98]), bytes([99, 101, 102]), bytes([99, 101, 103]), + bytes([99, 100, 101, 102]), bytes([98, 101]), bytes([98, 102])], type=pa.binary()) + sliced_array = data.slice(offset=2, length=3) + self.round_trip_array(lambda: sliced_array) + def test_stringview_array(self): # with nulls short strings self.round_trip_array(lambda: pa.array([None, "a", "bb", "c"], type=pa.string_view())) From 666aa0f524446944a9b4492f0449b8bc6c8224a5 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 26 Jun 2024 09:48:47 +0530 Subject: [PATCH 04/25] fix: revert test in C++ --- cpp/src/arrow/array/array_binary_test.cc | 71 ------------------------ 1 file changed, 71 deletions(-) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index dd53b48631e..bb5ccd0fe38 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -245,77 +245,6 @@ class TestStringArray : public ::testing::Test { equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); } - void showOffsetDetails(std::shared_ptr& str_array) { - std::cout << std::string(50, '=') << std::endl; - int64_t length = str_array->length(); - auto offset_values = str_array->value_offsets(); - auto* raw_offset_values = str_array->raw_value_offsets(); - std::cout << offset_values->ToString() << std::endl; - int64_t size = offset_values->size(); - int64_t capacity = offset_values->capacity(); - std::cout << "Offset" << std::endl; - std::cout << "Size: " << size << std::endl; - std::cout << "Capacity: " << capacity << std::endl; - std::cout << "Offset Values" << std::endl; - std::cout << "Array length : " << length << std::endl; - std::cout << "OffSets of this Array" << std::endl; - for (int64_t i=0; i < length; i++) { - std::cout << raw_offset_values[i] << ", "; - } - std::cout << std::endl; - std::cout << std::string(50, '=') << std::endl; - } - - void showDataDetails(std::shared_ptr& str_array) { - std::cout << std::string(50, '=') << std::endl; - auto value_data = str_array->value_data(); - auto* raw_data = str_array->raw_data(); - std::cout << "Value Data: " << value_data->ToString() << std::endl; - int64_t size = value_data->size(); - int64_t capacity = value_data->capacity(); - std::cout << "Data" << std::endl; - std::cout << "Size: " << size << std::endl; - std::cout << "Capacity: " << capacity << std::endl; - std::cout << "Data length : " << size << std::endl; - std::cout << "Data of this Array" << std::endl; - for (int64_t i=0; i < size; i++) { - std::cout << raw_data[i] << ", "; - } - std::cout << std::endl; - std::cout << std::string(50, '=') << std::endl; - } - - void TestBufferDetailsWithSlice() { - BuilderType builder; - - ASSERT_OK(builder.Append("foo")); - ASSERT_OK(builder.Append("bar")); - ASSERT_OK(builder.Append("baz1")); - ASSERT_OK(builder.Append("baz223")); - ASSERT_OK(builder.Append("baz23445")); - ASSERT_OK(builder.Append("baz2121")); - ASSERT_OK(builder.Append("12312baz")); - - std::shared_ptr array; - FinishAndCheckPadding(&builder, &array); - - std::shared_ptr str_array = std::static_pointer_cast(array); - std::cout << "Original Array" << std::endl; - std::cout << array->ToString() << std::endl; - - showOffsetDetails(str_array); - - showDataDetails(str_array); - - std::shared_ptr sliced_array = std::static_pointer_cast(str_array->Slice(2,3)); - std::cout << "Sliced Array" << std::endl; - std::cout << sliced_array->ToString() << std::endl; - - showOffsetDetails(sliced_array); - - showDataDetails(sliced_array); - } - void TestSliceGetString() { BuilderType builder; From d8b0a42e594412ffd9328a88f3d1ae4ebf641adc Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 26 Jun 2024 09:50:51 +0530 Subject: [PATCH 05/25] fix: minor --- cpp/src/arrow/array/array_binary_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index bb5ccd0fe38..04391be0ac7 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -363,8 +363,6 @@ TYPED_TEST(TestStringArray, TestEmptyStringComparison) { TYPED_TEST(TestStringArray, CompareNullByteSlots) { this->TestCompareNullByteSlots(); } -TYPED_TEST(TestStringArray, BufferDetailsWithSlice) { this->TestBufferDetailsWithSlice(); } - TYPED_TEST(TestStringArray, TestSliceGetString) { this->TestSliceGetString(); } TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); } From 6ec8384814ff99f812c8a3402862dfb22886c948 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 26 Jun 2024 09:53:04 +0530 Subject: [PATCH 06/25] fix: remove unnecessary java tests --- .../org/apache/arrow/c/RoundtripTest.java | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index 5e19790ae49..6591d1f7309 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -1041,35 +1041,4 @@ public void set(int index, UUID uuid) { getUnderlyingVector().set(index, bb.array()); } } - - @Test - public void testSliceVarCharVector() { - try (final VarCharVector vector = new VarCharVector("v", allocator)) { - setVector(vector, "abc", "def", "ghi", "efd", "mkl", "cwp", null); - // assertTrue(roundtrip(vector, VarCharVector.class)); - // then slice the vector and try to use the C Data interface. - System.out.println("Original vector: " + vector); - TransferPair tp = vector.getTransferPair(allocator); - System.out.println("Slice range : (2, 4)"); - tp.splitAndTransfer(2, 4); - final ArrowBuf originalOffsetBuffer = vector.getOffsetBuffer(); - final ArrowBuf originalValueBuffer = vector.getDataBuffer(); - try (VarCharVector sliced = (VarCharVector) tp.getTo()) { - System.out.println("sliced vector: " + sliced); - - final ArrowBuf slicedOffsetBuffer = sliced.getOffsetBuffer(); - final ArrowBuf slicedValueBuffer = sliced.getDataBuffer(); - - final int lastSet = sliced.getLastSet(); - final int firstOffsetSlice = slicedOffsetBuffer.getInt(0); - final int lastOffsetSlice = - slicedOffsetBuffer.getInt((lastSet + 1) * (long) VarCharVector.OFFSET_WIDTH); - System.out.println("Sliced Buffer First Offset: " + firstOffsetSlice); - System.out.println("Sliced Buffer Last Offset: " + lastOffsetSlice); - System.out.println("Sliced Offset Buffer capacity: " + slicedOffsetBuffer.capacity()); - System.out.println("Sliced Buffer ValueBuffer capacity: " + slicedValueBuffer.capacity()); - System.out.println("Original ValueBuffer capacity: " + originalValueBuffer.capacity()); - } - } - } } From d257fdd6da8dd02cdc14fa2139457ac41f6b68db Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 24 Jul 2024 14:01:33 +0530 Subject: [PATCH 07/25] fix: data buffer capcity --- .../org/apache/arrow/c/BufferImportTypeVisitor.java | 6 ++---- .../test/java/org/apache/arrow/c/RoundtripTest.java | 11 +++++++++++ java/c/src/test/python/integration_tests.py | 8 ++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 066d3b7670d..d9699605e56 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -229,8 +229,7 @@ public List visit(ArrowType.Utf8 type) { type, start, end); - final int len = end - start; - return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, len)); + return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, end)); } private List visitVariableWidthView(ArrowType type) { @@ -300,8 +299,7 @@ public List visit(ArrowType.Binary type) { type, start, end); - final int len = end - start; - return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, len)); + return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, end)); } @Override diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index 6591d1f7309..5249e3c0b96 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -951,6 +951,17 @@ public void testImportReleasedArray() { } } + @Test + public void testSliceVarCharVector() { + try (final VarCharVector vector = new VarCharVector("v", allocator); + VarCharVector target = new VarCharVector("v", allocator)) { + setVector(vector, "foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"); + vector.splitAndTransferTo(2, 3, target); + + System.out.println(target); + } + } + private VectorSchemaRoot createTestVSR() { BitVector bitVector = new BitVector("boolean", allocator); diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index d03ba310301..40f267b7208 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -54,11 +54,11 @@ def setup_jvm(): # For debugging purpose please uncomment the following, and include *jvm_args, before **kwargs # in startJVM function call - # jvm_args = [ - # "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" - # ] + jvm_args = [ + "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" + ] - jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, **kwargs) + jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, *jvm_args, **kwargs) class Bridge: From 37865ceeed37af1400a61917fbb12808b6855372 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 25 Jul 2024 05:36:18 +0530 Subject: [PATCH 08/25] fix: addressing reviews partially --- .../org/apache/arrow/c/RoundtripTest.java | 76 ++++++++++++++++++- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index 5249e3c0b96..ee6948d6746 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -77,6 +77,7 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VariableWidthFieldVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.ViewVarBinaryVector; import org.apache.arrow.vector.ViewVarCharVector; @@ -954,14 +955,83 @@ public void testImportReleasedArray() { @Test public void testSliceVarCharVector() { try (final VarCharVector vector = new VarCharVector("v", allocator); + VarCharVector slicedVector = new VarCharVector("v", allocator); VarCharVector target = new VarCharVector("v", allocator)) { + slicedVector.allocateNew(); setVector(vector, "foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"); - vector.splitAndTransferTo(2, 3, target); - - System.out.println(target); + // slice information + final int startIndex = 2; + final int length = 3; + // create a sliced vector manually to mimic C++ slice behavior + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf dataBuffer = vector.getDataBuffer(); + final int offsetWidth = VarCharVector.OFFSET_WIDTH; + + final ArrowBuf expectedSliceOffSetBuffer = + offsetBuffer.slice(startIndex * offsetWidth, (length + 1) * offsetWidth); + final int sizeOfDataBuffer = expectedSliceOffSetBuffer.getInt(length * offsetWidth); + final ArrowBuf expectedSliceDataBuffer = dataBuffer.slice(0, sizeOfDataBuffer); + final ArrowBuf expectedSliceValidityBuffer = allocator.buffer(8); + expectedSliceValidityBuffer.setLong(0, 0b00000111); + + createSlicedVector( + slicedVector, + expectedSliceValidityBuffer, + expectedSliceOffSetBuffer, + expectedSliceDataBuffer, + length); + + vector.splitAndTransferTo(startIndex, length, target); + + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + assertTrue(roundtrip(slicedVector, VarCharVector.class)); + + expectedSliceValidityBuffer.close(); } } + private static void createSlicedVector( + VariableWidthFieldVector slicedVector, + ArrowBuf expectedSliceValidityBuffer, + ArrowBuf expectedSliceOffSetBuffer, + ArrowBuf expectedSliceDataBuffer, + int length) { + final ArrowBuf sliceValidityBuffer = slicedVector.getValidityBuffer(); + final ArrowBuf slicedOffsetBuffer = slicedVector.getOffsetBuffer(); + final ArrowBuf slicedDataBuffer = slicedVector.getDataBuffer(); + + setBuffersFromExpectedBuffers( + sliceValidityBuffer, + expectedSliceValidityBuffer, + slicedOffsetBuffer, + expectedSliceOffSetBuffer, + slicedDataBuffer, + expectedSliceDataBuffer, + length); + + // setLastSet before SetValueCount to make sure fillHoles doesn't get called + slicedVector.setLastSet(length - 1); + slicedVector.setValueCount(length); + } + + private static void setBuffersFromExpectedBuffers( + ArrowBuf sliceValidityBuffer, + ArrowBuf expectedSliceValidityBuffer, + ArrowBuf slicedOffsetBuffer, + ArrowBuf expectedSliceOffSetBuffer, + ArrowBuf slicedDataBuffer, + ArrowBuf expectedSliceDataBuffer, + int length) { + final int offsetWidth = VarCharVector.OFFSET_WIDTH; + sliceValidityBuffer.setBytes( + 0, expectedSliceValidityBuffer, 0, expectedSliceValidityBuffer.capacity()); + for (int i = 0; i < length + 1; i++) { + slicedOffsetBuffer.setInt( + (long) i * offsetWidth, expectedSliceOffSetBuffer.getInt((long) i * offsetWidth)); + } + slicedDataBuffer.setBytes(0, expectedSliceDataBuffer, 0, expectedSliceDataBuffer.capacity()); + } + private VectorSchemaRoot createTestVSR() { BitVector bitVector = new BitVector("boolean", allocator); From 5077763d2153b63f54cf12bdecc680c82e0a75da Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 25 Jul 2024 06:22:45 +0530 Subject: [PATCH 09/25] fix: addressing reviews --- .../main/java/org/apache/arrow/c/BufferImportTypeVisitor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index d9699605e56..80e69975569 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -119,7 +119,7 @@ private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) } private ArrowBuf importOffsets(ArrowType type, long bytesPerSlot) { - final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset + 1); + final long capacity = bytesPerSlot * (fieldNode.getLength() + 1); return importBuffer(type, 1, capacity); } From ee62b7e35237a46b50f4a9c0fac19ca56339d31f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 25 Jul 2024 06:23:56 +0530 Subject: [PATCH 10/25] fix: addressing reviews v2 --- java/c/src/test/python/integration_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index 40f267b7208..b4658fd9d40 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -58,7 +58,7 @@ def setup_jvm(): "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" ] - jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, *jvm_args, **kwargs) + jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, **kwargs) class Bridge: From 216de2919e7a49e45bd01e81a4ca199a3171531d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 25 Jul 2024 06:24:46 +0530 Subject: [PATCH 11/25] fix: addressing reviews v3 --- java/c/src/test/python/integration_tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index b4658fd9d40..cf341fe0c45 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -54,9 +54,9 @@ def setup_jvm(): # For debugging purpose please uncomment the following, and include *jvm_args, before **kwargs # in startJVM function call - jvm_args = [ - "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" - ] + # jvm_args = [ + # "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" + # ] jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, **kwargs) From 27ce6856e24fe6d403d1f659f5ba582e5bd28e74 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 25 Jul 2024 06:25:21 +0530 Subject: [PATCH 12/25] fix: addressing reviews v4 --- java/c/src/test/python/integration_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index cf341fe0c45..a3475ac57c8 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -55,7 +55,7 @@ def setup_jvm(): # For debugging purpose please uncomment the following, and include *jvm_args, before **kwargs # in startJVM function call # jvm_args = [ - # "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" + # "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" # ] jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, **kwargs) From eb2dff1fdc6c5d2b29119cd72efffa5d8caa4c3c Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 25 Jul 2024 06:25:58 +0530 Subject: [PATCH 13/25] fix: addressing reviews v4 --- java/c/src/test/python/integration_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index a3475ac57c8..d03ba310301 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -55,7 +55,7 @@ def setup_jvm(): # For debugging purpose please uncomment the following, and include *jvm_args, before **kwargs # in startJVM function call # jvm_args = [ - # "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" + # "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" # ] jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, **kwargs) From a07266f6da247bd95eb8095d6f7b2007c6560c98 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 25 Jul 2024 07:15:04 +0530 Subject: [PATCH 14/25] fix: revert change on offset usage for capacity determination --- .../main/java/org/apache/arrow/c/BufferImportTypeVisitor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 80e69975569..d9699605e56 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -119,7 +119,7 @@ private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) } private ArrowBuf importOffsets(ArrowType type, long bytesPerSlot) { - final long capacity = bytesPerSlot * (fieldNode.getLength() + 1); + final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset + 1); return importBuffer(type, 1, capacity); } From 69446bf0d0f791efde155102804b925d35eda2b0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 26 Jul 2024 07:16:25 +0530 Subject: [PATCH 15/25] fix: adding additional test --- .../arrow/c/BufferImportTypeVisitor.java | 26 ++++------- .../org/apache/arrow/c/RoundtripTest.java | 44 +++++++++++++++++++ 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index d9699605e56..d5fe342ca48 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -120,7 +120,9 @@ private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) private ArrowBuf importOffsets(ArrowType type, long bytesPerSlot) { final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset + 1); - return importBuffer(type, 1, capacity); + ArrowBuf offsets = importBuffer(type, 1, capacity); + return offsets.slice( + arrowArrayOffset * bytesPerSlot, (long) (fieldNode.getLength() + 1) * bytesPerSlot); } private ArrowBuf importData(ArrowType type, long capacity) { @@ -216,20 +218,15 @@ public List visit(ArrowType.FloatingPoint type) { @Override public List visit(ArrowType.Utf8 type) { ArrowBuf offsets = importOffsets(type, VarCharVector.OFFSET_WIDTH); - ArrowBuf adjustedOffsets = - offsets.slice( - arrowArrayOffset * VarCharVector.OFFSET_WIDTH, - (long) (fieldNode.getLength() + 1) * VarCharVector.OFFSET_WIDTH); - final int start = adjustedOffsets.getInt(0); - final int end = - adjustedOffsets.getInt((fieldNode.getLength()) * (long) VarCharVector.OFFSET_WIDTH); + final int start = offsets.getInt(0); + final int end = offsets.getInt((fieldNode.getLength()) * (long) VarCharVector.OFFSET_WIDTH); checkState( end >= start, "Offset buffer for type %s is malformed: start: %s, end: %s", type, start, end); - return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, end)); + return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, end)); } private List visitVariableWidthView(ArrowType type) { @@ -286,20 +283,15 @@ public List visit(ArrowType.LargeUtf8 type) { @Override public List visit(ArrowType.Binary type) { ArrowBuf offsets = importOffsets(type, VarBinaryVector.OFFSET_WIDTH); - ArrowBuf adjustedOffsets = - offsets.slice( - arrowArrayOffset * VarBinaryVector.OFFSET_WIDTH, - (long) (fieldNode.getLength() + 1) * VarBinaryVector.OFFSET_WIDTH); - final int start = adjustedOffsets.getInt(0); - final int end = - adjustedOffsets.getInt(fieldNode.getLength() * (long) VarBinaryVector.OFFSET_WIDTH); + final int start = offsets.getInt(0); + final int end = offsets.getInt(fieldNode.getLength() * (long) VarBinaryVector.OFFSET_WIDTH); checkState( end >= start, "Offset buffer for type %s is malformed: start: %s, end: %s", type, start, end); - return Arrays.asList(maybeImportBitmap(type), adjustedOffsets, importData(type, end)); + return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, end)); } @Override diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index ee6948d6746..ec20a6beda4 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -952,6 +952,50 @@ public void testImportReleasedArray() { } } + private FieldVector getSlicedVector(FieldVector vector, int offset) { + // Consumer allocates empty structures + try (ArrowSchema consumerArrowSchema = ArrowSchema.allocateNew(allocator); + ArrowArray consumerArrowArray = ArrowArray.allocateNew(allocator)) { + + // Producer creates structures from existing memory pointers + try (ArrowSchema arrowSchema = ArrowSchema.wrap(consumerArrowSchema.memoryAddress()); + ArrowArray arrowArray = ArrowArray.wrap(consumerArrowArray.memoryAddress())) { + // Producer exports vector into the C Data Interface structures + Data.exportVector(allocator, vector, null, arrowArray, arrowSchema); + } + // consumerArrowArray.snapshot().offset = offset; + + // Consumer imports vector + FieldVector imported = + Data.importVector(childAllocator, consumerArrowArray, consumerArrowSchema, null); + if (!(imported instanceof NullVector)) { + assertEquals(childAllocator, imported.getAllocator()); + } + + // Check that transfers work + TransferPair pair = imported.getTransferPair(allocator); + pair.transfer(); + return (FieldVector) pair.getTo(); + } + } + + @Test + public void testSliceVarCharVector2() { + try (final VarCharVector vector = new VarCharVector("v", allocator); + VarCharVector target = new VarCharVector("v", allocator)) { + setVector(vector, "foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"); + // slice information + final int startIndex = 2; + final int length = 3; + // create a sliced vector manually to mimic C++ slice behavior + VarCharVector slicedVector = (VarCharVector) getSlicedVector(vector, startIndex); + vector.splitAndTransferTo(startIndex, length, target); + + // assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + assertTrue(roundtrip(slicedVector, VarCharVector.class)); + } + } + @Test public void testSliceVarCharVector() { try (final VarCharVector vector = new VarCharVector("v", allocator); From a04a412d85099091e8569c6ffb9bd812de9dd708 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 30 Jul 2024 07:53:01 +0530 Subject: [PATCH 16/25] fix: adding other test skeletons with todos --- .../org/apache/arrow/c/RoundtripTest.java | 198 ++++++++++-------- java/c/src/test/python/integration_tests.py | 69 ++++++ .../testing/ValueVectorDataPopulator.java | 25 +++ 3 files changed, 209 insertions(+), 83 deletions(-) diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index ec20a6beda4..9b3209e7f58 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -77,7 +77,6 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; -import org.apache.arrow.vector.VariableWidthFieldVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.ViewVarBinaryVector; import org.apache.arrow.vector.ViewVarCharVector; @@ -952,7 +951,7 @@ public void testImportReleasedArray() { } } - private FieldVector getSlicedVector(FieldVector vector, int offset) { + private FieldVector getSlicedVector(FieldVector vector, int offset, int length) { // Consumer allocates empty structures try (ArrowSchema consumerArrowSchema = ArrowSchema.allocateNew(allocator); ArrowArray consumerArrowArray = ArrowArray.allocateNew(allocator)) { @@ -963,7 +962,11 @@ private FieldVector getSlicedVector(FieldVector vector, int offset) { // Producer exports vector into the C Data Interface structures Data.exportVector(allocator, vector, null, arrowArray, arrowSchema); } - // consumerArrowArray.snapshot().offset = offset; + + ArrowArray.Snapshot snapshot = consumerArrowArray.snapshot(); + snapshot.offset = offset; + snapshot.length = length; + consumerArrowArray.save(snapshot); // Consumer imports vector FieldVector imported = @@ -972,7 +975,7 @@ private FieldVector getSlicedVector(FieldVector vector, int offset) { assertEquals(childAllocator, imported.getAllocator()); } - // Check that transfers work + // Check whether the transfer works TransferPair pair = imported.getTransferPair(allocator); pair.transfer(); return (FieldVector) pair.getTo(); @@ -980,7 +983,7 @@ private FieldVector getSlicedVector(FieldVector vector, int offset) { } @Test - public void testSliceVarCharVector2() { + public void testSliceVariableWidthVector() { try (final VarCharVector vector = new VarCharVector("v", allocator); VarCharVector target = new VarCharVector("v", allocator)) { setVector(vector, "foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"); @@ -988,92 +991,121 @@ public void testSliceVarCharVector2() { final int startIndex = 2; final int length = 3; // create a sliced vector manually to mimic C++ slice behavior - VarCharVector slicedVector = (VarCharVector) getSlicedVector(vector, startIndex); - vector.splitAndTransferTo(startIndex, length, target); - - // assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); - assertTrue(roundtrip(slicedVector, VarCharVector.class)); + try (VarCharVector slicedVector = + (VarCharVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, VarCharVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } } } @Test - public void testSliceVarCharVector() { - try (final VarCharVector vector = new VarCharVector("v", allocator); - VarCharVector slicedVector = new VarCharVector("v", allocator); - VarCharVector target = new VarCharVector("v", allocator)) { - slicedVector.allocateNew(); - setVector(vector, "foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"); + public void testSliceFixedWidthVector() { + // TODO: fix the import visitor to support offset + try (final IntVector vector = new IntVector("v", allocator); + IntVector target = new IntVector("v", allocator)) { + setVector(vector, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10); // slice information final int startIndex = 2; final int length = 3; // create a sliced vector manually to mimic C++ slice behavior - final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); - final ArrowBuf dataBuffer = vector.getDataBuffer(); - final int offsetWidth = VarCharVector.OFFSET_WIDTH; - - final ArrowBuf expectedSliceOffSetBuffer = - offsetBuffer.slice(startIndex * offsetWidth, (length + 1) * offsetWidth); - final int sizeOfDataBuffer = expectedSliceOffSetBuffer.getInt(length * offsetWidth); - final ArrowBuf expectedSliceDataBuffer = dataBuffer.slice(0, sizeOfDataBuffer); - final ArrowBuf expectedSliceValidityBuffer = allocator.buffer(8); - expectedSliceValidityBuffer.setLong(0, 0b00000111); - - createSlicedVector( - slicedVector, - expectedSliceValidityBuffer, - expectedSliceOffSetBuffer, - expectedSliceDataBuffer, - length); - - vector.splitAndTransferTo(startIndex, length, target); - - assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); - assertTrue(roundtrip(slicedVector, VarCharVector.class)); - - expectedSliceValidityBuffer.close(); - } - } - - private static void createSlicedVector( - VariableWidthFieldVector slicedVector, - ArrowBuf expectedSliceValidityBuffer, - ArrowBuf expectedSliceOffSetBuffer, - ArrowBuf expectedSliceDataBuffer, - int length) { - final ArrowBuf sliceValidityBuffer = slicedVector.getValidityBuffer(); - final ArrowBuf slicedOffsetBuffer = slicedVector.getOffsetBuffer(); - final ArrowBuf slicedDataBuffer = slicedVector.getDataBuffer(); - - setBuffersFromExpectedBuffers( - sliceValidityBuffer, - expectedSliceValidityBuffer, - slicedOffsetBuffer, - expectedSliceOffSetBuffer, - slicedDataBuffer, - expectedSliceDataBuffer, - length); - - // setLastSet before SetValueCount to make sure fillHoles doesn't get called - slicedVector.setLastSet(length - 1); - slicedVector.setValueCount(length); - } - - private static void setBuffersFromExpectedBuffers( - ArrowBuf sliceValidityBuffer, - ArrowBuf expectedSliceValidityBuffer, - ArrowBuf slicedOffsetBuffer, - ArrowBuf expectedSliceOffSetBuffer, - ArrowBuf slicedDataBuffer, - ArrowBuf expectedSliceDataBuffer, - int length) { - final int offsetWidth = VarCharVector.OFFSET_WIDTH; - sliceValidityBuffer.setBytes( - 0, expectedSliceValidityBuffer, 0, expectedSliceValidityBuffer.capacity()); - for (int i = 0; i < length + 1; i++) { - slicedOffsetBuffer.setInt( - (long) i * offsetWidth, expectedSliceOffSetBuffer.getInt((long) i * offsetWidth)); - } - slicedDataBuffer.setBytes(0, expectedSliceDataBuffer, 0, expectedSliceDataBuffer.capacity()); + try (IntVector slicedVector = (IntVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, IntVector.class)); + // assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceVariableWidthViewVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceListVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceLargeListVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceFixedSizeListVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceUnionVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceMapVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceIntVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceFloatingPointVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceLargeUtf8Vector() { + // TODO: complete this test and function + } + + @Test + public void testSliceFixedSizeBinaryVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceBoolVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceDecimalVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceDateVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceTimeVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceTimeStampVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceIntervalVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceDurationVector() { + // TODO: complete this test and function + } + + @Test + public void testSliceListViewVector() { + // TODO: complete this test and function } private VectorSchemaRoot createTestVSR() { diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index d03ba310301..769277bb080 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -258,9 +258,31 @@ def test_decimal_array(self): ] self.round_trip_array(lambda: pa.array(data, pa.decimal128(5, 2))) + def test_decimal_slice_array(self): + array_data = [ + round(decimal.Decimal(722.82), 2), + round(decimal.Decimal(-934.11), 2), + None, + round(decimal.Decimal(122.82), 2), + round(decimal.Decimal(934.11), 2), + round(decimal.Decimal(632.11), 2), + round(decimal.Decimal(312.11), 2), + round(decimal.Decimal(221.11), 2), + ] + data = pa.array(array_data, pa.decimal128(5, 2)) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_int_array(self): self.round_trip_array(lambda: pa.array([1, 2, 3], type=pa.int32())) + def test_int_slice_array(self): + data = pa.array([1, 2, None, 4, 5, 6, 7, 8, 9, 10], type=pa.int32()) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_list_array(self): self.round_trip_array(lambda: pa.array( [[], [0], [1, 2], [4, 5, 6]], pa.list_(pa.int64()) @@ -268,6 +290,16 @@ def test_list_array(self): # is not preserved during round trips (it becomes "$data$"). ), check_metadata=False) + def test_list_slice_array(self): + data = pa.array( + [[], [0], None, [1, 2], [4, 5, 6], [7, 8, 9, 10], [11, 12, 13, 14, 15]], pa.list_(pa.int64()) + # disabled check_metadata since the list internal field name ("item") + # is not preserved during round trips (it becomes "$data$"). + ) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array, check_metadata=False) + def test_empty_list_array(self): """Validates GH-37056 fix. Empty list of int32 produces a vector with empty child data buffer, however with non-zero capacity. @@ -302,10 +334,36 @@ def test_struct_array(self): ] self.round_trip_array(lambda: pa.array(data, type=pa.struct(fields))) + def test_struct_slice_array(self): + fields = [ + ("f1", pa.int32()), + ("f2", pa.string()), + ] + array_data = [ + {"f1": 1, "f2": "a"}, + None, + {"f1": 3, "f2": None}, + {"f1": None, "f2": "d"}, + {"f1": None, "f2": None}, + {"f1": 6, "f2": "f"}, + {"f1": 7, "f2": "g"}, + {"f1": 8, "f2": "h"}, + ] + data = pa.array(array_data, type=pa.struct(fields)) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_dict(self): self.round_trip_array( lambda: pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8()))) + def test_slice_dict(self): + data = pa.array(["a", "b", None, "d", "e", "f"], pa.dictionary(pa.int64(), pa.utf8())) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_map(self): offsets = [0, None, 2, 6] pykeys = [b"a", b"b", b"c", b"d", b"e", b"f"] @@ -315,6 +373,17 @@ def test_map(self): self.round_trip_array( lambda: pa.MapArray.from_arrays(offsets, keys, items)) + def test_slice_map(self): + offsets = [0, None, 2, 6] + pykeys = [b"a", b"b", b"c", b"d", b"e", b"f"] + pyitems = [1, 2, 3, None, 4, 5] + keys = pa.array(pykeys, type="binary") + items = pa.array(pyitems, type="i4") + data = pa.MapArray.from_arrays(offsets, keys, items) + sliced_array = data.slice(offset=2, length=3) + # TODO: complete this function + # self.round_trip_array(lambda: sliced_array) + def test_field(self): self.round_trip_field(lambda: pa.field("aa", pa.bool_())) diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index 69e16dc4703..8fe1d10f4ce 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -60,6 +60,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VariableWidthFieldVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; import org.apache.arrow.vector.complex.FixedSizeListVector; @@ -567,6 +568,18 @@ public static void setVector(VarCharVector vector, byte[]... values) { vector.setValueCount(length); } + /** Populate values for ViewVarCharVector. */ + public static void setVector(ViewVarCharVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + public static void setVector(VariableWidthFieldVector vector, byte[]... values) { final int length = values.length; vector.allocateNewSafe(); @@ -602,6 +615,18 @@ public static void setVector(VarCharVector vector, String... values) { vector.setValueCount(length); } + /** Populate values for VarCharVector. */ + public static void setVector(ViewVarCharVector vector, String... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); + } + } + vector.setValueCount(length); + } + /** Populate values for LargeVarCharVector. */ public static void setVector(LargeVarCharVector vector, String... values) { final int length = values.length; From c3424b876e22cf3b8c1c3e8beda919061fd03dda Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 31 Jul 2024 07:08:06 +0530 Subject: [PATCH 17/25] fix: adding experimental offset method for data slicing --- .../arrow/c/BufferImportTypeVisitor.java | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index d5fe342ca48..a3b56ef8830 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -113,11 +113,42 @@ private ArrowBuf importFixedBits(ArrowType type, int index, long bitsPerSlot) { return importBuffer(type, index, capacity); } + private ArrowBuf importFixedBitsWithOffset(ArrowType type, int index, long bitsPerSlot) { + // TODO: merge with importFixedBits + // Calculate the total capacity needed, including the offset + final long totalSlots = arrowArrayOffset + fieldNode.getLength(); + final long totalBits = totalSlots * bitsPerSlot; + // final long capacity = DataSizeRoundingUtil.divideBy8Ceil(totalBits); + + // Import the buffer with the calculated capacity + ArrowBuf buf = importBuffer(type, index, totalBits); + + // Calculate the start and end positions in bits + final long startBit = arrowArrayOffset * bitsPerSlot; + final long endBit = (arrowArrayOffset + fieldNode.getLength()) * bitsPerSlot; + + // Calculate the start and end positions in bytes + // TODO: this cannot process bit boundaries in slicing + final long startByte = DataSizeRoundingUtil.divideBy8Ceil(startBit); + final long endByte = DataSizeRoundingUtil.divideBy8Ceil(endBit); + + if (startByte != endByte) { + return buf.slice(startByte, endByte - startByte); + } + return buf; + } + private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) { final long capacity = bytesPerSlot * fieldNode.getLength(); return importBuffer(type, index, capacity); } + private ArrowBuf importFixedBytesWithOffset(ArrowType type, int index, long bytesPerSlot) { + final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset); + ArrowBuf buf = importBuffer(type, index, capacity); + return buf.slice(arrowArrayOffset * bytesPerSlot, fieldNode.getLength() * bytesPerSlot); + } + private ArrowBuf importOffsets(ArrowType type, long bytesPerSlot) { final long capacity = bytesPerSlot * (fieldNode.getLength() + arrowArrayOffset + 1); ArrowBuf offsets = importBuffer(type, 1, capacity); @@ -142,6 +173,20 @@ private ArrowBuf maybeImportBitmap(ArrowType type) { return importFixedBits(type, 0, /*bitsPerSlot=*/ 1); } + private ArrowBuf maybeImportBitmapWithOffset(ArrowType type) { + // TODO: merge with maybeImportBitMap + checkState( + buffers.length > 0, + "Expected at least %s buffers for type %s, but found %s", + 1, + type, + buffers.length); + if (buffers[0] == NULL) { + return null; + } + return importFixedBitsWithOffset(type, 0, /*bitsPerSlot=*/ 1); + } + @Override public List visit(ArrowType.Null type) { checkState( @@ -195,7 +240,8 @@ public List visit(ArrowType.Map type) { @Override public List visit(ArrowType.Int type) { - return Arrays.asList(maybeImportBitmap(type), importFixedBits(type, 1, type.getBitWidth())); + return Arrays.asList( + maybeImportBitmapWithOffset(type), importFixedBitsWithOffset(type, 1, type.getBitWidth())); } @Override From 9563162ff57d3ab7da6961de906854d867c9f5b0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 1 Aug 2024 12:13:59 +0530 Subject: [PATCH 18/25] fix: temp --- .../apache/arrow/c/BufferImportTypeVisitor.java | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index a3b56ef8830..17f070768fa 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -28,6 +28,7 @@ import org.apache.arrow.util.AutoCloseables; import org.apache.arrow.util.VisibleForTesting; import org.apache.arrow.vector.BaseVariableWidthViewVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; import org.apache.arrow.vector.DurationVector; @@ -118,10 +119,10 @@ private ArrowBuf importFixedBitsWithOffset(ArrowType type, int index, long bitsP // Calculate the total capacity needed, including the offset final long totalSlots = arrowArrayOffset + fieldNode.getLength(); final long totalBits = totalSlots * bitsPerSlot; - // final long capacity = DataSizeRoundingUtil.divideBy8Ceil(totalBits); + final long capacity = DataSizeRoundingUtil.divideBy8Ceil(totalBits); // Import the buffer with the calculated capacity - ArrowBuf buf = importBuffer(type, index, totalBits); + ArrowBuf buf = importBuffer(type, index, capacity); // Calculate the start and end positions in bits final long startBit = arrowArrayOffset * bitsPerSlot; @@ -134,7 +135,17 @@ private ArrowBuf importFixedBitsWithOffset(ArrowType type, int index, long bitsP if (startByte != endByte) { return buf.slice(startByte, endByte - startByte); - } + } // else { + // final ArrowBuf bufCopy = allocator.buffer(buf.capacity()); + // bufCopy.setZero(0, buf.capacity()); + // + // for (int i = (int) arrowArrayOffset; i < bufCopy.capacity(); i++) { + // if (BitVectorHelper.get(buf, i) == 1) { + // BitVectorHelper.setBit(buf, i); + // } + // } + // return bufCopy; + // } return buf; } From f2f5fcbf0234a8827d648a2761e491f949f8d614 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 1 Aug 2024 17:37:50 +0530 Subject: [PATCH 19/25] fix: adding tests for Int, List, LargeList, Utf8, Binary --- .../arrow/c/BufferImportTypeVisitor.java | 39 ++++--- .../org/apache/arrow/c/RoundtripTest.java | 108 ++++++++++++++++-- .../arrow/vector/complex/LargeListVector.java | 13 +++ .../arrow/vector/complex/ListVector.java | 13 +++ 4 files changed, 149 insertions(+), 24 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 17f070768fa..9a4983fbcba 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -135,18 +135,24 @@ private ArrowBuf importFixedBitsWithOffset(ArrowType type, int index, long bitsP if (startByte != endByte) { return buf.slice(startByte, endByte - startByte); - } // else { - // final ArrowBuf bufCopy = allocator.buffer(buf.capacity()); - // bufCopy.setZero(0, buf.capacity()); - // - // for (int i = (int) arrowArrayOffset; i < bufCopy.capacity(); i++) { - // if (BitVectorHelper.get(buf, i) == 1) { - // BitVectorHelper.setBit(buf, i); - // } - // } - // return bufCopy; - // } - return buf; + } else { + ArrowBuf bufCopy = allocator.buffer(buf.capacity()); + bufCopy.setZero(0, buf.capacity()); + for (int i = 0; i < bufCopy.capacity() * 8; i++) { + int bitIndex = (int) (i + arrowArrayOffset); + if (bitIndex < buf.capacity() * 8) { + if (BitVectorHelper.get(buf, bitIndex) == 1) { + BitVectorHelper.setBit(bufCopy, i); + } else { + BitVectorHelper.unsetBit(bufCopy, i); + } + } else { + BitVectorHelper.unsetBit(bufCopy, i); + } + } + imported.add(bufCopy); + return bufCopy; + } } private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) { @@ -216,13 +222,14 @@ public List visit(ArrowType.Struct type) { @Override public List visit(ArrowType.List type) { - return Arrays.asList(maybeImportBitmap(type), importOffsets(type, ListVector.OFFSET_WIDTH)); + return Arrays.asList( + maybeImportBitmapWithOffset(type), importOffsets(type, ListVector.OFFSET_WIDTH)); } @Override public List visit(ArrowType.LargeList type) { return Arrays.asList( - maybeImportBitmap(type), importOffsets(type, LargeListVector.OFFSET_WIDTH)); + maybeImportBitmapWithOffset(type), importOffsets(type, LargeListVector.OFFSET_WIDTH)); } @Override @@ -283,7 +290,7 @@ public List visit(ArrowType.Utf8 type) { type, start, end); - return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, end)); + return Arrays.asList(maybeImportBitmapWithOffset(type), offsets, importData(type, end)); } private List visitVariableWidthView(ArrowType type) { @@ -348,7 +355,7 @@ public List visit(ArrowType.Binary type) { type, start, end); - return Arrays.asList(maybeImportBitmap(type), offsets, importData(type, end)); + return Arrays.asList(maybeImportBitmapWithOffset(type), offsets, importData(type, end)); } @Override diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index 9b3209e7f58..dff4db8e404 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -986,10 +986,21 @@ private FieldVector getSlicedVector(FieldVector vector, int offset, int length) public void testSliceVariableWidthVector() { try (final VarCharVector vector = new VarCharVector("v", allocator); VarCharVector target = new VarCharVector("v", allocator)) { - setVector(vector, "foo", "bar", "baz1", "baz223", "baz23445", "baz2121", "12312baz"); + setVector( + vector, + "foo", + "bar", + "baz1", + "", + "baz23445", + null, + "12312baz", + "baz11", + "baz22", + "baz33"); // slice information final int startIndex = 2; - final int length = 3; + final int length = 6; // create a sliced vector manually to mimic C++ slice behavior try (VarCharVector slicedVector = (VarCharVector) getSlicedVector(vector, startIndex, length)) { @@ -1000,20 +1011,48 @@ public void testSliceVariableWidthVector() { } } + @Test + public void testSliceVarBinaryVector() { + try (final VarBinaryVector vector = new VarBinaryVector("v", allocator); + VarBinaryVector target = new VarBinaryVector("v", allocator)) { + setVector( + vector, + new byte[] {0x01, 0x02, 0x03}, + new byte[] {0x04, 0x05}, + new byte[] {0x06, 0x07, 0x08, 0x09}, + new byte[] {}, + new byte[] {0x0A, 0x0B, 0x0C, 0x0D, 0x0E}, + null, + new byte[] {0x0F, 0x10, 0x11}, + new byte[] {0x12, 0x13}, + new byte[] {0x14, 0x15, 0x16}, + new byte[] {0x17, 0x18, 0x19, 0x1A}); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (VarBinaryVector slicedVector = + (VarBinaryVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, VarBinaryVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + @Test public void testSliceFixedWidthVector() { - // TODO: fix the import visitor to support offset try (final IntVector vector = new IntVector("v", allocator); IntVector target = new IntVector("v", allocator)) { - setVector(vector, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10); + setVector(vector, 1, 2, null, 3, 4, null, 6, 7, 8, 9, 10); // slice information final int startIndex = 2; - final int length = 3; + final int length = 6; // create a sliced vector manually to mimic C++ slice behavior try (IntVector slicedVector = (IntVector) getSlicedVector(vector, startIndex, length)) { vector.splitAndTransferTo(startIndex, length, target); assertTrue(roundtrip(slicedVector, IntVector.class)); - // assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); } } } @@ -1025,12 +1064,65 @@ public void testSliceVariableWidthViewVector() { @Test public void testSliceListVector() { - // TODO: complete this test and function + try (final ListVector vector = ListVector.empty("v", allocator); + ListVector target = ListVector.empty("v", allocator)) { + // Set values in the ListVector + setVector( + vector, + Arrays.asList(1, 2, 3), + Arrays.asList(4, 5), + Arrays.asList(6, 7, 8, 9), + Collections.emptyList(), + Arrays.asList(10, 11, 12, 13, 14), + null, + Arrays.asList(15, 16, 17), + Arrays.asList(18, 19), + Arrays.asList(20, 21, 22), + Arrays.asList(23, 24, 25, 26)); + + // Slice information + final int startIndex = 2; + final int length = 6; + + // Create a sliced vector manually to mimic C++ slice behavior + try (ListVector slicedVector = (ListVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, ListVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } } @Test public void testSliceLargeListVector() { - // TODO: complete this test and function + try (final LargeListVector vector = LargeListVector.empty("v", allocator); + LargeListVector target = LargeListVector.empty("v", allocator)) { + // Set values in the LargeListVector + setVector( + vector, + Arrays.asList(1, 2, 3), + Arrays.asList(4, 5), + Arrays.asList(6, 7, 8, 9), + Collections.emptyList(), + Arrays.asList(10, 11, 12, 13, 14), + null, + Arrays.asList(15, 16, 17), + Arrays.asList(18, 19), + Arrays.asList(20, 21, 22), + Arrays.asList(23, 24, 25, 26)); + + // Slice information + final int startIndex = 2; + final int length = 6; + + // Create a sliced vector manually to mimic C++ slice behavior + try (LargeListVector slicedVector = + (LargeListVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, LargeListVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } } @Test diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java index ed075352c93..e7c85b184a9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -1124,4 +1124,17 @@ public long getElementStartIndex(int index) { public long getElementEndIndex(int index) { return offsetBuffer.getLong(((long) index + 1L) * OFFSET_WIDTH); } + + /** + * Slice this vector at desired index and length and transfer the corresponding data to the target + * vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, LargeListVector target) { + TransferImpl transfer = new TransferImpl(target); + transfer.splitAndTransfer(startIndex, length); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 76682c28fe6..23703d413f3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -648,6 +648,19 @@ public void copyValueSafe(int from, int to) { } } + /** + * Slice this vector at desired index and length and transfer the corresponding data to the target + * vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, ListVector target) { + TransferImpl transfer = new TransferImpl(target); + transfer.splitAndTransfer(startIndex, length); + } + @Override protected FieldReader getReaderImpl() { return new UnionListReader(this); From c2d23855eeaf7bdd23ff4590b93d2790bf1248aa Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 6 Aug 2024 17:24:13 +0530 Subject: [PATCH 20/25] fix: adding test cases and updates for variable-width view vector --- .../arrow/c/BufferImportTypeVisitor.java | 4 +- .../org/apache/arrow/c/RoundtripTest.java | 61 ++++++++++++++++++- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 9a4983fbcba..322ec5f9873 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -304,8 +304,8 @@ private List visitVariableWidthView(ArrowType type) { importBuffer(type, variadicSizeBufferIndex, variadicSizeBufferCapacity); ArrowBuf view = - importFixedBytes(type, viewBufferIndex, BaseVariableWidthViewVector.ELEMENT_SIZE); - buffers.add(maybeImportBitmap(type)); + importFixedBytesWithOffset(type, viewBufferIndex, BaseVariableWidthViewVector.ELEMENT_SIZE); + buffers.add(maybeImportBitmapWithOffset(type)); buffers.add(view); // 0th buffer is validity buffer diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index dff4db8e404..9299a0d7e9d 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -1058,8 +1058,65 @@ public void testSliceFixedWidthVector() { } @Test - public void testSliceVariableWidthViewVector() { - // TODO: complete this test and function + public void testSliceViewVarCharVector() { + try (final ViewVarCharVector vector = new ViewVarCharVector("vu", allocator); + ViewVarCharVector target = new ViewVarCharVector("vu", allocator)) { + setVector( + vector, + "foo", + "bar", + "baz1", + "", + "baz1234567890123", + null, + "12312baz", + "baz11", + "baz22", + "baz33"); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (ViewVarCharVector slicedVector = + (ViewVarCharVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, ViewVarCharVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } + } + + @Test + public void testSliceViewVarBinaryVector() { + try (final ViewVarBinaryVector vector = new ViewVarBinaryVector("vz", allocator); + ViewVarBinaryVector target = new ViewVarBinaryVector("vz", allocator)) { + setVector( + vector, + new byte[] {0x66, 0x6F, 0x6F}, // "foo" + new byte[] {0x62, 0x61, 0x72}, // "bar" + new byte[] {0x62, 0x61, 0x7A, 0x31}, // "baz1" + new byte[] {}, // empty + new byte[] { + 0x62, 0x61, 0x7A, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, + 0x32, 0x33 + }, // "baz1234567890123" + null, // null + new byte[] {0x31, 0x32, 0x33, 0x31, 0x32, 0x62, 0x61, 0x7A}, // "12312baz" + new byte[] {0x62, 0x61, 0x7A, 0x31, 0x31}, // "baz11" + new byte[] {0x62, 0x61, 0x7A, 0x32, 0x32}, // "baz22" + new byte[] {0x62, 0x61, 0x7A, 0x33, 0x33} // "baz33" + ); + // slice information + final int startIndex = 2; + final int length = 6; + // create a sliced vector manually to mimic C++ slice behavior + try (ViewVarBinaryVector slicedVector = + (ViewVarBinaryVector) getSlicedVector(vector, startIndex, length)) { + vector.splitAndTransferTo(startIndex, length, target); + assertTrue(roundtrip(slicedVector, ViewVarBinaryVector.class)); + assertTrue(VectorEqualsVisitor.vectorEquals(target, slicedVector)); + } + } } @Test From 6774efe27ea8640b7e6ba57bb530a5875d067df7 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 8 Aug 2024 15:33:52 +0530 Subject: [PATCH 21/25] fix: temp commit for fixedsizelist --- .../main/java/org/apache/arrow/c/ArrayImporter.java | 6 +++++- .../org/apache/arrow/c/BufferImportTypeVisitor.java | 2 +- .../arrow/vector/complex/FixedSizeListVector.java | 13 +++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java b/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java index a6f8fa72ef6..a09c4f90980 100644 --- a/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java +++ b/java/c/src/main/java/org/apache/arrow/c/ArrayImporter.java @@ -98,7 +98,11 @@ private void doImport(ArrowArray.Snapshot snapshot) { checkState(children[i] != NULL, "ArrowArray struct has NULL child at position %s", i); ArrayImporter childImporter = new ArrayImporter(allocator, childVectors.get(i), dictionaryProvider); - childImporter.importChild(this, ArrowArray.wrap(children[i])); + ArrowArray childArray = ArrowArray.wrap(children[i]); + ArrowArray.Snapshot childSnapshot = childArray.snapshot(); + childSnapshot.offset = snapshot.offset; + childArray.save(childSnapshot); + childImporter.importChild(this, childArray); } } diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 322ec5f9873..1b3a21fe4b0 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -234,7 +234,7 @@ public List visit(ArrowType.LargeList type) { @Override public List visit(ArrowType.FixedSizeList type) { - return Collections.singletonList(maybeImportBitmap(type)); + return Collections.singletonList(maybeImportBitmapWithOffset(type)); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index c762eb51725..d120526c8bf 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -730,4 +730,17 @@ public void copyValueSafe(int fromIndex, int toIndex) { } } } + + /** + * Slice this vector at desired index and length and transfer the corresponding data to the target + * vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, FixedSizeListVector target) { + TransferImpl transfer = new TransferImpl(target); + transfer.splitAndTransfer(startIndex, length); + } } From 949706698209c82304c682587feb01dabcd7ee2d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 14 Aug 2024 14:32:27 +0530 Subject: [PATCH 22/25] fix: test ci log enabling --- .github/workflows/java.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 0317879b580..c42d70fa038 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -63,6 +63,7 @@ jobs: env: JDK: ${{ matrix.jdk }} MAVEN: ${{ matrix.maven }} + MAVEN_OPTS: -Darrow.memory.debug.allocator=true steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 From 7e591b3ee236294309bb2cb403d89773b06fb2ed Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 14 Aug 2024 15:27:00 +0530 Subject: [PATCH 23/25] fix: test 3 --- ci/scripts/java_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 212ec6eb114..1d4e2046357 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -102,7 +102,7 @@ if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 # GH-43378: Maven site plugins not compatible with multithreading mkdir -p ${build_dir}/docs/java/reference - ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site + ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false -Darrow.memory.debug.allocator=true clean install site rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference fi From f3cf3ff1c1ed3cc88ddee415ea488aa865e0cdde Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 14 Aug 2024 15:41:43 +0530 Subject: [PATCH 24/25] fix: test 4 --- ci/scripts/java_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 1d4e2046357..0bd22e138e0 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -96,7 +96,7 @@ if [ "${ARROW_JAVA_JNI}" = "ON" ]; then fi # Use `2 * ncores` threads -${mvn} -T 2C clean install +${mvn} -T 2C clean install -Darrow.memory.debug.allocator=true if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 From 07660f169406dcefea05e07ab7e3cce245be01ef Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 14 Aug 2024 16:55:48 +0530 Subject: [PATCH 25/25] fix: test 5 --- .github/workflows/java.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index c42d70fa038..85889901d92 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -87,6 +87,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + JAVA_TOOL_OPTIONS: -Darrow.memory.debug.allocator=true run: | archery docker run \ -e CI=true \