From 76d1d7875df8f4a86771cb26b5474e70dca93e36 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 9 May 2024 23:45:57 -0700 Subject: [PATCH 1/5] Compute data buffer length by offset buffer start and end values --- arrow-array/src/ffi.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 7b988bb07478..c6ec5f3d126a 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -431,8 +431,11 @@ impl<'a> ImportedArrowArray<'a> { // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets. #[allow(clippy::cast_ptr_alignment)] let offset_buffer = self.array.buffer(1) as *const i32; + // get first offset + let start = (unsafe { *offset_buffer.add(0) }) as usize; // get last offset - (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize + let end = (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize; + end - start } (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) @@ -441,8 +444,11 @@ impl<'a> ImportedArrowArray<'a> { // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets. #[allow(clippy::cast_ptr_alignment)] let offset_buffer = self.array.buffer(1) as *const i64; + // get first offset + let start = (unsafe { *offset_buffer.add(0) }) as usize; // get last offset - (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize + let end = (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize; + end - start } // buffer len of primitive types _ => { From ec5bc9a99da92718207af18602aa0cc0806c162b Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 10 May 2024 06:47:41 -0700 Subject: [PATCH 2/5] Update code comment --- arrow-array/src/ffi.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index c6ec5f3d126a..cc6e601f63ef 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -425,7 +425,8 @@ impl<'a> ImportedArrowArray<'a> { (length + 1) * (bits / 8) } (DataType::Utf8, 2) | (DataType::Binary, 2) => { - // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) + // the len of the data buffer (buffer 2) equals the difference between the last value + // and the first value of the offset buffer (buffer 1). let len = self.buffer_len(1, dt)?; // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets. @@ -438,7 +439,8 @@ impl<'a> ImportedArrowArray<'a> { end - start } (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => { - // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) + // the len of the data buffer (buffer 2) equals the difference between the last value + // and the first value of the offset buffer (buffer 1). let len = self.buffer_len(1, dt)?; // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets. From 0b928a621befdcabea6934f6cc901c3fbb5b1f7e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 11 May 2024 11:14:47 -0700 Subject: [PATCH 3/5] Add unit test --- arrow-array/src/ffi.rs | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index cc6e601f63ef..311b5b83e98e 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -1224,7 +1224,7 @@ mod tests_to_then_from_ffi { mod tests_from_ffi { use std::sync::Arc; - use arrow_buffer::{bit_util, buffer::Buffer}; + use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; @@ -1236,7 +1236,7 @@ mod tests_from_ffi { ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}, }; - use super::Result; + use super::{ImportedArrowArray, Result}; fn test_round_trip(expected: &ArrayData) -> Result<()> { // here we export the array @@ -1428,4 +1428,34 @@ mod tests_from_ffi { let data = array.into_data(); test_round_trip(&data) } + + #[test] + fn test_empty_string_with_non_zero_offset() -> Result<()> { + // Simulate an empty string array with a non-zero offset from a producer + let data: Buffer = MutableBuffer::new(0).into(); + let offsets = OffsetBuffer::new(vec![123].into()); + let string_array = + unsafe { StringArray::new_unchecked(offsets.clone(), data.clone(), None) }; + + let data = string_array.into_data(); + + let array = FFI_ArrowArray::new(&data); + let schema = FFI_ArrowSchema::try_from(data.data_type())?; + + let dt = DataType::try_from(&schema)?; + let array = Arc::new(array); + let imported_array = ImportedArrowArray { + array: &array, + data_type: dt, + owner: &array, + }; + + let offset_buf_len = imported_array.buffer_len(1, &imported_array.data_type)?; + let data_buf_len = imported_array.buffer_len(2, &imported_array.data_type)?; + + assert_eq!(offset_buf_len, 4); + assert_eq!(data_buf_len, 0); + + Ok(()) + } } From 555cdfd9a74e15400c5a4d8c550fb7e614838eb1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 11 May 2024 11:33:32 -0700 Subject: [PATCH 4/5] Add round_trip check --- arrow-array/src/ffi.rs | 2 ++ arrow-data/src/equal/variable_size.rs | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 311b5b83e98e..4ca95f22888a 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -1456,6 +1456,8 @@ mod tests_from_ffi { assert_eq!(offset_buf_len, 4); assert_eq!(data_buf_len, 0); + test_round_trip(&imported_array.consume()?); + Ok(()) } } diff --git a/arrow-data/src/equal/variable_size.rs b/arrow-data/src/equal/variable_size.rs index 92f00818b4a0..d6e8e6a95481 100644 --- a/arrow-data/src/equal/variable_size.rs +++ b/arrow-data/src/equal/variable_size.rs @@ -32,17 +32,18 @@ fn offset_value_equal( ) -> bool { let lhs_start = lhs_offsets[lhs_pos].as_usize(); let rhs_start = rhs_offsets[rhs_pos].as_usize(); - let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]; - let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]; + let lhs_len = (lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]) + .to_usize() + .unwrap(); + let rhs_len = (rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]) + .to_usize() + .unwrap(); - lhs_len == rhs_len - && equal_len( - lhs_values, - rhs_values, - lhs_start, - rhs_start, - lhs_len.to_usize().unwrap(), - ) + if lhs_len == 0 && rhs_len == 0 { + return true; + } + + lhs_len == rhs_len && equal_len(lhs_values, rhs_values, lhs_start, rhs_start, lhs_len) } pub(super) fn variable_sized_equal( From 11fa8befd89b702a2e8a50900d1747116fb2a0de Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 11 May 2024 12:14:02 -0700 Subject: [PATCH 5/5] Fix clippy --- arrow-array/src/ffi.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 4ca95f22888a..088a0a6ab58a 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -1456,8 +1456,6 @@ mod tests_from_ffi { assert_eq!(offset_buf_len, 4); assert_eq!(data_buf_len, 0); - test_round_trip(&imported_array.consume()?); - - Ok(()) + test_round_trip(&imported_array.consume()?) } }