diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 7b988bb07478..088a0a6ab58a 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -425,24 +425,32 @@ impl<'a> ImportedArrowArray<'a> { (length + 1) * (bits / 8) } (DataType::Utf8, 2) | (DataType::Binary, 2) => { - // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) + // the len of the data buffer (buffer 2) equals the difference between the last value + // and the first value of the offset buffer (buffer 1). let len = self.buffer_len(1, dt)?; // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets. #[allow(clippy::cast_ptr_alignment)] let offset_buffer = self.array.buffer(1) as *const i32; + // get first offset + let start = (unsafe { *offset_buffer.add(0) }) as usize; // get last offset - (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize + let end = (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize; + end - start } (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => { - // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) + // the len of the data buffer (buffer 2) equals the difference between the last value + // and the first value of the offset buffer (buffer 1). let len = self.buffer_len(1, dt)?; // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets. #[allow(clippy::cast_ptr_alignment)] let offset_buffer = self.array.buffer(1) as *const i64; + // get first offset + let start = (unsafe { *offset_buffer.add(0) }) as usize; // get last offset - (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize + let end = (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize; + end - start } // buffer len of primitive types _ => { @@ -1216,7 +1224,7 @@ mod tests_to_then_from_ffi { mod tests_from_ffi { use std::sync::Arc; - use arrow_buffer::{bit_util, buffer::Buffer}; + use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; @@ -1228,7 +1236,7 @@ mod tests_from_ffi { ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}, }; - use super::Result; + use super::{ImportedArrowArray, Result}; fn test_round_trip(expected: &ArrayData) -> Result<()> { // here we export the array @@ -1420,4 +1428,34 @@ mod tests_from_ffi { let data = array.into_data(); test_round_trip(&data) } + + #[test] + fn test_empty_string_with_non_zero_offset() -> Result<()> { + // Simulate an empty string array with a non-zero offset from a producer + let data: Buffer = MutableBuffer::new(0).into(); + let offsets = OffsetBuffer::new(vec![123].into()); + let string_array = + unsafe { StringArray::new_unchecked(offsets.clone(), data.clone(), None) }; + + let data = string_array.into_data(); + + let array = FFI_ArrowArray::new(&data); + let schema = FFI_ArrowSchema::try_from(data.data_type())?; + + let dt = DataType::try_from(&schema)?; + let array = Arc::new(array); + let imported_array = ImportedArrowArray { + array: &array, + data_type: dt, + owner: &array, + }; + + let offset_buf_len = imported_array.buffer_len(1, &imported_array.data_type)?; + let data_buf_len = imported_array.buffer_len(2, &imported_array.data_type)?; + + assert_eq!(offset_buf_len, 4); + assert_eq!(data_buf_len, 0); + + test_round_trip(&imported_array.consume()?) + } } diff --git a/arrow-data/src/equal/variable_size.rs b/arrow-data/src/equal/variable_size.rs index 92f00818b4a0..d6e8e6a95481 100644 --- a/arrow-data/src/equal/variable_size.rs +++ b/arrow-data/src/equal/variable_size.rs @@ -32,17 +32,18 @@ fn offset_value_equal( ) -> bool { let lhs_start = lhs_offsets[lhs_pos].as_usize(); let rhs_start = rhs_offsets[rhs_pos].as_usize(); - let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]; - let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]; + let lhs_len = (lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]) + .to_usize() + .unwrap(); + let rhs_len = (rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]) + .to_usize() + .unwrap(); - lhs_len == rhs_len - && equal_len( - lhs_values, - rhs_values, - lhs_start, - rhs_start, - lhs_len.to_usize().unwrap(), - ) + if lhs_len == 0 && rhs_len == 0 { + return true; + } + + lhs_len == rhs_len && equal_len(lhs_values, rhs_values, lhs_start, rhs_start, lhs_len) } pub(super) fn variable_sized_equal(