From 04a9889ac7bced7265643b5a08738cc3fbe2f365 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 6 Oct 2022 13:39:43 +0100 Subject: [PATCH 1/2] Handle empty offsets buffer (#1824) --- arrow-array/src/array/binary_array.rs | 29 +++++++++++++++++-- arrow-array/src/array/list_array.rs | 41 +++++++++++++++++++++++++-- arrow-array/src/array/string_array.rs | 29 +++++++++++++++++-- 3 files changed, 92 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index cb168daf0720..4d39f0b1ae55 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -17,7 +17,10 @@ use crate::iterator::GenericBinaryIter; use crate::raw_pointer::RawPtrBox; -use crate::{print_long_array, Array, ArrayAccessor, GenericListArray, OffsetSizeTrait}; +use crate::{ + empty_offsets, print_long_array, Array, ArrayAccessor, GenericListArray, + OffsetSizeTrait, +}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -286,7 +289,11 @@ impl From for GenericBinaryArray empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; let values = data.buffers()[1].as_ptr(); Self { data, @@ -845,4 +852,22 @@ mod tests { .validate_full() .expect("All null array has valid array data"); } + + #[test] + fn test_empty_offsets() { + let string = BinaryArray::from( + ArrayData::builder(DataType::Binary) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + let string = LargeBinaryArray::from( + ArrayData::builder(DataType::LargeBinary) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + } } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index b45a0f9257f2..b1b243823ed7 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -43,6 +43,17 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } +/// Returns a slice of `OffsetSize` consisting of a single zero value +#[inline] +pub(crate) fn empty_offsets() -> &'static [OffsetSize] { + static OFFSET: &[i64] = &[0]; + // SAFETY: + // OffsetSize is ArrowNativeType and is therefore trivially transmutable + let (prefix, val, suffix) = unsafe { OFFSET.align_to::() }; + assert!(prefix.is_empty() && suffix.is_empty()); + val +} + /// Generic struct for a variable-size list array. /// /// Columnar format in Apache Arrow: @@ -240,8 +251,13 @@ impl GenericListArray { } let values = make_array(values); - let value_offsets = data.buffers()[0].as_ptr(); - let value_offsets = unsafe { RawPtrBox::::new(value_offsets) }; + // Handle case of empty offsets + let offsets = match data.is_empty() && data.buffers()[0].is_empty() { + true => empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; + + let value_offsets = unsafe { RawPtrBox::new(offsets) }; Ok(Self { data, values, @@ -941,4 +957,25 @@ mod tests { false, ); } + + #[test] + fn test_empty_offsets() { + let f = Box::new(Field::new("element", DataType::Int32, true)); + let string = ListArray::from( + ArrayData::builder(DataType::List(f.clone())) + .buffers(vec![Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + let string = LargeListArray::from( + ArrayData::builder(DataType::LargeList(f)) + .buffers(vec![Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + } } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 22ad81eaa3f9..8129ca5f085d 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -18,8 +18,8 @@ use crate::iterator::GenericStringIter; use crate::raw_pointer::RawPtrBox; use crate::{ - print_long_array, Array, ArrayAccessor, GenericBinaryArray, GenericListArray, - OffsetSizeTrait, + empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray, + GenericListArray, OffsetSizeTrait, }; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; @@ -370,7 +370,11 @@ impl From for GenericStringArray empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; let values = data.buffers()[1].as_ptr(); Self { data, @@ -823,4 +827,23 @@ mod tests { fn test_large_string_array_from_list_array_wrong_type() { _test_generic_string_array_from_list_array_wrong_type::(); } + + #[test] + fn test_empty_offsets() { + let string = StringArray::from( + ArrayData::builder(DataType::Utf8) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + + let string = LargeStringArray::from( + ArrayData::builder(DataType::LargeUtf8) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + } } From 532cf8ba375296b7ce57151ec4a33658611d64d0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 12 Oct 2022 21:32:17 +0100 Subject: [PATCH 2/2] Review feedback --- arrow-array/src/array/binary_array.rs | 1 + arrow-array/src/array/list_array.rs | 1 + arrow-array/src/array/string_array.rs | 2 ++ 3 files changed, 4 insertions(+) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 4d39f0b1ae55..851fb60c0787 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -868,6 +868,7 @@ mod tests { .build() .unwrap(), ); + assert_eq!(string.len(), 0); assert_eq!(string.value_offsets(), &[0]); } } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index b1b243823ed7..3022db023ab6 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -976,6 +976,7 @@ mod tests { .build() .unwrap(), ); + assert_eq!(string.len(), 0); assert_eq!(string.value_offsets(), &[0]); } } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 8129ca5f085d..7e2ed3667e22 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -836,6 +836,7 @@ mod tests { .build() .unwrap(), ); + assert_eq!(string.len(), 0); assert_eq!(string.value_offsets(), &[0]); let string = LargeStringArray::from( @@ -844,6 +845,7 @@ mod tests { .build() .unwrap(), ); + assert_eq!(string.len(), 0); assert_eq!(string.value_offsets(), &[0]); } }