diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index e2acb6b0c1c..57dbfaf0f3f 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -399,7 +399,7 @@ mod tests { let array = new_empty_array(&DataType::Utf8); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 0); - assert_eq!(a.value_offset(0), 0i32); + assert_eq!(a.value_offsets()[0], 0i32); } #[test] @@ -409,6 +409,6 @@ mod tests { let array = new_empty_array(&data_type); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 0); - assert_eq!(a.value_offset(0), 0i32); + assert_eq!(a.value_offsets()[0], 0i32); } } diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index c50af60fefd..c59e540551a 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -52,26 +52,11 @@ pub struct GenericBinaryArray { } impl GenericBinaryArray { - /// Returns the offset for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> OffsetSize { - self.value_offset_at(self.data.offset() + i) - } - - /// Returns the length for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. + /// Returns the length for value at index `i`. #[inline] - pub fn value_length(&self, mut i: usize) -> OffsetSize { - i += self.data.offset(); - self.value_offset_at(i + 1) - self.value_offset_at(i) - } - - /// Returns a clone of the value offset buffer - pub fn value_offsets(&self) -> Buffer { - self.data.buffers()[0].clone() + pub fn value_length(&self, i: usize) -> OffsetSize { + let offsets = self.value_offsets(); + offsets[i + 1] - offsets[i] } /// Returns a clone of the value data buffer @@ -79,20 +64,50 @@ impl GenericBinaryArray { self.data.buffers()[1].clone() } + /// Returns the offset values in the offsets buffer #[inline] - fn value_offset_at(&self, i: usize) -> OffsetSize { - unsafe { *self.value_offsets.as_ptr().add(i) } + pub fn value_offsets(&self) -> &[OffsetSize] { + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the ArrayData instance. + unsafe { + std::slice::from_raw_parts( + self.value_offsets.as_ptr().add(self.data.offset()), + self.len() + 1, + ) + } } - /// Returns the element at index `i` as a byte slice. + /// Returns the element at index `i` as bytes slice + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array + pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { + let end = *self.value_offsets().get_unchecked(i + 1); + let start = *self.value_offsets().get_unchecked(i); + + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the value_offset invariants + std::slice::from_raw_parts( + self.value_data.as_ptr().offset(start.to_isize()), + (end - start).to_usize().unwrap(), + ) + } + + /// Returns the element at index `i` as bytes slice pub fn value(&self, i: usize) -> &[u8] { assert!(i < self.data.len(), "BinaryArray out of bounds access"); - let offset = i.checked_add(self.data.offset()).unwrap(); + //Soundness: length checked above, offset buffer length is 1 larger than logical array length + let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; + let start = unsafe { self.value_offsets().get_unchecked(i) }; + + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the value_offset invariants unsafe { - let pos = self.value_offset_at(offset); std::slice::from_raw_parts( - self.value_data.as_ptr().offset(pos.to_isize()), - (self.value_offset_at(offset + 1) - pos).to_usize().unwrap(), + self.value_data.as_ptr().offset(start.to_isize()), + (*end - *start).to_usize().unwrap(), ) } } @@ -648,12 +663,19 @@ mod tests { assert_eq!(3, binary_array.len()); assert_eq!(0, binary_array.null_count()); assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], unsafe { + binary_array.value_unchecked(0) + }); assert_eq!([] as [u8; 0], binary_array.value(1)); + assert_eq!([] as [u8; 0], unsafe { binary_array.value_unchecked(1) }); assert_eq!( [b'p', b'a', b'r', b'q', b'u', b'e', b't'], binary_array.value(2) ); - assert_eq!(5, binary_array.value_offset(2)); + assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe { + binary_array.value_unchecked(2) + }); + assert_eq!(5, binary_array.value_offsets()[2]); assert_eq!(7, binary_array.value_length(2)); for i in 0..3 { assert!(binary_array.is_valid(i)); @@ -672,9 +694,9 @@ mod tests { [b'p', b'a', b'r', b'q', b'u', b'e', b't'], binary_array.value(1) ); - assert_eq!(5, binary_array.value_offset(0)); + assert_eq!(5, binary_array.value_offsets()[0]); assert_eq!(0, binary_array.value_length(0)); - assert_eq!(5, binary_array.value_offset(1)); + assert_eq!(5, binary_array.value_offsets()[1]); assert_eq!(7, binary_array.value_length(1)); } @@ -695,12 +717,19 @@ mod tests { assert_eq!(3, binary_array.len()); assert_eq!(0, binary_array.null_count()); assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], unsafe { + binary_array.value_unchecked(0) + }); assert_eq!([] as [u8; 0], binary_array.value(1)); + assert_eq!([] as [u8; 0], unsafe { binary_array.value_unchecked(1) }); assert_eq!( [b'p', b'a', b'r', b'q', b'u', b'e', b't'], binary_array.value(2) ); - assert_eq!(5, binary_array.value_offset(2)); + assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe { + binary_array.value_unchecked(2) + }); + assert_eq!(5, binary_array.value_offsets()[2]); assert_eq!(7, binary_array.value_length(2)); for i in 0..3 { assert!(binary_array.is_valid(i)); @@ -719,9 +748,12 @@ mod tests { [b'p', b'a', b'r', b'q', b'u', b'e', b't'], binary_array.value(1) ); - assert_eq!(5, binary_array.value_offset(0)); + assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe { + binary_array.value_unchecked(1) + }); + assert_eq!(5, binary_array.value_offsets()[0]); assert_eq!(0, binary_array.value_length(0)); - assert_eq!(5, binary_array.value_offset(1)); + assert_eq!(5, binary_array.value_offsets()[1]); assert_eq!(7, binary_array.value_length(1)); } @@ -757,9 +789,12 @@ mod tests { assert_eq!(binary_array1.len(), binary_array2.len()); assert_eq!(binary_array1.null_count(), binary_array2.null_count()); + assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets()); for i in 0..binary_array1.len() { assert_eq!(binary_array1.value(i), binary_array2.value(i)); - assert_eq!(binary_array1.value_offset(i), binary_array2.value_offset(i)); + assert_eq!(binary_array1.value(i), unsafe { + binary_array2.value_unchecked(i) + }); assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i)); } } @@ -796,9 +831,12 @@ mod tests { assert_eq!(binary_array1.len(), binary_array2.len()); assert_eq!(binary_array1.null_count(), binary_array2.null_count()); + assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets()); for i in 0..binary_array1.len() { assert_eq!(binary_array1.value(i), binary_array2.value(i)); - assert_eq!(binary_array1.value_offset(i), binary_array2.value_offset(i)); + assert_eq!(binary_array1.value(i), unsafe { + binary_array2.value_unchecked(i) + }); assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i)); } } diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs index 0f881a86f23..e80e9296234 100644 --- a/rust/arrow/src/array/array_list.rs +++ b/rust/arrow/src/array/array_list.rs @@ -74,33 +74,42 @@ impl GenericListArray { } /// Returns ith value of this list array. - pub fn value(&self, i: usize) -> ArrayRef { - self.values.slice( - self.value_offset(i).to_usize().unwrap(), - self.value_length(i).to_usize().unwrap(), - ) + /// # Safety + /// Caller must ensure that the index is within the array bounds + pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { + let end = *self.value_offsets().get_unchecked(i + 1); + let start = *self.value_offsets().get_unchecked(i); + self.values + .slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) } - /// Returns the offset for value at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> OffsetSize { - self.value_offset_at(self.data.offset() + i) + /// Returns ith value of this list array. + pub fn value(&self, i: usize) -> ArrayRef { + let end = self.value_offsets()[i + 1]; + let start = self.value_offsets()[i]; + self.values + .slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) } - /// Returns the length for value at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. + /// Returns the offset values in the offsets buffer #[inline] - pub fn value_length(&self, mut i: usize) -> OffsetSize { - i += self.data.offset(); - self.value_offset_at(i + 1) - self.value_offset_at(i) + pub fn value_offsets(&self) -> &[OffsetSize] { + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the ArrayData instance. + unsafe { + std::slice::from_raw_parts( + self.value_offsets.as_ptr().add(self.data.offset()), + self.len() + 1, + ) + } } + /// Returns the length for value at index `i`. #[inline] - fn value_offset_at(&self, i: usize) -> OffsetSize { - unsafe { *self.value_offsets.as_ptr().add(i) } + pub fn value_length(&self, i: usize) -> OffsetSize { + let offsets = self.value_offsets(); + offsets[i + 1] - offsets[i] } } @@ -334,7 +343,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); + assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_length(2)); assert_eq!( 0, @@ -345,6 +354,14 @@ mod tests { .unwrap() .value(0) ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); for i in 0..3 { assert!(list_array.is_valid(i)); assert!(!list_array.is_null(i)); @@ -364,7 +381,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(1)); + assert_eq!(6, list_array.value_offsets()[1]); assert_eq!(2, list_array.value_length(1)); assert_eq!( 3, @@ -375,6 +392,14 @@ mod tests { .unwrap() .value(0) ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); } #[test] @@ -404,7 +429,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); + assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_length(2)); assert_eq!( 0, @@ -415,6 +440,14 @@ mod tests { .unwrap() .value(0) ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); for i in 0..3 { assert!(list_array.is_valid(i)); assert!(!list_array.is_null(i)); @@ -434,7 +467,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(1)); + assert_eq!(6, list_array.value_offsets()[1]); assert_eq!(2, list_array.value_length(1)); assert_eq!( 3, @@ -445,6 +478,14 @@ mod tests { .unwrap() .value(0) ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); } #[test] @@ -571,7 +612,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(9, list_array.len()); assert_eq!(4, list_array.null_count()); - assert_eq!(2, list_array.value_offset(3)); + assert_eq!(2, list_array.value_offsets()[3]); assert_eq!(2, list_array.value_length(3)); let sliced_array = list_array.slice(1, 6); @@ -590,11 +631,11 @@ mod tests { // Check offset and length for each non-null value. let sliced_list_array = sliced_array.as_any().downcast_ref::().unwrap(); - assert_eq!(2, sliced_list_array.value_offset(2)); + assert_eq!(2, sliced_list_array.value_offsets()[2]); assert_eq!(2, sliced_list_array.value_length(2)); - assert_eq!(4, sliced_list_array.value_offset(3)); + assert_eq!(4, sliced_list_array.value_offsets()[3]); assert_eq!(2, sliced_list_array.value_length(3)); - assert_eq!(6, sliced_list_array.value_offset(5)); + assert_eq!(6, sliced_list_array.value_offsets()[5]); assert_eq!(3, sliced_list_array.value_length(5)); } @@ -633,7 +674,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(9, list_array.len()); assert_eq!(4, list_array.null_count()); - assert_eq!(2, list_array.value_offset(3)); + assert_eq!(2, list_array.value_offsets()[3]); assert_eq!(2, list_array.value_length(3)); let sliced_array = list_array.slice(1, 6); @@ -654,14 +695,49 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - assert_eq!(2, sliced_list_array.value_offset(2)); + assert_eq!(2, sliced_list_array.value_offsets()[2]); assert_eq!(2, sliced_list_array.value_length(2)); - assert_eq!(4, sliced_list_array.value_offset(3)); + assert_eq!(4, sliced_list_array.value_offsets()[3]); assert_eq!(2, sliced_list_array.value_length(3)); - assert_eq!(6, sliced_list_array.value_offset(5)); + assert_eq!(6, sliced_list_array.value_offsets()[5]); assert_eq!(3, sliced_list_array.value_length(5)); } + #[test] + #[should_panic(expected = "index out of bounds: the len is 10 but the index is 11")] + fn test_list_array_index_out_of_bound() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list array from the above two + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_child_data(value_data) + .null_bit_buffer(Buffer::from(null_bits)) + .build(); + let list_array = LargeListArray::from(list_data); + assert_eq!(9, list_array.len()); + + list_array.value(10); + } + #[test] fn test_fixed_size_list_array_slice() { // Construct a value array @@ -721,6 +797,38 @@ mod tests { assert_eq!(8, sliced_list_array.value_offset(3)); } + #[test] + #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] + fn test_fixed_size_list_array_index_out_of_bound() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build(); + + // Set null buts for the nested array: + // [[0, 1], null, null, [6, 7], [8, 9]] + // 01011001 00000001 + let mut null_bits: [u8; 1] = [0; 1]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + + // Construct a fixed size list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 2, + ); + let list_data = ArrayData::builder(list_data_type) + .len(5) + .add_child_data(value_data) + .null_bit_buffer(Buffer::from(null_bits)) + .build(); + let list_array = FixedSizeListArray::from(list_data); + + list_array.value(10); + } + #[test] #[should_panic( expected = "ListArray data should contain a single buffer only (value offsets)" @@ -793,7 +901,7 @@ mod tests { let values: [i32; 8] = [0; 8]; let value_data = ArrayData::builder(DataType::Int32) - .add_buffer(Buffer::from(values.to_byte_slice())) + .add_buffer(Buffer::from_slice_ref(&values)) .build(); let list_data_type = diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index 984454ce779..409e06509b2 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -465,9 +465,7 @@ mod tests { fn test_primitive_array_from_vec() { let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]); let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); - let slice = arr.values(); assert_eq!(buf, arr.data.buffers()[0]); - assert_eq!(&[0, 1, 2, 3, 4], slice); assert_eq!(5, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -476,6 +474,7 @@ mod tests { assert!(arr.is_valid(i)); assert_eq!(i as i32, arr.value(i)); } + assert_eq!(&[0, 1, 2, 3, 4], arr.values()); assert_eq!(64, arr.get_buffer_memory_size()); let internals_of_primitive_array = 8 + 72; // RawPtrBox & Arc combined. @@ -590,8 +589,10 @@ mod tests { assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); assert_eq!(1, arr.value(0)); + assert_eq!(1, arr.values()[0]); assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); + assert_eq!(-5, arr.values()[2]); // a day_time interval contains days and milliseconds, but we do not yet have accessors for the values let arr = IntervalDayTimeArray::from(vec![Some(1), None, Some(-5)]); @@ -599,8 +600,10 @@ mod tests { assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); assert_eq!(1, arr.value(0)); + assert_eq!(1, arr.values()[0]); assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); + assert_eq!(-5, arr.values()[2]); } #[test] @@ -610,32 +613,40 @@ mod tests { assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); assert_eq!(1, arr.value(0)); + assert_eq!(1, arr.values()[0]); assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); + assert_eq!(-5, arr.values()[2]); let arr = DurationMillisecondArray::from(vec![Some(1), None, Some(-5)]); assert_eq!(3, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); assert_eq!(1, arr.value(0)); + assert_eq!(1, arr.values()[0]); assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); + assert_eq!(-5, arr.values()[2]); let arr = DurationMicrosecondArray::from(vec![Some(1), None, Some(-5)]); assert_eq!(3, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); assert_eq!(1, arr.value(0)); + assert_eq!(1, arr.values()[0]); assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); + assert_eq!(-5, arr.values()[2]); let arr = DurationNanosecondArray::from(vec![Some(1), None, Some(-5)]); assert_eq!(3, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); assert_eq!(1, arr.value(0)); + assert_eq!(1, arr.values()[0]); assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); + assert_eq!(-5, arr.values()[2]); } #[test] @@ -646,6 +657,7 @@ mod tests { assert_eq!(0, arr.null_count()); assert_eq!(1, arr.value(0)); assert_eq!(-5, arr.value(1)); + assert_eq!(&[1, -5], arr.values()); let arr = TimestampMillisecondArray::from_vec(vec![1, -5], None); assert_eq!(2, arr.len()); @@ -653,6 +665,7 @@ mod tests { assert_eq!(0, arr.null_count()); assert_eq!(1, arr.value(0)); assert_eq!(-5, arr.value(1)); + assert_eq!(&[1, -5], arr.values()); let arr = TimestampMicrosecondArray::from_vec(vec![1, -5], None); assert_eq!(2, arr.len()); @@ -660,6 +673,7 @@ mod tests { assert_eq!(0, arr.null_count()); assert_eq!(1, arr.value(0)); assert_eq!(-5, arr.value(1)); + assert_eq!(&[1, -5], arr.values()); let arr = TimestampNanosecondArray::from_vec(vec![1, -5], None); assert_eq!(2, arr.len()); @@ -667,6 +681,7 @@ mod tests { assert_eq!(0, arr.null_count()); assert_eq!(1, arr.value(0)); assert_eq!(-5, arr.value(1)); + assert_eq!(&[1, -5], arr.values()); } #[test] @@ -695,16 +710,20 @@ mod tests { assert_eq!(i == 1, arr2.is_null(i)); assert_eq!(i != 1, arr2.is_valid(i)); } + let int_arr2 = arr2.as_any().downcast_ref::().unwrap(); + assert_eq!(2, int_arr2.values()[0]); + assert_eq!(&[4, 5, 6], &int_arr2.values()[2..5]); let arr3 = arr2.slice(2, 3); assert_eq!(3, arr3.len()); assert_eq!(4, arr3.offset()); assert_eq!(0, arr3.null_count()); - let int_arr = arr3.as_any().downcast_ref::().unwrap(); - assert_eq!(4, int_arr.value(0)); - assert_eq!(5, int_arr.value(1)); - assert_eq!(6, int_arr.value(2)); + let int_arr3 = arr3.as_any().downcast_ref::().unwrap(); + assert_eq!(&[4, 5, 6], int_arr3.values()); + assert_eq!(4, int_arr3.value(0)); + assert_eq!(5, int_arr3.value(1)); + assert_eq!(6, int_arr3.value(2)); } #[test] diff --git a/rust/arrow/src/array/array_string.rs b/rust/arrow/src/array/array_string.rs index 4b98fafdeaf..fbce81ea908 100644 --- a/rust/arrow/src/array/array_string.rs +++ b/rust/arrow/src/array/array_string.rs @@ -50,26 +50,25 @@ pub struct GenericStringArray { } impl GenericStringArray { - /// Returns the offset for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> OffsetSize { - self.value_offset_at(self.data.offset() + i) - } - /// Returns the length for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_length(&self, mut i: usize) -> OffsetSize { - i += self.data.offset(); - self.value_offset_at(i + 1) - self.value_offset_at(i) + pub fn value_length(&self, i: usize) -> OffsetSize { + let offsets = self.value_offsets(); + offsets[i + 1] - offsets[i] } - /// Returns a clone of the value offset buffer - pub fn value_offsets(&self) -> Buffer { - self.data.buffers()[0].clone() + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[OffsetSize] { + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the ArrayData instance. + unsafe { + std::slice::from_raw_parts( + self.value_offsets.as_ptr().add(self.data.offset()), + self.len() + 1, + ) + } } /// Returns a clone of the value data buffer @@ -77,22 +76,40 @@ impl GenericStringArray { self.data.buffers()[1].clone() } - #[inline] - fn value_offset_at(&self, i: usize) -> OffsetSize { - unsafe { *self.value_offsets.as_ptr().add(i) } + /// Returns the element at index + /// # Safety + /// caller is responsible for ensuring that index is within the array bounds + pub unsafe fn value_unchecked(&self, i: usize) -> &str { + let end = self.value_offsets().get_unchecked(i + 1); + let start = self.value_offsets().get_unchecked(i); + + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the value_offset invariants + // ISSUE: utf-8 well formedness is not checked + let slice = std::slice::from_raw_parts( + self.value_data.as_ptr().offset(start.to_isize()), + (*end - *start).to_usize().unwrap(), + ); + std::str::from_utf8_unchecked(slice) } /// Returns the element at index `i` as &str pub fn value(&self, i: usize) -> &str { assert!(i < self.data.len(), "StringArray out of bounds access"); - let offset = i.checked_add(self.data.offset()).unwrap(); + //Soundness: length checked above, offset buffer length is 1 larger than logical array length + let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; + let start = unsafe { self.value_offsets().get_unchecked(i) }; + + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the value_offset invariants + // ISSUE: utf-8 well formedness is not checked unsafe { - let pos = self.value_offset_at(offset); let slice = std::slice::from_raw_parts( - self.value_data.as_ptr().offset(pos.to_isize()), - (self.value_offset_at(offset + 1) - pos).to_usize().unwrap(), + self.value_data.as_ptr().offset(start.to_isize()), + (*end - *start).to_usize().unwrap(), ); - std::str::from_utf8_unchecked(slice) } } @@ -340,9 +357,12 @@ mod tests { assert_eq!(3, string_array.len()); assert_eq!(0, string_array.null_count()); assert_eq!("hello", string_array.value(0)); + assert_eq!("hello", unsafe { string_array.value_unchecked(0) }); assert_eq!("", string_array.value(1)); + assert_eq!("", unsafe { string_array.value_unchecked(1) }); assert_eq!("parquet", string_array.value(2)); - assert_eq!(5, string_array.value_offset(2)); + assert_eq!("parquet", unsafe { string_array.value_unchecked(2) }); + assert_eq!(5, string_array.value_offsets()[2]); assert_eq!(7, string_array.value_length(2)); for i in 0..3 { assert!(string_array.is_valid(i)); @@ -367,9 +387,12 @@ mod tests { assert_eq!(3, string_array.len()); assert_eq!(0, string_array.null_count()); assert_eq!("hello", string_array.value(0)); + assert_eq!("hello", unsafe { string_array.value_unchecked(0) }); assert_eq!("", string_array.value(1)); + assert_eq!("", unsafe { string_array.value_unchecked(1) }); assert_eq!("parquet", string_array.value(2)); - assert_eq!(5, string_array.value_offset(2)); + assert_eq!("parquet", unsafe { string_array.value_unchecked(2) }); + assert_eq!(5, string_array.value_offsets()[2]); assert_eq!(7, string_array.value_length(2)); for i in 0..3 { assert!(string_array.is_valid(i)); @@ -399,12 +422,15 @@ mod tests { let first_list = first_slot.as_any().downcast_ref::().unwrap(); assert_eq!(first_list.len(), 2); assert_eq!(first_list.value(0), "foo"); + assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo"); assert_eq!(first_list.value(1), "bar"); + assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar"); let second_slot = list_of_strings.value(1); let second_list = second_slot.as_any().downcast_ref::().unwrap(); assert_eq!(second_list.len(), 1); assert_eq!(second_list.value(0), "foobar"); + assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar"); } #[test] diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 82d383f9e65..8eb13bbbed8 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -2420,7 +2420,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); + assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_length(2)); for i in 0..3 { assert!(list_array.is_valid(i)); @@ -2456,7 +2456,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); + assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_length(2)); for i in 0..3 { assert!(list_array.is_valid(i)); @@ -2487,7 +2487,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(4, list_array.len()); assert_eq!(1, list_array.null_count()); - assert_eq!(3, list_array.value_offset(2)); + assert_eq!(3, list_array.value_offsets()[2]); assert_eq!(3, list_array.value_length(2)); } @@ -2514,7 +2514,7 @@ mod tests { assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(4, list_array.len()); assert_eq!(1, list_array.null_count()); - assert_eq!(3, list_array.value_offset(2)); + assert_eq!(3, list_array.value_offsets()[2]); assert_eq!(3, list_array.value_length(2)); } @@ -2681,7 +2681,7 @@ mod tests { assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); assert_eq!([] as [u8; 0], binary_array.value(1)); assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); - assert_eq!(5, binary_array.value_offset(2)); + assert_eq!(5, binary_array.value_offsets()[2]); assert_eq!(5, binary_array.value_length(2)); } @@ -2710,7 +2710,7 @@ mod tests { assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); assert_eq!([] as [u8; 0], binary_array.value(1)); assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); - assert_eq!(5, binary_array.value_offset(2)); + assert_eq!(5, binary_array.value_offsets()[2]); assert_eq!(5, binary_array.value_length(2)); } @@ -2729,7 +2729,7 @@ mod tests { assert_eq!("hello", string_array.value(0)); assert_eq!("", string_array.value(1)); assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offset(2)); + assert_eq!(5, string_array.value_offsets()[2]); assert_eq!(5, string_array.value_length(2)); } @@ -2802,7 +2802,7 @@ mod tests { assert_eq!("hello", string_array.value(0)); assert_eq!("", string_array.value(1)); assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offset(2)); + assert_eq!(5, string_array.value_offsets()[2]); assert_eq!(5, string_array.value_length(2)); } diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index e2e29620cc5..4223a4c08b0 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -1260,11 +1260,7 @@ mod tests { .unwrap(); assert_eq!(5, b.len()); let arr = b.as_any().downcast_ref::().unwrap(); - assert_eq!(0, arr.value_offset(0)); - assert_eq!(1, arr.value_offset(1)); - assert_eq!(2, arr.value_offset(2)); - assert_eq!(3, arr.value_offset(3)); - assert_eq!(4, arr.value_offset(4)); + assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); assert_eq!(1, arr.value_length(1)); assert_eq!(1, arr.value_length(2)); @@ -1291,11 +1287,7 @@ mod tests { assert_eq!(5, b.len()); assert_eq!(1, b.null_count()); let arr = b.as_any().downcast_ref::().unwrap(); - assert_eq!(0, arr.value_offset(0)); - assert_eq!(1, arr.value_offset(1)); - assert_eq!(2, arr.value_offset(2)); - assert_eq!(3, arr.value_offset(3)); - assert_eq!(4, arr.value_offset(4)); + assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); assert_eq!(1, arr.value_length(1)); assert_eq!(1, arr.value_length(2)); @@ -1324,10 +1316,7 @@ mod tests { assert_eq!(4, b.len()); assert_eq!(1, b.null_count()); let arr = b.as_any().downcast_ref::().unwrap(); - assert_eq!(0, arr.value_offset(0)); - assert_eq!(1, arr.value_offset(1)); - assert_eq!(2, arr.value_offset(2)); - assert_eq!(3, arr.value_offset(3)); + assert_eq!(&[0, 1, 2, 3, 4], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); assert_eq!(1, arr.value_length(1)); assert_eq!(1, arr.value_length(2)); diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs index 18db31c4df5..ee7d031c245 100644 --- a/rust/arrow/src/compute/kernels/limit.rs +++ b/rust/arrow/src/compute/kernels/limit.rs @@ -124,8 +124,9 @@ mod tests { // Check offset and length for each non-null value. let limit_array: &ListArray = limit_array.as_any().downcast_ref::().unwrap(); + for i in 0..limit_array.len() { - let offset = limit_array.value_offset(i); + let offset = limit_array.value_offsets()[i]; let length = limit_array.value_length(i); if i % 2 == 0 { assert_eq!(2, length); diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index e613146f719..cbe9ef43bb1 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -107,9 +107,7 @@ where PrimitiveArray: From>>, { // TODO: benchmark this function, there might be a faster unsafe alternative - // get list array's offsets - let offsets: Vec = - (0..=list.len()).map(|i| list.value_offset(i)).collect(); + let offsets: &[OffsetType::Native] = list.value_offsets(); let mut new_offsets = Vec::with_capacity(indices.len()); let mut values = Vec::new();