From 4e06899941ab862faf440384fefe0f1c548999e3 Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Sat, 19 Dec 2020 11:28:27 -0600 Subject: [PATCH 1/8] Iterate primitive buffers by slice Iterate primitive buffers by slice --- rust/arrow/src/array/array_primitive.rs | 6 ++++++ rust/arrow/src/compute/kernels/arithmetic.rs | 13 +++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index a04401900b3..bb7dd8d982a 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -68,6 +68,12 @@ impl PrimitiveArray { unsafe { self.raw_values.get().add(self.data.offset()) } } + /// Returns a slice of the values of this array + pub fn raw_values_slice(&self) -> &[T::Native] { + let raw = unsafe { std::slice::from_raw_parts(self.raw_values(), self.len()) }; + &raw[..] + } + /// Returns a slice for the given offset and length /// /// Note this doesn't do any bound checking, for performance reason. diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs index cc955db45f1..45198789b4a 100644 --- a/rust/arrow/src/compute/kernels/arithmetic.rs +++ b/rust/arrow/src/compute/kernels/arithmetic.rs @@ -51,8 +51,10 @@ where T::Native: Neg, F: Fn(T::Native) -> T::Native, { - let values = (0..array.len()) - .map(|i| op(array.value(i))) + let values = array + .raw_values_slice() + .iter() + .map(|v| op(*v)) .collect::>(); let data = ArrayData::new( @@ -141,8 +143,11 @@ where let null_bit_buffer = combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; - let values = (0..left.len()) - .map(|i| op(left.value(i), right.value(i))) + let values = left + .raw_values_slice() + .iter() + .zip(right.raw_values_slice().iter()) + .map(|(l, r)| op(*l, *r)) .collect::>(); let data = ArrayData::new( From 2c823df03b580f780a7721ac1f8bb8d2a078b114 Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Sun, 20 Dec 2020 21:19:23 -0600 Subject: [PATCH 2/8] replace raw_value, value_slice, values with a common function values() returning a slice of the complete array data --- rust/arrow/src/array/array_dictionary.rs | 2 +- rust/arrow/src/array/array_primitive.rs | 37 +++----------------- rust/arrow/src/array/builder.rs | 2 +- rust/arrow/src/array/mod.rs | 2 +- rust/arrow/src/compute/kernels/aggregate.rs | 4 +-- rust/arrow/src/compute/kernels/arithmetic.rs | 6 ++-- 6 files changed, 13 insertions(+), 40 deletions(-) diff --git a/rust/arrow/src/array/array_dictionary.rs b/rust/arrow/src/array/array_dictionary.rs index 3cca55440f4..312e465f50a 100644 --- a/rust/arrow/src/array/array_dictionary.rs +++ b/rust/arrow/src/array/array_dictionary.rs @@ -386,7 +386,7 @@ mod tests { let keys = array.keys_array(); assert_eq!(&DataType::Int8, keys.data_type()); assert_eq!(0, keys.null_count()); - assert_eq!(&[0, 1, 2, 0], keys.value_slice(0, keys.len())); + assert_eq!(&[0, 1, 2, 0], keys.values()); } #[test] diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index bb7dd8d982a..db0dec74485 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -63,23 +63,9 @@ impl PrimitiveArray { self.data.is_empty() } - /// Returns a raw pointer to the values of this array. - pub fn raw_values(&self) -> *const T::Native { - unsafe { self.raw_values.get().add(self.data.offset()) } - } - /// Returns a slice of the values of this array - pub fn raw_values_slice(&self) -> &[T::Native] { - let raw = unsafe { std::slice::from_raw_parts(self.raw_values(), self.len()) }; - &raw[..] - } - - /// Returns a slice for the given offset and length - /// - /// Note this doesn't do any bound checking, for performance reason. - pub fn value_slice(&self, offset: usize, len: usize) -> &[T::Native] { - let raw = - unsafe { std::slice::from_raw_parts(self.raw_values().add(offset), len) }; + pub fn values(&self) -> &[T::Native] { + let raw = unsafe { std::slice::from_raw_parts(self.raw_values.get().add(self.data.offset()), self.len()) }; &raw[..] } @@ -88,13 +74,6 @@ impl PrimitiveArray { PrimitiveBuilder::::new(capacity) } - /// Returns a `Buffer` holding all the values of this array. - /// - /// Note this doesn't take the offset of this array into account. - pub fn values(&self) -> Buffer { - self.data.buffers()[0].clone() - } - /// Returns the primitive value at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. @@ -461,8 +440,8 @@ mod tests { fn test_primitive_array_from_vec() { let buf = Buffer::from(&[0, 1, 2, 3, 4].to_byte_slice()); let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); - let slice = unsafe { std::slice::from_raw_parts(arr.raw_values(), 5) }; - assert_eq!(buf, arr.values()); + let slice = arr.values(); + assert_eq!(buf, arr.data.buffers()[0]); assert_eq!(&[0, 1, 2, 3, 4], slice); assert_eq!(5, arr.len()); assert_eq!(0, arr.offset()); @@ -744,12 +723,6 @@ mod tests { assert_eq!(false, bool_arr.value(4)); } - #[test] - fn test_value_slice_no_bounds_check() { - let arr = Int32Array::from(vec![2, 3, 4]); - let _slice = arr.value_slice(0, 4); - } - #[test] fn test_int32_fmt_debug() { let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); @@ -829,7 +802,7 @@ mod tests { .add_buffer(buf) .build(); let arr = Int32Array::from(data); - assert_eq!(buf2, arr.values()); + assert_eq!(buf2, arr.data.buffers()[0]); assert_eq!(5, arr.len()); assert_eq!(0, arr.null_count()); for i in 0..3 { diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 97f2978a3b8..3526c03c50c 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -3344,7 +3344,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); - let avs: &[u32] = ava.value_slice(0, array.values().len()); + let avs: &[u32] = ava.values(); assert_eq!(array.is_null(0), false); assert_eq!(array.is_null(1), true); diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs index 1838f83f2bb..fafa4c70608 100644 --- a/rust/arrow/src/array/mod.rs +++ b/rust/arrow/src/array/mod.rs @@ -75,7 +75,7 @@ //! assert_eq!(2, array.value(2), "Get the value with index 2"); //! //! assert_eq!( -//! array.value_slice(3, 2), +//! &array.values()[3..5], //! &[3, 4], //! "Get slice of len 2 starting at idx 3" //! ) diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs index a7e30f8e2de..b9a9d72ab0f 100644 --- a/rust/arrow/src/compute/kernels/aggregate.rs +++ b/rust/arrow/src/compute/kernels/aggregate.rs @@ -114,7 +114,7 @@ where } let data = array.data(); - let m = array.value_slice(0, data.len()); + let m = array.values(); let mut n; if null_count == 0 { @@ -150,7 +150,7 @@ where return None; } - let data: &[T::Native] = array.value_slice(0, array.len()); + let data: &[T::Native] = array.values(); match array.data().null_buffer() { None => { diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs index 45198789b4a..2c0fdd2e764 100644 --- a/rust/arrow/src/compute/kernels/arithmetic.rs +++ b/rust/arrow/src/compute/kernels/arithmetic.rs @@ -52,7 +52,7 @@ where F: Fn(T::Native) -> T::Native, { let values = array - .raw_values_slice() + .values() .iter() .map(|v| op(*v)) .collect::>(); @@ -144,9 +144,9 @@ where combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; let values = left - .raw_values_slice() + .values() .iter() - .zip(right.raw_values_slice().iter()) + .zip(right.values().iter()) .map(|(l, r)| op(*l, *r)) .collect::>(); From d51aa0490304c4a4b881ebf6700a10d5442012e3 Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Tue, 22 Dec 2020 21:14:23 -0600 Subject: [PATCH 3/8] fix simd compilation --- rust/arrow/src/compute/kernels/aggregate.rs | 2 +- rust/arrow/src/compute/kernels/arithmetic.rs | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs index b9a9d72ab0f..4a4e15d7e1e 100644 --- a/rust/arrow/src/compute/kernels/aggregate.rs +++ b/rust/arrow/src/compute/kernels/aggregate.rs @@ -478,7 +478,7 @@ mod simd { return None; } - let data: &[T::Native] = array.value_slice(0, array.len()); + let data: &[T::Native] = array.values(); let mut chunk_acc = A::init_accumulator_chunk(); let mut rem_acc = A::init_accumulator_scalar(); diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs index 2c0fdd2e764..59128d320b7 100644 --- a/rust/arrow/src/compute/kernels/arithmetic.rs +++ b/rust/arrow/src/compute/kernels/arithmetic.rs @@ -86,7 +86,7 @@ where let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut array_chunks = array.value_slice(0, array.len()).chunks_exact(lanes); + let mut array_chunks = array.values().chunks_exact(lanes); result_chunks .borrow_mut() @@ -253,8 +253,8 @@ where let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut left_chunks = left.value_slice(0, left.len()).chunks_exact(lanes); - let mut right_chunks = right.value_slice(0, left.len()).chunks_exact(lanes); + let mut left_chunks = left.values().chunks_exact(lanes); + let mut right_chunks = right.values().chunks_exact(lanes); result_chunks .borrow_mut() @@ -390,8 +390,8 @@ where // process data in chunks of 64 elements since we also get 64 bits of validity information at a time let mut result_chunks = result.typed_data_mut().chunks_exact_mut(64); - let mut left_chunks = left.value_slice(0, left.len()).chunks_exact(64); - let mut right_chunks = right.value_slice(0, left.len()).chunks_exact(64); + let mut left_chunks = left.values().chunks_exact(64); + let mut right_chunks = right.values().chunks_exact(64); valid_chunks .iter() @@ -435,8 +435,8 @@ where } None => { let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut left_chunks = left.value_slice(0, left.len()).chunks_exact(lanes); - let mut right_chunks = right.value_slice(0, left.len()).chunks_exact(lanes); + let mut left_chunks = left.values().chunks_exact(lanes); + let mut right_chunks = right.values().chunks_exact(lanes); result_chunks .borrow_mut() From a8e4f8b799b9c4580e8e26a09e83491353ebcd38 Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Tue, 22 Dec 2020 21:47:14 -0600 Subject: [PATCH 4/8] Reinstate a temporary value_slice function to avoid duplicating work from ARROW-10990 --- rust/arrow/src/array/array_primitive.rs | 13 ++++++++++++- rust/arrow/src/compute/kernels/comparison.rs | 16 ++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index db0dec74485..c6f3f8bb32c 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -63,9 +63,20 @@ impl PrimitiveArray { self.data.is_empty() } + /// Returns a slice for the given offset and length + /// + /// Note this doesn't do any bound checking, for performance reason. + /// # SAFETY + /// caller must ensure that the passed in offset + len are less than the array len() + #[deprecated(note = "Please use values() instead")] + pub unsafe fn value_slice(&self, offset: usize, len: usize) -> &[T::Native] { + let raw = std::slice::from_raw_parts(self.raw_values.get().add(self.data.offset()).add(offset),len); + &raw[..] + } + /// Returns a slice of the values of this array pub fn values(&self) -> &[T::Native] { - let raw = unsafe { std::slice::from_raw_parts(self.raw_values.get().add(self.data.offset()), self.len()) }; + let raw = unsafe {std::slice::from_raw_parts(self.raw_values.get().add(self.data.offset()),self.len())}; &raw[..] } diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs index 6c9d575b5b6..35f93304371 100644 --- a/rust/arrow/src/compute/kernels/comparison.rs +++ b/rust/arrow/src/compute/kernels/comparison.rs @@ -398,8 +398,8 @@ where let rem = len % lanes; for i in (0..len - rem).step_by(lanes) { - let simd_left = T::load(left.value_slice(i, lanes)); - let simd_right = T::load(right.value_slice(i, lanes)); + let simd_left = T::load(unsafe { left.value_slice(i, lanes) }); + let simd_right = T::load(unsafe { right.value_slice(i, lanes) }); let simd_result = op(simd_left, simd_right); T::bitmask(&simd_result, |b| { result.extend_from_slice(b); @@ -407,8 +407,10 @@ where } if rem > 0 { - let simd_left = T::load(left.value_slice(len - rem, lanes)); - let simd_right = T::load(right.value_slice(len - rem, lanes)); + //Soundness + // This is not sound because it can read past the end of PrimitiveArray buffer (lanes is always greater than rem), see ARROW-10990 + let simd_left = T::load(unsafe { left.value_slice(len - rem, lanes) }); + let simd_right = T::load(unsafe { right.value_slice(len - rem, lanes) }); let simd_result = op(simd_left, simd_right); let rem_buffer_size = (rem as f32 / 8f32).ceil() as usize; T::bitmask(&simd_result, |b| { @@ -451,7 +453,7 @@ where let rem = len % lanes; for i in (0..len - rem).step_by(lanes) { - let simd_left = T::load(left.value_slice(i, lanes)); + let simd_left = T::load(unsafe { left.value_slice(i, lanes) }); let simd_result = op(simd_left, simd_right); T::bitmask(&simd_result, |b| { result.extend_from_slice(b); @@ -459,7 +461,9 @@ where } if rem > 0 { - let simd_left = T::load(left.value_slice(len - rem, lanes)); + //Soundness + // This is not sound because it can read past the end of PrimitiveArray buffer (lanes is always greater than rem), see ARROW-10990 + let simd_left = T::load(unsafe { left.value_slice(len - rem, lanes) }); let simd_result = op(simd_left, simd_right); let rem_buffer_size = (rem as f32 / 8f32).ceil() as usize; T::bitmask(&simd_result, |b| { From be2ea6ff265d8337b8c466afb51002d66219e8b6 Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Tue, 22 Dec 2020 22:00:48 -0600 Subject: [PATCH 5/8] fix format --- rust/arrow/src/array/array_primitive.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index c6f3f8bb32c..d8307bf1a8b 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -70,13 +70,21 @@ impl PrimitiveArray { /// caller must ensure that the passed in offset + len are less than the array len() #[deprecated(note = "Please use values() instead")] pub unsafe fn value_slice(&self, offset: usize, len: usize) -> &[T::Native] { - let raw = std::slice::from_raw_parts(self.raw_values.get().add(self.data.offset()).add(offset),len); + let raw = std::slice::from_raw_parts( + self.raw_values.get().add(self.data.offset()).add(offset), + len, + ); &raw[..] } /// Returns a slice of the values of this array pub fn values(&self) -> &[T::Native] { - let raw = unsafe {std::slice::from_raw_parts(self.raw_values.get().add(self.data.offset()),self.len())}; + let raw = unsafe { + std::slice::from_raw_parts( + self.raw_values.get().add(self.data.offset()), + self.len(), + ) + }; &raw[..] } From 56c9f3b581e6c8c10480ab032a98d6ba5b41c83e Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Tue, 22 Dec 2020 22:23:42 -0600 Subject: [PATCH 6/8] fix missing safety section --- rust/arrow/src/array/array_primitive.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index d8307bf1a8b..af3aaa70d79 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -66,7 +66,7 @@ impl PrimitiveArray { /// Returns a slice for the given offset and length /// /// Note this doesn't do any bound checking, for performance reason. - /// # SAFETY + /// # Safety /// caller must ensure that the passed in offset + len are less than the array len() #[deprecated(note = "Please use values() instead")] pub unsafe fn value_slice(&self, offset: usize, len: usize) -> &[T::Native] { From 2e0aa18639cc9dc865b9a0f5c1fedbc04337e449 Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Wed, 23 Dec 2020 11:52:29 -0600 Subject: [PATCH 7/8] apply review feedback --- rust/arrow/src/array/array_primitive.rs | 26 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index af3aaa70d79..625d99dc230 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -43,17 +43,21 @@ const NANOSECONDS: i64 = 1_000_000_000; /// Array whose elements are of primitive types. pub struct PrimitiveArray { + /// Underlying ArrayData + /// # Safety + /// must have exactly one buffer, aligned to type T data: ArrayDataRef, /// Pointer to the value array. The lifetime of this must be <= to the value buffer /// stored in `data`, so it's safe to store. - /// Also note that boolean arrays are bit-packed, so although the underlying pointer - /// is of type bool it should be cast back to u8 before being used. - /// i.e. `self.raw_values.get() as *const u8` + /// # Safety + /// raw_values must have a value equivalent to data.buffers()[0].raw_data() + /// raw_values must have alignment for type T::NativeType raw_values: RawPtrBox, } impl PrimitiveArray { /// Returns the length of this array. + #[inline] pub fn len(&self) -> usize { self.data.len() } @@ -70,22 +74,24 @@ impl PrimitiveArray { /// caller must ensure that the passed in offset + len are less than the array len() #[deprecated(note = "Please use values() instead")] pub unsafe fn value_slice(&self, offset: usize, len: usize) -> &[T::Native] { - let raw = std::slice::from_raw_parts( + std::slice::from_raw_parts( self.raw_values.get().add(self.data.offset()).add(offset), len, - ); - &raw[..] + ) } /// Returns a slice of the values of this array + #[inline] pub fn values(&self) -> &[T::Native] { - let raw = unsafe { + // Soundness + // raw_values alignment & location is ensured by fn from(ArrayDataRef) + // buffer bounds/offset is ensured by the ArrayData instance. + unsafe { std::slice::from_raw_parts( self.raw_values.get().add(self.data.offset()), self.len(), ) - }; - &raw[..] + } } // Returns a new primitive array builder @@ -96,6 +102,8 @@ impl PrimitiveArray { /// Returns the primitive value at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. + /// # Safety + /// caller must ensure that the passed in offset is less than the array len() pub fn value(&self, i: usize) -> T::Native { let offset = i + self.offset(); unsafe { *self.raw_values.get().add(offset) } From b94c46fd28a25249914d593cb05266601d06c745 Mon Sep 17 00:00:00 2001 From: Tyrel Rink <44035897+tyrelr@users.noreply.github.com> Date: Wed, 23 Dec 2020 14:17:25 -0600 Subject: [PATCH 8/8] fix indentation causing prose to be treated as a doctest --- rust/arrow/src/array/array_primitive.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs index 625d99dc230..1faf0db1499 100644 --- a/rust/arrow/src/array/array_primitive.rs +++ b/rust/arrow/src/array/array_primitive.rs @@ -45,13 +45,13 @@ const NANOSECONDS: i64 = 1_000_000_000; pub struct PrimitiveArray { /// Underlying ArrayData /// # Safety - /// must have exactly one buffer, aligned to type T + /// must have exactly one buffer, aligned to type T data: ArrayDataRef, /// Pointer to the value array. The lifetime of this must be <= to the value buffer /// stored in `data`, so it's safe to store. /// # Safety - /// raw_values must have a value equivalent to data.buffers()[0].raw_data() - /// raw_values must have alignment for type T::NativeType + /// raw_values must have a value equivalent to data.buffers()[0].raw_data() + /// raw_values must have alignment for type T::NativeType raw_values: RawPtrBox, }