From c675410458bf81f35a9d3c40f05f6d67a529a3fb Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Thu, 16 May 2019 23:48:07 +0200 Subject: [PATCH 1/9] ARROW-5351: [Rust] Take kernel --- rust/arrow/src/array/array.rs | 5 + rust/arrow/src/compute/kernels/mod.rs | 1 + rust/arrow/src/compute/kernels/take.rs | 193 +++++++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 rust/arrow/src/compute/kernels/take.rs diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index 2c353d578f0..410c12608ea 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -994,6 +994,11 @@ impl StructArray { pub fn num_columns(&self) -> usize { self.boxed_fields.len() } + + /// Returns the fields of the struct array + pub fn columns(&self) -> Vec<&ArrayRef> { + self.boxed_fields.iter().collect() + } } impl From for StructArray { diff --git a/rust/arrow/src/compute/kernels/mod.rs b/rust/arrow/src/compute/kernels/mod.rs index 2483f519b97..ae1ab0cc45d 100644 --- a/rust/arrow/src/compute/kernels/mod.rs +++ b/rust/arrow/src/compute/kernels/mod.rs @@ -21,4 +21,5 @@ pub mod arithmetic; pub mod boolean; pub mod cast; pub mod comparison; +pub mod take; pub mod temporal; diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs new file mode 100644 index 00000000000..fee11e9df2e --- /dev/null +++ b/rust/arrow/src/compute/kernels/take.rs @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines take kernel for `ArrayRef` + +use std::sync::Arc; + +use crate::array::*; +use crate::builder::*; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +pub fn take(array: &ArrayRef, index: &UInt32Array) -> Result { + use TimeUnit::*; + match array.data_type() { + DataType::Boolean => panic!(), + DataType::Int8 => take_numeric::(array, index), + DataType::Int16 => take_numeric::(array, index), + DataType::Int32 => take_numeric::(array, index), + DataType::Int64 => take_numeric::(array, index), + DataType::UInt8 => take_numeric::(array, index), + DataType::UInt16 => take_numeric::(array, index), + DataType::UInt32 => take_numeric::(array, index), + DataType::UInt64 => take_numeric::(array, index), + DataType::Float32 => take_numeric::(array, index), + DataType::Float64 => take_numeric::(array, index), + DataType::Date32(_) => take_numeric::(array, index), + DataType::Date64(_) => take_numeric::(array, index), + DataType::Time32(Second) => take_numeric::(array, index), + DataType::Time32(Millisecond) => { + take_numeric::(array, index) + } + DataType::Time64(Microsecond) => { + take_numeric::(array, index) + } + DataType::Time64(Nanosecond) => { + take_numeric::(array, index) + } + DataType::Timestamp(Second) => take_numeric::(array, index), + DataType::Timestamp(Millisecond) => { + take_numeric::(array, index) + } + DataType::Timestamp(Microsecond) => { + take_numeric::(array, index) + } + DataType::Timestamp(Nanosecond) => { + take_numeric::(array, index) + } + DataType::Utf8 => panic!(), + DataType::List(_) => unimplemented!(), + DataType::Struct(fields) => { + let struct_: &StructArray = + array.as_any().downcast_ref::().unwrap(); + let arrays: Result> = + struct_.columns().iter().map(|a| take(a, index)).collect(); + let arrays = arrays?; + let pairs: Vec<(Field, ArrayRef)> = + fields.clone().into_iter().zip(arrays).collect(); + Ok(Arc::new(StructArray::from(pairs)) as ArrayRef) + } + t @ _ => unimplemented!("Sort not supported for data type {:?}", t), + } +} + +/// `take` implementation for numeric arrays +fn take_numeric(array: &ArrayRef, index: &UInt32Array) -> Result +where + T: ArrowNumericType, +{ + let mut builder = PrimitiveBuilder::::new(index.len()); + let a = array.as_any().downcast_ref::>().unwrap(); + let len = a.len(); + for i in 0..index.len() { + if index.is_null(i) { + builder.append_null()?; + } else { + let ix = index.value(i) as usize; + if ix >= len { + return Err(ArrowError::ComputeError( + format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) + ); + } else { + if a.is_null(ix) { + builder.append_null()?; + } else { + builder.append_value(a.value(ix))?; + } + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn take_test_numeric<'a, T>( + data: Vec>, + index: &UInt32Array, + ) -> ArrayRef + where + T: ArrowNumericType, + PrimitiveArray: From>>, + { + let a = PrimitiveArray::::from(data); + take(&(Arc::new(a) as ArrayRef), index).unwrap() + } + + #[test] + fn take_primitive() { + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(3)]); + + // uint8 + let a = take_test_numeric::( + vec![Some(0), None, Some(2), Some(3), None], + &index, + ); + assert_eq!(index.len(), a.len()); + let b = UInt8Array::from(vec![Some(3), None, None, Some(3), Some(3)]); + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(b.data(), a.data()); + + // uint16 + let a = take_test_numeric::( + vec![Some(0), None, Some(2), Some(3), None], + &index, + ); + assert_eq!(index.len(), a.len()); + let b = UInt16Array::from(vec![Some(3), None, None, Some(3), Some(3)]); + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(b.data(), a.data()); + + // uint32 + let a = take_test_numeric::( + vec![Some(0), None, Some(2), Some(3), None], + &index, + ); + assert_eq!(index.len(), a.len()); + let b = UInt32Array::from(vec![Some(3), None, None, Some(3), Some(3)]); + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(b.data(), a.data()); + + // uint64 + let a = take_test_numeric::( + vec![Some(0), None, Some(2), Some(3), None], + &index, + ); + assert_eq!(index.len(), a.len()); + let b = UInt64Array::from(vec![Some(3), None, None, Some(3), Some(3)]); + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(b.data(), a.data()); + + // int8 + let a = take_test_numeric::( + vec![Some(0), None, Some(2), Some(-15), None], + &index, + ); + assert_eq!(index.len(), a.len()); + let b = Int8Array::from(vec![Some(-15), None, None, Some(-15), Some(-15)]); + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(b.data(), a.data()); + } + + // #[test] + // fn take_bool() {} + + // #[test] + // fn take_binary() {} + + // #[test] + // fn take_list() {} + + // #[test] + // fn take_struct() {} + + // #[test] + // fn take_out_of_bounds() {} +} From 03014681b4fe1939fbef3c2ac6ffcdbafe6f11f0 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Thu, 23 May 2019 18:46:50 +0200 Subject: [PATCH 2/9] complete take functions for different arrays Tests are still incomplete, and there aren't benchmarks yet --- rust/arrow/src/array/builder.rs | 10 ++ rust/arrow/src/compute/kernels/take.rs | 220 +++++++++++++++++++++++-- rust/arrow/src/compute/util.rs | 35 ++++ 3 files changed, 251 insertions(+), 14 deletions(-) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index b0b97c22107..f032cd24a54 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -472,6 +472,16 @@ impl BinaryBuilder { Ok(()) } + /// Appends a byte slice into the builder. + /// + /// Automatically calls the `append` method to delimit the slice appended in as a + /// distinct array element. + pub fn append_values(&mut self, value: &[u8]) -> Result<()> { + self.builder.values().append_slice(value)?; + self.builder.append(true)?; + Ok(()) + } + /// Appends a `&String` or `&str` into the builder. /// /// Automatically calls the `append` method to delimit the string appended in as a diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index fee11e9df2e..e32f07013a5 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -20,14 +20,26 @@ use std::sync::Arc; use crate::array::*; +use crate::array_data::ArrayData; +use crate::buffer::Buffer; use crate::builder::*; +use crate::compute::util::take_index_from_list; use crate::datatypes::*; use crate::error::{ArrowError, Result}; -pub fn take(array: &ArrayRef, index: &UInt32Array) -> Result { +pub fn take( + array: &ArrayRef, + index: &UInt32Array, + options: Option<&TakeOptions>, +) -> Result { use TimeUnit::*; + + let options = options.map(|opt| opt.clone()).unwrap_or(Default::default()); + if options.check_bounds { + // TODO check bounds once, and fail early if index is out of bounds + } match array.data_type() { - DataType::Boolean => panic!(), + DataType::Boolean => take_bool(array, index), DataType::Int8 => take_numeric::(array, index), DataType::Int16 => take_numeric::(array, index), DataType::Int32 => take_numeric::(array, index), @@ -60,13 +72,16 @@ pub fn take(array: &ArrayRef, index: &UInt32Array) -> Result { DataType::Timestamp(Nanosecond) => { take_numeric::(array, index) } - DataType::Utf8 => panic!(), - DataType::List(_) => unimplemented!(), + DataType::Utf8 => take_binary(array, index), + DataType::List(_) => take_list(array, index), DataType::Struct(fields) => { let struct_: &StructArray = array.as_any().downcast_ref::().unwrap(); - let arrays: Result> = - struct_.columns().iter().map(|a| take(a, index)).collect(); + let arrays: Result> = struct_ + .columns() + .iter() + .map(|a| take(a, index, Some(&options))) + .collect(); let arrays = arrays?; let pairs: Vec<(Field, ArrayRef)> = fields.clone().into_iter().zip(arrays).collect(); @@ -76,6 +91,21 @@ pub fn take(array: &ArrayRef, index: &UInt32Array) -> Result { } } +/// Options that define how `take` should behave +#[derive(Clone)] +pub struct TakeOptions { + /// perform bounds check before taking + pub check_bounds: bool, +} + +impl Default for TakeOptions { + fn default() -> Self { + Self { + check_bounds: false, + } + } +} + /// `take` implementation for numeric arrays fn take_numeric(array: &ArrayRef, index: &UInt32Array) -> Result where @@ -105,6 +135,81 @@ where Ok(Arc::new(builder.finish()) as ArrayRef) } +/// `take` implementation for binary arrays +fn take_binary(array: &ArrayRef, index: &UInt32Array) -> Result { + let mut builder = BinaryBuilder::new(index.len()); + let a = array.as_any().downcast_ref::().unwrap(); + let len = a.len(); + for i in 0..index.len() { + if index.is_null(i) { + builder.append(false)?; + } else { + let ix = index.value(i) as usize; + if ix >= len { + return Err(ArrowError::ComputeError( + format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) + ); + } else { + if a.is_null(ix) { + builder.append(false)?; + } else { + builder.append_values(a.value(ix))?; + } + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) +} + +/// `take` implementation for boolean arrays +fn take_bool(array: &ArrayRef, index: &UInt32Array) -> Result { + let mut builder = BooleanBuilder::new(index.len()); + let a = array.as_any().downcast_ref::().unwrap(); + let len = a.len(); + for i in 0..index.len() { + if index.is_null(i) { + builder.append_null()?; + } else { + let ix = index.value(i) as usize; + if ix >= len { + return Err(ArrowError::ComputeError( + format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) + ); + } else { + if a.is_null(ix) { + builder.append_null()?; + } else { + builder.append_value(a.value(ix))?; + } + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) +} + +/// `take` implementation for list arrays +/// +/// Works by calculating the index and indexed offset for the inner array, +/// applying `take` on the inner array, then reconstructing a list array +/// with the indexed offsets +fn take_list(array: &ArrayRef, index: &UInt32Array) -> Result { + let list: &ListArray = array.as_any().downcast_ref::().unwrap(); + let (indices, offsets) = take_index_from_list(array, index); + let taken = take(&list.values(), &indices, None)?; + let value_offsets = Buffer::from(offsets[..].to_byte_slice()); + let list_data = ArrayData::new( + list.data_type().clone(), + index.len(), + Some(index.null_count()), + taken.data().null_bitmap().clone().map(|bitmap| bitmap.bits), + 0, + vec![value_offsets], + vec![taken.data()], + ); + let list_array = Arc::new(ListArray::from(Arc::new(list_data))) as ArrayRef; + Ok(list_array) +} + #[cfg(test)] mod tests { use super::*; @@ -118,11 +223,11 @@ mod tests { PrimitiveArray: From>>, { let a = PrimitiveArray::::from(data); - take(&(Arc::new(a) as ArrayRef), index).unwrap() + take(&(Arc::new(a) as ArrayRef), index, None).unwrap() } #[test] - fn take_primitive() { + fn test_take_primitive() { let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(3)]); // uint8 @@ -176,14 +281,101 @@ mod tests { assert_eq!(b.data(), a.data()); } - // #[test] - // fn take_bool() {} + #[test] + fn test_take_bool() { + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(4)]); + let array = BooleanArray::from(vec![ + Some(true), + Some(false), + None, + Some(false), + Some(true), + None, + ]); + let array = Arc::new(array) as ArrayRef; + let a = take(&array, &index, None).unwrap(); + assert_eq!(a.len(), index.len()); + let b = BooleanArray::from(vec![ + Some(false), + None, + Some(false), + Some(false), + Some(true), + ]); + assert_eq!(a.data(), b.data()); + } - // #[test] - // fn take_binary() {} + #[test] + fn test_take_binary() { + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(4)]); + let mut builder: BinaryBuilder = BinaryBuilder::new(6); + builder.append_string("one").unwrap(); + builder.append_null().unwrap(); + builder.append_string("three").unwrap(); + builder.append_string("four").unwrap(); + builder.append_string("five").unwrap(); + let array = Arc::new(builder.finish()) as ArrayRef; + let a = take(&array, &index, None).unwrap(); + assert_eq!(a.len(), index.len()); + builder.append_string("four").unwrap(); + builder.append_null().unwrap(); + builder.append_null().unwrap(); + builder.append_string("four").unwrap(); + builder.append_string("five").unwrap(); + let b = builder.finish(); + assert_eq!(a.data(), b.data()); + } - // #[test] - // fn take_list() {} + #[test] + fn test_take_list() { + // Construct a value array, [[0,0,0], [-1,-2,-1], [2,3]] + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]).data(); + // Construct offsets + let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); + // Construct a list array from the above two + let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_child_data(value_data.clone()) + .build(); + let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; + + // index returns: [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] + let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(2), Some(0)]); + + let a = take(&list_array, &index, None).unwrap(); + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(5, a.len()); + let b = a.values(); + let b = Int32Array::from(b.data()); + + let taken_values = Int32Array::from(vec![ + Some(2), + Some(3), + None, + Some(-1), + Some(-2), + Some(-1), + Some(2), + Some(3), + Some(0), + Some(0), + Some(0), + ]); + let taken_offsets = Buffer::from(&[0, 2, 2, 5, 7, 10].to_byte_slice()); + let taken_list_data = ArrayData::builder(list_data_type.clone()) + .len(5) + .null_count(1) + .add_buffer(taken_offsets.clone()) + .add_child_data(taken_values.data().clone()) + .build(); + // taken values should match b + assert_eq!(format!("{:?}", b), format!("{:?}", taken_values)); + // assert_eq!(b.data(), taken_values.data()); + // list data should be equal + // assert_eq!(taken_list_data, a.data()); + } // #[test] // fn take_struct() {} diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index 55726b85eda..f22b04ff94c 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -17,6 +17,7 @@ //! Common utilities for computation kernels. +use crate::array::{Array, ArrayRef, ListArray, UInt32Array}; use crate::bitmap::Bitmap; use crate::buffer::Buffer; use crate::error::Result; @@ -44,6 +45,40 @@ where } } +/// Takes/filters a list array's inner offsets +pub(crate) fn take_index_from_list( + array: &ArrayRef, + index: &UInt32Array, +) -> (UInt32Array, Vec) { + // TODO complete documenting, and add an example + // TODO benchmark this function, there might be a faster unsafe alternative + // get list array's offsets + let list: &ListArray = array.as_any().downcast_ref::().unwrap(); + let offsets: Vec = (0..=list.len()) + .map(|i| list.value_offset(i) as u32) + .collect(); + let mut new_offsets = Vec::with_capacity(index.len()); + let mut current_offset = 0; + // add first offset + new_offsets.push(0); + let values: Vec> = (0..index.len()) + .flat_map(|i: usize| { + if index.is_valid(i) { + let ix = index.value(i) as usize; + let start = offsets[ix]; + let end = offsets[ix + 1]; + current_offset += (end - start) as i32; + new_offsets.push(current_offset); + // type annotation needed to guide compiler a bit + (start..end).map(|v| Some(v)).collect::>>() + } else { + vec![None] + } + }) + .collect(); + (UInt32Array::from(values), new_offsets) +} + #[cfg(test)] mod tests { use super::*; From 5f44899447aa19410988cf3987114a3c404f037c Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Thu, 23 May 2019 21:25:34 +0000 Subject: [PATCH 3/9] add list and struct tests --- rust/arrow/src/compute/kernels/take.rs | 173 +++++++++++++++++++------ rust/arrow/src/compute/util.rs | 41 +++++- 2 files changed, 174 insertions(+), 40 deletions(-) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index e32f07013a5..48b9a85740d 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -27,6 +27,11 @@ use crate::compute::util::take_index_from_list; use crate::datatypes::*; use crate::error::{ArrowError, Result}; +/// Take elements from `ArrayRef` by supplying an array of indices. +/// +/// Supports: +/// * null indices, returning a null value for the index +/// * checking for overflowing indices pub fn take( array: &ArrayRef, index: &UInt32Array, @@ -36,7 +41,17 @@ pub fn take( let options = options.map(|opt| opt.clone()).unwrap_or(Default::default()); if options.check_bounds { - // TODO check bounds once, and fail early if index is out of bounds + let len = array.len(); + for i in 0..index.len() { + if index.is_valid(i) { + let ix = index.value(i) as usize; + if ix >= len { + return Err(ArrowError::ComputeError( + format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) + ); + } + } + } } match array.data_type() { DataType::Boolean => take_bool(array, index), @@ -113,22 +128,15 @@ where { let mut builder = PrimitiveBuilder::::new(index.len()); let a = array.as_any().downcast_ref::>().unwrap(); - let len = a.len(); for i in 0..index.len() { if index.is_null(i) { builder.append_null()?; } else { let ix = index.value(i) as usize; - if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) - ); + if a.is_null(ix) { + builder.append_null()?; } else { - if a.is_null(ix) { - builder.append_null()?; - } else { - builder.append_value(a.value(ix))?; - } + builder.append_value(a.value(ix))?; } } } @@ -139,22 +147,15 @@ where fn take_binary(array: &ArrayRef, index: &UInt32Array) -> Result { let mut builder = BinaryBuilder::new(index.len()); let a = array.as_any().downcast_ref::().unwrap(); - let len = a.len(); for i in 0..index.len() { if index.is_null(i) { builder.append(false)?; } else { let ix = index.value(i) as usize; - if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) - ); + if a.is_null(ix) { + builder.append(false)?; } else { - if a.is_null(ix) { - builder.append(false)?; - } else { - builder.append_values(a.value(ix))?; - } + builder.append_values(a.value(ix))?; } } } @@ -165,22 +166,15 @@ fn take_binary(array: &ArrayRef, index: &UInt32Array) -> Result { fn take_bool(array: &ArrayRef, index: &UInt32Array) -> Result { let mut builder = BooleanBuilder::new(index.len()); let a = array.as_any().downcast_ref::().unwrap(); - let len = a.len(); for i in 0..index.len() { if index.is_null(i) { builder.append_null()?; } else { let ix = index.value(i) as usize; - if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) - ); + if a.is_null(ix) { + builder.append_null()?; } else { - if a.is_null(ix) { - builder.append_null()?; - } else { - builder.append_value(a.value(ix))?; - } + builder.append_value(a.value(ix))?; } } } @@ -189,7 +183,7 @@ fn take_bool(array: &ArrayRef, index: &UInt32Array) -> Result { /// `take` implementation for list arrays /// -/// Works by calculating the index and indexed offset for the inner array, +/// Calculates the index and indexed offset for the inner array, /// applying `take` on the inner array, then reconstructing a list array /// with the indexed offsets fn take_list(array: &ArrayRef, index: &UInt32Array) -> Result { @@ -345,7 +339,7 @@ mod tests { let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(2), Some(0)]); let a = take(&list_array, &index, None).unwrap(); - let a = a.as_any().downcast_ref::().unwrap(); + let a: &ListArray = a.as_any().downcast_ref::().unwrap(); assert_eq!(5, a.len()); let b = a.values(); let b = Int32Array::from(b.data()); @@ -368,17 +362,122 @@ mod tests { .len(5) .null_count(1) .add_buffer(taken_offsets.clone()) + .null_bit_buffer(Buffer::from([0b11111011, 0b00000111])) .add_child_data(taken_values.data().clone()) .build(); // taken values should match b assert_eq!(format!("{:?}", b), format!("{:?}", taken_values)); - // assert_eq!(b.data(), taken_values.data()); + assert_eq!(b.data(), taken_values.data()); + // list offsets should be the same + assert_eq!(a.data_ref().buffers(), &[taken_offsets]); // list data should be equal - // assert_eq!(taken_list_data, a.data()); + assert_eq!(taken_list_data, a.data()); } - // #[test] - // fn take_struct() {} + #[test] + fn test_take_list_with_nulls() { + // Construct a value array, [[0,null,0], [-1,-2,3], null, [2,null]] + let value_data = Int32Array::from(vec![ + Some(0), + None, + Some(0), + Some(-1), + Some(-2), + Some(3), + None, + Some(5), + None, + ]) + .data(); + // Construct offsets + let value_offsets = Buffer::from(&[0, 3, 6, 7, 9].to_byte_slice()); + // Construct a list array from the above two + let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(4) + .add_buffer(value_offsets.clone()) + .null_count(1) + .null_bit_buffer(Buffer::from([0b10111101, 0b00000000])) + .add_child_data(value_data.clone()) + .build(); + let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; + + // index returns: [null, null, [-1,-2,-1], [2,null], [0,null,0]] + let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); + + let a = take(&list_array, &index, None).unwrap(); + let a: &ListArray = a.as_any().downcast_ref::().unwrap(); + assert_eq!(5, a.len()); + let b = a.values(); + let b = Int32Array::from(b.data()); + + let taken_values = Int32Array::from(vec![ + None, + None, + Some(-1), + Some(-2), + Some(3), + Some(5), + None, + Some(0), + None, + Some(0), + ]); + let taken_offsets = Buffer::from(&[0, 1, 1, 4, 6, 9].to_byte_slice()); + let taken_list_data = ArrayData::builder(list_data_type.clone()) + .len(5) + .null_count(2) + .add_buffer(taken_offsets.clone()) + .null_bit_buffer(Buffer::from([0b00111101, 0b00000001])) + .add_child_data(taken_values.data().clone()) + .build(); + // taken values should match b + assert_eq!(format!("{:?}", b), format!("{:?}", taken_values)); + assert_eq!(b.data(), taken_values.data()); + // list offsets should be the same + assert_eq!(a.data_ref().buffers(), &[taken_offsets]); + // list data should be equal + assert_eq!(taken_list_data, a.data()); + } + + #[test] + fn take_struct() { + let boolean_data = ArrayData::builder(DataType::Boolean) + .len(4) + .add_buffer(Buffer::from([true, false, true, false].to_byte_slice())) + .build(); + let int_data = ArrayData::builder(DataType::Int32) + .len(4) + .add_buffer(Buffer::from([42, 28, 19, 31].to_byte_slice())) + .build(); + let mut field_types = vec![]; + field_types.push(Field::new("a", DataType::Boolean, true)); + field_types.push(Field::new("b", DataType::Int32, true)); + let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) + .len(4) + .add_child_data(boolean_data.clone()) + .add_child_data(int_data.clone()) + .build(); + let struct_array = StructArray::from(struct_array_data); + let array = Arc::new(struct_array) as ArrayRef; + + let index = UInt32Array::from(vec![0, 3, 1, 0, 2]); + let a = take(&array, &index, None).unwrap(); + let a: &StructArray = a.as_any().downcast_ref::().unwrap(); + assert_eq!(index.len(), a.len()); + assert_eq!(0, a.null_count()); + + let b = BooleanArray::from(vec![true, false, false, true, false]); + let c = Int32Array::from(vec![42, 31, 28, 42, 19]); + let bools = a.column(0); + let bools = bools.as_any().downcast_ref::().unwrap(); + let ints = a.column(1); + let ints = ints.as_any().downcast_ref::().unwrap(); + assert_eq!(format!("{:?}", bools), format!("{:?}", b)); + assert_eq!(format!("{:?}", ints), format!("{:?}", c)); + assert_eq!(b.data(), bools.data()); + assert_eq!(c.data(), a.column(1).data()); + } // #[test] // fn take_out_of_bounds() {} diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index f22b04ff94c..7a5cbfa2a00 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -17,7 +17,7 @@ //! Common utilities for computation kernels. -use crate::array::{Array, ArrayRef, ListArray, UInt32Array}; +use crate::array::*; use crate::bitmap::Bitmap; use crate::buffer::Buffer; use crate::error::Result; @@ -45,12 +45,15 @@ where } } -/// Takes/filters a list array's inner offsets +/// Takes/filters a list array's inner data using the offsets of the list array. +/// +/// Where a list array has index `[0,2,5,10]`, taking an index of `[2,0]` returns +/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2 +/// elements) pub(crate) fn take_index_from_list( array: &ArrayRef, index: &UInt32Array, ) -> (UInt32Array, Vec) { - // TODO complete documenting, and add an example // TODO benchmark this function, there might be a faster unsafe alternative // get list array's offsets let list: &ListArray = array.as_any().downcast_ref::().unwrap(); @@ -72,6 +75,7 @@ pub(crate) fn take_index_from_list( // type annotation needed to guide compiler a bit (start..end).map(|v| Some(v)).collect::>>() } else { + new_offsets.push(current_offset); vec![None] } }) @@ -83,6 +87,11 @@ pub(crate) fn take_index_from_list( mod tests { use super::*; + use std::sync::Arc; + + use crate::array_data::ArrayData; + use crate::datatypes::{DataType, ToByteSlice}; + #[test] fn test_apply_bin_op_to_option_bitmap() { assert_eq!( @@ -115,4 +124,30 @@ mod tests { ); } + #[test] + fn test_take_index_from_list() { + let value_data = Int32Array::from((0..10).collect::>()).data(); + let value_offsets = Buffer::from(&[0, 2, 5, 10].to_byte_slice()); + let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_child_data(value_data.clone()) + .build(); + let array = Arc::new(ListArray::from(list_data)) as ArrayRef; + let index = UInt32Array::from(vec![2, 0]); + let (indexed, offsets) = take_index_from_list(&array, &index); + assert_eq!(vec![0, 5, 7], offsets); + let data = UInt32Array::from(vec![ + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(0), + Some(1), + ]) + .data(); + assert_eq!(data, indexed.data()); + } } From b29e160c34056eee0fd57e654837639aa8f71ffc Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 28 May 2019 20:15:09 +0000 Subject: [PATCH 4/9] add some benchmarks runtime is linear as would be expected --- rust/arrow/Cargo.toml | 4 ++ rust/arrow/benches/take_kernels.rs | 97 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 rust/arrow/benches/take_kernels.rs diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 2943069b985..0ceb135643f 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -77,6 +77,10 @@ harness = false name = "comparison_kernels" harness = false +[[bench]] +name = "take_kernels" +harness = false + [[bench]] name = "csv_writer" harness = false diff --git a/rust/arrow/benches/take_kernels.rs b/rust/arrow/benches/take_kernels.rs new file mode 100644 index 00000000000..ee420808348 --- /dev/null +++ b/rust/arrow/benches/take_kernels.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +use criterion::Criterion; +use rand::distributions::{Distribution, Standard}; +use rand::prelude::random; +use rand::Rng; + +use std::sync::Arc; + +extern crate arrow; + +use arrow::array::*; +use arrow::compute::{cast, take}; +use arrow::datatypes::*; + +// cast array from specified primitive array type to desired data type +fn create_numeric(size: usize) -> ArrayRef +where + T: ArrowNumericType, + Standard: Distribution, + PrimitiveArray: std::convert::From>, +{ + Arc::new(PrimitiveArray::::from(vec![random::(); size])) as ArrayRef +} + +fn create_random_index(size: usize) -> UInt32Array { + let mut rng = rand::thread_rng(); + let ints = Int32Array::from(vec![rng.gen_range(-24i32, size as i32); size]); + // cast to u32, conveniently marking negative values as nulls + UInt32Array::from( + cast(&(Arc::new(ints) as ArrayRef), &DataType::UInt32) + .unwrap() + .data(), + ) +} + +fn take_numeric(size: usize, index_len: usize) -> () +where + T: ArrowNumericType, + Standard: Distribution, + PrimitiveArray: std::convert::From>, + T::Native: num::NumCast, +{ + let array = create_numeric::(size); + let index = create_random_index(index_len); + criterion::black_box(take(&array, &index, None).unwrap()); +} + +fn take_boolean(size: usize, index_len: usize) -> () { + let array = Arc::new(BooleanArray::from(vec![random::(); size])) as ArrayRef; + let index = create_random_index(index_len); + criterion::black_box(take(&array, &index, None).unwrap()); +} + +fn add_benchmark(c: &mut Criterion) { + c.bench_function("take u8 256", |b| { + b.iter(|| take_numeric::(256, 256)) + }); + c.bench_function("take u8 512", |b| { + b.iter(|| take_numeric::(512, 512)) + }); + c.bench_function("take u8 1024", |b| { + b.iter(|| take_numeric::(1024, 1024)) + }); + c.bench_function("take i32 256", |b| { + b.iter(|| take_numeric::(256, 256)) + }); + c.bench_function("take i32 512", |b| { + b.iter(|| take_numeric::(512, 512)) + }); + c.bench_function("take i32 1024", |b| { + b.iter(|| take_numeric::(1024, 1024)) + }); + c.bench_function("take bool 256", |b| b.iter(|| take_boolean(256, 256))); + c.bench_function("take bool 512", |b| b.iter(|| take_boolean(512, 512))); + c.bench_function("take bool 1024", |b| b.iter(|| take_boolean(1024, 1024))); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); From 38c7a234af95ab2088d99348ecf85b2a598f79dc Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 28 May 2019 20:16:40 +0000 Subject: [PATCH 5/9] add take kernel to mod exports --- rust/arrow/src/compute/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/arrow/src/compute/mod.rs b/rust/arrow/src/compute/mod.rs index 7e31c52d85d..15af978af0a 100644 --- a/rust/arrow/src/compute/mod.rs +++ b/rust/arrow/src/compute/mod.rs @@ -27,4 +27,5 @@ pub use self::kernels::arithmetic::*; pub use self::kernels::boolean::*; pub use self::kernels::cast::*; pub use self::kernels::comparison::*; +pub use self::kernels::take::*; pub use self::kernels::temporal::*; From 1adfccdc739da26f0081c7f978f5f6dca5728e39 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sun, 2 Jun 2019 16:51:07 +0000 Subject: [PATCH 6/9] update tests, add bounds test --- rust/arrow/src/compute/kernels/take.rs | 229 ++++++++++++++----------- 1 file changed, 132 insertions(+), 97 deletions(-) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 48b9a85740d..ad6c93cd361 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -47,7 +47,7 @@ pub fn take( let ix = index.value(i) as usize; if ix >= len { return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {} from {} length", ix, len)) + format!("Array index out of bounds, cannot get item at index {} from {} entries", ix, len)) ); } } @@ -133,10 +133,10 @@ where builder.append_null()?; } else { let ix = index.value(i) as usize; - if a.is_null(ix) { - builder.append_null()?; - } else { + if a.is_valid(ix) { builder.append_value(a.value(ix))?; + } else { + builder.append_null()?; } } } @@ -211,68 +211,111 @@ mod tests { fn take_test_numeric<'a, T>( data: Vec>, index: &UInt32Array, + options: Option<&TakeOptions>, ) -> ArrayRef where T: ArrowNumericType, PrimitiveArray: From>>, { let a = PrimitiveArray::::from(data); - take(&(Arc::new(a) as ArrayRef), index, None).unwrap() + take(&(Arc::new(a) as ArrayRef), index, options).unwrap() + } + + // create a simple struct for testing purposes + fn create_test_struct() -> ArrayRef { + let boolean_data = BooleanArray::from(vec![true, false, false, true]).data(); + let int_data = Int32Array::from(vec![42, 28, 19, 31]).data(); + let mut field_types = vec![]; + field_types.push(Field::new("a", DataType::Boolean, true)); + field_types.push(Field::new("b", DataType::Int32, true)); + let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) + .len(4) + .null_count(0) + .add_child_data(boolean_data.clone()) + .add_child_data(int_data.clone()) + .build(); + let struct_array = StructArray::from(struct_array_data); + Arc::new(struct_array) as ArrayRef } #[test] fn test_take_primitive() { - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(3)]); + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); // uint8 let a = take_test_numeric::( vec![Some(0), None, Some(2), Some(3), None], &index, + None, ); assert_eq!(index.len(), a.len()); - let b = UInt8Array::from(vec![Some(3), None, None, Some(3), Some(3)]); let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(b.data(), a.data()); + assert_eq!(2, a.null_count()); + assert_eq!(3, a.value(0)); + assert_eq!(true, a.is_null(1)); + assert_eq!(true, a.is_null(2)); + assert_eq!(3, a.value(3)); + assert_eq!(2, a.value(4)); // uint16 let a = take_test_numeric::( vec![Some(0), None, Some(2), Some(3), None], &index, + None, ); assert_eq!(index.len(), a.len()); - let b = UInt16Array::from(vec![Some(3), None, None, Some(3), Some(3)]); let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(b.data(), a.data()); + assert_eq!(2, a.null_count()); + assert_eq!(3, a.value(0)); + assert_eq!(true, a.is_null(1)); + assert_eq!(true, a.is_null(2)); + assert_eq!(3, a.value(3)); + assert_eq!(2, a.value(4)); // uint32 let a = take_test_numeric::( vec![Some(0), None, Some(2), Some(3), None], &index, + None, ); assert_eq!(index.len(), a.len()); - let b = UInt32Array::from(vec![Some(3), None, None, Some(3), Some(3)]); let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(b.data(), a.data()); - - // uint64 - let a = take_test_numeric::( - vec![Some(0), None, Some(2), Some(3), None], + assert_eq!(2, a.null_count()); + assert_eq!(3, a.value(0)); + assert_eq!(true, a.is_null(1)); + assert_eq!(true, a.is_null(2)); + assert_eq!(3, a.value(3)); + assert_eq!(2, a.value(4)); + + // int64 + let a = take_test_numeric::( + vec![Some(0), None, Some(2), Some(-15), None], &index, + None, ); assert_eq!(index.len(), a.len()); - let b = UInt64Array::from(vec![Some(3), None, None, Some(3), Some(3)]); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(b.data(), a.data()); - - // int8 - let a = take_test_numeric::( - vec![Some(0), None, Some(2), Some(-15), None], + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(2, a.null_count()); + assert_eq!(-15, a.value(0)); + assert_eq!(true, a.is_null(1)); + assert_eq!(true, a.is_null(2)); + assert_eq!(-15, a.value(3)); + assert_eq!(2, a.value(4)); + + // float32 + let a = take_test_numeric::( + vec![Some(0.0), None, Some(2.21), Some(-3.1), None], &index, + None, ); assert_eq!(index.len(), a.len()); - let b = Int8Array::from(vec![Some(-15), None, None, Some(-15), Some(-15)]); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(b.data(), a.data()); + let a = a.as_any().downcast_ref::().unwrap(); + assert_eq!(2, a.null_count()); + assert_eq!(-3.1, a.value(0)); + assert_eq!(true, a.is_null(1)); + assert_eq!(true, a.is_null(2)); + assert_eq!(-3.1, a.value(3)); + assert_eq!(2.21, a.value(4)); } #[test] @@ -344,34 +387,22 @@ mod tests { let b = a.values(); let b = Int32Array::from(b.data()); - let taken_values = Int32Array::from(vec![ - Some(2), - Some(3), - None, - Some(-1), - Some(-2), - Some(-1), - Some(2), - Some(3), - Some(0), - Some(0), - Some(0), - ]); let taken_offsets = Buffer::from(&[0, 2, 2, 5, 7, 10].to_byte_slice()); - let taken_list_data = ArrayData::builder(list_data_type.clone()) - .len(5) - .null_count(1) - .add_buffer(taken_offsets.clone()) - .null_bit_buffer(Buffer::from([0b11111011, 0b00000111])) - .add_child_data(taken_values.data().clone()) - .build(); - // taken values should match b - assert_eq!(format!("{:?}", b), format!("{:?}", taken_values)); - assert_eq!(b.data(), taken_values.data()); + assert_eq!(1, b.null_count()); + assert_eq!(11, b.len()); + assert_eq!(2, b.value(0)); + assert_eq!(3, b.value(1)); + assert_eq!(true, b.is_null(2)); + assert_eq!(-1, b.value(3)); + assert_eq!(-2, b.value(4)); + assert_eq!(-1, b.value(5)); + assert_eq!(2, b.value(6)); + assert_eq!(3, b.value(7)); + assert_eq!(0, b.value(8)); + assert_eq!(0, b.value(9)); + assert_eq!(0, b.value(9)); // list offsets should be the same assert_eq!(a.data_ref().buffers(), &[taken_offsets]); - // list data should be equal - assert_eq!(taken_list_data, a.data()); } #[test] @@ -411,55 +442,26 @@ mod tests { let b = a.values(); let b = Int32Array::from(b.data()); - let taken_values = Int32Array::from(vec![ - None, - None, - Some(-1), - Some(-2), - Some(3), - Some(5), - None, - Some(0), - None, - Some(0), - ]); let taken_offsets = Buffer::from(&[0, 1, 1, 4, 6, 9].to_byte_slice()); - let taken_list_data = ArrayData::builder(list_data_type.clone()) - .len(5) - .null_count(2) - .add_buffer(taken_offsets.clone()) - .null_bit_buffer(Buffer::from([0b00111101, 0b00000001])) - .add_child_data(taken_values.data().clone()) - .build(); - // taken values should match b - assert_eq!(format!("{:?}", b), format!("{:?}", taken_values)); - assert_eq!(b.data(), taken_values.data()); + assert_eq!(4, b.null_count()); + assert_eq!(10, b.len()); + assert_eq!(true, b.is_null(0)); + assert_eq!(true, b.is_null(1)); + assert_eq!(-1, b.value(2)); + assert_eq!(-2, b.value(3)); + assert_eq!(3, b.value(4)); + assert_eq!(5, b.value(5)); + assert_eq!(true, b.is_null(6)); + assert_eq!(0, b.value(7)); + assert_eq!(true, b.is_null(8)); + assert_eq!(0, b.value(9)); // list offsets should be the same assert_eq!(a.data_ref().buffers(), &[taken_offsets]); - // list data should be equal - assert_eq!(taken_list_data, a.data()); } #[test] fn take_struct() { - let boolean_data = ArrayData::builder(DataType::Boolean) - .len(4) - .add_buffer(Buffer::from([true, false, true, false].to_byte_slice())) - .build(); - let int_data = ArrayData::builder(DataType::Int32) - .len(4) - .add_buffer(Buffer::from([42, 28, 19, 31].to_byte_slice())) - .build(); - let mut field_types = vec![]; - field_types.push(Field::new("a", DataType::Boolean, true)); - field_types.push(Field::new("b", DataType::Int32, true)); - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) - .len(4) - .add_child_data(boolean_data.clone()) - .add_child_data(int_data.clone()) - .build(); - let struct_array = StructArray::from(struct_array_data); - let array = Arc::new(struct_array) as ArrayRef; + let array = create_test_struct(); let index = UInt32Array::from(vec![0, 3, 1, 0, 2]); let a = take(&array, &index, None).unwrap(); @@ -467,7 +469,7 @@ mod tests { assert_eq!(index.len(), a.len()); assert_eq!(0, a.null_count()); - let b = BooleanArray::from(vec![true, false, false, true, false]); + let b = BooleanArray::from(vec![true, true, false, true, false]); let c = Int32Array::from(vec![42, 31, 28, 42, 19]); let bools = a.column(0); let bools = bools.as_any().downcast_ref::().unwrap(); @@ -476,9 +478,42 @@ mod tests { assert_eq!(format!("{:?}", bools), format!("{:?}", b)); assert_eq!(format!("{:?}", ints), format!("{:?}", c)); assert_eq!(b.data(), bools.data()); - assert_eq!(c.data(), a.column(1).data()); + assert_eq!(c.data(), ints.data()); } - // #[test] - // fn take_out_of_bounds() {} + #[test] + fn take_struct_with_nulls() { + let array = create_test_struct(); + + let index = UInt32Array::from(vec![None, Some(3), Some(1), None, Some(0)]); + let a = take(&array, &index, None).unwrap(); + let a: &StructArray = a.as_any().downcast_ref::().unwrap(); + assert_eq!(index.len(), a.len()); + assert_eq!(0, a.null_count()); + + let b = BooleanArray::from(vec![None, Some(true), Some(false), None, Some(true)]); + let c = Int32Array::from(vec![None, Some(31), Some(28), None, Some(42)]); + let bools = a.column(0); + let bools = bools.as_any().downcast_ref::().unwrap(); + let ints = a.column(1); + let ints = ints.as_any().downcast_ref::().unwrap(); + assert_eq!(format!("{:?}", bools), format!("{:?}", b)); + assert_eq!(format!("{:?}", ints), format!("{:?}", c)); + } + + #[test] + #[should_panic( + expected = "Array index out of bounds, cannot get item at index 6 from 5 entries" + )] + fn take_out_of_bounds() { + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(6)]); + let take_opt = TakeOptions { check_bounds: true }; + + // int64 + take_test_numeric::( + vec![Some(0), None, Some(2), Some(3), None], + &index, + Some(&take_opt), + ); + } } From 0fc3f73d14a60647ca02c5d816ae9e2354dc4788 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sun, 23 Jun 2019 02:27:04 +0000 Subject: [PATCH 7/9] address review comments --- rust/arrow/src/compute/kernels/take.rs | 213 ++++++++++++++++--------- rust/arrow/src/compute/util.rs | 27 ++-- 2 files changed, 153 insertions(+), 87 deletions(-) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index ad6c93cd361..c58f34ef592 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -23,7 +23,7 @@ use crate::array::*; use crate::array_data::ArrayData; use crate::buffer::Buffer; use crate::builder::*; -use crate::compute::util::take_index_from_list; +use crate::compute::util::take_value_indices_from_list; use crate::datatypes::*; use crate::error::{ArrowError, Result}; @@ -33,18 +33,18 @@ use crate::error::{ArrowError, Result}; /// * null indices, returning a null value for the index /// * checking for overflowing indices pub fn take( - array: &ArrayRef, - index: &UInt32Array, - options: Option<&TakeOptions>, + values: &ArrayRef, + indices: &UInt32Array, + options: Option, ) -> Result { use TimeUnit::*; - let options = options.map(|opt| opt.clone()).unwrap_or(Default::default()); + let options = options.clone().unwrap_or(Default::default()); if options.check_bounds { - let len = array.len(); - for i in 0..index.len() { - if index.is_valid(i) { - let ix = index.value(i) as usize; + let len = values.len(); + for i in 0..indices.len() { + if indices.is_valid(i) { + let ix = indices.value(i) as usize; if ix >= len { return Err(ArrowError::ComputeError( format!("Array index out of bounds, cannot get item at index {} from {} entries", ix, len)) @@ -53,56 +53,58 @@ pub fn take( } } } - match array.data_type() { - DataType::Boolean => take_bool(array, index), - DataType::Int8 => take_numeric::(array, index), - DataType::Int16 => take_numeric::(array, index), - DataType::Int32 => take_numeric::(array, index), - DataType::Int64 => take_numeric::(array, index), - DataType::UInt8 => take_numeric::(array, index), - DataType::UInt16 => take_numeric::(array, index), - DataType::UInt32 => take_numeric::(array, index), - DataType::UInt64 => take_numeric::(array, index), - DataType::Float32 => take_numeric::(array, index), - DataType::Float64 => take_numeric::(array, index), - DataType::Date32(_) => take_numeric::(array, index), - DataType::Date64(_) => take_numeric::(array, index), - DataType::Time32(Second) => take_numeric::(array, index), + match values.data_type() { + DataType::Boolean => take_bool(values, indices), + DataType::Int8 => take_numeric::(values, indices), + DataType::Int16 => take_numeric::(values, indices), + DataType::Int32 => take_numeric::(values, indices), + DataType::Int64 => take_numeric::(values, indices), + DataType::UInt8 => take_numeric::(values, indices), + DataType::UInt16 => take_numeric::(values, indices), + DataType::UInt32 => take_numeric::(values, indices), + DataType::UInt64 => take_numeric::(values, indices), + DataType::Float32 => take_numeric::(values, indices), + DataType::Float64 => take_numeric::(values, indices), + DataType::Date32(_) => take_numeric::(values, indices), + DataType::Date64(_) => take_numeric::(values, indices), + DataType::Time32(Second) => take_numeric::(values, indices), DataType::Time32(Millisecond) => { - take_numeric::(array, index) + take_numeric::(values, indices) } DataType::Time64(Microsecond) => { - take_numeric::(array, index) + take_numeric::(values, indices) } DataType::Time64(Nanosecond) => { - take_numeric::(array, index) + take_numeric::(values, indices) + } + DataType::Timestamp(Second) => { + take_numeric::(values, indices) } - DataType::Timestamp(Second) => take_numeric::(array, index), DataType::Timestamp(Millisecond) => { - take_numeric::(array, index) + take_numeric::(values, indices) } DataType::Timestamp(Microsecond) => { - take_numeric::(array, index) + take_numeric::(values, indices) } DataType::Timestamp(Nanosecond) => { - take_numeric::(array, index) + take_numeric::(values, indices) } - DataType::Utf8 => take_binary(array, index), - DataType::List(_) => take_list(array, index), + DataType::Utf8 => take_binary(values, indices), + DataType::List(_) => take_list(values, indices), DataType::Struct(fields) => { let struct_: &StructArray = - array.as_any().downcast_ref::().unwrap(); + values.as_any().downcast_ref::().unwrap(); let arrays: Result> = struct_ .columns() .iter() - .map(|a| take(a, index, Some(&options))) + .map(|a| take(a, indices, Some(options.clone()))) .collect(); let arrays = arrays?; let pairs: Vec<(Field, ArrayRef)> = fields.clone().into_iter().zip(arrays).collect(); Ok(Arc::new(StructArray::from(pairs)) as ArrayRef) } - t @ _ => unimplemented!("Sort not supported for data type {:?}", t), + t @ _ => unimplemented!("Take not supported for data type {:?}", t), } } @@ -122,17 +124,18 @@ impl Default for TakeOptions { } /// `take` implementation for numeric arrays -fn take_numeric(array: &ArrayRef, index: &UInt32Array) -> Result +fn take_numeric(values: &ArrayRef, indices: &UInt32Array) -> Result where T: ArrowNumericType, { - let mut builder = PrimitiveBuilder::::new(index.len()); - let a = array.as_any().downcast_ref::>().unwrap(); - for i in 0..index.len() { - if index.is_null(i) { + let mut builder = PrimitiveBuilder::::new(indices.len()); + let a = values.as_any().downcast_ref::>().unwrap(); + dbg!(&a); + for i in 0..indices.len() { + if indices.is_null(i) { builder.append_null()?; } else { - let ix = index.value(i) as usize; + let ix = indices.value(i) as usize; if a.is_valid(ix) { builder.append_value(a.value(ix))?; } else { @@ -144,14 +147,14 @@ where } /// `take` implementation for binary arrays -fn take_binary(array: &ArrayRef, index: &UInt32Array) -> Result { - let mut builder = BinaryBuilder::new(index.len()); - let a = array.as_any().downcast_ref::().unwrap(); - for i in 0..index.len() { - if index.is_null(i) { +fn take_binary(values: &ArrayRef, indices: &UInt32Array) -> Result { + let mut builder = BinaryBuilder::new(indices.len()); + let a = values.as_any().downcast_ref::().unwrap(); + for i in 0..indices.len() { + if indices.is_null(i) { builder.append(false)?; } else { - let ix = index.value(i) as usize; + let ix = indices.value(i) as usize; if a.is_null(ix) { builder.append(false)?; } else { @@ -163,14 +166,14 @@ fn take_binary(array: &ArrayRef, index: &UInt32Array) -> Result { } /// `take` implementation for boolean arrays -fn take_bool(array: &ArrayRef, index: &UInt32Array) -> Result { - let mut builder = BooleanBuilder::new(index.len()); - let a = array.as_any().downcast_ref::().unwrap(); - for i in 0..index.len() { - if index.is_null(i) { +fn take_bool(values: &ArrayRef, indices: &UInt32Array) -> Result { + let mut builder = BooleanBuilder::new(indices.len()); + let a = values.as_any().downcast_ref::().unwrap(); + for i in 0..indices.len() { + if indices.is_null(i) { builder.append_null()?; } else { - let ix = index.value(i) as usize; + let ix = indices.value(i) as usize; if a.is_null(ix) { builder.append_null()?; } else { @@ -186,15 +189,17 @@ fn take_bool(array: &ArrayRef, index: &UInt32Array) -> Result { /// Calculates the index and indexed offset for the inner array, /// applying `take` on the inner array, then reconstructing a list array /// with the indexed offsets -fn take_list(array: &ArrayRef, index: &UInt32Array) -> Result { - let list: &ListArray = array.as_any().downcast_ref::().unwrap(); - let (indices, offsets) = take_index_from_list(array, index); - let taken = take(&list.values(), &indices, None)?; +fn take_list(values: &ArrayRef, indices: &UInt32Array) -> Result { + let list: &ListArray = values.as_any().downcast_ref::().unwrap(); + let (list_indices, offsets) = take_value_indices_from_list(values, indices); + dbg!((&list_indices, &offsets)); + let taken = take(&list.values(), &list_indices, None)?; let value_offsets = Buffer::from(offsets[..].to_byte_slice()); + // create a new list with let list_data = ArrayData::new( list.data_type().clone(), - index.len(), - Some(index.null_count()), + indices.len(), + Some(indices.null_count()), taken.data().null_bitmap().clone().map(|bitmap| bitmap.bits), 0, vec![value_offsets], @@ -208,10 +213,10 @@ fn take_list(array: &ArrayRef, index: &UInt32Array) -> Result { mod tests { use super::*; - fn take_test_numeric<'a, T>( + fn test_take_numeric<'a, T>( data: Vec>, index: &UInt32Array, - options: Option<&TakeOptions>, + options: Option, ) -> ArrayRef where T: ArrowNumericType, @@ -243,7 +248,7 @@ mod tests { let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); // uint8 - let a = take_test_numeric::( + let a = test_take_numeric::( vec![Some(0), None, Some(2), Some(3), None], &index, None, @@ -258,7 +263,7 @@ mod tests { assert_eq!(2, a.value(4)); // uint16 - let a = take_test_numeric::( + let a = test_take_numeric::( vec![Some(0), None, Some(2), Some(3), None], &index, None, @@ -273,7 +278,7 @@ mod tests { assert_eq!(2, a.value(4)); // uint32 - let a = take_test_numeric::( + let a = test_take_numeric::( vec![Some(0), None, Some(2), Some(3), None], &index, None, @@ -288,7 +293,7 @@ mod tests { assert_eq!(2, a.value(4)); // int64 - let a = take_test_numeric::( + let a = test_take_numeric::( vec![Some(0), None, Some(2), Some(-15), None], &index, None, @@ -303,7 +308,7 @@ mod tests { assert_eq!(2, a.value(4)); // float32 - let a = take_test_numeric::( + let a = test_take_numeric::( vec![Some(0.0), None, Some(2.21), Some(-3.1), None], &index, None, @@ -406,8 +411,8 @@ mod tests { } #[test] - fn test_take_list_with_nulls() { - // Construct a value array, [[0,null,0], [-1,-2,3], null, [2,null]] + fn test_take_list_with_value_nulls() { + // Construct a value array, [[0,null,0], [-1,-2,3], [null], [2,null]] let value_data = Int32Array::from(vec![ Some(0), None, @@ -427,18 +432,19 @@ mod tests { let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets.clone()) - .null_count(1) + .null_count(0) .null_bit_buffer(Buffer::from([0b10111101, 0b00000000])) .add_child_data(value_data.clone()) .build(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - // index returns: [null, null, [-1,-2,-1], [2,null], [0,null,0]] + // index returns: [[null], null, [-1,-2,-1], [2,null], [0,null,0]] let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); let a = take(&list_array, &index, None).unwrap(); let a: &ListArray = a.as_any().downcast_ref::().unwrap(); assert_eq!(5, a.len()); + assert_eq!(1, a.null_count()); let b = a.values(); let b = Int32Array::from(b.data()); @@ -460,7 +466,62 @@ mod tests { } #[test] - fn take_struct() { + fn test_take_list_with_list_nulls() { + // Construct a value array, [[0,null,0], [-1,-2,3], null, [2,null]] + let value_data = Int32Array::from(vec![ + Some(0), + None, + Some(0), + Some(-1), + Some(-2), + Some(3), + Some(5), + None, + ]) + .data(); + // Construct offsets + let value_offsets = Buffer::from(&[0, 3, 6, 6, 8].to_byte_slice()); + // Construct a list array from the above two + let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(4) + .add_buffer(value_offsets.clone()) + .null_count(1) + .null_bit_buffer(Buffer::from([0b01111101])) + .add_child_data(value_data.clone()) + .build(); + let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; + + // index returns: [null, null, [-1,-2,-1], [2,null], [0,null,0]] + let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); + + let a = take(&list_array, &index, None).unwrap(); + let a: &ListArray = a.as_any().downcast_ref::().unwrap(); + assert_eq!(5, a.len()); + assert_eq!(1, a.null_count()); + let b = a.values(); + let b = Int32Array::from(b.data()); + + dbg!(&b); + + let taken_offsets = Buffer::from(&[0, 0, 0, 3, 5, 8].to_byte_slice()); + // list offsets should be the same + assert_eq!(a.data_ref().buffers(), &[taken_offsets]); + assert_eq!(3, b.null_count()); + assert_eq!(9, b.len()); + assert_eq!(true, b.is_null(0)); + assert_eq!(-1, b.value(1)); + assert_eq!(-2, b.value(2)); + assert_eq!(3, b.value(3)); + assert_eq!(5, b.value(4)); + assert_eq!(true, b.is_null(5)); + assert_eq!(0, b.value(6)); + assert_eq!(true, b.is_null(7)); + assert_eq!(0, b.value(8)); + } + + #[test] + fn test_take_struct() { let array = create_test_struct(); let index = UInt32Array::from(vec![0, 3, 1, 0, 2]); @@ -482,7 +543,7 @@ mod tests { } #[test] - fn take_struct_with_nulls() { + fn test_take_struct_with_nulls() { let array = create_test_struct(); let index = UInt32Array::from(vec![None, Some(3), Some(1), None, Some(0)]); @@ -505,15 +566,15 @@ mod tests { #[should_panic( expected = "Array index out of bounds, cannot get item at index 6 from 5 entries" )] - fn take_out_of_bounds() { + fn test_take_out_of_bounds() { let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(6)]); let take_opt = TakeOptions { check_bounds: true }; // int64 - take_test_numeric::( + test_take_numeric::( vec![Some(0), None, Some(2), Some(3), None], &index, - Some(&take_opt), + Some(take_opt), ); } } diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index 7a5cbfa2a00..2c54b0447dd 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -47,27 +47,27 @@ where /// Takes/filters a list array's inner data using the offsets of the list array. /// -/// Where a list array has index `[0,2,5,10]`, taking an index of `[2,0]` returns +/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns /// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2 /// elements) -pub(crate) fn take_index_from_list( - array: &ArrayRef, - index: &UInt32Array, +pub(super) fn take_value_indices_from_list( + values: &ArrayRef, + indices: &UInt32Array, ) -> (UInt32Array, Vec) { // TODO benchmark this function, there might be a faster unsafe alternative // get list array's offsets - let list: &ListArray = array.as_any().downcast_ref::().unwrap(); + let list: &ListArray = values.as_any().downcast_ref::().unwrap(); let offsets: Vec = (0..=list.len()) .map(|i| list.value_offset(i) as u32) .collect(); - let mut new_offsets = Vec::with_capacity(index.len()); + let mut new_offsets = Vec::with_capacity(indices.len()); let mut current_offset = 0; // add first offset new_offsets.push(0); - let values: Vec> = (0..index.len()) + let values: Vec> = (0..indices.len()) .flat_map(|i: usize| { - if index.is_valid(i) { - let ix = index.value(i) as usize; + if indices.is_valid(i) { + let ix = indices.value(i) as usize; let start = offsets[ix]; let end = offsets[ix + 1]; current_offset += (end - start) as i32; @@ -79,7 +79,12 @@ pub(crate) fn take_index_from_list( vec![None] } }) + .map(|x| { + dbg!(&x); + x + }) .collect(); + dbg!(&values); (UInt32Array::from(values), new_offsets) } @@ -125,7 +130,7 @@ mod tests { } #[test] - fn test_take_index_from_list() { + fn test_take_value_index_from_list() { let value_data = Int32Array::from((0..10).collect::>()).data(); let value_offsets = Buffer::from(&[0, 2, 5, 10].to_byte_slice()); let list_data_type = DataType::List(Box::new(DataType::Int32)); @@ -136,7 +141,7 @@ mod tests { .build(); let array = Arc::new(ListArray::from(list_data)) as ArrayRef; let index = UInt32Array::from(vec![2, 0]); - let (indexed, offsets) = take_index_from_list(&array, &index); + let (indexed, offsets) = take_value_indices_from_list(&array, &index); assert_eq!(vec![0, 5, 7], offsets); let data = UInt32Array::from(vec![ Some(5), From 2467241f027f4c14d3a19f31309462d152652b5f Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 6 Jul 2019 14:31:04 +0000 Subject: [PATCH 8/9] address review feedback --- rust/arrow/src/array/array.rs | 2 +- rust/arrow/src/array/builder.rs | 24 +- rust/arrow/src/compute/kernels/take.rs | 440 +++++++++++++------------ rust/arrow/src/compute/util.rs | 42 +-- 4 files changed, 256 insertions(+), 252 deletions(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index 410c12608ea..e4e55d06650 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -994,7 +994,7 @@ impl StructArray { pub fn num_columns(&self) -> usize { self.boxed_fields.len() } - + /// Returns the fields of the struct array pub fn columns(&self) -> Vec<&ArrayRef> { self.boxed_fields.iter().collect() diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index f032cd24a54..da0357b4924 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -467,7 +467,7 @@ impl BinaryBuilder { /// /// Note, when appending individual byte values you must call `append` to delimit each /// distinct list value. - pub fn append_value(&mut self, value: u8) -> Result<()> { + pub fn append_byte(&mut self, value: u8) -> Result<()> { self.builder.values().append_value(value)?; Ok(()) } @@ -476,7 +476,7 @@ impl BinaryBuilder { /// /// Automatically calls the `append` method to delimit the slice appended in as a /// distinct array element. - pub fn append_values(&mut self, value: &[u8]) -> Result<()> { + pub fn append_value(&mut self, value: &[u8]) -> Result<()> { self.builder.values().append_slice(value)?; self.builder.append(true)?; Ok(()) @@ -1166,18 +1166,18 @@ mod tests { fn test_binary_array_builder() { let mut builder = BinaryBuilder::new(20); - builder.append_value(b'h').unwrap(); - builder.append_value(b'e').unwrap(); - builder.append_value(b'l').unwrap(); - builder.append_value(b'l').unwrap(); - builder.append_value(b'o').unwrap(); + builder.append_byte(b'h').unwrap(); + builder.append_byte(b'e').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'o').unwrap(); builder.append(true).unwrap(); builder.append(true).unwrap(); - builder.append_value(b'w').unwrap(); - builder.append_value(b'o').unwrap(); - builder.append_value(b'r').unwrap(); - builder.append_value(b'l').unwrap(); - builder.append_value(b'd').unwrap(); + builder.append_byte(b'w').unwrap(); + builder.append_byte(b'o').unwrap(); + builder.append_byte(b'r').unwrap(); + builder.append_byte(b'l').unwrap(); + builder.append_byte(b'd').unwrap(); builder.append(true).unwrap(); let array = builder.finish(); diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index c58f34ef592..21b02f8d200 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -20,12 +20,13 @@ use std::sync::Arc; use crate::array::*; -use crate::array_data::ArrayData; -use crate::buffer::Buffer; -use crate::builder::*; +use crate::buffer::{Buffer, MutableBuffer}; use crate::compute::util::take_value_indices_from_list; use crate::datatypes::*; use crate::error::{ArrowError, Result}; +use crate::util::bit_util; + +use TimeUnit::*; /// Take elements from `ArrayRef` by supplying an array of indices. /// @@ -37,8 +38,6 @@ pub fn take( indices: &UInt32Array, options: Option, ) -> Result { - use TimeUnit::*; - let options = options.clone().unwrap_or(Default::default()); if options.check_bounds { let len = values.len(); @@ -54,40 +53,40 @@ pub fn take( } } match values.data_type() { - DataType::Boolean => take_bool(values, indices), - DataType::Int8 => take_numeric::(values, indices), - DataType::Int16 => take_numeric::(values, indices), - DataType::Int32 => take_numeric::(values, indices), - DataType::Int64 => take_numeric::(values, indices), - DataType::UInt8 => take_numeric::(values, indices), - DataType::UInt16 => take_numeric::(values, indices), - DataType::UInt32 => take_numeric::(values, indices), - DataType::UInt64 => take_numeric::(values, indices), - DataType::Float32 => take_numeric::(values, indices), - DataType::Float64 => take_numeric::(values, indices), - DataType::Date32(_) => take_numeric::(values, indices), - DataType::Date64(_) => take_numeric::(values, indices), - DataType::Time32(Second) => take_numeric::(values, indices), + DataType::Boolean => take_primitive::(values, indices), + DataType::Int8 => take_primitive::(values, indices), + DataType::Int16 => take_primitive::(values, indices), + DataType::Int32 => take_primitive::(values, indices), + DataType::Int64 => take_primitive::(values, indices), + DataType::UInt8 => take_primitive::(values, indices), + DataType::UInt16 => take_primitive::(values, indices), + DataType::UInt32 => take_primitive::(values, indices), + DataType::UInt64 => take_primitive::(values, indices), + DataType::Float32 => take_primitive::(values, indices), + DataType::Float64 => take_primitive::(values, indices), + DataType::Date32(_) => take_primitive::(values, indices), + DataType::Date64(_) => take_primitive::(values, indices), + DataType::Time32(Second) => take_primitive::(values, indices), DataType::Time32(Millisecond) => { - take_numeric::(values, indices) + take_primitive::(values, indices) } DataType::Time64(Microsecond) => { - take_numeric::(values, indices) + take_primitive::(values, indices) } DataType::Time64(Nanosecond) => { - take_numeric::(values, indices) + take_primitive::(values, indices) } DataType::Timestamp(Second) => { - take_numeric::(values, indices) + take_primitive::(values, indices) } DataType::Timestamp(Millisecond) => { - take_numeric::(values, indices) + take_primitive::(values, indices) } DataType::Timestamp(Microsecond) => { - take_numeric::(values, indices) + take_primitive::(values, indices) } DataType::Timestamp(Nanosecond) => { - take_numeric::(values, indices) + take_primitive::(values, indices) } DataType::Utf8 => take_binary(values, indices), DataType::List(_) => take_list(values, indices), @@ -111,7 +110,9 @@ pub fn take( /// Options that define how `take` should behave #[derive(Clone)] pub struct TakeOptions { - /// perform bounds check before taking + /// Perform bounds check before taking indices from values. + /// If enabled, an `ArrowError` is returned if the indices are out of bounds. + /// If not enabled, and indices exceed bounds, the kernel will panic. pub check_bounds: bool, } @@ -123,14 +124,13 @@ impl Default for TakeOptions { } } -/// `take` implementation for numeric arrays -fn take_numeric(values: &ArrayRef, indices: &UInt32Array) -> Result +/// `take` implementation for primitive arrays +fn take_primitive(values: &ArrayRef, indices: &UInt32Array) -> Result where - T: ArrowNumericType, + T: ArrowPrimitiveType, { let mut builder = PrimitiveBuilder::::new(indices.len()); let a = values.as_any().downcast_ref::>().unwrap(); - dbg!(&a); for i in 0..indices.len() { if indices.is_null(i) { builder.append_null()?; @@ -157,25 +157,6 @@ fn take_binary(values: &ArrayRef, indices: &UInt32Array) -> Result { let ix = indices.value(i) as usize; if a.is_null(ix) { builder.append(false)?; - } else { - builder.append_values(a.value(ix))?; - } - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) -} - -/// `take` implementation for boolean arrays -fn take_bool(values: &ArrayRef, indices: &UInt32Array) -> Result { - let mut builder = BooleanBuilder::new(indices.len()); - let a = values.as_any().downcast_ref::().unwrap(); - for i in 0..indices.len() { - if indices.is_null(i) { - builder.append_null()?; - } else { - let ix = indices.value(i) as usize; - if a.is_null(ix) { - builder.append_null()?; } else { builder.append_value(a.value(ix))?; } @@ -190,22 +171,39 @@ fn take_bool(values: &ArrayRef, indices: &UInt32Array) -> Result { /// applying `take` on the inner array, then reconstructing a list array /// with the indexed offsets fn take_list(values: &ArrayRef, indices: &UInt32Array) -> Result { + // TODO Some optimizations can be done here such as if it is taking the whole list or a contiguous sublist let list: &ListArray = values.as_any().downcast_ref::().unwrap(); let (list_indices, offsets) = take_value_indices_from_list(values, indices); - dbg!((&list_indices, &offsets)); let taken = take(&list.values(), &list_indices, None)?; + // determine null count and null buffer, because they are now a function of `values` and `indices` + let mut null_count = 0; + let num_bytes = bit_util::ceil(indices.len(), 8); + let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + { + let null_slice = null_buf.data_mut(); + &offsets[..] + .windows(2) + .enumerate() + .for_each(|(i, window): (usize, &[i32])| { + if window[0] != window[1] { + // offsets are unequal, slot is not null + bit_util::set_bit(null_slice, i); + } else { + null_count += 1; + } + }); + } let value_offsets = Buffer::from(offsets[..].to_byte_slice()); // create a new list with - let list_data = ArrayData::new( - list.data_type().clone(), - indices.len(), - Some(indices.null_count()), - taken.data().null_bitmap().clone().map(|bitmap| bitmap.bits), - 0, - vec![value_offsets], - vec![taken.data()], - ); - let list_array = Arc::new(ListArray::from(Arc::new(list_data))) as ArrayRef; + let list_data = ArrayDataBuilder::new(list.data_type().clone()) + .len(indices.len()) + .null_count(null_count) + .null_bit_buffer(null_buf.freeze()) + .offset(0) + .add_child_data(taken.data()) + .add_buffer(value_offsets) + .build(); + let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; Ok(list_array) } @@ -213,17 +211,20 @@ fn take_list(values: &ArrayRef, indices: &UInt32Array) -> Result { mod tests { use super::*; - fn test_take_numeric<'a, T>( + fn test_take_primitive_arrays<'a, T>( data: Vec>, index: &UInt32Array, options: Option, - ) -> ArrayRef - where - T: ArrowNumericType, - PrimitiveArray: From>>, + expected_data: Vec>, + ) where + T: ArrowPrimitiveType, + PrimitiveArray: From>> + ArrayEqual, { - let a = PrimitiveArray::::from(data); - take(&(Arc::new(a) as ArrayRef), index, options).unwrap() + let output = PrimitiveArray::::from(data); + let expected = PrimitiveArray::::from(expected_data); + let output = take(&(Arc::new(output) as ArrayRef), index, options).unwrap(); + let output = output.as_any().downcast_ref::>().unwrap(); + assert!(output.equals(&expected)) } // create a simple struct for testing purposes @@ -248,103 +249,61 @@ mod tests { let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); // uint8 - let a = test_take_numeric::( + test_take_primitive_arrays::( vec![Some(0), None, Some(2), Some(3), None], &index, None, + vec![Some(3), None, None, Some(3), Some(2)], ); - assert_eq!(index.len(), a.len()); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(2, a.null_count()); - assert_eq!(3, a.value(0)); - assert_eq!(true, a.is_null(1)); - assert_eq!(true, a.is_null(2)); - assert_eq!(3, a.value(3)); - assert_eq!(2, a.value(4)); // uint16 - let a = test_take_numeric::( + test_take_primitive_arrays::( vec![Some(0), None, Some(2), Some(3), None], &index, None, + vec![Some(3), None, None, Some(3), Some(2)], ); - assert_eq!(index.len(), a.len()); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(2, a.null_count()); - assert_eq!(3, a.value(0)); - assert_eq!(true, a.is_null(1)); - assert_eq!(true, a.is_null(2)); - assert_eq!(3, a.value(3)); - assert_eq!(2, a.value(4)); // uint32 - let a = test_take_numeric::( + test_take_primitive_arrays::( vec![Some(0), None, Some(2), Some(3), None], &index, None, + vec![Some(3), None, None, Some(3), Some(2)], ); - assert_eq!(index.len(), a.len()); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(2, a.null_count()); - assert_eq!(3, a.value(0)); - assert_eq!(true, a.is_null(1)); - assert_eq!(true, a.is_null(2)); - assert_eq!(3, a.value(3)); - assert_eq!(2, a.value(4)); // int64 - let a = test_take_numeric::( + test_take_primitive_arrays::( vec![Some(0), None, Some(2), Some(-15), None], &index, None, + vec![Some(-15), None, None, Some(-15), Some(2)], ); - assert_eq!(index.len(), a.len()); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(2, a.null_count()); - assert_eq!(-15, a.value(0)); - assert_eq!(true, a.is_null(1)); - assert_eq!(true, a.is_null(2)); - assert_eq!(-15, a.value(3)); - assert_eq!(2, a.value(4)); // float32 - let a = test_take_numeric::( + test_take_primitive_arrays::( vec![Some(0.0), None, Some(2.21), Some(-3.1), None], &index, None, + vec![Some(-3.1), None, None, Some(-3.1), Some(2.21)], ); - assert_eq!(index.len(), a.len()); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(2, a.null_count()); - assert_eq!(-3.1, a.value(0)); - assert_eq!(true, a.is_null(1)); - assert_eq!(true, a.is_null(2)); - assert_eq!(-3.1, a.value(3)); - assert_eq!(2.21, a.value(4)); - } - #[test] - fn test_take_bool() { - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(4)]); - let array = BooleanArray::from(vec![ - Some(true), - Some(false), - None, - Some(false), - Some(true), + // float64 + test_take_primitive_arrays::( + vec![Some(0.0), None, Some(2.21), Some(-3.1), None], + &index, None, - ]); - let array = Arc::new(array) as ArrayRef; - let a = take(&array, &index, None).unwrap(); - assert_eq!(a.len(), index.len()); - let b = BooleanArray::from(vec![ - Some(false), + vec![Some(-3.1), None, None, Some(-3.1), Some(2.21)], + ); + + // boolean + // float32 + test_take_primitive_arrays::( + vec![Some(false), None, Some(true), Some(false), None], + &index, None, - Some(false), - Some(false), - Some(true), - ]); - assert_eq!(a.data(), b.data()); + vec![Some(false), None, None, Some(false), Some(true)], + ); } #[test] @@ -388,31 +347,41 @@ mod tests { let a = take(&list_array, &index, None).unwrap(); let a: &ListArray = a.as_any().downcast_ref::().unwrap(); - assert_eq!(5, a.len()); - let b = a.values(); - let b = Int32Array::from(b.data()); - - let taken_offsets = Buffer::from(&[0, 2, 2, 5, 7, 10].to_byte_slice()); - assert_eq!(1, b.null_count()); - assert_eq!(11, b.len()); - assert_eq!(2, b.value(0)); - assert_eq!(3, b.value(1)); - assert_eq!(true, b.is_null(2)); - assert_eq!(-1, b.value(3)); - assert_eq!(-2, b.value(4)); - assert_eq!(-1, b.value(5)); - assert_eq!(2, b.value(6)); - assert_eq!(3, b.value(7)); - assert_eq!(0, b.value(8)); - assert_eq!(0, b.value(9)); - assert_eq!(0, b.value(9)); - // list offsets should be the same - assert_eq!(a.data_ref().buffers(), &[taken_offsets]); + + // construct a value aray with expected results: + // [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] + let expected_data = Int32Array::from(vec![ + Some(2), + Some(3), + Some(-1), + Some(-2), + Some(-1), + Some(2), + Some(3), + Some(0), + Some(0), + Some(0), + ]) + .data(); + // construct offsets + let expected_offsets = Buffer::from(&[0, 2, 2, 5, 7, 10].to_byte_slice()); + // construct list array from the two + let expected_list_data = ArrayData::builder(list_data_type.clone()) + .len(5) + .null_count(1) + // null buffer remains the same as only the indices have nulls + .null_bit_buffer(index.data().null_bitmap().as_ref().unwrap().bits.clone()) + .add_buffer(expected_offsets.clone()) + .add_child_data(expected_data.clone()) + .build(); + let expected_list_array = ListArray::from(expected_list_data); + + assert!(a.equals(&expected_list_array)); } #[test] fn test_take_list_with_value_nulls() { - // Construct a value array, [[0,null,0], [-1,-2,3], [null], [2,null]] + // Construct a value array, [[0,null,0], [-1,-2,3], [null], [5,null]] let value_data = Int32Array::from(vec![ Some(0), None, @@ -438,36 +407,45 @@ mod tests { .build(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - // index returns: [[null], null, [-1,-2,-1], [2,null], [0,null,0]] + // index returns: [[null], null, [-1,-2,3], [2,null], [0,null,0]] let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); let a = take(&list_array, &index, None).unwrap(); let a: &ListArray = a.as_any().downcast_ref::().unwrap(); - assert_eq!(5, a.len()); - assert_eq!(1, a.null_count()); - let b = a.values(); - let b = Int32Array::from(b.data()); - - let taken_offsets = Buffer::from(&[0, 1, 1, 4, 6, 9].to_byte_slice()); - assert_eq!(4, b.null_count()); - assert_eq!(10, b.len()); - assert_eq!(true, b.is_null(0)); - assert_eq!(true, b.is_null(1)); - assert_eq!(-1, b.value(2)); - assert_eq!(-2, b.value(3)); - assert_eq!(3, b.value(4)); - assert_eq!(5, b.value(5)); - assert_eq!(true, b.is_null(6)); - assert_eq!(0, b.value(7)); - assert_eq!(true, b.is_null(8)); - assert_eq!(0, b.value(9)); - // list offsets should be the same - assert_eq!(a.data_ref().buffers(), &[taken_offsets]); + + // construct a value aray with expected results: + // [[null], null, [-1,-2,3], [5,null], [0,null,0]] + let expected_data = Int32Array::from(vec![ + None, + Some(-1), + Some(-2), + Some(3), + Some(5), + None, + Some(0), + None, + Some(0), + ]) + .data(); + // construct offsets + let expected_offsets = Buffer::from(&[0, 1, 1, 4, 6, 9].to_byte_slice()); + // construct list array from the two + let expected_list_data = ArrayData::builder(list_data_type.clone()) + .len(5) + .null_count(1) + // null buffer remains the same as only the indices have nulls + .null_bit_buffer(index.data().null_bitmap().as_ref().unwrap().bits.clone()) + .add_buffer(expected_offsets.clone()) + .add_child_data(expected_data.clone()) + .build(); + let expected_list_array = ListArray::from(expected_list_data); + + assert!(a.equals(&expected_list_array)); } #[test] fn test_take_list_with_list_nulls() { - // Construct a value array, [[0,null,0], [-1,-2,3], null, [2,null]] + // Construct a value array, [[0,null,0], [-1,-2,3], null, [5,null]] let value_data = Int32Array::from(vec![ Some(0), None, @@ -492,32 +470,43 @@ mod tests { .build(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - // index returns: [null, null, [-1,-2,-1], [2,null], [0,null,0]] + // index returns: [null, null, [-1,-2,3], [5,null], [0,null,0]] let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); let a = take(&list_array, &index, None).unwrap(); let a: &ListArray = a.as_any().downcast_ref::().unwrap(); - assert_eq!(5, a.len()); - assert_eq!(1, a.null_count()); - let b = a.values(); - let b = Int32Array::from(b.data()); - - dbg!(&b); - - let taken_offsets = Buffer::from(&[0, 0, 0, 3, 5, 8].to_byte_slice()); - // list offsets should be the same - assert_eq!(a.data_ref().buffers(), &[taken_offsets]); - assert_eq!(3, b.null_count()); - assert_eq!(9, b.len()); - assert_eq!(true, b.is_null(0)); - assert_eq!(-1, b.value(1)); - assert_eq!(-2, b.value(2)); - assert_eq!(3, b.value(3)); - assert_eq!(5, b.value(4)); - assert_eq!(true, b.is_null(5)); - assert_eq!(0, b.value(6)); - assert_eq!(true, b.is_null(7)); - assert_eq!(0, b.value(8)); + + // construct a value aray with expected results: + // [null, null, [-1,-2,3], [5,null], [0,null,0]] + let expected_data = Int32Array::from(vec![ + Some(-1), + Some(-2), + Some(3), + Some(5), + None, + Some(0), + None, + Some(0), + ]) + .data(); + // construct offsets + let expected_offsets = Buffer::from(&[0, 0, 0, 3, 5, 8].to_byte_slice()); + // construct list array from the two + let mut null_bits: [u8; 1] = [0; 1]; + bit_util::set_bit(&mut null_bits, 2); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + let expected_list_data = ArrayData::builder(list_data_type.clone()) + .len(5) + .null_count(2) + // null buffer must be recalculated as both values and indices have nulls + .null_bit_buffer(Buffer::from(null_bits)) + .add_buffer(expected_offsets.clone()) + .add_child_data(expected_data.clone()) + .build(); + let expected_list_array = ListArray::from(expected_list_data); + + assert!(a.equals(&expected_list_array)); } #[test] @@ -530,16 +519,20 @@ mod tests { assert_eq!(index.len(), a.len()); assert_eq!(0, a.null_count()); - let b = BooleanArray::from(vec![true, true, false, true, false]); - let c = Int32Array::from(vec![42, 31, 28, 42, 19]); - let bools = a.column(0); - let bools = bools.as_any().downcast_ref::().unwrap(); - let ints = a.column(1); - let ints = ints.as_any().downcast_ref::().unwrap(); - assert_eq!(format!("{:?}", bools), format!("{:?}", b)); - assert_eq!(format!("{:?}", ints), format!("{:?}", c)); - assert_eq!(b.data(), bools.data()); - assert_eq!(c.data(), ints.data()); + let expected_bool_data = + BooleanArray::from(vec![true, true, false, true, false]).data(); + let expected_int_data = Int32Array::from(vec![42, 31, 28, 42, 19]).data(); + let mut field_types = vec![]; + field_types.push(Field::new("a", DataType::Boolean, true)); + field_types.push(Field::new("b", DataType::Int32, true)); + let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) + .len(5) + .null_count(0) + .add_child_data(expected_bool_data) + .add_child_data(expected_int_data) + .build(); + let struct_array = StructArray::from(struct_array_data); + assert!(a.equals(&struct_array)); } #[test] @@ -552,14 +545,24 @@ mod tests { assert_eq!(index.len(), a.len()); assert_eq!(0, a.null_count()); - let b = BooleanArray::from(vec![None, Some(true), Some(false), None, Some(true)]); - let c = Int32Array::from(vec![None, Some(31), Some(28), None, Some(42)]); - let bools = a.column(0); - let bools = bools.as_any().downcast_ref::().unwrap(); - let ints = a.column(1); - let ints = ints.as_any().downcast_ref::().unwrap(); - assert_eq!(format!("{:?}", bools), format!("{:?}", b)); - assert_eq!(format!("{:?}", ints), format!("{:?}", c)); + let expected_bool_data = + BooleanArray::from(vec![None, Some(true), Some(false), None, Some(true)]) + .data(); + let expected_int_data = + Int32Array::from(vec![None, Some(31), Some(28), None, Some(42)]).data(); + + let mut field_types = vec![]; + field_types.push(Field::new("a", DataType::Boolean, true)); + field_types.push(Field::new("b", DataType::Int32, true)); + let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) + .len(5) + // TODO see https://issues.apache.org/jira/browse/ARROW-5408 for why count != 2 + .null_count(0) + .add_child_data(expected_bool_data) + .add_child_data(expected_int_data) + .build(); + let struct_array = StructArray::from(struct_array_data); + assert!(a.equals(&struct_array)); } #[test] @@ -571,10 +574,11 @@ mod tests { let take_opt = TakeOptions { check_bounds: true }; // int64 - test_take_numeric::( + test_take_primitive_arrays::( vec![Some(0), None, Some(2), Some(3), None], &index, Some(take_opt), + vec![None], ); } } diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index 2c54b0447dd..e7c110bd2a5 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -61,30 +61,30 @@ pub(super) fn take_value_indices_from_list( .map(|i| list.value_offset(i) as u32) .collect(); let mut new_offsets = Vec::with_capacity(indices.len()); + let mut values = Vec::new(); let mut current_offset = 0; // add first offset new_offsets.push(0); - let values: Vec> = (0..indices.len()) - .flat_map(|i: usize| { - if indices.is_valid(i) { - let ix = indices.value(i) as usize; - let start = offsets[ix]; - let end = offsets[ix + 1]; - current_offset += (end - start) as i32; - new_offsets.push(current_offset); - // type annotation needed to guide compiler a bit - (start..end).map(|v| Some(v)).collect::>>() - } else { - new_offsets.push(current_offset); - vec![None] + // compute the value indices, and set offsets accordingly + for i in 0..indices.len() { + if indices.is_valid(i) { + let ix = indices.value(i) as usize; + let start = offsets[ix]; + let end = offsets[ix + 1]; + current_offset += (end - start) as i32; + new_offsets.push(current_offset); + // type annotation needed to guide compiler a bit + let mut offsets: Vec> = + (start..end).map(|v| Some(v)).collect::>>(); + if !offsets.is_empty() { + // if offsets are empty, there are no values to append, and thus we create a null slot + values.append(&mut offsets); } - }) - .map(|x| { - dbg!(&x); - x - }) - .collect(); - dbg!(&values); + } else { + new_offsets.push(current_offset); + // values.push(None); + } + } (UInt32Array::from(values), new_offsets) } @@ -94,7 +94,7 @@ mod tests { use std::sync::Arc; - use crate::array_data::ArrayData; + use crate::array::ArrayData; use crate::datatypes::{DataType, ToByteSlice}; #[test] From 6e7af2db5cf9bcc5f8c8fa67e441702f7c0e4ccc Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 9 Jul 2019 20:34:34 +0000 Subject: [PATCH 9/9] address review comments --- rust/arrow/src/compute/kernels/take.rs | 49 ++++++++++++++++---------- rust/arrow/src/compute/util.rs | 13 ++++--- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 21b02f8d200..6cce7fb47d9 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -38,7 +38,7 @@ pub fn take( indices: &UInt32Array, options: Option, ) -> Result { - let options = options.clone().unwrap_or(Default::default()); + let options = options.unwrap_or(Default::default()); if options.check_bounds { let len = values.len(); for i in 0..indices.len() { @@ -125,6 +125,14 @@ impl Default for TakeOptions { } /// `take` implementation for primitive arrays +/// +/// This checks if an `indices` slot is populated, and gets the value from `values` +/// as the populated index. +/// If the `indices` slot is null, a null value is returned. +/// For example, given: +/// values: [1, 2, 3, null, 5] +/// indices: [0, null, 4, 3] +/// The result is: [1 (slot 0), null (null slot), 5 (slot 4), null (slot 3)] fn take_primitive(values: &ArrayRef, indices: &UInt32Array) -> Result where T: ArrowPrimitiveType, @@ -133,8 +141,10 @@ where let a = values.as_any().downcast_ref::>().unwrap(); for i in 0..indices.len() { if indices.is_null(i) { + // populate with null if index is null builder.append_null()?; } else { + // get index value to use in looking up the value from `values` let ix = indices.value(i) as usize; if a.is_valid(ix) { builder.append_value(a.value(ix))?; @@ -171,11 +181,12 @@ fn take_binary(values: &ArrayRef, indices: &UInt32Array) -> Result { /// applying `take` on the inner array, then reconstructing a list array /// with the indexed offsets fn take_list(values: &ArrayRef, indices: &UInt32Array) -> Result { - // TODO Some optimizations can be done here such as if it is taking the whole list or a contiguous sublist + // TODO: Some optimizations can be done here such as if it is + // taking the whole list or a contiguous sublist let list: &ListArray = values.as_any().downcast_ref::().unwrap(); let (list_indices, offsets) = take_value_indices_from_list(values, indices); let taken = take(&list.values(), &list_indices, None)?; - // determine null count and null buffer, because they are now a function of `values` and `indices` + // determine null count and null buffer, which are a function of `values` and `indices` let mut null_count = 0; let num_bytes = bit_util::ceil(indices.len(), 8); let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); @@ -194,7 +205,7 @@ fn take_list(values: &ArrayRef, indices: &UInt32Array) -> Result { }); } let value_offsets = Buffer::from(offsets[..].to_byte_slice()); - // create a new list with + // create a new list with taken data and computed null information let list_data = ArrayDataBuilder::new(list.data_type().clone()) .len(indices.len()) .null_count(null_count) @@ -237,8 +248,8 @@ mod tests { let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) .len(4) .null_count(0) - .add_child_data(boolean_data.clone()) - .add_child_data(int_data.clone()) + .add_child_data(boolean_data) + .add_child_data(int_data) .build(); let struct_array = StructArray::from(struct_array_data); Arc::new(struct_array) as ArrayRef @@ -337,8 +348,8 @@ mod tests { let list_data_type = DataType::List(Box::new(DataType::Int32)); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) - .add_buffer(value_offsets.clone()) - .add_child_data(value_data.clone()) + .add_buffer(value_offsets) + .add_child_data(value_data) .build(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; @@ -371,8 +382,8 @@ mod tests { .null_count(1) // null buffer remains the same as only the indices have nulls .null_bit_buffer(index.data().null_bitmap().as_ref().unwrap().bits.clone()) - .add_buffer(expected_offsets.clone()) - .add_child_data(expected_data.clone()) + .add_buffer(expected_offsets) + .add_child_data(expected_data) .build(); let expected_list_array = ListArray::from(expected_list_data); @@ -400,10 +411,10 @@ mod tests { let list_data_type = DataType::List(Box::new(DataType::Int32)); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) - .add_buffer(value_offsets.clone()) + .add_buffer(value_offsets) .null_count(0) .null_bit_buffer(Buffer::from([0b10111101, 0b00000000])) - .add_child_data(value_data.clone()) + .add_child_data(value_data) .build(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; @@ -435,8 +446,8 @@ mod tests { .null_count(1) // null buffer remains the same as only the indices have nulls .null_bit_buffer(index.data().null_bitmap().as_ref().unwrap().bits.clone()) - .add_buffer(expected_offsets.clone()) - .add_child_data(expected_data.clone()) + .add_buffer(expected_offsets) + .add_child_data(expected_data) .build(); let expected_list_array = ListArray::from(expected_list_data); @@ -463,10 +474,10 @@ mod tests { let list_data_type = DataType::List(Box::new(DataType::Int32)); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) - .add_buffer(value_offsets.clone()) + .add_buffer(value_offsets) .null_count(1) .null_bit_buffer(Buffer::from([0b01111101])) - .add_child_data(value_data.clone()) + .add_child_data(value_data) .build(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; @@ -501,8 +512,8 @@ mod tests { .null_count(2) // null buffer must be recalculated as both values and indices have nulls .null_bit_buffer(Buffer::from(null_bits)) - .add_buffer(expected_offsets.clone()) - .add_child_data(expected_data.clone()) + .add_buffer(expected_offsets) + .add_child_data(expected_data) .build(); let expected_list_array = ListArray::from(expected_list_data); @@ -556,7 +567,7 @@ mod tests { field_types.push(Field::new("b", DataType::Int32, true)); let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) .len(5) - // TODO see https://issues.apache.org/jira/browse/ARROW-5408 for why count != 2 + // TODO: see https://issues.apache.org/jira/browse/ARROW-5408 for why count != 2 .null_count(0) .add_child_data(expected_bool_data) .add_child_data(expected_int_data) diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index e7c110bd2a5..dc1f54fdd2a 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -54,7 +54,7 @@ pub(super) fn take_value_indices_from_list( values: &ArrayRef, indices: &UInt32Array, ) -> (UInt32Array, Vec) { - // TODO benchmark this function, there might be a faster unsafe alternative + // TODO: benchmark this function, there might be a faster unsafe alternative // get list array's offsets let list: &ListArray = values.as_any().downcast_ref::().unwrap(); let offsets: Vec = (0..=list.len()) @@ -73,16 +73,15 @@ pub(super) fn take_value_indices_from_list( let end = offsets[ix + 1]; current_offset += (end - start) as i32; new_offsets.push(current_offset); - // type annotation needed to guide compiler a bit - let mut offsets: Vec> = - (start..end).map(|v| Some(v)).collect::>>(); - if !offsets.is_empty() { - // if offsets are empty, there are no values to append, and thus we create a null slot + // if start == end, this slot is empty + if start != end { + // type annotation needed to guide compiler a bit + let mut offsets: Vec> = + (start..end).map(|v| Some(v)).collect::>>(); values.append(&mut offsets); } } else { new_offsets.push(current_offset); - // values.push(None); } } (UInt32Array::from(values), new_offsets)