From 10be8bb6a6475fa33636ace69fe845be23400a0a Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Mon, 4 Jan 2021 20:05:38 +0200 Subject: [PATCH 1/6] ARROW-11125: [Rust] Logical equality for list arrays --- rust/arrow/src/array/data.rs | 165 +++++++++++++++- rust/arrow/src/array/equal/boolean.rs | 61 ++++-- rust/arrow/src/array/equal/decimal.rs | 19 +- rust/arrow/src/array/equal/dictionary.rs | 19 +- rust/arrow/src/array/equal/fixed_binary.rs | 19 +- rust/arrow/src/array/equal/fixed_list.rs | 19 +- rust/arrow/src/array/equal/list.rs | 64 +++++- rust/arrow/src/array/equal/mod.rs | 218 +++++++++++---------- rust/arrow/src/array/equal/primitive.rs | 18 +- rust/arrow/src/array/equal/structure.rs | 30 +-- rust/datafusion/tests/sql.rs | 1 + 11 files changed, 470 insertions(+), 163 deletions(-) diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index f4e241e1666..887a22e5d50 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -21,11 +21,12 @@ use std::mem; use std::sync::Arc; -use crate::buffer::Buffer; +use crate::buffer::{Buffer, MutableBuffer}; use crate::datatypes::DataType; +use crate::util::bit_util; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; -use super::equal::equal; +use super::{equal::equal, OffsetSizeTrait}; #[inline] pub(crate) fn count_nulls( @@ -33,7 +34,7 @@ pub(crate) fn count_nulls( offset: usize, len: usize, ) -> usize { - if let Some(ref buf) = null_bit_buffer { + if let Some(buf) = null_bit_buffer { len.checked_sub(buf.count_set_bits_offset(offset, len)) .unwrap() } else { @@ -136,6 +137,84 @@ impl ArrayData { &self.null_bitmap } + /// Computes the logical validity bitmap of the array data using the + /// parent's array data. The parent should be a list or struct, else + /// the logical bitmap of the array is returned unaltered. + /// + /// Parent data is passed along with the parent's logical bitmap, as + /// nested arrays could have a logical bitmap different to the physical + /// one on the `ArrayData`. + /// + /// Safety + /// + /// As we index into [`ArrayData::child_data`], this function panics if + /// array data is not a nested type, as it will not have child data. + pub fn child_logical_null_buffer( + &self, + logical_null_buffer: Option, + child_index: usize, + ) -> Option { + // This function should only be called when having nested data types. + // However, as a convenience, we return the parent's logical buffer if + // we do not encounter a nested type. + let child_data = self.child_data().get(child_index).unwrap(); + + // TODO: rationalise this logic, prefer not creating populated bitmaps, use Option matching + // I found taking a Bitmap that's populated to be more convenient, but I was more concerned first + // about accuracy, so I'll explore using Option<&Bitmap> directly. + let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| { + let ceil = bit_util::ceil(self.len(), 8); + Bitmap::from(Buffer::from(vec![0b11111111; ceil])) + }); + let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| { + let ceil = bit_util::ceil(child_data.len(), 8); + Bitmap::from(Buffer::from(vec![0b11111111; ceil])) + }); + match self.data_type() { + DataType::List(_) => Some(logical_list_bitmap::( + self, + parent_bitmap, + self_null_bitmap, + )), + DataType::LargeList(_) => Some(logical_list_bitmap::( + self, + parent_bitmap, + self_null_bitmap, + )), + DataType::FixedSizeList(_, len) => { + let len = *len as usize; + let array_len = self.len(); + let array_offset = self.offset(); + let bitmap_len = bit_util::ceil(array_len * len, 8); + let mut buffer = + MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false); + let mut null_slice = buffer.data_mut(); + (array_offset..array_len + array_offset).for_each(|index| { + let start = index * len; + let end = start + len; + let mask = parent_bitmap.is_set(index); + (start..end).for_each(|child_index| { + if mask && self_null_bitmap.is_set(child_index) { + bit_util::set_bit(&mut null_slice, child_index); + } + }); + }); + Some(buffer.freeze()) + } + DataType::Struct(_) => (&parent_bitmap & &self_null_bitmap) + .ok() + .map(|bitmap| bitmap.bits), + DataType::Union(_) => { + panic!("Logical equality not yet implemented for union arrays") + } + _ => { + panic!( + "Encountered array data with child data, but not a known nested type" + ) + } + } + } + /// Returns a reference to the null buffer of this array data. pub fn null_buffer(&self) -> Option<&Buffer> { self.null_bitmap().as_ref().map(|b| b.buffer_ref()) @@ -331,14 +410,43 @@ impl ArrayDataBuilder { } } +#[inline] +fn logical_list_bitmap( + parent_data: &ArrayData, + parent_bitmap: Bitmap, + child_bitmap: Bitmap, +) -> Buffer { + let offsets = parent_data.buffer::(0); + let offset_start = offsets.first().unwrap().to_usize().unwrap(); + let offset_len = offsets.get(parent_data.len()).unwrap().to_usize().unwrap(); + let mut buffer = MutableBuffer::new_null(offset_len - offset_start); + let mut null_slice = buffer.data_mut(); + + offsets + .windows(2) + .enumerate() + .take(offset_len - offset_start) + .for_each(|(index, window)| { + dbg!(&window, index); + let start = window[0].to_usize().unwrap(); + let end = window[1].to_usize().unwrap(); + let mask = parent_bitmap.is_set(index); + (start..end).for_each(|child_index| { + if mask && child_bitmap.is_set(child_index) { + bit_util::set_bit(&mut null_slice, child_index - offset_start); + } + }); + }); + buffer.freeze() +} + #[cfg(test)] mod tests { use super::*; use std::sync::Arc; - use crate::buffer::Buffer; - use crate::datatypes::ToByteSlice; + use crate::datatypes::{Field, ToByteSlice}; use crate::util::bit_util; #[test] @@ -462,4 +570,51 @@ mod tests { let count = count_nulls(null_buffer.as_ref(), 4, 8); assert_eq!(count, 3); } + + #[test] + fn test_logical_null_buffer() { + let child_data = ArrayData::builder(DataType::Int32) + .len(11) + .add_buffer(Buffer::from( + vec![1i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11].to_byte_slice(), + )) + .build(); + + let data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + false, + )))) + .len(7) + .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) + .null_bit_buffer(Buffer::from(vec![0b01011010])) + .add_child_data(child_data.clone()) + .build(); + + // Get the child logical null buffer. The child is non-nullable, but because the list has nulls, + // we expect the child to logically have some nulls, inherited from the parent: + // [1, 2, 3, null, null, 6, 7, 8, 9, null, 11] + let nulls = data.child_logical_null_buffer(data.null_buffer().cloned(), 0); + let expected = Some(Buffer::from(vec![0b11100111, 0b00000101])); + assert_eq!(nulls, expected); + + // test with offset + let data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + false, + )))) + .len(4) + .offset(3) + .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) + // the null_bit_buffer doesn't have an offset, i.e. cleared the 3 offset bits 0b[---]01011[010] + .null_bit_buffer(Buffer::from(vec![0b00001011])) + .add_child_data(child_data) + .build(); + + let nulls = data.child_logical_null_buffer(data.null_buffer().cloned(), 0); + + let expected = Some(Buffer::from(vec![0b00101111])); + assert_eq!(nulls, expected); + } } diff --git a/rust/arrow/src/array/equal/boolean.rs b/rust/arrow/src/array/equal/boolean.rs index 88bd080ba53..5aa48fcf556 100644 --- a/rust/arrow/src/array/equal/boolean.rs +++ b/rust/arrow/src/array/equal/boolean.rs @@ -15,13 +15,17 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; +use crate::array::{data::count_nulls, ArrayData}; +use crate::buffer::Buffer; +use crate::util::bit_util::get_bit; use super::utils::equal_bits; pub(super) fn boolean_equal( lhs: &ArrayData, rhs: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_start: usize, rhs_start: usize, len: usize, @@ -29,21 +33,42 @@ pub(super) fn boolean_equal( let lhs_values = lhs.buffers()[0].as_slice(); let rhs_values = rhs.buffers()[0].as_slice(); - // TODO: we can do this more efficiently if all values are not-null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - let lhs_is_null = lhs.is_null(lhs_pos); - let rhs_is_null = rhs.is_null(rhs_pos); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_bits( - lhs_values, - rhs_values, - lhs_pos + lhs.offset(), - rhs_pos + rhs.offset(), - 1, - ) - }) + let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); + let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); + + if lhs_null_count == 0 && rhs_null_count == 0 { + (0..len).all(|i| { + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + + equal_bits( + lhs_values, + rhs_values, + lhs_pos + lhs.offset(), + rhs_pos + rhs.offset(), + 1, + ) + }) + } else { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); + + (0..len).all(|i| { + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); + + lhs_is_null + || (lhs_is_null == rhs_is_null) + && equal_bits( + lhs_values, + rhs_values, + lhs_pos + lhs.offset(), + rhs_pos + rhs.offset(), + 1, + ) + }) + } } diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs index a8fdded2fa7..a8c0659ceea 100644 --- a/rust/arrow/src/array/equal/decimal.rs +++ b/rust/arrow/src/array/equal/decimal.rs @@ -15,13 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType}; +use crate::array::{data::count_nulls, ArrayData}; +use crate::buffer::Buffer; +use crate::datatypes::DataType; +use crate::util::bit_util::get_bit; use super::utils::equal_len; pub(super) fn decimal_equal( lhs: &ArrayData, rhs: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_start: usize, rhs_start: usize, len: usize, @@ -34,7 +39,10 @@ pub(super) fn decimal_equal( let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..]; let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..]; - if lhs.null_count() == 0 && rhs.null_count() == 0 { + let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); + let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); + + if lhs_null_count == 0 && rhs_null_count == 0 { equal_len( lhs_values, rhs_values, @@ -43,13 +51,16 @@ pub(super) fn decimal_equal( size * len, ) } else { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = lhs.is_null(lhs_pos); - let rhs_is_null = rhs.is_null(rhs_pos); + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + lhs.offset()); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/rust/arrow/src/array/equal/dictionary.rs b/rust/arrow/src/array/equal/dictionary.rs index 087f8f8329b..df6dd292d9a 100644 --- a/rust/arrow/src/array/equal/dictionary.rs +++ b/rust/arrow/src/array/equal/dictionary.rs @@ -15,13 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::ArrowNativeType}; +use crate::array::{data::count_nulls, ArrayData}; +use crate::buffer::Buffer; +use crate::datatypes::ArrowNativeType; +use crate::util::bit_util::get_bit; use super::equal_range; pub(super) fn dictionary_equal( lhs: &ArrayData, rhs: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_start: usize, rhs_start: usize, len: usize, @@ -32,7 +37,10 @@ pub(super) fn dictionary_equal( let lhs_values = lhs.child_data()[0].as_ref(); let rhs_values = rhs.child_data()[0].as_ref(); - if lhs.null_count() == 0 && rhs.null_count() == 0 { + let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); + let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); + + if lhs_null_count == 0 && rhs_null_count == 0 { (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; @@ -48,12 +56,15 @@ pub(super) fn dictionary_equal( ) }) } else { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = lhs.is_null(lhs_pos); - let rhs_is_null = rhs.is_null(rhs_pos); + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/rust/arrow/src/array/equal/fixed_binary.rs b/rust/arrow/src/array/equal/fixed_binary.rs index c6889ba4b43..a0a99c0f7d4 100644 --- a/rust/arrow/src/array/equal/fixed_binary.rs +++ b/rust/arrow/src/array/equal/fixed_binary.rs @@ -15,13 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType}; +use crate::array::{data::count_nulls, ArrayData}; +use crate::buffer::Buffer; +use crate::datatypes::DataType; +use crate::util::bit_util::get_bit; use super::utils::equal_len; pub(super) fn fixed_binary_equal( lhs: &ArrayData, rhs: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_start: usize, rhs_start: usize, len: usize, @@ -34,7 +39,10 @@ pub(super) fn fixed_binary_equal( let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..]; let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..]; - if lhs.null_count() == 0 && rhs.null_count() == 0 { + let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); + let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); + + if lhs_null_count == 0 && rhs_null_count == 0 { equal_len( lhs_values, rhs_values, @@ -43,13 +51,16 @@ pub(super) fn fixed_binary_equal( size * len, ) } else { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = lhs.is_null(lhs_pos); - let rhs_is_null = rhs.is_null(rhs_pos); + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/rust/arrow/src/array/equal/fixed_list.rs b/rust/arrow/src/array/equal/fixed_list.rs index ee0ee0f9c21..72588fcead2 100644 --- a/rust/arrow/src/array/equal/fixed_list.rs +++ b/rust/arrow/src/array/equal/fixed_list.rs @@ -15,13 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType}; +use crate::array::{data::count_nulls, ArrayData}; +use crate::buffer::Buffer; +use crate::datatypes::DataType; +use crate::util::bit_util::get_bit; use super::equal_range; pub(super) fn fixed_list_equal( lhs: &ArrayData, rhs: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_start: usize, rhs_start: usize, len: usize, @@ -34,7 +39,10 @@ pub(super) fn fixed_list_equal( let lhs_values = lhs.child_data()[0].as_ref(); let rhs_values = rhs.child_data()[0].as_ref(); - if lhs.null_count() == 0 && rhs.null_count() == 0 { + let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); + let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); + + if lhs_null_count == 0 && rhs_null_count == 0 { equal_range( lhs_values, rhs_values, @@ -45,13 +53,16 @@ pub(super) fn fixed_list_equal( size * len, ) } else { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = lhs.is_null(lhs_pos); - let rhs_is_null = rhs.is_null(rhs_pos); + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/rust/arrow/src/array/equal/list.rs b/rust/arrow/src/array/equal/list.rs index c48c716d6e0..c0fd5c01d13 100644 --- a/rust/arrow/src/array/equal/list.rs +++ b/rust/arrow/src/array/equal/list.rs @@ -15,7 +15,12 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, array::OffsetSizeTrait}; +use crate::{ + array::ArrayData, + array::{data::count_nulls, OffsetSizeTrait}, + buffer::Buffer, + util::bit_util::get_bit, +}; use super::equal_range; @@ -45,6 +50,8 @@ fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { fn offset_value_equal( lhs_values: &ArrayData, rhs_values: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_offsets: &[T], rhs_offsets: &[T], lhs_pos: usize, @@ -60,8 +67,8 @@ fn offset_value_equal( && equal_range( lhs_values, rhs_values, - lhs_values.null_buffer(), - rhs_values.null_buffer(), + lhs_nulls, + rhs_nulls, lhs_start, rhs_start, lhs_len.to_usize().unwrap(), @@ -71,6 +78,8 @@ fn offset_value_equal( pub(super) fn list_equal( lhs: &ArrayData, rhs: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_start: usize, rhs_start: usize, len: usize, @@ -78,18 +87,52 @@ pub(super) fn list_equal( let lhs_offsets = lhs.buffer::(0); let rhs_offsets = rhs.buffer::(0); + // There is an edge-case where a n-length list that has 0 children, results in panics. + // For example; an array with offsets [0, 0, 0, 0, 0] has 4 slots, but will have + // no valid children. + // Under logical equality, the child null bitmap will be an empty buffer, as there are + // no child values. This causes panics when trying to count set bits. + // + // We caught this by chance from an accidental test-case, but due to the nature of this + // crash only occuring on list equality checks, we are adding a check here, instead of + // on the buffer/bitmap utilities, as a length check would incur a penalty for almost all + // other use-cases. + // + // The solution is to check the number of child values from offsets, and return `true` if + // they = 0. Empty arrays are equal, so this is correct. + // + // It's unlikely that one would create a n-length list array with no values, where n > 0, + // however, one is more likely to slice into a list array and get a region that has 0 + // child values. + // The test that triggered this behaviour had [4, 4] as a slice of 1 value slot. + let lhs_child_length = lhs_offsets.get(len).unwrap().to_usize().unwrap() + - lhs_offsets.first().unwrap().to_usize().unwrap(); + let rhs_child_length = rhs_offsets.get(len).unwrap().to_usize().unwrap() + - rhs_offsets.first().unwrap().to_usize().unwrap(); + + if lhs_child_length == 0 && lhs_child_length == rhs_child_length { + return true; + } + let lhs_values = lhs.child_data()[0].as_ref(); let rhs_values = rhs.child_data()[0].as_ref(); - if lhs.null_count() == 0 && rhs.null_count() == 0 { + let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); + let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); + + // compute the child logical bitmap + let child_lhs_nulls = lhs.child_logical_null_buffer(lhs_nulls.cloned(), 0); + let child_rhs_nulls = rhs.child_logical_null_buffer(rhs_nulls.cloned(), 0); + + if lhs_null_count == 0 && rhs_null_count == 0 { lengths_equal( &lhs_offsets[lhs_start..lhs_start + len], &rhs_offsets[rhs_start..rhs_start + len], ) && equal_range( lhs_values, rhs_values, - lhs_values.null_buffer(), - rhs_values.null_buffer(), + child_lhs_nulls.as_ref(), + child_rhs_nulls.as_ref(), lhs_offsets[lhs_start].to_usize().unwrap(), rhs_offsets[rhs_start].to_usize().unwrap(), (lhs_offsets[len] - lhs_offsets[lhs_start]) @@ -97,19 +140,24 @@ pub(super) fn list_equal( .unwrap(), ) } else { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = child_lhs_nulls.as_ref().unwrap().data(); + let rhs_null_bytes = child_rhs_nulls.as_ref().unwrap().data(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = lhs.is_null(lhs_pos); - let rhs_is_null = rhs.is_null(rhs_pos); + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) && offset_value_equal::( lhs_values, rhs_values, + child_lhs_nulls.as_ref(), + child_rhs_nulls.as_ref(), lhs_offsets, rhs_offsets, lhs_pos, diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs index 412d951da5b..33977b49694 100644 --- a/rust/arrow/src/array/equal/mod.rs +++ b/rust/arrow/src/array/equal/mod.rs @@ -146,118 +146,103 @@ fn equal_values( rhs_start: usize, len: usize, ) -> bool { - // compute the nested buffer of the parent and child - // if the array has no parent, the child is computed with itself - #[allow(unused_assignments)] - let mut temp_lhs: Option = None; - #[allow(unused_assignments)] - let mut temp_rhs: Option = None; - let lhs_merged_nulls = match (lhs_nulls, lhs.null_buffer()) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(p), None) => Some(p), - (Some(p), Some(c)) => { - let merged = (p & c).unwrap(); - temp_lhs = Some(merged); - temp_lhs.as_ref() - } - }; - let rhs_merged_nulls = match (rhs_nulls, rhs.null_buffer()) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(p), None) => Some(p), - (Some(p), Some(c)) => { - let merged = (p & c).unwrap(); - temp_rhs = Some(merged); - temp_rhs.as_ref() - } - }; - match lhs.data_type() { DataType::Null => null_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Boolean => boolean_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Boolean => { + boolean_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) + } + DataType::UInt8 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::UInt16 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::UInt32 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::UInt64 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Int8 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Int16 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Int32 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Int64 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Float32 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Float64 => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), DataType::Date32(_) | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } + | DataType::Interval(IntervalUnit::YearMonth) => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), DataType::Date64(_) | DataType::Interval(IntervalUnit::DayTime) | DataType::Time64(_) | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } + | DataType::Duration(_) => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), DataType::Utf8 | DataType::Binary => variable_sized_equal::( - lhs, - rhs, - lhs_merged_nulls, - rhs_merged_nulls, - lhs_start, - rhs_start, - len, + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, ), DataType::LargeUtf8 | DataType::LargeBinary => variable_sized_equal::( - lhs, - rhs, - lhs_merged_nulls, - rhs_merged_nulls, - lhs_start, - rhs_start, - len, + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, ), DataType::FixedSizeBinary(_) => { - fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) + fixed_binary_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) + } + DataType::Decimal(_, _) => { + decimal_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) + } + DataType::List(_) => { + list_equal::(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) + } + DataType::LargeList(_) => { + list_equal::(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) } - DataType::Decimal(_, _) => decimal_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => { - fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) + fixed_list_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) + } + DataType::Struct(_) => { + struct_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) } - DataType::Struct(_) => struct_equal( - lhs, - rhs, - lhs_merged_nulls, - rhs_merged_nulls, - lhs_start, - rhs_start, - len, - ), DataType::Union(_) => unimplemented!("See ARROW-8576"), DataType::Dictionary(data_type, _) => match data_type.as_ref() { - DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt8 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::Int8 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Int16 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Int32 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::Int64 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::UInt8 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::UInt16 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::UInt32 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), + DataType::UInt64 => dictionary_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), _ => unreachable!(), }, DataType::Float16 => unreachable!(), @@ -305,14 +290,14 @@ mod tests { use std::sync::Arc; use crate::array::{ - array::Array, ArrayDataRef, ArrayRef, BinaryOffsetSizeTrait, BooleanArray, - DecimalBuilder, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, - Int32Builder, ListBuilder, NullArray, PrimitiveBuilder, StringArray, - StringDictionaryBuilder, StringOffsetSizeTrait, StructArray, + array::Array, ArrayDataBuilder, ArrayDataRef, ArrayRef, BinaryOffsetSizeTrait, + BooleanArray, DecimalBuilder, FixedSizeBinaryBuilder, FixedSizeListBuilder, + GenericBinaryArray, Int32Builder, ListBuilder, NullArray, PrimitiveBuilder, + StringArray, StringDictionaryBuilder, StringOffsetSizeTrait, StructArray, }; use crate::array::{GenericStringArray, Int32Array}; use crate::buffer::Buffer; - use crate::datatypes::{Field, Int16Type}; + use crate::datatypes::{Field, Int16Type, ToByteSlice}; use super::*; @@ -599,6 +584,41 @@ mod tests { let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); test_equal(a.as_ref(), b.as_ref(), false); + + // a list where the nullness of values is determined by the list's bitmap + let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); + let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(6) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data(c_values.data()) + .null_bit_buffer(Buffer::from(vec![0b00001001])) + .build(); + + let d_values = Int32Array::from(vec![ + Some(1), + Some(2), + None, + None, + Some(3), + Some(4), + None, + None, + ]); + let d = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(6) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data(d_values.data()) + .null_bit_buffer(Buffer::from(vec![0b00001001])) + .build(); + test_equal(c.as_ref(), d.as_ref(), true); } // Test the case where offset != 0 diff --git a/rust/arrow/src/array/equal/primitive.rs b/rust/arrow/src/array/equal/primitive.rs index 4bb256643ca..6bfd551fb58 100644 --- a/rust/arrow/src/array/equal/primitive.rs +++ b/rust/arrow/src/array/equal/primitive.rs @@ -17,13 +17,17 @@ use std::mem::size_of; -use crate::array::ArrayData; +use crate::array::{data::count_nulls, ArrayData}; +use crate::buffer::Buffer; +use crate::util::bit_util::get_bit; use super::utils::equal_len; pub(super) fn primitive_equal( lhs: &ArrayData, rhs: &ArrayData, + lhs_nulls: Option<&Buffer>, + rhs_nulls: Option<&Buffer>, lhs_start: usize, rhs_start: usize, len: usize, @@ -32,7 +36,10 @@ pub(super) fn primitive_equal( let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * byte_width..]; let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * byte_width..]; - if lhs.null_count() == 0 && rhs.null_count() == 0 { + let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); + let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); + + if lhs_null_count == 0 && rhs_null_count == 0 { // without nulls, we just need to compare slices equal_len( lhs_values, @@ -42,12 +49,15 @@ pub(super) fn primitive_equal( len * byte_width, ) } else { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = lhs.is_null(lhs_pos); - let rhs_is_null = rhs.is_null(rhs_pos); + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/rust/arrow/src/array/equal/structure.rs b/rust/arrow/src/array/equal/structure.rs index 31ccbc870d0..141aecf02a2 100644 --- a/rust/arrow/src/array/equal/structure.rs +++ b/rust/arrow/src/array/equal/structure.rs @@ -38,12 +38,13 @@ fn equal_values( len: usize, ) -> bool { let mut temp_lhs: Option = None; - let mut temp_rhs: Option = None; + // let mut temp_rhs: Option = None; lhs.child_data() .iter() .zip(rhs.child_data()) - .all(|(lhs_values, rhs_values)| { + .enumerate() + .all(|(index, (lhs_values, rhs_values))| { // merge the null data let lhs_merged_nulls = match (lhs_nulls, lhs_values.null_buffer()) { (None, None) => None, @@ -55,21 +56,24 @@ fn equal_values( temp_lhs.as_ref() } }; - let rhs_merged_nulls = match (rhs_nulls, rhs_values.null_buffer()) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(p), None) => Some(p), - (Some(p), Some(c)) => { - let merged = (p & c).unwrap(); - temp_rhs = Some(merged); - temp_rhs.as_ref() - } - }; + // TODO: this is intentional, looking at which is the better option + let rhs_merged_nulls = + rhs.child_logical_null_buffer(rhs_nulls.cloned(), index); + // let rhs_merged_nulls = match (rhs_nulls, rhs_values.null_buffer()) { + // (None, None) => None, + // (None, Some(c)) => Some(c), + // (Some(p), None) => Some(p), + // (Some(p), Some(c)) => { + // let merged = (p & c).unwrap(); + // temp_rhs = Some(merged); + // temp_rhs.as_ref() + // } + // }; equal_range( lhs_values, rhs_values, lhs_merged_nulls, - rhs_merged_nulls, + rhs_merged_nulls.as_ref(), lhs_start, rhs_start, len, diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs index d250774a34a..87b155a902d 100644 --- a/rust/datafusion/tests/sql.rs +++ b/rust/datafusion/tests/sql.rs @@ -132,6 +132,7 @@ async fn parquet_single_nan_schema() { } #[tokio::test] +#[ignore = "Test ignored, will be enabled as part of the nested Parquet reader"] async fn parquet_list_columns() { let mut ctx = ExecutionContext::new(); let testdata = arrow::util::test_util::parquet_test_data(); From 4c4ac46335b41e12f65b7475889c82f1c1e03b39 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Mon, 4 Jan 2021 20:36:08 +0200 Subject: [PATCH 2/6] update renamed functions --- rust/arrow/src/array/data.rs | 8 ++++---- rust/arrow/src/array/equal/boolean.rs | 4 ++-- rust/arrow/src/array/equal/decimal.rs | 4 ++-- rust/arrow/src/array/equal/dictionary.rs | 4 ++-- rust/arrow/src/array/equal/fixed_binary.rs | 4 ++-- rust/arrow/src/array/equal/fixed_list.rs | 4 ++-- rust/arrow/src/array/equal/list.rs | 4 ++-- rust/arrow/src/array/equal/primitive.rs | 4 ++-- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index 887a22e5d50..355e1df0463 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -188,7 +188,7 @@ impl ArrayData { let bitmap_len = bit_util::ceil(array_len * len, 8); let mut buffer = MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false); - let mut null_slice = buffer.data_mut(); + let mut null_slice = buffer.as_slice_mut(); (array_offset..array_len + array_offset).for_each(|index| { let start = index * len; let end = start + len; @@ -199,7 +199,7 @@ impl ArrayData { } }); }); - Some(buffer.freeze()) + Some(buffer.into()) } DataType::Struct(_) => (&parent_bitmap & &self_null_bitmap) .ok() @@ -420,7 +420,7 @@ fn logical_list_bitmap( let offset_start = offsets.first().unwrap().to_usize().unwrap(); let offset_len = offsets.get(parent_data.len()).unwrap().to_usize().unwrap(); let mut buffer = MutableBuffer::new_null(offset_len - offset_start); - let mut null_slice = buffer.data_mut(); + let mut null_slice = buffer.as_slice_mut(); offsets .windows(2) @@ -437,7 +437,7 @@ fn logical_list_bitmap( } }); }); - buffer.freeze() + buffer.into() } #[cfg(test)] diff --git a/rust/arrow/src/array/equal/boolean.rs b/rust/arrow/src/array/equal/boolean.rs index 5aa48fcf556..ef241e86ba1 100644 --- a/rust/arrow/src/array/equal/boolean.rs +++ b/rust/arrow/src/array/equal/boolean.rs @@ -51,8 +51,8 @@ pub(super) fn boolean_equal( }) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); (0..len).all(|i| { let lhs_pos = lhs_start + i; diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs index a8c0659ceea..0924835b954 100644 --- a/rust/arrow/src/array/equal/decimal.rs +++ b/rust/arrow/src/array/equal/decimal.rs @@ -52,8 +52,8 @@ pub(super) fn decimal_equal( ) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; diff --git a/rust/arrow/src/array/equal/dictionary.rs b/rust/arrow/src/array/equal/dictionary.rs index df6dd292d9a..4f4c148e8f0 100644 --- a/rust/arrow/src/array/equal/dictionary.rs +++ b/rust/arrow/src/array/equal/dictionary.rs @@ -57,8 +57,8 @@ pub(super) fn dictionary_equal( }) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; diff --git a/rust/arrow/src/array/equal/fixed_binary.rs b/rust/arrow/src/array/equal/fixed_binary.rs index a0a99c0f7d4..57158311703 100644 --- a/rust/arrow/src/array/equal/fixed_binary.rs +++ b/rust/arrow/src/array/equal/fixed_binary.rs @@ -52,8 +52,8 @@ pub(super) fn fixed_binary_equal( ) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; diff --git a/rust/arrow/src/array/equal/fixed_list.rs b/rust/arrow/src/array/equal/fixed_list.rs index 72588fcead2..f5065bb2918 100644 --- a/rust/arrow/src/array/equal/fixed_list.rs +++ b/rust/arrow/src/array/equal/fixed_list.rs @@ -54,8 +54,8 @@ pub(super) fn fixed_list_equal( ) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; diff --git a/rust/arrow/src/array/equal/list.rs b/rust/arrow/src/array/equal/list.rs index c0fd5c01d13..3c0c405b4bd 100644 --- a/rust/arrow/src/array/equal/list.rs +++ b/rust/arrow/src/array/equal/list.rs @@ -141,8 +141,8 @@ pub(super) fn list_equal( ) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = child_lhs_nulls.as_ref().unwrap().data(); - let rhs_null_bytes = child_rhs_nulls.as_ref().unwrap().data(); + let lhs_null_bytes = child_lhs_nulls.as_ref().unwrap().as_slice(); + let rhs_null_bytes = child_rhs_nulls.as_ref().unwrap().as_slice(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; diff --git a/rust/arrow/src/array/equal/primitive.rs b/rust/arrow/src/array/equal/primitive.rs index 6bfd551fb58..ff061d16607 100644 --- a/rust/arrow/src/array/equal/primitive.rs +++ b/rust/arrow/src/array/equal/primitive.rs @@ -50,8 +50,8 @@ pub(super) fn primitive_equal( ) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().data(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().data(); + let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; From a2941dde5aead6e895201cff4b68e4de630a1b83 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Mon, 4 Jan 2021 23:31:55 +0200 Subject: [PATCH 3/6] ignore clippy warning --- rust/arrow/src/array/equal/list.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/arrow/src/array/equal/list.rs b/rust/arrow/src/array/equal/list.rs index 3c0c405b4bd..bbe11e02b5c 100644 --- a/rust/arrow/src/array/equal/list.rs +++ b/rust/arrow/src/array/equal/list.rs @@ -46,6 +46,7 @@ fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { }) } +#[allow(clippy::too_many_arguments)] #[inline] fn offset_value_equal( lhs_values: &ArrayData, From 66bb98e240f9ca49a7abe2a00ce83a117a62b426 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 5 Jan 2021 08:43:47 +0200 Subject: [PATCH 4/6] fix integration test failures --- rust/arrow/src/array/data.rs | 39 ++++++++++++++++++++----- rust/arrow/src/array/equal/list.rs | 6 ++-- rust/arrow/src/array/equal/structure.rs | 16 +++------- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index 355e1df0463..942a29cdecd 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -162,8 +162,9 @@ impl ArrayData { // TODO: rationalise this logic, prefer not creating populated bitmaps, use Option matching // I found taking a Bitmap that's populated to be more convenient, but I was more concerned first // about accuracy, so I'll explore using Option<&Bitmap> directly. + let parent_len = self.len(); let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| { - let ceil = bit_util::ceil(self.len(), 8); + let ceil = bit_util::ceil(parent_len, 8); Bitmap::from(Buffer::from(vec![0b11111111; ceil])) }); let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| { @@ -183,13 +184,12 @@ impl ArrayData { )), DataType::FixedSizeList(_, len) => { let len = *len as usize; - let array_len = self.len(); let array_offset = self.offset(); - let bitmap_len = bit_util::ceil(array_len * len, 8); + let bitmap_len = bit_util::ceil(parent_len * len, 8); let mut buffer = MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false); let mut null_slice = buffer.as_slice_mut(); - (array_offset..array_len + array_offset).for_each(|index| { + (array_offset..parent_len + array_offset).for_each(|index| { let start = index * len; let end = start + len; let mask = parent_bitmap.is_set(index); @@ -201,9 +201,33 @@ impl ArrayData { }); Some(buffer.into()) } - DataType::Struct(_) => (&parent_bitmap & &self_null_bitmap) - .ok() - .map(|bitmap| bitmap.bits), + DataType::Struct(_) => { + // Arrow implementations are free to pad data, which can result in null buffers not + // having the same length. + // Rust bitwise comparisons will return an error if left AND right is performed on + // buffers of different length. + // This might be a valid case during integration testing, where we read Arrow arrays + // from IPC data, which has padding. + // + // We first perform a bitwise comparison, and if there is an error, we revert to a + // slower method that indexes into the buffers one-by-one. + let result = &parent_bitmap & &self_null_bitmap; + if let Ok(bitmap) = result { + return Some(bitmap.bits); + } + // slow path + let array_offset = self.offset(); + let mut buffer = MutableBuffer::new_null(parent_len); + let mut null_slice = buffer.as_slice_mut(); + (0..parent_len).for_each(|index| { + if parent_bitmap.is_set(index + array_offset) + && self_null_bitmap.is_set(index + array_offset) + { + bit_util::set_bit(&mut null_slice, index); + } + }); + Some(buffer.into()) + } DataType::Union(_) => { panic!("Logical equality not yet implemented for union arrays") } @@ -427,7 +451,6 @@ fn logical_list_bitmap( .enumerate() .take(offset_len - offset_start) .for_each(|(index, window)| { - dbg!(&window, index); let start = window[0].to_usize().unwrap(); let end = window[1].to_usize().unwrap(); let mask = parent_bitmap.is_set(index); diff --git a/rust/arrow/src/array/equal/list.rs b/rust/arrow/src/array/equal/list.rs index bbe11e02b5c..17dd893cdb2 100644 --- a/rust/arrow/src/array/equal/list.rs +++ b/rust/arrow/src/array/equal/list.rs @@ -141,9 +141,9 @@ pub(super) fn list_equal( .unwrap(), ) } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = child_lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = child_rhs_nulls.as_ref().unwrap().as_slice(); + // get a ref of the parent null buffer bytes, to use in testing for nullness + let lhs_null_bytes = rhs_nulls.unwrap().as_slice(); + let rhs_null_bytes = rhs_nulls.unwrap().as_slice(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; diff --git a/rust/arrow/src/array/equal/structure.rs b/rust/arrow/src/array/equal/structure.rs index 141aecf02a2..ed7a4e28f53 100644 --- a/rust/arrow/src/array/equal/structure.rs +++ b/rust/arrow/src/array/equal/structure.rs @@ -37,7 +37,7 @@ fn equal_values( rhs_start: usize, len: usize, ) -> bool { - let mut temp_lhs: Option = None; + // let mut temp_lhs: Option = None; // let mut temp_rhs: Option = None; lhs.child_data() @@ -46,16 +46,8 @@ fn equal_values( .enumerate() .all(|(index, (lhs_values, rhs_values))| { // merge the null data - let lhs_merged_nulls = match (lhs_nulls, lhs_values.null_buffer()) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(p), None) => Some(p), - (Some(p), Some(c)) => { - let merged = (p & c).unwrap(); - temp_lhs = Some(merged); - temp_lhs.as_ref() - } - }; + let lhs_merged_nulls = + lhs.child_logical_null_buffer(lhs_nulls.cloned(), index); // TODO: this is intentional, looking at which is the better option let rhs_merged_nulls = rhs.child_logical_null_buffer(rhs_nulls.cloned(), index); @@ -72,7 +64,7 @@ fn equal_values( equal_range( lhs_values, rhs_values, - lhs_merged_nulls, + lhs_merged_nulls.as_ref(), rhs_merged_nulls.as_ref(), lhs_start, rhs_start, From 93017a2e54aa6afa2925cd8ce000ad3473f25d6d Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 5 Jan 2021 09:32:16 +0200 Subject: [PATCH 5/6] address review comments --- rust/arrow/src/array/data.rs | 185 +---------------------- rust/arrow/src/array/equal/list.rs | 14 +- rust/arrow/src/array/equal/structure.rs | 23 +-- rust/arrow/src/array/equal/utils.rs | 188 +++++++++++++++++++++++- 4 files changed, 205 insertions(+), 205 deletions(-) diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index 942a29cdecd..ecfda3d5d5b 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -21,12 +21,11 @@ use std::mem; use std::sync::Arc; -use crate::buffer::{Buffer, MutableBuffer}; +use crate::buffer::Buffer; use crate::datatypes::DataType; -use crate::util::bit_util; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; -use super::{equal::equal, OffsetSizeTrait}; +use super::equal::equal; #[inline] pub(crate) fn count_nulls( @@ -137,108 +136,6 @@ impl ArrayData { &self.null_bitmap } - /// Computes the logical validity bitmap of the array data using the - /// parent's array data. The parent should be a list or struct, else - /// the logical bitmap of the array is returned unaltered. - /// - /// Parent data is passed along with the parent's logical bitmap, as - /// nested arrays could have a logical bitmap different to the physical - /// one on the `ArrayData`. - /// - /// Safety - /// - /// As we index into [`ArrayData::child_data`], this function panics if - /// array data is not a nested type, as it will not have child data. - pub fn child_logical_null_buffer( - &self, - logical_null_buffer: Option, - child_index: usize, - ) -> Option { - // This function should only be called when having nested data types. - // However, as a convenience, we return the parent's logical buffer if - // we do not encounter a nested type. - let child_data = self.child_data().get(child_index).unwrap(); - - // TODO: rationalise this logic, prefer not creating populated bitmaps, use Option matching - // I found taking a Bitmap that's populated to be more convenient, but I was more concerned first - // about accuracy, so I'll explore using Option<&Bitmap> directly. - let parent_len = self.len(); - let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| { - let ceil = bit_util::ceil(parent_len, 8); - Bitmap::from(Buffer::from(vec![0b11111111; ceil])) - }); - let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| { - let ceil = bit_util::ceil(child_data.len(), 8); - Bitmap::from(Buffer::from(vec![0b11111111; ceil])) - }); - match self.data_type() { - DataType::List(_) => Some(logical_list_bitmap::( - self, - parent_bitmap, - self_null_bitmap, - )), - DataType::LargeList(_) => Some(logical_list_bitmap::( - self, - parent_bitmap, - self_null_bitmap, - )), - DataType::FixedSizeList(_, len) => { - let len = *len as usize; - let array_offset = self.offset(); - let bitmap_len = bit_util::ceil(parent_len * len, 8); - let mut buffer = - MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false); - let mut null_slice = buffer.as_slice_mut(); - (array_offset..parent_len + array_offset).for_each(|index| { - let start = index * len; - let end = start + len; - let mask = parent_bitmap.is_set(index); - (start..end).for_each(|child_index| { - if mask && self_null_bitmap.is_set(child_index) { - bit_util::set_bit(&mut null_slice, child_index); - } - }); - }); - Some(buffer.into()) - } - DataType::Struct(_) => { - // Arrow implementations are free to pad data, which can result in null buffers not - // having the same length. - // Rust bitwise comparisons will return an error if left AND right is performed on - // buffers of different length. - // This might be a valid case during integration testing, where we read Arrow arrays - // from IPC data, which has padding. - // - // We first perform a bitwise comparison, and if there is an error, we revert to a - // slower method that indexes into the buffers one-by-one. - let result = &parent_bitmap & &self_null_bitmap; - if let Ok(bitmap) = result { - return Some(bitmap.bits); - } - // slow path - let array_offset = self.offset(); - let mut buffer = MutableBuffer::new_null(parent_len); - let mut null_slice = buffer.as_slice_mut(); - (0..parent_len).for_each(|index| { - if parent_bitmap.is_set(index + array_offset) - && self_null_bitmap.is_set(index + array_offset) - { - bit_util::set_bit(&mut null_slice, index); - } - }); - Some(buffer.into()) - } - DataType::Union(_) => { - panic!("Logical equality not yet implemented for union arrays") - } - _ => { - panic!( - "Encountered array data with child data, but not a known nested type" - ) - } - } - } - /// Returns a reference to the null buffer of this array data. pub fn null_buffer(&self) -> Option<&Buffer> { self.null_bitmap().as_ref().map(|b| b.buffer_ref()) @@ -434,42 +331,13 @@ impl ArrayDataBuilder { } } -#[inline] -fn logical_list_bitmap( - parent_data: &ArrayData, - parent_bitmap: Bitmap, - child_bitmap: Bitmap, -) -> Buffer { - let offsets = parent_data.buffer::(0); - let offset_start = offsets.first().unwrap().to_usize().unwrap(); - let offset_len = offsets.get(parent_data.len()).unwrap().to_usize().unwrap(); - let mut buffer = MutableBuffer::new_null(offset_len - offset_start); - let mut null_slice = buffer.as_slice_mut(); - - offsets - .windows(2) - .enumerate() - .take(offset_len - offset_start) - .for_each(|(index, window)| { - let start = window[0].to_usize().unwrap(); - let end = window[1].to_usize().unwrap(); - let mask = parent_bitmap.is_set(index); - (start..end).for_each(|child_index| { - if mask && child_bitmap.is_set(child_index) { - bit_util::set_bit(&mut null_slice, child_index - offset_start); - } - }); - }); - buffer.into() -} - #[cfg(test)] mod tests { use super::*; use std::sync::Arc; - use crate::datatypes::{Field, ToByteSlice}; + use crate::datatypes::ToByteSlice; use crate::util::bit_util; #[test] @@ -593,51 +461,4 @@ mod tests { let count = count_nulls(null_buffer.as_ref(), 4, 8); assert_eq!(count, 3); } - - #[test] - fn test_logical_null_buffer() { - let child_data = ArrayData::builder(DataType::Int32) - .len(11) - .add_buffer(Buffer::from( - vec![1i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11].to_byte_slice(), - )) - .build(); - - let data = ArrayData::builder(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - false, - )))) - .len(7) - .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) - .null_bit_buffer(Buffer::from(vec![0b01011010])) - .add_child_data(child_data.clone()) - .build(); - - // Get the child logical null buffer. The child is non-nullable, but because the list has nulls, - // we expect the child to logically have some nulls, inherited from the parent: - // [1, 2, 3, null, null, 6, 7, 8, 9, null, 11] - let nulls = data.child_logical_null_buffer(data.null_buffer().cloned(), 0); - let expected = Some(Buffer::from(vec![0b11100111, 0b00000101])); - assert_eq!(nulls, expected); - - // test with offset - let data = ArrayData::builder(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - false, - )))) - .len(4) - .offset(3) - .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) - // the null_bit_buffer doesn't have an offset, i.e. cleared the 3 offset bits 0b[---]01011[010] - .null_bit_buffer(Buffer::from(vec![0b00001011])) - .add_child_data(child_data) - .build(); - - let nulls = data.child_logical_null_buffer(data.null_buffer().cloned(), 0); - - let expected = Some(Buffer::from(vec![0b00101111])); - assert_eq!(nulls, expected); - } } diff --git a/rust/arrow/src/array/equal/list.rs b/rust/arrow/src/array/equal/list.rs index 17dd893cdb2..7eeca5b90f7 100644 --- a/rust/arrow/src/array/equal/list.rs +++ b/rust/arrow/src/array/equal/list.rs @@ -22,7 +22,7 @@ use crate::{ util::bit_util::get_bit, }; -use super::equal_range; +use super::{equal_range, utils::child_logical_null_buffer}; fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { // invariant from `base_equal` @@ -122,8 +122,16 @@ pub(super) fn list_equal( let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); // compute the child logical bitmap - let child_lhs_nulls = lhs.child_logical_null_buffer(lhs_nulls.cloned(), 0); - let child_rhs_nulls = rhs.child_logical_null_buffer(rhs_nulls.cloned(), 0); + let child_lhs_nulls = child_logical_null_buffer( + lhs, + lhs_nulls.cloned(), + lhs.child_data().get(0).unwrap(), + ); + let child_rhs_nulls = child_logical_null_buffer( + rhs, + rhs_nulls.cloned(), + rhs.child_data().get(0).unwrap(), + ); if lhs_null_count == 0 && rhs_null_count == 0 { lengths_equal( diff --git a/rust/arrow/src/array/equal/structure.rs b/rust/arrow/src/array/equal/structure.rs index ed7a4e28f53..528ab80c511 100644 --- a/rust/arrow/src/array/equal/structure.rs +++ b/rust/arrow/src/array/equal/structure.rs @@ -19,7 +19,7 @@ use crate::{ array::data::count_nulls, array::ArrayData, buffer::Buffer, util::bit_util::get_bit, }; -use super::equal_range; +use super::{equal_range, utils::child_logical_null_buffer}; /// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively /// for `len` slots. The null buffers `lhs_nulls` and `rhs_nulls` inherit parent nullability. @@ -37,30 +37,15 @@ fn equal_values( rhs_start: usize, len: usize, ) -> bool { - // let mut temp_lhs: Option = None; - // let mut temp_rhs: Option = None; - lhs.child_data() .iter() .zip(rhs.child_data()) - .enumerate() - .all(|(index, (lhs_values, rhs_values))| { + .all(|(lhs_values, rhs_values)| { // merge the null data let lhs_merged_nulls = - lhs.child_logical_null_buffer(lhs_nulls.cloned(), index); - // TODO: this is intentional, looking at which is the better option + child_logical_null_buffer(lhs, lhs_nulls.cloned(), lhs_values); let rhs_merged_nulls = - rhs.child_logical_null_buffer(rhs_nulls.cloned(), index); - // let rhs_merged_nulls = match (rhs_nulls, rhs_values.null_buffer()) { - // (None, None) => None, - // (None, Some(c)) => Some(c), - // (Some(p), None) => Some(p), - // (Some(p), Some(c)) => { - // let merged = (p & c).unwrap(); - // temp_rhs = Some(merged); - // temp_rhs.as_ref() - // } - // }; + child_logical_null_buffer(rhs, rhs_nulls.cloned(), rhs_values); equal_range( lhs_values, rhs_values, diff --git a/rust/arrow/src/array/equal/utils.rs b/rust/arrow/src/array/equal/utils.rs index 3ccc2450852..452e8ad789c 100644 --- a/rust/arrow/src/array/equal/utils.rs +++ b/rust/arrow/src/array/equal/utils.rs @@ -15,7 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::data::count_nulls, array::ArrayData, buffer::Buffer, util::bit_util}; +use crate::array::{data::count_nulls, ArrayData, OffsetSizeTrait}; +use crate::bitmap::Bitmap; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::DataType; +use crate::util::bit_util; // whether bits along the positions are equal // `lhs_start`, `rhs_start` and `len` are _measured in bits_. @@ -76,3 +80,185 @@ pub(super) fn equal_len( ) -> bool { lhs_values[lhs_start..(lhs_start + len)] == rhs_values[rhs_start..(rhs_start + len)] } + +/// Computes the logical validity bitmap of the array data using the +/// parent's array data. The parent should be a list or struct, else +/// the logical bitmap of the array is returned unaltered. +/// +/// Parent data is passed along with the parent's logical bitmap, as +/// nested arrays could have a logical bitmap different to the physical +/// one on the `ArrayData`. +pub(super) fn child_logical_null_buffer( + parent_data: &ArrayData, + logical_null_buffer: Option, + child_data: &ArrayData, +) -> Option { + let parent_len = parent_data.len(); + let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| { + let ceil = bit_util::ceil(parent_len, 8); + Bitmap::from(Buffer::from(vec![0b11111111; ceil])) + }); + let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| { + let ceil = bit_util::ceil(child_data.len(), 8); + Bitmap::from(Buffer::from(vec![0b11111111; ceil])) + }); + match parent_data.data_type() { + DataType::List(_) => Some(logical_list_bitmap::( + parent_data, + parent_bitmap, + self_null_bitmap, + )), + DataType::LargeList(_) => Some(logical_list_bitmap::( + parent_data, + parent_bitmap, + self_null_bitmap, + )), + DataType::FixedSizeList(_, len) => { + let len = *len as usize; + let array_offset = parent_data.offset(); + let bitmap_len = bit_util::ceil(parent_len * len, 8); + let mut buffer = + MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false); + let mut null_slice = buffer.as_slice_mut(); + (array_offset..parent_len + array_offset).for_each(|index| { + let start = index * len; + let end = start + len; + let mask = parent_bitmap.is_set(index); + (start..end).for_each(|child_index| { + if mask && self_null_bitmap.is_set(child_index) { + bit_util::set_bit(&mut null_slice, child_index); + } + }); + }); + Some(buffer.into()) + } + DataType::Struct(_) => { + // Arrow implementations are free to pad data, which can result in null buffers not + // having the same length. + // Rust bitwise comparisons will return an error if left AND right is performed on + // buffers of different length. + // This might be a valid case during integration testing, where we read Arrow arrays + // from IPC data, which has padding. + // + // We first perform a bitwise comparison, and if there is an error, we revert to a + // slower method that indexes into the buffers one-by-one. + let result = &parent_bitmap & &self_null_bitmap; + if let Ok(bitmap) = result { + return Some(bitmap.bits); + } + // slow path + let array_offset = parent_data.offset(); + let mut buffer = MutableBuffer::new_null(parent_len); + let mut null_slice = buffer.as_slice_mut(); + (0..parent_len).for_each(|index| { + if parent_bitmap.is_set(index + array_offset) + && self_null_bitmap.is_set(index + array_offset) + { + bit_util::set_bit(&mut null_slice, index); + } + }); + Some(buffer.into()) + } + DataType::Union(_) => { + unimplemented!("Logical equality not yet implemented for union arrays") + } + DataType::Dictionary(_, _) => { + unimplemented!("Logical equality not yet implemented for nested dictionaries") + } + data_type => { + panic!("Data type {:?} is not a supported nested type", data_type) + } + } +} + +// Calculate a list child's logical bitmap/buffer +#[inline] +fn logical_list_bitmap( + parent_data: &ArrayData, + parent_bitmap: Bitmap, + child_bitmap: Bitmap, +) -> Buffer { + let offsets = parent_data.buffer::(0); + let offset_start = offsets.first().unwrap().to_usize().unwrap(); + let offset_len = offsets.get(parent_data.len()).unwrap().to_usize().unwrap(); + let mut buffer = MutableBuffer::new_null(offset_len - offset_start); + let mut null_slice = buffer.as_slice_mut(); + + offsets + .windows(2) + .enumerate() + .take(offset_len - offset_start) + .for_each(|(index, window)| { + let start = window[0].to_usize().unwrap(); + let end = window[1].to_usize().unwrap(); + let mask = parent_bitmap.is_set(index); + (start..end).for_each(|child_index| { + if mask && child_bitmap.is_set(child_index) { + bit_util::set_bit(&mut null_slice, child_index - offset_start); + } + }); + }); + buffer.into() +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::datatypes::{Field, ToByteSlice}; + + #[test] + fn test_logical_null_buffer() { + let child_data = ArrayData::builder(DataType::Int32) + .len(11) + .add_buffer(Buffer::from( + vec![1i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11].to_byte_slice(), + )) + .build(); + + let data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + false, + )))) + .len(7) + .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) + .null_bit_buffer(Buffer::from(vec![0b01011010])) + .add_child_data(child_data.clone()) + .build(); + + // Get the child logical null buffer. The child is non-nullable, but because the list has nulls, + // we expect the child to logically have some nulls, inherited from the parent: + // [1, 2, 3, null, null, 6, 7, 8, 9, null, 11] + let nulls = child_logical_null_buffer( + &data, + data.null_buffer().cloned(), + data.child_data().get(0).unwrap(), + ); + let expected = Some(Buffer::from(vec![0b11100111, 0b00000101])); + assert_eq!(nulls, expected); + + // test with offset + let data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + false, + )))) + .len(4) + .offset(3) + .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) + // the null_bit_buffer doesn't have an offset, i.e. cleared the 3 offset bits 0b[---]01011[010] + .null_bit_buffer(Buffer::from(vec![0b00001011])) + .add_child_data(child_data) + .build(); + + let nulls = child_logical_null_buffer( + &data, + data.null_buffer().cloned(), + data.child_data().get(0).unwrap(), + ); + + let expected = Some(Buffer::from(vec![0b00101111])); + assert_eq!(nulls, expected); + } +} From 01b28b51317820f279591a21ae51aeaf2768689a Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 5 Jan 2021 15:08:47 +0200 Subject: [PATCH 6/6] take borrowed reference of buffer --- rust/arrow/src/array/equal/list.rs | 14 ++++---------- rust/arrow/src/array/equal/structure.rs | 6 ++---- rust/arrow/src/array/equal/utils.rs | 17 ++++++++++------- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/rust/arrow/src/array/equal/list.rs b/rust/arrow/src/array/equal/list.rs index 7eeca5b90f7..a7a6bd334c1 100644 --- a/rust/arrow/src/array/equal/list.rs +++ b/rust/arrow/src/array/equal/list.rs @@ -122,16 +122,10 @@ pub(super) fn list_equal( let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); // compute the child logical bitmap - let child_lhs_nulls = child_logical_null_buffer( - lhs, - lhs_nulls.cloned(), - lhs.child_data().get(0).unwrap(), - ); - let child_rhs_nulls = child_logical_null_buffer( - rhs, - rhs_nulls.cloned(), - rhs.child_data().get(0).unwrap(), - ); + let child_lhs_nulls = + child_logical_null_buffer(lhs, lhs_nulls, lhs.child_data().get(0).unwrap()); + let child_rhs_nulls = + child_logical_null_buffer(rhs, rhs_nulls, rhs.child_data().get(0).unwrap()); if lhs_null_count == 0 && rhs_null_count == 0 { lengths_equal( diff --git a/rust/arrow/src/array/equal/structure.rs b/rust/arrow/src/array/equal/structure.rs index 528ab80c511..8779a160460 100644 --- a/rust/arrow/src/array/equal/structure.rs +++ b/rust/arrow/src/array/equal/structure.rs @@ -42,10 +42,8 @@ fn equal_values( .zip(rhs.child_data()) .all(|(lhs_values, rhs_values)| { // merge the null data - let lhs_merged_nulls = - child_logical_null_buffer(lhs, lhs_nulls.cloned(), lhs_values); - let rhs_merged_nulls = - child_logical_null_buffer(rhs, rhs_nulls.cloned(), rhs_values); + let lhs_merged_nulls = child_logical_null_buffer(lhs, lhs_nulls, lhs_values); + let rhs_merged_nulls = child_logical_null_buffer(rhs, rhs_nulls, rhs_values); equal_range( lhs_values, rhs_values, diff --git a/rust/arrow/src/array/equal/utils.rs b/rust/arrow/src/array/equal/utils.rs index 452e8ad789c..a880527578f 100644 --- a/rust/arrow/src/array/equal/utils.rs +++ b/rust/arrow/src/array/equal/utils.rs @@ -90,14 +90,17 @@ pub(super) fn equal_len( /// one on the `ArrayData`. pub(super) fn child_logical_null_buffer( parent_data: &ArrayData, - logical_null_buffer: Option, + logical_null_buffer: Option<&Buffer>, child_data: &ArrayData, ) -> Option { let parent_len = parent_data.len(); - let parent_bitmap = logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| { - let ceil = bit_util::ceil(parent_len, 8); - Bitmap::from(Buffer::from(vec![0b11111111; ceil])) - }); + let parent_bitmap = logical_null_buffer + .cloned() + .map(Bitmap::from) + .unwrap_or_else(|| { + let ceil = bit_util::ceil(parent_len, 8); + Bitmap::from(Buffer::from(vec![0b11111111; ceil])) + }); let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| { let ceil = bit_util::ceil(child_data.len(), 8); Bitmap::from(Buffer::from(vec![0b11111111; ceil])) @@ -232,7 +235,7 @@ mod tests { // [1, 2, 3, null, null, 6, 7, 8, 9, null, 11] let nulls = child_logical_null_buffer( &data, - data.null_buffer().cloned(), + data.null_buffer(), data.child_data().get(0).unwrap(), ); let expected = Some(Buffer::from(vec![0b11100111, 0b00000101])); @@ -254,7 +257,7 @@ mod tests { let nulls = child_logical_null_buffer( &data, - data.null_buffer().cloned(), + data.null_buffer(), data.child_data().get(0).unwrap(), );