diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs index aa1d7fe7f56..703a67a0aa2 100644 --- a/rust/arrow/src/buffer.rs +++ b/rust/arrow/src/buffer.rs @@ -38,6 +38,18 @@ use crate::util::bit_util; #[cfg(feature = "simd")] use std::borrow::BorrowMut; +/// Used to trim offset bits +const TRIM_OFFSET: [u8; 8] = [ + 0b00000001, 0b00000011, 0b00000111, 0b00001111, 0b00011111, 0b00111111, 0b01111111, + 0b11111111, +]; + +/// Used to trim trailing bits +const TRIM_TRAIL: [u8; 8] = [ + 0b10000000, 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100, 0b11111110, + 0b11111111, +]; + /// Buffer is a contiguous memory region of fixed size and is aligned at a 64-byte /// boundary. Buffer is immutable. #[derive(PartialEq, Debug)] @@ -227,6 +239,62 @@ impl Buffer { } } + /// Returns a slice of this buffer, starting from `offset` and with a length of `len`. + /// + /// Note that this is mostly useful when the offset is not a multiple of 8, as a + /// way of realigning the buffer. + pub(crate) fn slice_with_len(&self, offset: usize, len: usize) -> Self { + let mut arr = self.data().to_vec(); + let mut arr: &mut [u8] = arr.as_mut(); + let offset = self.offset + offset; + // shift by whole bytes + let pop_by = offset / 8; + let offset_rem = offset % 8; + + let offset_diff = 8 - offset_rem; + arr = &mut arr[pop_by..]; + let shift_by = bit_util::ceil(offset_rem + len, 8); + arr = &mut arr[..shift_by]; + + let mut left = 0u8; + // determine the length of the output buffer + let total_rem = (len + offset_rem) % 8; + + // remove trailing bits at end of bytes + if total_rem > 0 { + if let Some(v) = arr.iter_mut().last() { + *v &= TRIM_OFFSET[total_rem - 1]; + } + } + + // shorter route if offset is div by 8 + if offset_rem == 0 { + return Buffer::from(&arr[0..bit_util::ceil(len, 8)]); + } else { + // reverse array to enable carrying over + arr.reverse(); + if let Some(v) = arr.iter_mut().last() { + *v &= TRIM_TRAIL[offset_diff - 1]; + } + } + + // replace bits from each byte in reverse order + arr.iter_mut().for_each(|byte| { + // assign the bits that we'll copy + let new_left = *byte & TRIM_OFFSET[offset_rem - 1]; + // shift bytes to remove offset + *byte >>= offset_rem; + // add the bits copied from the previous byte + *byte |= left; + // shift the bytes we're copying to the next byte + left = new_left << offset_diff; + }); + + arr.reverse(); + + Buffer::from(&arr[0..bit_util::ceil(len, 8)]) + } + /// Returns a raw pointer for this buffer. /// /// Note that this should be used cautiously, and the returned pointer should not be @@ -896,6 +964,30 @@ mod tests { let _buf3 = (&buf1 | &buf2).unwrap(); } + #[test] + fn test_buffer_slice_with_len() { + let buf1 = Buffer::from([0b11111111, 0b11111111]); + let buf2 = buf1.slice_with_len(3, 10); + let buf3 = Buffer::from([0b11111111, 0b00000011]); + assert_eq!(buf2, buf3); + + let buf1 = Buffer::from([0b01001110, 0b01001111]); + let buf2 = buf1.slice_with_len(4, 9); + let buf3 = Buffer::from([0b11110100, 0b00000000]); + assert_eq!(buf2, buf3); + + let buf1 = Buffer::from([0b01001110, 0b01001111, 0b11001111]); + let buf2 = buf1.slice_with_len(7, 17); + let buf3 = Buffer::from([0b10011110, 0b10011110, 0b00000001]); + assert_eq!(buf2, buf3); + + // this should remove the first byte, and mask the 24th bit + let buf1 = Buffer::from([0b01001110, 0b01001111, 0b11111111]); + let buf2 = buf1.slice_with_len(0, 17); + let buf3 = Buffer::from([0b01001110, 0b01001111, 0b00000001]); + assert_eq!(buf2, buf3); + } + #[test] fn test_mutable_new() { let buf = MutableBuffer::new(63); diff --git a/rust/arrow/src/compute/kernels/boolean.rs b/rust/arrow/src/compute/kernels/boolean.rs index e9134095b92..7f0aeae3e5a 100644 --- a/rust/arrow/src/compute/kernels/boolean.rs +++ b/rust/arrow/src/compute/kernels/boolean.rs @@ -46,21 +46,30 @@ where )); } - if left.offset() % 8 != 0 || right.offset() % 8 != 0 { - return Err(ArrowError::ComputeError( - "Cannot perform bitwise operation when offsets are not byte-aligned." - .to_string(), - )); - } - let left_data = left.data_ref(); let right_data = right.data_ref(); let null_bit_buffer = combine_option_bitmap(&left_data, &right_data, left.len())?; - let left_buffer = &left_data.buffers()[0]; - let right_buffer = &right_data.buffers()[0]; - let left_offset = &left.offset() / 8; - let right_offset = &right.offset() / 8; + let mut left_offset = &left.offset() / 8; + let mut right_offset = &right.offset() / 8; + + // A BooleanArray Buffer does not carry the data's offset, thus we offset with data's offset + let left_buffer = if left.offset() % 8 != 0 { + let b = &left_data.buffers()[0].clone(); + let sliced = b.slice_with_len(left.offset(), left.len()); + left_offset = 0; + sliced + } else { + left_data.buffers()[0].clone() + }; + let right_buffer = if right.offset() % 8 != 0 { + let b = &right_data.buffers()[0].clone(); + let sliced = b.slice_with_len(right.offset(), right.len()); + right_offset = 0; + sliced + } else { + right_data.buffers()[0].clone() + }; let len = ceil(left.len(), 8); @@ -93,14 +102,7 @@ pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { /// Performs unary `NOT` operation on an arrays. If value is null then the result is also /// null. pub fn not(left: &BooleanArray) -> Result { - if left.offset() % 8 != 0 { - return Err(ArrowError::ComputeError( - "Cannot perform bitwise operation when offsets are not byte-aligned." - .to_string(), - )); - } - - let left_offset = left.offset() / 8; + let mut left_offset = left.offset() / 8; let len = ceil(left.len(), 8); let data = left.data_ref(); @@ -109,7 +111,16 @@ pub fn not(left: &BooleanArray) -> Result { .as_ref() .map(|b| b.bits.slice(left_offset)); - let values = buffer_unary_not(&data.buffers()[0], left_offset, len); + let buf = if left.offset() % 8 != 0 { + let b = &data.buffers()[0].clone(); + let sliced = b.slice_with_len(left.offset(), left.len()); + left_offset = 0; + sliced + } else { + data.buffers()[0].clone() + }; + + let values = buffer_unary_not(&buf, left_offset, len); let data = ArrayData::new( DataType::Boolean, @@ -171,6 +182,19 @@ mod tests { assert_eq!(false, c.value(3)); } + #[test] + fn test_bool_array_not_sliced() { + let a = + BooleanArray::from(vec![true, true, false, false, false, false, true, true]); + let a = a.slice(4, 4); + let a = a.as_any().downcast_ref::().unwrap(); + let c = not(&a).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(false, c.value(3)); + } + #[test] fn test_bool_array_and_nulls() { let a = BooleanArray::from(vec![None, Some(false), None, Some(false)]); @@ -182,6 +206,32 @@ mod tests { assert_eq!(false, c.is_null(3)); } + #[test] + fn test_bool_array_and_nulls_and_offsets() { + let a = + BooleanArray::from(vec![Some(true), None, Some(false), None, Some(false)]); + let a = a.slice(1, 4); + let a = a.as_any().downcast_ref::().unwrap(); + + let b = BooleanArray::from(vec![ + None, + Some(false), + None, + None, + None, + Some(false), + Some(false), + ]); + let b = b.slice(3, 4); + let b = b.as_any().downcast_ref::().unwrap(); + + let c = and(&a, &b).unwrap(); + assert_eq!(true, c.is_null(0)); + assert_eq!(true, c.is_null(1)); + assert_eq!(true, c.is_null(2)); + assert_eq!(false, c.is_null(3)); + } + #[test] fn test_bool_array_and_sliced_same_offset() { let a = BooleanArray::from(vec![ @@ -233,12 +283,12 @@ mod tests { #[test] fn test_bool_array_and_sliced_offset1() { let a = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, false, true, - true, + false, false, false, false, false, false, false, false, false, false, false, + true, true, ]); let b = BooleanArray::from(vec![false, true, false, true]); - let a = a.slice(8, 4); + let a = a.slice(9, 4); let a = a.as_any().downcast_ref::().unwrap(); let c = and(&a, &b).unwrap(); @@ -253,18 +303,17 @@ mod tests { fn test_bool_array_and_sliced_offset2() { let a = BooleanArray::from(vec![false, false, true, true]); let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, + true, true, true, false, false, false, false, false, false, true, false, true, ]); - let b = b.slice(8, 4); + let b = b.slice(7, 4); let b = b.as_any().downcast_ref::().unwrap(); let c = and(&a, &b).unwrap(); assert_eq!(4, c.len()); assert_eq!(false, c.value(0)); assert_eq!(false, c.value(1)); - assert_eq!(false, c.value(2)); - assert_eq!(true, c.value(3)); + assert_eq!(true, c.value(2)); + assert_eq!(false, c.value(3)); } } diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index e8f331223b0..eaa4764d5bd 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -225,10 +225,8 @@ fn take_boolean(values: &ArrayRef, indices: &UInt32Array) -> Result { let index = indices.value(i) as usize; if array.is_null(index) { bit_util::unset_bit(null_slice, i); - } else { - if array.value(index) { - bit_util::set_bit(val_slice, i); - } + } else if array.value(index) { + bit_util::set_bit(val_slice, i); } }); diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index 85b6296ecd7..1ea10f7ef5b 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -23,7 +23,7 @@ use crate::bitmap::Bitmap; use crate::buffer::{buffer_bin_and, Buffer}; #[cfg(feature = "simd")] use crate::datatypes::*; -use crate::error::{ArrowError, Result}; +use crate::error::Result; use crate::util::bit_util::ceil; #[cfg(feature = "simd")] use num::One; @@ -38,19 +38,34 @@ pub(super) fn combine_option_bitmap( right_data: &ArrayDataRef, len_in_bits: usize, ) -> Result> { - let left_offset_in_bits = left_data.offset(); - let right_offset_in_bits = right_data.offset(); + let mut left_offset_in_bits = left_data.offset(); + let mut right_offset_in_bits = right_data.offset(); + let left_rem = left_offset_in_bits % 8; + let right_rem = right_offset_in_bits % 8; + + // check if offsets are aligned, and align as necessary let left = left_data.null_buffer(); let right = right_data.null_buffer(); - if (left.is_some() && left_offset_in_bits % 8 != 0) - || (right.is_some() && right_offset_in_bits % 8 != 0) - { - return Err(ArrowError::ComputeError( - "Cannot combine option bitmaps that are not byte-aligned.".to_string(), - )); - } + // if there are remainders, slice the buffers to realign them + let left = if left.is_some() && left_rem > 0 { + let b = left.cloned().unwrap(); + let sliced = b.slice_with_len(left_offset_in_bits, len_in_bits); + left_offset_in_bits = 0; + Some(sliced) + } else { + left.cloned() + }; + + let right = if right.is_some() && right_rem > 0 { + let b = right.cloned().unwrap(); + let sliced = b.slice_with_len(right_offset_in_bits, len_in_bits); + right_offset_in_bits = 0; + Some(sliced) + } else { + right.cloned() + }; let left_offset = left_offset_in_bits / 8; let right_offset = right_offset_in_bits / 8; diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs index d8ffa6f19c5..8a11341ebde 100644 --- a/rust/arrow/src/util/bit_util.rs +++ b/rust/arrow/src/util/bit_util.rs @@ -351,9 +351,9 @@ mod tests { unsafe { set_bits_raw(buf.as_mut_ptr(), start, end); } - for i in start..end { + (start..end).for_each(|i| { expected[i] = true; - } + }); } let raw_ptr = buf.as_ptr();