Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 142 additions & 46 deletions rust/arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use super::{
FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait,
};
use crate::buffer::Buffer;
use crate::error::ArrowError;
use crate::util::bit_util;
use crate::{buffer::MutableBuffer, datatypes::DataType};

Expand Down Expand Up @@ -364,62 +365,157 @@ impl FixedSizeBinaryArray {
self.data.buffers()[0].clone()
}

#[inline]
fn value_offset_at(&self, i: usize) -> i32 {
self.length * i as i32
}
}
/// Create an array from an iterable argument of sparse byte slices.
/// Sparsity means that items returned by the iterator are optional, i.e input argument can
/// contain `None` items.
///
/// # Examles
///
/// ```
/// use arrow::array::FixedSizeBinaryArray;
/// let input_arg = vec![
/// None,
/// Some(vec![7, 8]),
/// Some(vec![9, 10]),
/// None,
/// Some(vec![13, 14]),
/// None,
/// ];
/// let array = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
/// ```
///
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
pub fn try_from_sparse_iter<T, U>(mut iter: T) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
U: AsRef<[u8]>,
{
let mut len = 0;
let mut size = None;
let mut byte = 0;
let mut null_buf = MutableBuffer::from_len_zeroed(0);
let mut buffer = MutableBuffer::from_len_zeroed(0);
let mut prepend = 0;
iter.try_for_each(|item| -> Result<(), ArrowError> {
// extend null bitmask by one byte per each 8 items
if byte == 0 {
null_buf.push(0u8);
byte = 8;
}
byte -= 1;

if let Some(slice) = item {
let slice = slice.as_ref();
if let Some(size) = size {
if size != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other is {}",
size,
slice.len()
)));
}
} else {
size = Some(slice.len());
buffer.extend_zeros(slice.len() * prepend);
}
bit_util::set_bit(null_buf.as_slice_mut(), len);
buffer.extend_from_slice(slice);
} else {
if let Some(size) = size {
buffer.extend_zeros(size);
} else {
prepend += 1;
}
}

impl From<Vec<Vec<u8>>> for FixedSizeBinaryArray {
fn from(data: Vec<Vec<u8>>) -> Self {
let len = data.len();
assert!(len > 0);
let size = data[0].len();
assert!(data.iter().all(|item| item.len() == size));
let data = data.into_iter().flatten().collect::<Vec<_>>();
let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32))
.len(len)
.add_buffer(Buffer::from(&data))
.build();
FixedSizeBinaryArray::from(array_data)
}
}
len += 1;

impl From<Vec<Option<Vec<u8>>>> for FixedSizeBinaryArray {
fn from(data: Vec<Option<Vec<u8>>>) -> Self {
let len = data.len();
assert!(len > 0);
// try to estimate the size. This may not be possible no entry is valid => panic
let size = data.iter().filter_map(|e| e.as_ref()).next().unwrap().len();
assert!(data
.iter()
.filter_map(|e| e.as_ref())
.all(|item| item.len() == size));

let num_bytes = bit_util::ceil(len, 8);
let mut null_buf = MutableBuffer::from_len_zeroed(num_bytes);
let null_slice = null_buf.as_slice_mut();

data.iter().enumerate().for_each(|(i, entry)| {
if entry.is_some() {
bit_util::set_bit(null_slice, i);
}
});
Ok(())
})?;

let data = data
.into_iter()
.flat_map(|e| e.unwrap_or_else(|| vec![0; size]))
.collect::<Vec<_>>();
let data = ArrayData::new(
if len == 0 {
return Err(ArrowError::InvalidArgumentError(
"Input iterable argument has no data".to_owned(),
));
}

let size = size.unwrap_or(0);
let array_data = ArrayData::new(
DataType::FixedSizeBinary(size as i32),
len,
None,
Some(null_buf.into()),
0,
vec![Buffer::from(&data)],
vec![buffer.into()],
vec![],
);
FixedSizeBinaryArray::from(Arc::new(data))
Ok(FixedSizeBinaryArray::from(Arc::new(array_data)))
}

/// Create an array from an iterable argument of byte slices.
///
/// # Examles
///
/// ```
/// use arrow::array::FixedSizeBinaryArray;
/// let input_arg = vec![
/// vec![1, 2],
/// vec![3, 4],
/// vec![5, 6],
/// ];
/// let array = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap();
/// ```
///
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
pub fn try_from_iter<T, U>(mut iter: T) -> Result<Self, ArrowError>
where
T: Iterator<Item = U>,
U: AsRef<[u8]>,
{
let mut len = 0;
let mut size = None;
let mut buffer = MutableBuffer::from_len_zeroed(0);
iter.try_for_each(|item| -> Result<(), ArrowError> {
let slice = item.as_ref();
if let Some(size) = size {
if size != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other is {}",
size,
slice.len()
)));
}
} else {
size = Some(slice.len());
}
buffer.extend_from_slice(slice);

len += 1;

Ok(())
})?;

if len == 0 {
return Err(ArrowError::InvalidArgumentError(
"Input iterable argument has no data".to_owned(),
));
}

let size = size.unwrap_or(0);
let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32))
.len(len)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider an iterator of size 4 whose every element is Some(&[0, 0, 0]).

I think that len is being increased by 1 on every group of size size. In the example above, won't len equal to 4 instead of 12?

I think that this should be len * size

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it will be equal to 4. But in the previous implementation, for example impl From<Vec<Vec<u8>>> for FixedSizeBinaryArray, when we pass for example a vec with 4 items to from then it will put 4 as len in the array builder. Isn't it the same there?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right (always learning something new here). In FixedSizeBinary[3] , each item is [a, b, c], pretty much like in an ListArray, and thus the len should be 4, not 12. So, the invariant is actually the other way around:

array.buffer()[0].len() % size == 0
array.len() * size == array.buffer()[0].len()
array.validity().len() == array.len()

Sorry about the noise and thank you for your patience 🥇

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great work you do 👍

.add_buffer(buffer.into())
.build();
Ok(FixedSizeBinaryArray::from(array_data))
}

#[inline]
fn value_offset_at(&self, i: usize) -> i32 {
self.length * i as i32
}
}

Expand Down
29 changes: 21 additions & 8 deletions rust/arrow/src/array/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -834,8 +834,11 @@ mod tests {

#[test]
fn test_binary_fixed_sized_offsets() {
let array =
FixedSizeBinaryArray::from(vec![vec![0, 0], vec![0, 1], vec![0, 2]]).data();
let array = FixedSizeBinaryArray::try_from_iter(
vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(),
)
.expect("Failed to create FixedSizeBinaryArray from iterable")
.data();
let array = array.slice(1, 2);
// = [[0, 1], [0, 2]] due to the offset = 1

Expand All @@ -849,7 +852,9 @@ mod tests {
let result = mutable.freeze();
let result = FixedSizeBinaryArray::from(Arc::new(result));

let expected = FixedSizeBinaryArray::from(vec![vec![0, 2], vec![0, 1]]);
let expected =
FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter())
.expect("Failed to create FixedSizeBinaryArray from iterable");
assert_eq!(result, expected);
}

Expand Down Expand Up @@ -1077,16 +1082,21 @@ mod tests {
#[test]
fn test_fixed_size_binary_append() {
let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])];
let a = FixedSizeBinaryArray::from(a).data();
let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter())
.expect("Failed to create FixedSizeBinaryArray from iterable")
.data();

let b = vec![
None,
Some(vec![7, 8]),
Some(vec![9, 10]),
None,
Some(vec![13, 14]),
None,
];
let b = FixedSizeBinaryArray::from(b).data();
let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter())
.expect("Failed to create FixedSizeBinaryArray from iterable")
.data();

let mut mutable = MutableArrayData::new(vec![a.as_ref(), b.as_ref()], false, 10);

Expand All @@ -1103,20 +1113,23 @@ mod tests {
Some(vec![3, 4]),
Some(vec![5, 6]),
// b
None,
Some(vec![7, 8]),
Some(vec![9, 10]),
None,
Some(vec![13, 14]),
None,
// b[1..4]
Some(vec![7, 8]),
Some(vec![9, 10]),
None,
Some(vec![13, 14]),
// b[2..3]
None,
Some(vec![9, 10]),
// b[4..4]
];
let expected = FixedSizeBinaryArray::from(expected).data();
let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter())
.expect("Failed to create FixedSizeBinaryArray from iterable")
.data();
assert_eq!(&result, expected.as_ref());
}

Expand Down