From 041b9b93858d170f35d1d4fd3c0c9af55a68d84d Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 6 Sep 2020 09:36:03 +0200 Subject: [PATCH 1/4] Added benchmark. --- rust/arrow/benches/array_from_vec.rs | 82 ++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/rust/arrow/benches/array_from_vec.rs b/rust/arrow/benches/array_from_vec.rs index ec9356f8f0a..0f73c1d69dc 100644 --- a/rust/arrow/benches/array_from_vec.rs +++ b/rust/arrow/benches/array_from_vec.rs @@ -48,6 +48,68 @@ fn array_string_from_vec(n: usize) { criterion::black_box(StringArray::from(v)); } +fn struct_array_values(n: usize) -> (Field, Vec>, Field, Vec>) { + let mut strings: Vec> = Vec::with_capacity(n); + let mut ints: Vec> = Vec::with_capacity(n); + for _ in 0..n / 4 { + strings.extend_from_slice(&[Some("joe"), None, None, Some("mark")]); + ints.extend_from_slice(&[Some(1), Some(2), None, Some(4)]); + } + (Field::new("f1", DataType::Utf8, false), + strings, Field::new("f2", DataType::Int32, false), ints) +} + +fn struct_array_from_vec(field1: &Field, strings: &Vec>, field2: &Field, ints: &Vec>) { + + criterion::black_box({ + let len = strings.len(); + // this cheats a bit, as the compiler knows the that they will be strings and i32, but well + let string_builder = StringBuilder::new(len); + let int_builder = Int32Builder::new(len); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(field1.clone()); + field_builders.push(Box::new(string_builder) as Box); + fields.push(field2.clone()); + field_builders.push(Box::new(int_builder) as Box); + + let mut builder = StructBuilder::new(fields, field_builders); + assert_eq!(2, builder.num_fields()); + + let string_builder = builder + .field_builder::(0) + .expect("builder at field 0 should be string builder"); + for string in strings { + if string.is_some() { + string_builder.append_value(string.unwrap()).unwrap(); + } else { + string_builder.append_null().unwrap(); + } + } + + let int_builder = builder + .field_builder::(1) + .expect("builder at field 1 should be int builder"); + for int in ints { + if int.is_some() { + int_builder.append_value(int.unwrap()).unwrap(); + } else { + int_builder.append_null().unwrap(); + } + } + + for _ in 0..len / 4 { + builder.append(true).unwrap(); + builder.append(true).unwrap(); + builder.append_null().unwrap(); + builder.append(true).unwrap(); + } + + builder.finish(); + }); +} + fn criterion_benchmark(c: &mut Criterion) { c.bench_function("array_from_vec 128", |b| b.iter(|| array_from_vec(128))); c.bench_function("array_from_vec 256", |b| b.iter(|| array_from_vec(256))); @@ -62,6 +124,26 @@ fn criterion_benchmark(c: &mut Criterion) { c.bench_function("array_string_from_vec 512", |b| { b.iter(|| array_string_from_vec(512)) }); + + let (field1, strings, field2, ints) = struct_array_values(128); + c.bench_function("struct_array_from_vec 128", |b| { + b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) + }); + + let (field1, strings, field2, ints) = struct_array_values(256); + c.bench_function("struct_array_from_vec 256", |b| { + b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) + }); + + let (field1, strings, field2, ints) = struct_array_values(512); + c.bench_function("struct_array_from_vec 512", |b| { + b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) + }); + + let (field1, strings, field2, ints) = struct_array_values(1024); + c.bench_function("struct_array_from_vec 1024", |b| { + b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) + }); } criterion_group!(benches, criterion_benchmark); From 8d31dd59007f0d62f4152113cd4a44b64d324e18 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 19 Sep 2020 18:02:09 +0200 Subject: [PATCH 2/4] Added StructArray::TryFrom. --- rust/arrow/src/array/array.rs | 181 +++++++++++++++++++++++++++++++--- rust/arrow/src/array/equal.rs | 121 ++++++++--------------- 2 files changed, 205 insertions(+), 97 deletions(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index 6d56a5aeef5..f2bddb0e61e 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::convert::From; +use std::convert::{From, TryFrom}; use std::fmt; use std::io::Write; use std::iter::{FromIterator, IntoIterator}; @@ -28,11 +28,14 @@ use chrono::prelude::*; use super::*; use crate::array::builder::StringDictionaryBuilder; use crate::array::equal::JsonEqual; -use crate::buffer::{Buffer, MutableBuffer}; +use crate::buffer::{buffer_bin_or, Buffer, MutableBuffer}; use crate::datatypes::DataType::Struct; use crate::datatypes::*; use crate::memory; -use crate::util::bit_util; +use crate::{ + error::{ArrowError, Result}, + util::bit_util, +}; /// Number of seconds in a day const SECONDS_IN_DAY: i64 = 86_400; @@ -360,6 +363,13 @@ fn slice_data(data: &ArrayDataRef, mut offset: usize, length: usize) -> ArrayDat Arc::new(new_data) } +// creates a new MutableBuffer initializes all falsed +// this is useful to populate null bitmaps +fn make_null_buffer(len: usize) -> MutableBuffer { + let num_bytes = bit_util::ceil(len, 8); + MutableBuffer::new(num_bytes).with_bitset(num_bytes, false) +} + /// ---------------------------------------------------------------------------- /// Implementations of different array types @@ -703,9 +713,7 @@ macro_rules! def_numeric_from_vec { { fn from(data: Vec::Native>>) -> Self { let data_len = data.len(); - let num_bytes = bit_util::ceil(data_len, 8); - let mut null_buf = - MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + let mut null_buf = make_null_buffer(data_len); let mut val_buf = MutableBuffer::new( data_len * mem::size_of::<<$ty as ArrowPrimitiveType>::Native>(), ); @@ -780,8 +788,7 @@ impl PrimitiveArray { pub fn from_opt_vec(data: Vec>, timezone: Option>) -> Self { // TODO: duplicated from def_numeric_from_vec! macro, it looks possible to convert to generic let data_len = data.len(); - let num_bytes = bit_util::ceil(data_len, 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + let mut null_buf = make_null_buffer(data_len); let mut val_buf = MutableBuffer::new(data_len * mem::size_of::()); { @@ -812,8 +819,7 @@ impl PrimitiveArray { /// Constructs a boolean array from a vector. Should only be used for testing. impl From> for BooleanArray { fn from(data: Vec) -> Self { - let num_byte = bit_util::ceil(data.len(), 8); - let mut mut_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); + let mut mut_buf = make_null_buffer(data.len()); { let mut_slice = mut_buf.data_mut(); for (i, b) in data.iter().enumerate() { @@ -834,7 +840,7 @@ impl From>> for BooleanArray { fn from(data: Vec>) -> Self { let data_len = data.len(); let num_byte = bit_util::ceil(data_len, 8); - let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); + let mut null_buf = make_null_buffer(data.len()); let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); { @@ -1642,9 +1648,7 @@ macro_rules! def_string_from_vec { fn from(v: Vec>) -> Self { let mut offsets = Vec::with_capacity(v.len() + 1); let mut values = Vec::new(); - let num_bytes = bit_util::ceil(v.len(), 8); - let mut null_buf = - MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + let mut null_buf = make_null_buffer(v.len()); let mut length_so_far = 0; offsets.push(length_so_far); for (i, s) in v.iter().enumerate() { @@ -2002,6 +2006,67 @@ impl From for StructArray { } } +impl TryFrom> for StructArray { + type Error = ArrowError; + + /// builds a StructArray from a vector of names and arrays. + /// This errors if the values have a different length. + /// An entry is set to Null when all values are null. + fn try_from(values: Vec<(&str, ArrayRef)>) -> Result { + let values_len = values.len(); + + // these will be populated + let mut fields = Vec::with_capacity(values_len); + let mut child_data = Vec::with_capacity(values_len); + + // len: the size of the arrays. + let mut len: Option = None; + // null: the null mask of the arrays. + let mut null: Option = None; + for (field_name, array) in values { + let child_datum = array.data(); + if let Some(len) = len { + if len != child_datum.len() { + return Err(ArrowError::InvalidArgumentError( + format!("Array of field \"{}\" has length {}, but previous elements have length {}. + All arrays in every entry in a struct array must have the same length.", field_name, child_datum.len(), len) + )); + } + } else { + len = Some(child_datum.len()) + } + child_data.push(child_datum.clone()); + fields.push(Field::new( + field_name, + array.data_type().clone(), + child_datum.null_buffer().is_some(), + )); + + if let Some(child_null_buffer) = child_datum.null_buffer() { + null = Some(if let Some(null_buffer) = &null { + buffer_bin_or(null_buffer, 0, child_null_buffer, 0, null_buffer.len()) + } else { + child_null_buffer.clone() + }); + } else if null.is_some() { + // when one of the fields has no nulls, them there is no null in the array + null = None; + } + } + let len = len.unwrap(); + + let mut builder = ArrayData::builder(DataType::Struct(fields.clone())) + .len(len) + .child_data(child_data); + if let Some(null_buffer) = null { + let null_count = len - bit_util::count_set_bits(null_buffer.data()); + builder = builder.null_count(null_count).null_bit_buffer(null_buffer); + } + + Ok(StructArray::from(builder.build())) + } +} + impl Array for StructArray { fn as_any(&self) -> &Any { self @@ -2382,7 +2447,7 @@ mod tests { use crate::buffer::Buffer; use crate::datatypes::{DataType, Field}; - use crate::memory; + use crate::{bitmap::Bitmap, memory}; #[test] fn test_primitive_array_from_vec() { @@ -3858,6 +3923,92 @@ mod tests { assert_eq!(0, struct_array.offset()); } + /// validates that the in-memory representation follows [the spec](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) + #[test] + fn test_struct_array_from_vec() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + ])); + let ints: ArrayRef = + Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); + + let arr = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + + let struct_data = arr.data(); + assert_eq!(4, struct_data.len()); + assert_eq!(1, struct_data.null_count()); + assert_eq!( + // 00001011 + &Some(Bitmap::from(Buffer::from(&[11_u8]))), + struct_data.null_bitmap() + ); + + let expected_string_data = ArrayData::builder(DataType::Utf8) + .len(4) + .null_count(2) + .null_bit_buffer(Buffer::from(&[9_u8])) + .add_buffer(Buffer::from(&[0, 3, 3, 3, 7].to_byte_slice())) + .add_buffer(Buffer::from("joemark".as_bytes())) + .build(); + + let expected_int_data = ArrayData::builder(DataType::Int32) + .len(4) + .null_count(1) + .null_bit_buffer(Buffer::from(&[11_u8])) + .add_buffer(Buffer::from(&[1, 2, 0, 4].to_byte_slice())) + .build(); + + assert_eq!(expected_string_data, arr.column(0).data()); + + // TODO: implement equality for ArrayData + assert_eq!(expected_int_data.len(), arr.column(1).data().len()); + assert_eq!( + expected_int_data.null_count(), + arr.column(1).data().null_count() + ); + assert_eq!( + expected_int_data.null_bitmap(), + arr.column(1).data().null_bitmap() + ); + let expected_value_buf = expected_int_data.buffers()[0].clone(); + let actual_value_buf = arr.column(1).data().buffers()[0].clone(); + for i in 0..expected_int_data.len() { + if !expected_int_data.is_null(i) { + assert_eq!( + expected_value_buf.data()[i * 4..(i + 1) * 4], + actual_value_buf.data()[i * 4..(i + 1) * 4] + ); + } + } + } + + #[test] + fn test_struct_array_from_vec_error() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + // 3 elements, not 4 + ])); + let ints: ArrayRef = + Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); + + let arr = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]); + + match arr { + Err(ArrowError::InvalidArgumentError(e)) => { + assert!(e.starts_with("Array of field \"f2\" has length 4, but previous elements have length 3.")); + } + _ => assert!(false, "This test got an unexpected error type"), + }; + } + #[test] #[should_panic( expected = "the field data types must match the array data in a StructArray" diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs index f9f7d075ac1..3aef6f9fad2 100644 --- a/rust/arrow/src/array/equal.rs +++ b/rust/arrow/src/array/equal.rs @@ -1544,6 +1544,7 @@ mod tests { use super::*; use crate::error::Result; + use std::{convert::TryFrom, sync::Arc}; #[test] fn test_primitive_equal() { @@ -1910,32 +1911,26 @@ mod tests { #[test] fn test_struct_equal() { - let string_builder = StringBuilder::new(5); - let int_builder = Int32Builder::new(5); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Utf8, false)); - field_builders.push(Box::new(string_builder) as Box); - fields.push(Field::new("f2", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - - let a = create_struct_array( - &mut builder, - &[Some("joe"), None, None, Some("mark"), Some("doe")], - &[Some(1), Some(2), None, Some(4), Some(5)], - &[true, true, false, true, true], - ) - .unwrap(); - let b = create_struct_array( - &mut builder, - &[Some("joe"), None, None, Some("mark"), Some("doe")], - &[Some(1), Some(2), None, Some(4), Some(5)], - &[true, true, false, true, true], - ) - .unwrap(); + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let a = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + + let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); assert!(a.equals(&b)); assert!(b.equals(&a)); @@ -2440,26 +2435,24 @@ mod tests { #[test] fn test_struct_json_equal() { - // Test equal case - let string_builder = StringBuilder::new(5); - let int_builder = Int32Builder::new(5); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Utf8, false)); - field_builders.push(Box::new(string_builder) as Box); - fields.push(Field::new("f2", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - - let arrow_array = create_struct_array( - &mut builder, - &[Some("joe"), None, None, Some("mark"), Some("doe")], - &[Some(1), Some(2), None, Some(4), Some(5)], - &[true, true, false, true, true], - ) - .unwrap(); + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let arrow_array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); let json_array: Value = serde_json::from_str( r#" @@ -2545,42 +2538,6 @@ mod tests { assert!(json_array.ne(&arrow_array)); } - fn create_struct_array< - 'a, - T: AsRef<[Option<&'a str>]>, - U: AsRef<[Option]>, - V: AsRef<[bool]>, - >( - builder: &'a mut StructBuilder, - first: T, - second: U, - is_valid: V, - ) -> Result { - let string_builder = builder.field_builder::(0).unwrap(); - for v in first.as_ref() { - if let Some(s) = v { - string_builder.append_value(s)?; - } else { - string_builder.append_null()?; - } - } - - let int_builder = builder.field_builder::(1).unwrap(); - for v in second.as_ref() { - if let Some(i) = v { - int_builder.append_value(*i)?; - } else { - int_builder.append_null()?; - } - } - - for v in is_valid.as_ref() { - builder.append(*v)? - } - - Ok(builder.finish()) - } - #[test] fn test_null_json_equal() { // Test equaled array From 231121e165a1d3606bf18d37ddd2fdb7715b2055 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 19 Sep 2020 18:03:54 +0200 Subject: [PATCH 3/4] updated bench. --- rust/arrow/benches/array_from_vec.rs | 75 +++++++++------------------- 1 file changed, 23 insertions(+), 52 deletions(-) diff --git a/rust/arrow/benches/array_from_vec.rs b/rust/arrow/benches/array_from_vec.rs index 0f73c1d69dc..41900b8831a 100644 --- a/rust/arrow/benches/array_from_vec.rs +++ b/rust/arrow/benches/array_from_vec.rs @@ -24,6 +24,7 @@ extern crate arrow; use arrow::array::*; use arrow::buffer::Buffer; use arrow::datatypes::*; +use std::{convert::TryFrom, sync::Arc}; fn array_from_vec(n: usize) { let mut v: Vec = Vec::with_capacity(n); @@ -48,66 +49,36 @@ fn array_string_from_vec(n: usize) { criterion::black_box(StringArray::from(v)); } -fn struct_array_values(n: usize) -> (Field, Vec>, Field, Vec>) { +fn struct_array_values( + n: usize, +) -> ( + &'static str, + Vec>, + &'static str, + Vec>, +) { let mut strings: Vec> = Vec::with_capacity(n); let mut ints: Vec> = Vec::with_capacity(n); for _ in 0..n / 4 { strings.extend_from_slice(&[Some("joe"), None, None, Some("mark")]); ints.extend_from_slice(&[Some(1), Some(2), None, Some(4)]); } - (Field::new("f1", DataType::Utf8, false), - strings, Field::new("f2", DataType::Int32, false), ints) + ("f1", strings, "f2", ints) } -fn struct_array_from_vec(field1: &Field, strings: &Vec>, field2: &Field, ints: &Vec>) { - - criterion::black_box({ - let len = strings.len(); - // this cheats a bit, as the compiler knows the that they will be strings and i32, but well - let string_builder = StringBuilder::new(len); - let int_builder = Int32Builder::new(len); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(field1.clone()); - field_builders.push(Box::new(string_builder) as Box); - fields.push(field2.clone()); - field_builders.push(Box::new(int_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - assert_eq!(2, builder.num_fields()); - - let string_builder = builder - .field_builder::(0) - .expect("builder at field 0 should be string builder"); - for string in strings { - if string.is_some() { - string_builder.append_value(string.unwrap()).unwrap(); - } else { - string_builder.append_null().unwrap(); - } - } - - let int_builder = builder - .field_builder::(1) - .expect("builder at field 1 should be int builder"); - for int in ints { - if int.is_some() { - int_builder.append_value(int.unwrap()).unwrap(); - } else { - int_builder.append_null().unwrap(); - } - } - - for _ in 0..len / 4 { - builder.append(true).unwrap(); - builder.append(true).unwrap(); - builder.append_null().unwrap(); - builder.append(true).unwrap(); - } - - builder.finish(); - }); +fn struct_array_from_vec( + field1: &str, + strings: &Vec>, + field2: &str, + ints: &Vec>, +) { + let strings: ArrayRef = Arc::new(StringArray::from(strings.clone())); + let ints: ArrayRef = Arc::new(Int32Array::from(ints.clone())); + + criterion::black_box( + StructArray::try_from(vec![(field1.clone(), strings), (field2.clone(), ints)]) + .unwrap(), + ); } fn criterion_benchmark(c: &mut Criterion) { From 379ed4d4ffc842e252160eeb46d1e7017a2b5bd3 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 19 Sep 2020 19:22:25 +0200 Subject: [PATCH 4/4] Clippy. --- rust/arrow/src/array/array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index f2bddb0e61e..9cc1d51039c 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -2055,7 +2055,7 @@ impl TryFrom> for StructArray { } let len = len.unwrap(); - let mut builder = ArrayData::builder(DataType::Struct(fields.clone())) + let mut builder = ArrayData::builder(DataType::Struct(fields)) .len(len) .child_data(child_data); if let Some(null_buffer) = null {