From 725cce006a0d57f590b0d2c5f8325bd2d438aede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Tue, 26 Jan 2021 19:45:46 +0100 Subject: [PATCH 1/4] Use SmallVec in ArrayData to optimize the common usecase of arrays containing only one buffer --- rust/arrow/Cargo.toml | 1 + rust/arrow/src/array/data.rs | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 0b14b5bfae8..5253ab239a6 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -51,6 +51,7 @@ flatbuffers = "^0.8" hex = "0.4" prettytable-rs = { version = "0.8.0", optional = true } lexical-core = "^0.7" +smallvec = "1.6" [features] default = [] diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index 09fb019f314..cdd3258d6a8 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -29,6 +29,7 @@ use crate::{ }; use super::equal::equal; +use smallvec::SmallVec; #[inline] pub(crate) fn count_nulls( @@ -225,11 +226,11 @@ pub struct ArrayData { /// The buffers for this array data. Note that depending on the array types, this /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) /// at different positions. - buffers: Vec, + buffers: SmallVec<[Buffer; 1]>, /// The child(ren) of this array. Only non-empty for nested types, currently /// `ListArray` and `StructArray`. - child_data: Vec, + child_data: SmallVec<[ArrayDataRef; 1]>, /// The null bitmap. A `None` value for this indicates all values are non-null in /// this array. @@ -258,8 +259,33 @@ impl ArrayData { len, null_count, offset, - buffers, - child_data, + buffers: SmallVec::from_vec(buffers), + child_data: SmallVec::from_vec(child_data), + null_bitmap, + } + } + + pub fn new_smallvec( + data_type: DataType, + len: usize, + null_count: Option, + null_bit_buffer: Option, + offset: usize, + buffer: Buffer, + child_data: Option, + ) -> Self { + let null_count = match null_count { + None => count_nulls(null_bit_buffer.as_ref(), offset, len), + Some(null_count) => null_count, + }; + let null_bitmap = null_bit_buffer.map(Bitmap::from); + Self { + data_type, + len, + null_count, + offset, + buffers: SmallVec::from_buf([buffer]), + child_data: child_data.map(|cd| SmallVec::from_buf([cd])).unwrap_or_default(), null_bitmap, } } From 083f147854308f3dee273e152790f8f3f9312e80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Mon, 1 Feb 2021 16:35:36 +0100 Subject: [PATCH 2/4] Benchmark for slicing nested array --- rust/arrow/benches/array_slice.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/rust/arrow/benches/array_slice.rs b/rust/arrow/benches/array_slice.rs index a535c80d217..8aacf451743 100644 --- a/rust/arrow/benches/array_slice.rs +++ b/rust/arrow/benches/array_slice.rs @@ -35,6 +35,23 @@ fn create_array_with_nulls(size: usize) -> ArrayRef { Arc::new(array) } +fn create_nested_array(size: usize) -> ArrayRef { + let mut builder = ListBuilder::new(StringDictionaryBuilder::new(Int16Builder::new(size), StringBuilder::new(size))); + let strings = &["foo", "bar", "baz"]; + + (0..size).for_each(|i| { + if i%2== 0 { + builder.values().append(&strings[i%strings.len()]).unwrap(); + builder.append(true).unwrap() + } else { + builder.append(false).unwrap(); + } + }); + + Arc::new(builder.finish()) +} + + fn array_slice_benchmark(c: &mut Criterion) { let array = create_array_with_nulls(4096); c.bench_function("array_slice 128", |b| { @@ -46,6 +63,17 @@ fn array_slice_benchmark(c: &mut Criterion) { c.bench_function("array_slice 2048", |b| { b.iter(|| create_array_slice(&array, 2048)) }); + + let nested_array = create_nested_array(4096); + c.bench_function("array_slice nested type 128", |b| { + b.iter(|| create_array_slice(&nested_array, 128)) + }); + c.bench_function("array_slice nested type 512", |b| { + b.iter(|| create_array_slice(&nested_array, 512)) + }); + c.bench_function("array_slice nested type 2048", |b| { + b.iter(|| create_array_slice(&nested_array, 2048)) + }); } criterion_group!(benches, array_slice_benchmark); From f79ee547e07685673a84904d247f6912f80c78c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Mon, 1 Feb 2021 18:05:29 +0100 Subject: [PATCH 3/4] Replace SmallVec with custom enum --- rust/arrow/Cargo.toml | 1 - rust/arrow/src/array/data.rs | 75 +++++++++++++++++++++++++++++++----- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 5253ab239a6..0b14b5bfae8 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -51,7 +51,6 @@ flatbuffers = "^0.8" hex = "0.4" prettytable-rs = { version = "0.8.0", optional = true } lexical-core = "^0.7" -smallvec = "1.6" [features] default = [] diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index cdd3258d6a8..2c8c0d66352 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -29,7 +29,7 @@ use crate::{ }; use super::equal::equal; -use smallvec::SmallVec; +use std::ops::Index; #[inline] pub(crate) fn count_nulls( @@ -206,6 +206,57 @@ pub(crate) fn into_buffers( } } +#[derive(Debug, Clone)] +enum SmallContainer { + Zero, + One([T;1]), + Many(Vec) +} + +impl SmallContainer { + fn from_vec(mut v: Vec) -> Self { + match v.len() { + 0 => Self::Zero, + 1 => Self::One([v.remove(0)]), + _ => Self::Many(v) + } + } + + fn get(&self, index: usize) -> &T { + match self { + SmallContainer::One(one) if index == 0 => &one[0], + SmallContainer::Many(v) if index < v.len() => &v[index], + _ => panic!("index out of bounds") + } + } + + fn as_slice(&self) -> &[T] { + match self { + SmallContainer::Zero => &[], + SmallContainer::One(one) => one.as_ref(), + SmallContainer::Many(v) => v.as_slice() + } + } + + fn iter(&self) -> impl Iterator { + self.as_slice().iter() + } +} + +impl Index for SmallContainer { + type Output = T; + + fn index(&self, index: usize) -> &Self::Output { + self.get(index) + } +} + +impl Default for SmallContainer { + fn default() -> Self { + Self::Zero + } +} + /// An generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. @@ -226,11 +277,11 @@ pub struct ArrayData { /// The buffers for this array data. Note that depending on the array types, this /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) /// at different positions. - buffers: SmallVec<[Buffer; 1]>, + buffers: SmallContainer, /// The child(ren) of this array. Only non-empty for nested types, currently /// `ListArray` and `StructArray`. - child_data: SmallVec<[ArrayDataRef; 1]>, + child_data: SmallContainer, /// The null bitmap. A `None` value for this indicates all values are non-null in /// this array. @@ -259,8 +310,8 @@ impl ArrayData { len, null_count, offset, - buffers: SmallVec::from_vec(buffers), - child_data: SmallVec::from_vec(child_data), + buffers: SmallContainer::from_vec(buffers), + child_data: SmallContainer::from_vec(child_data), null_bitmap, } } @@ -284,8 +335,8 @@ impl ArrayData { len, null_count, offset, - buffers: SmallVec::from_buf([buffer]), - child_data: child_data.map(|cd| SmallVec::from_buf([cd])).unwrap_or_default(), + buffers: SmallContainer::One([buffer]), + child_data: child_data.map(|cd| SmallContainer::One([cd])).unwrap_or_default(), null_bitmap, } } @@ -304,12 +355,12 @@ impl ArrayData { /// Returns a slice of buffers for this array data pub fn buffers(&self) -> &[Buffer] { - &self.buffers[..] + &self.buffers.as_slice() } /// Returns a slice of children data arrays pub fn child_data(&self) -> &[ArrayDataRef] { - &self.child_data[..] + &self.child_data.as_slice() } /// Returns whether the element at index `i` is null @@ -366,6 +417,7 @@ impl ArrayData { /// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData]. pub fn get_buffer_memory_size(&self) -> usize { let mut size = 0; + /* for buffer in &self.buffers { size += buffer.capacity(); } @@ -375,6 +427,8 @@ impl ArrayData { for child in &self.child_data { size += child.get_buffer_memory_size(); } + + */ size } @@ -388,6 +442,7 @@ impl ArrayData { - mem::size_of_val(&self.child_data); // Calculate rest of the fields top down which contain actual data + /* for buffer in &self.buffers { size += mem::size_of_val(&buffer); size += buffer.capacity(); @@ -399,6 +454,8 @@ impl ArrayData { size += child.get_array_memory_size(); } + */ + size } From 75ca9fa1bd162625b3f8db0a45379b4ab110e5b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Mon, 1 Feb 2021 18:32:07 +0100 Subject: [PATCH 4/4] Simplify slicing for single element --- rust/arrow/src/array/data.rs | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index 2c8c0d66352..34970c746ad 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -209,7 +209,7 @@ pub(crate) fn into_buffers( #[derive(Debug, Clone)] enum SmallContainer { Zero, - One([T;1]), + One(T), Many(Vec) } @@ -217,14 +217,14 @@ impl SmallContainer { fn from_vec(mut v: Vec) -> Self { match v.len() { 0 => Self::Zero, - 1 => Self::One([v.remove(0)]), + 1 => Self::One(v.pop().unwrap()), _ => Self::Many(v) } } fn get(&self, index: usize) -> &T { match self { - SmallContainer::One(one) if index == 0 => &one[0], + SmallContainer::One(one) if index == 0 => one, SmallContainer::Many(v) if index < v.len() => &v[index], _ => panic!("index out of bounds") } @@ -233,7 +233,7 @@ impl SmallContainer { fn as_slice(&self) -> &[T] { match self { SmallContainer::Zero => &[], - SmallContainer::One(one) => one.as_ref(), + SmallContainer::One(one) => std::slice::from_ref(one), SmallContainer::Many(v) => v.as_slice() } } @@ -335,8 +335,8 @@ impl ArrayData { len, null_count, offset, - buffers: SmallContainer::One([buffer]), - child_data: child_data.map(|cd| SmallContainer::One([cd])).unwrap_or_default(), + buffers: SmallContainer::One(buffer), + child_data: child_data.map(SmallContainer::One).unwrap_or_default(), null_bitmap, } } @@ -417,18 +417,16 @@ impl ArrayData { /// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData]. pub fn get_buffer_memory_size(&self) -> usize { let mut size = 0; - /* - for buffer in &self.buffers { + for buffer in self.buffers.iter() { size += buffer.capacity(); } if let Some(bitmap) = &self.null_bitmap { size += bitmap.get_buffer_memory_size() } - for child in &self.child_data { + for child in self.child_data.iter() { size += child.get_buffer_memory_size(); } - */ size } @@ -442,20 +440,17 @@ impl ArrayData { - mem::size_of_val(&self.child_data); // Calculate rest of the fields top down which contain actual data - /* - for buffer in &self.buffers { + for buffer in self.buffers.iter() { size += mem::size_of_val(&buffer); size += buffer.capacity(); } if let Some(bitmap) = &self.null_bitmap { size += bitmap.get_array_memory_size() } - for child in &self.child_data { + for child in self.child_data.iter() { size += child.get_array_memory_size(); } - */ - size }