From 765eca48009d4c8c6880a731614091c36b719bac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Wed, 11 Nov 2020 17:32:50 +0100 Subject: [PATCH 01/13] feat: add DecimalArray --- rust/arrow/src/array/array.rs | 168 +++++++++++++++++++++++++ rust/arrow/src/array/equal.rs | 153 +++++++++++++++++++++- rust/arrow/src/array/mod.rs | 1 + rust/arrow/src/datatypes.rs | 6 +- rust/parquet/src/arrow/arrow_writer.rs | 3 + rust/parquet/src/arrow/schema.rs | 6 + 6 files changed, 335 insertions(+), 2 deletions(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index f35c02aa3d0..56075bd6ebd 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -334,6 +334,7 @@ pub fn make_array(data: ArrayDataRef) -> ArrayRef { dt => panic!("Unexpected dictionary key type {:?}", dt), }, DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, + DataType::Decimal(_, _) => Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef, dt => panic!("Unexpected data type {:?}", dt), } } @@ -1722,6 +1723,126 @@ impl From>> for LargeStringArray { } } +/// A type of `DecimalArray` whose elements are binaries. +pub struct DecimalArray { + data: ArrayDataRef, + value_data: RawPtrBox, + precision: usize, + scale: usize, + length: i32, +} + +impl DecimalArray { + /// Returns the element at index `i` as i128. + pub fn value(&self, i: usize) -> i128 { + assert!( + i < self.data.len(), + "DecimalArray out of bounds access" + ); + let offset = i.checked_add(self.data.offset()).unwrap(); + let raw_val = unsafe { + let pos = self.value_offset_at(offset); + std::slice::from_raw_parts( + self.value_data.get().offset(pos as isize), + (self.value_offset_at(offset + 1) - pos) as usize, + ) + }; + Self::from_bytes_to_i128(raw_val) + } + + fn from_bytes_to_i128(b: &[u8]) -> i128 { + let first_bit = b[0] & 128u8 == 128u8; + let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] }; + for (i, v) in b.iter().enumerate() { + result[i + (16 - b.len())] = *v; + } + i128::from_be_bytes(result) + } + + /// Returns the offset for the element at index `i`. + /// + /// Note this doesn't do any bound checking, for performance reason. + #[inline] + pub fn value_offset(&self, i: usize) -> i32 { + self.value_offset_at(self.data.offset() + i) + } + + /// Returns the length for an element. + /// + /// All elements have the same length as the array is a fixed size. + #[inline] + pub fn value_length(&self) -> i32 { + self.length + } + + /// Returns a clone of the value data buffer + pub fn value_data(&self) -> Buffer { + self.data.buffers()[0].clone() + } + + #[inline] + fn value_offset_at(&self, i: usize) -> i32 { + self.length * i as i32 + } +} + +impl From for DecimalArray { + fn from(data: ArrayDataRef) -> Self { + assert_eq!( + data.buffers().len(), + 1, + "BinaryArray data should contain 1 buffer only (values)" + ); + let value_data = data.buffers()[0].raw_data(); + let (precision, scale) = match data.data_type() { + DataType::Decimal(precision, scale) => (*precision, *scale), + _ => panic!("Expected data type to be Decimal"), + }; + let length = (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32; + Self { + data, + value_data: RawPtrBox::new(value_data), + precision, + scale, + length, + } + } +} + +impl fmt::Debug for DecimalArray { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "DecimalArray<{}, {}>\n[\n", self.precision, self.scale)?; + print_long_array(self, f, |array, index, f| { + fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl Array for DecimalArray { + fn as_any(&self) -> &Any { + self + } + + fn data(&self) -> ArrayDataRef { + self.data.clone() + } + + fn data_ref(&self) -> &ArrayDataRef { + &self.data + } + + /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray]. + fn get_buffer_memory_size(&self) -> usize { + self.data.get_buffer_memory_size() + } + + /// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray]. + fn get_array_memory_size(&self) -> usize { + self.data.get_array_memory_size() + mem::size_of_val(self) + } +} + /// A type of `FixedSizeListArray` whose elements are binaries. pub struct FixedSizeBinaryArray { data: ArrayDataRef, @@ -4322,4 +4443,51 @@ mod tests { assert_eq!(1, keys.value(2)); assert_eq!(0, keys.value(5)); } + + #[test] + fn test_decimal_array() { + let values: [u8; 20] = [0, 0, 0, 0, 0, 2, 17, 180, 219, 192, 255, 255, 255, 255, 255, 253, 238, 75, 36, 64]; + + let array_data = ArrayData::builder(DataType::Decimal(23, 6)) + .len(2) + .add_buffer(Buffer::from(&values[..])) + .build(); + let fixed_size_binary_array = DecimalArray::from(array_data); + // assert_eq!(10, fixed_size_binary_array.len()); + // assert_eq!(0, fixed_size_binary_array.null_count()); + assert_eq!( + 8_887_000_000, + fixed_size_binary_array.value(0) + ); + assert_eq!( + -8_887_000_000, + fixed_size_binary_array.value(1) + ); + assert_eq!(10, fixed_size_binary_array.value_length()); + // assert_eq!(10, fixed_size_binary_array.value_offset(2)); + // for i in 0..3 { + // assert!(fixed_size_binary_array.is_valid(i)); + // assert!(!fixed_size_binary_array.is_null(i)); + // } + + // Test binary array with offset + // let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) + // .len(2) + // .offset(1) + // .add_buffer(Buffer::from(&values[..])) + // .build(); + // let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); + // assert_eq!( + // [b't', b'h', b'e', b'r', b'e'], + // fixed_size_binary_array.value(0) + // ); + // assert_eq!( + // [b'a', b'r', b'r', b'o', b'w'], + // fixed_size_binary_array.value(1) + // ); + // assert_eq!(2, fixed_size_binary_array.len()); + // assert_eq!(5, fixed_size_binary_array.value_offset(0)); + // assert_eq!(5, fixed_size_binary_array.value_length()); + // assert_eq!(10, fixed_size_binary_array.value_offset(1)); + } } diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs index 26b4cd45a7a..42d333698f4 100644 --- a/rust/arrow/src/array/equal.rs +++ b/rust/arrow/src/array/equal.rs @@ -20,7 +20,7 @@ use crate::datatypes::*; use crate::util::bit_util; use array::{ Array, BinaryOffsetSizeTrait, GenericBinaryArray, GenericListArray, - GenericStringArray, ListArrayOps, OffsetSizeTrait, StringOffsetSizeTrait, + GenericStringArray, ListArrayOps, OffsetSizeTrait, StringOffsetSizeTrait }; use hex::FromHex; use serde_json::value::Value::{Null as JNull, Object, String as JString}; @@ -156,6 +156,12 @@ impl PartialEq for FixedSizeBinaryArray { } } +impl PartialEq for DecimalArray { + fn eq(&self, other: &Self) -> bool { + self.equals(other) + } +} + impl ArrayEqual for GenericListArray { fn equals(&self, other: &dyn Array) -> bool { if !base_equal(&self.data(), &other.data()) { @@ -668,6 +674,117 @@ impl ArrayEqual for FixedSizeBinaryArray { } } +impl ArrayEqual for DecimalArray { + fn equals(&self, other: &dyn Array) -> bool { + if !base_equal(&self.data(), &other.data()) { + return false; + } + + let other = other + .as_any() + .downcast_ref::() + .unwrap(); + + let this = self + .as_any() + .downcast_ref::() + .unwrap(); + + // TODO: handle null & length == 0 case? + + let value_buf = self.value_data(); + let other_value_buf = other.value_data(); + let value_data = value_buf.data(); + let other_value_data = other_value_buf.data(); + + if self.null_count() == 0 { + // No offset in both - just do memcmp + if self.offset() == 0 && other.offset() == 0 { + let len = self.value_offset(self.len()) as usize; + return value_data[..len] == other_value_data[..len]; + } else { + let start = self.value_offset(0) as usize; + let other_start = other.value_offset(0) as usize; + let len = (self.value_offset(self.len()) - self.value_offset(0)) as usize; + return value_data[start..(start + len)] + == other_value_data[other_start..(other_start + len)]; + } + } else { + for i in 0..self.len() { + if self.is_null(i) { + continue; + } + + let start = self.value_offset(i) as usize; + let other_start = other.value_offset(i) as usize; + let len = self.value_length() as usize; + if value_data[start..(start + len)] + != other_value_data[other_start..(other_start + len)] + { + return false; + } + } + } + + true + } + + fn range_equals( + &self, + other: &dyn Array, + start_idx: usize, + end_idx: usize, + other_start_idx: usize, + ) -> bool { + assert!(other_start_idx + (end_idx - start_idx) <= other.len()); + let other = other + .as_any() + .downcast_ref::() + .unwrap(); + + let mut j = other_start_idx; + for i in start_idx..end_idx { + let is_null = self.is_null(i); + let other_is_null = other.is_null(j); + + if is_null != other_is_null { + return false; + } + + if is_null { + continue; + } + + let start_offset = self.value_offset(i) as usize; + let end_offset = self.value_offset(i + 1) as usize; + let other_start_offset = other.value_offset(j) as usize; + let other_end_offset = other.value_offset(j + 1) as usize; + + if end_offset - start_offset != other_end_offset - other_start_offset { + return false; + } + + let value_buf = self.value_data(); + let other_value_buf = other.value_data(); + let value_data = value_buf.data(); + let other_value_data = other_value_buf.data(); + + if end_offset - start_offset > 0 { + let len = end_offset - start_offset; + if value_data[start_offset..(start_offset + len)] + != other_value_data[other_start_offset..(other_start_offset + len)] + { + return false; + } + } + + j += 1; + } + + true + } +} + impl ArrayEqual for StructArray { fn equals(&self, other: &dyn Array) -> bool { if !base_equal(&self.data(), &other.data()) { @@ -1120,6 +1237,22 @@ impl JsonEqual for FixedSizeBinaryArray { } } +impl JsonEqual for DecimalArray { + fn equals_json(&self, json: &[&Value]) -> bool { + if self.len() != json.len() { + return false; + } + + (0..self.len()).all(|i| match json[i] { + JString(s) => { + self.is_valid(i) && (s.parse::().unwrap() == self.value(i)) + } + JNull => self.is_null(i), + _ => false, + }) + } +} + impl PartialEq for FixedSizeBinaryArray { fn eq(&self, json: &Value) -> bool { match json { @@ -1129,6 +1262,15 @@ impl PartialEq for FixedSizeBinaryArray { } } +impl PartialEq for DecimalArray { + fn eq(&self, json: &Value) -> bool { + match json { + Value::Array(json_array) => self.equals_json_values(&json_array), + _ => false, + } + } +} + impl PartialEq for Value { fn eq(&self, arrow: &FixedSizeBinaryArray) -> bool { match self { @@ -1138,6 +1280,15 @@ impl PartialEq for Value { } } +impl PartialEq for Value { + fn eq(&self, arrow: &DecimalArray) -> bool { + match self { + Value::Array(json_array) => arrow.equals_json_values(&json_array), + _ => false, + } + } +} + impl JsonEqual for UnionArray { fn equals_json(&self, _json: &[&Value]) -> bool { unimplemented!( diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs index 0a996e93257..f768b2b60a5 100644 --- a/rust/arrow/src/array/mod.rs +++ b/rust/arrow/src/array/mod.rs @@ -104,6 +104,7 @@ pub use self::data::ArrayDataRef; pub use self::array::BinaryArray; pub use self::array::DictionaryArray; +pub use self::array::DecimalArray; pub use self::array::FixedSizeBinaryArray; pub use self::array::FixedSizeListArray; pub use self::array::LargeBinaryArray; diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 6f0dc240ac9..da982647a29 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -145,6 +145,8 @@ pub enum DataType { /// This type mostly used to represent low cardinality string /// arrays or a limited set of primitive types as integers. Dictionary(Box, Box), + /// Decimal value with precision and scale + Decimal(usize, usize), } /// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX @@ -1121,6 +1123,7 @@ impl DataType { TimeUnit::Nanosecond => "NANOSECOND", }}), DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), + DataType::Decimal(precision, scale) => json!({"name": "decimal", "precision": precision, "scale": scale}), } } @@ -1454,7 +1457,8 @@ impl Field { | DataType::FixedSizeList(_, _) | DataType::FixedSizeBinary(_) | DataType::Utf8 - | DataType::LargeUtf8 => { + | DataType::LargeUtf8 + | DataType::Decimal(_, _) => { if self.data_type != from.data_type { return Err(ArrowError::SchemaError( "Fail to merge schema Field due to conflicting datatype" diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index de8bc7c1d46..3383eb051db 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -270,6 +270,7 @@ fn write_leaves( ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Boolean | ArrowDataType::FixedSizeBinary(_) + | ArrowDataType::Decimal(_, _) | ArrowDataType::Union(_) => Err(ParquetError::NYI( "Attempting to write an Arrow type that is not yet implemented".to_string(), )), @@ -482,6 +483,7 @@ fn get_levels( repetition: None, }], ArrowDataType::FixedSizeBinary(_) => unimplemented!(), + ArrowDataType::Decimal(_, _) => unimplemented!(), ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { let array_data = array.data(); let child_data = array_data.child_data().get(0).unwrap(); @@ -566,6 +568,7 @@ fn get_levels( | ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => unimplemented!(), ArrowDataType::FixedSizeBinary(_) => unimplemented!(), + ArrowDataType::Decimal(_, _) => unimplemented!(), ArrowDataType::LargeBinary => unimplemented!(), ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { // nested list diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 10270fff464..6cf0f602a93 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -400,6 +400,12 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } + DataType::Decimal(_, _) => { + Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(repetition) + .with_length(10) + .build() + } DataType::Utf8 | DataType::LargeUtf8 => { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) .with_logical_type(LogicalType::UTF8) From 1e229e3c3ea4152b6a0579f12b789f6cb7530e31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Wed, 11 Nov 2020 17:46:47 +0100 Subject: [PATCH 02/13] chore: formatting --- rust/arrow/src/array/array.rs | 47 ++++++----------------------------- rust/arrow/src/array/equal.rs | 17 +++---------- rust/arrow/src/array/mod.rs | 2 +- rust/arrow/src/datatypes.rs | 4 ++- rust/arrow/src/error.rs | 4 +-- 5 files changed, 16 insertions(+), 58 deletions(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index 56075bd6ebd..69a5d6773df 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -1735,10 +1735,7 @@ pub struct DecimalArray { impl DecimalArray { /// Returns the element at index `i` as i128. pub fn value(&self, i: usize) -> i128 { - assert!( - i < self.data.len(), - "DecimalArray out of bounds access" - ); + assert!(i < self.data.len(), "DecimalArray out of bounds access"); let offset = i.checked_add(self.data.offset()).unwrap(); let raw_val = unsafe { let pos = self.value_offset_at(offset); @@ -4446,48 +4443,18 @@ mod tests { #[test] fn test_decimal_array() { - let values: [u8; 20] = [0, 0, 0, 0, 0, 2, 17, 180, 219, 192, 255, 255, 255, 255, 255, 253, 238, 75, 36, 64]; + let values: [u8; 20] = [ + 0, 0, 0, 0, 0, 2, 17, 180, 219, 192, 255, 255, 255, 255, 255, 253, 238, 75, + 36, 64, + ]; let array_data = ArrayData::builder(DataType::Decimal(23, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) .build(); let fixed_size_binary_array = DecimalArray::from(array_data); - // assert_eq!(10, fixed_size_binary_array.len()); - // assert_eq!(0, fixed_size_binary_array.null_count()); - assert_eq!( - 8_887_000_000, - fixed_size_binary_array.value(0) - ); - assert_eq!( - -8_887_000_000, - fixed_size_binary_array.value(1) - ); + assert_eq!(8_887_000_000, fixed_size_binary_array.value(0)); + assert_eq!(-8_887_000_000, fixed_size_binary_array.value(1)); assert_eq!(10, fixed_size_binary_array.value_length()); - // assert_eq!(10, fixed_size_binary_array.value_offset(2)); - // for i in 0..3 { - // assert!(fixed_size_binary_array.is_valid(i)); - // assert!(!fixed_size_binary_array.is_null(i)); - // } - - // Test binary array with offset - // let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - // .len(2) - // .offset(1) - // .add_buffer(Buffer::from(&values[..])) - // .build(); - // let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); - // assert_eq!( - // [b't', b'h', b'e', b'r', b'e'], - // fixed_size_binary_array.value(0) - // ); - // assert_eq!( - // [b'a', b'r', b'r', b'o', b'w'], - // fixed_size_binary_array.value(1) - // ); - // assert_eq!(2, fixed_size_binary_array.len()); - // assert_eq!(5, fixed_size_binary_array.value_offset(0)); - // assert_eq!(5, fixed_size_binary_array.value_length()); - // assert_eq!(10, fixed_size_binary_array.value_offset(1)); } } diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs index 42d333698f4..04a2e811f1e 100644 --- a/rust/arrow/src/array/equal.rs +++ b/rust/arrow/src/array/equal.rs @@ -20,7 +20,7 @@ use crate::datatypes::*; use crate::util::bit_util; use array::{ Array, BinaryOffsetSizeTrait, GenericBinaryArray, GenericListArray, - GenericStringArray, ListArrayOps, OffsetSizeTrait, StringOffsetSizeTrait + GenericStringArray, ListArrayOps, OffsetSizeTrait, StringOffsetSizeTrait, }; use hex::FromHex; use serde_json::value::Value::{Null as JNull, Object, String as JString}; @@ -680,15 +680,9 @@ impl ArrayEqual for DecimalArray { return false; } - let other = other - .as_any() - .downcast_ref::() - .unwrap(); + let other = other.as_any().downcast_ref::().unwrap(); - let this = self - .as_any() - .downcast_ref::() - .unwrap(); + let this = self.as_any().downcast_ref::().unwrap(); // TODO: handle null & length == 0 case? @@ -737,10 +731,7 @@ impl ArrayEqual for DecimalArray { other_start_idx: usize, ) -> bool { assert!(other_start_idx + (end_idx - start_idx) <= other.len()); - let other = other - .as_any() - .downcast_ref::() - .unwrap(); + let other = other.as_any().downcast_ref::().unwrap(); let mut j = other_start_idx; for i in start_idx..end_idx { diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs index f768b2b60a5..4f54ba4ae4e 100644 --- a/rust/arrow/src/array/mod.rs +++ b/rust/arrow/src/array/mod.rs @@ -103,8 +103,8 @@ pub use self::data::ArrayDataBuilder; pub use self::data::ArrayDataRef; pub use self::array::BinaryArray; -pub use self::array::DictionaryArray; pub use self::array::DecimalArray; +pub use self::array::DictionaryArray; pub use self::array::FixedSizeBinaryArray; pub use self::array::FixedSizeListArray; pub use self::array::LargeBinaryArray; diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index da982647a29..5a28aba800d 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1123,7 +1123,9 @@ impl DataType { TimeUnit::Nanosecond => "NANOSECOND", }}), DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), - DataType::Decimal(precision, scale) => json!({"name": "decimal", "precision": precision, "scale": scale}), + DataType::Decimal(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale}) + } } } diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs index f428b4f0f99..c9c30d3343a 100644 --- a/rust/arrow/src/error.rs +++ b/rust/arrow/src/error.rs @@ -40,9 +40,7 @@ pub enum ArrowError { impl ArrowError { /// Wraps an external error in an `ArrowError`. - pub fn from_external_error( - error: Box, - ) -> Self { + pub fn from_external_error(error: Box) -> Self { Self::ExternalError(error) } } From b81bb097989b8fd1cace9af61312da8253ccc853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Thu, 12 Nov 2020 17:01:37 +0100 Subject: [PATCH 03/13] feat: add builder from binary data --- rust/arrow/src/array/array_binary.rs | 60 ++++++++++-- rust/arrow/src/array/builder.rs | 131 +++++++++++++++++++++++++++ rust/arrow/src/array/equal/mod.rs | 74 ++++++++++++++- rust/arrow/src/array/mod.rs | 1 + 4 files changed, 258 insertions(+), 8 deletions(-) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 732016705af..5bf7c6f685e 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -506,7 +506,7 @@ impl From for DecimalArray { assert_eq!( data.buffers().len(), 1, - "BinaryArray data should contain 1 buffer only (values)" + "DecimalArray data should contain 1 buffer only (values)" ); let value_data = data.buffers()[0].raw_data(); let (precision, scale) = match data.data_type() { @@ -524,6 +524,35 @@ impl From for DecimalArray { } } +/// Creates a `FixedSizeBinaryArray` from `FixedSizeList` array +impl From for DecimalArray { + fn from(v: FixedSizeListArray) -> Self { + assert_eq!( + v.data_ref().child_data()[0].child_data().len(), + 0, + "DecimalArray can only be created from list array of u8 values \ + (i.e. FixedSizeList>)." + ); + assert_eq!( + v.data_ref().child_data()[0].data_type(), + &DataType::UInt8, + "DecimalArray can only be created from FixedSizeList arrays, mismatched data types." + ); + + let mut builder = ArrayData::builder(DataType::Decimal(23, 6)) + .len(v.len()) + .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()); + if let Some(bitmap) = v.data_ref().null_bitmap() { + builder = builder + .null_count(v.data_ref().null_count()) + .null_bit_buffer(bitmap.bits.clone()) + } + + let data = builder.build(); + Self::from(data) + } +} + impl fmt::Debug for DecimalArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "DecimalArray<{}, {}>\n[\n", self.precision, self.scale)?; @@ -547,12 +576,12 @@ impl Array for DecimalArray { &self.data } - /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray]. + /// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray]. fn get_buffer_memory_size(&self) -> usize { self.data.get_buffer_memory_size() } - /// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray]. + /// Returns the total number of bytes of memory occupied physically by this [DecimalArray]. fn get_array_memory_size(&self) -> usize { self.data.get_array_memory_size() + mem::size_of_val(self) } @@ -923,9 +952,26 @@ mod tests { .len(2) .add_buffer(Buffer::from(&values[..])) .build(); - let fixed_size_binary_array = DecimalArray::from(array_data); - assert_eq!(8_887_000_000, fixed_size_binary_array.value(0)); - assert_eq!(-8_887_000_000, fixed_size_binary_array.value(1)); - assert_eq!(10, fixed_size_binary_array.value_length()); + let decimal_array = DecimalArray::from(array_data); + assert_eq!(8_887_000_000, decimal_array.value(0)); + assert_eq!(-8_887_000_000, decimal_array.value(1)); + assert_eq!(10, decimal_array.value_length()); + } + + #[test] + fn test_decimal_array_fmt_debug() { + let values: [u8; 20] = [ + 0, 0, 0, 0, 0, 2, 17, 180, 219, 192, 255, 255, 255, 255, 255, 253, 238, 75, + 36, 64, + ]; + let array_data = ArrayData::builder(DataType::Decimal(23, 6)) + .len(2) + .add_buffer(Buffer::from(&values[..])) + .build(); + let arr = DecimalArray::from(array_data); + assert_eq!( + "DecimalArray<23, 6>\n[\n 8887000000,\n -8887000000,\n]", + format!("{:?}", arr) + ); } } diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index c0300eeb510..b1b62dde37c 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1285,6 +1285,11 @@ pub struct FixedSizeBinaryBuilder { builder: FixedSizeListBuilder, } +#[derive(Debug)] +pub struct DecimalBuilder { + builder: FixedSizeListBuilder, +} + impl ArrayBuilder for BinaryBuilder { /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &Any { @@ -1663,6 +1668,92 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { } } +impl ArrayBuilder for DecimalBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &Any { + self + } + + /// Appends data from other arrays into the builder + /// + /// This is most useful when concatenating arrays of the same type into a builder. + fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { + // validate arraydata and reserve memory + for array in data { + if array.data_type() != &self.data_type() { + return Err(ArrowError::InvalidArgumentError( + "Cannot append data to builder if data types are different" + .to_string(), + )); + } + if array.buffers().len() != 1 { + return Err(ArrowError::InvalidArgumentError( + "Decimal arrays should have 1 buffer".to_string(), + )); + } + } + for array in data { + // convert string to FixedSizeList to reuse list's append + let int_data = &array.buffers()[0]; + let int_data = Arc::new(ArrayData::new( + DataType::UInt8, + int_data.len(), + None, + None, + 0, + vec![int_data.clone()], + vec![], + )) as ArrayDataRef; + let list_data = Arc::new(ArrayData::new( + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, true)), + self.builder.list_len, + ), + array.len(), + None, + array.null_buffer().cloned(), + array.offset(), + vec![], + vec![int_data], + )); + self.builder.append_data(&[list_data])?; + } + Ok(()) + } + + /// Returns the data type of the builder + /// + /// This is used for validating array data types in `append_data` + fn data_type(&self) -> DataType { + DataType::Decimal(23, 6) + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.builder.len() + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + impl BinaryBuilder { /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values /// array @@ -1882,6 +1973,43 @@ impl FixedSizeBinaryBuilder { } } +impl DecimalBuilder { + /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values + /// array + pub fn new(capacity: usize, byte_width: i32) -> Self { + let values_builder = UInt8Builder::new(capacity); + Self { + builder: FixedSizeListBuilder::new(values_builder, byte_width), + } + } + + /// Appends a byte slice into the builder. + /// + /// Automatically calls the `append` method to delimit the slice appended in as a + /// distinct array element. + pub fn append_value(&mut self, value: &[u8]) -> Result<()> { + if self.builder.value_length() != value.len() as i32 { + return Err(ArrowError::InvalidArgumentError( + "Byte slice does not have the same length as DecimalBuilder value lengths".to_string() + )); + } + self.builder.values().append_slice(value)?; + self.builder.append(true) + } + + /// Append a null value to the array. + pub fn append_null(&mut self) -> Result<()> { + let length: usize = self.builder.value_length() as usize; + self.builder.values().append_slice(&vec![0u8; length][..])?; + self.builder.append(false) + } + + /// Builds the `DecimalArray` and reset this builder. + pub fn finish(&mut self) -> DecimalArray { + DecimalArray::from(self.builder.finish()) + } +} + /// Array builder for Struct types. /// /// Note that callers should make sure that methods of all the child field builders are @@ -2024,6 +2152,9 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { DataType::FixedSizeBinary(len) => { Box::new(FixedSizeBinaryBuilder::new(capacity, *len)) } + DataType::Decimal(precision, _) => { + Box::new(DecimalBuilder::new(capacity, (10.0_f64.powi(*precision as i32).log2() / 8.0).ceil() as i32)) + } DataType::Utf8 => Box::new(StringBuilder::new(capacity)), DataType::Date32(DateUnit::Day) => Box::new(Date32Builder::new(capacity)), DataType::Date64(DateUnit::Millisecond) => Box::new(Date64Builder::new(capacity)), diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs index 134201a8137..3d2e242285b 100644 --- a/rust/arrow/src/array/equal/mod.rs +++ b/rust/arrow/src/array/equal/mod.rs @@ -230,7 +230,7 @@ mod tests { array::Array, ArrayDataRef, ArrayRef, BinaryOffsetSizeTrait, BooleanArray, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder, ListBuilder, NullArray, PrimitiveBuilder, StringArray, StringDictionaryBuilder, - StringOffsetSizeTrait, StructArray, + StringOffsetSizeTrait, StructArray, DecimalBuilder }; use crate::array::{GenericStringArray, Int32Array}; use crate::datatypes::Int16Type; @@ -615,6 +615,78 @@ mod tests { test_equal(&a_slice, &b_slice, true); } + fn create_decimal_array, T: AsRef<[Option]>>( + data: T, + ) -> ArrayDataRef { + let mut builder = DecimalBuilder::new(20, 10); + + for d in data.as_ref() { + if let Some(v) = d { + builder.append_value(v.as_ref()).unwrap(); + } else { + builder.append_null().unwrap(); + } + } + builder.finish().data() + } + + #[test] + fn test_decimal_equal() { + let a = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + test_equal(a.as_ref(), b.as_ref(), true); + + let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 3, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + test_equal(a.as_ref(), b.as_ref(), false); + } + + // Test the case where null_count > 0 + #[test] + fn test_decimal_null() { + let a = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), None, Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), None, Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + test_equal(a.as_ref(), b.as_ref(), true); + + let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64]), None]); + test_equal(a.as_ref(), b.as_ref(), false); + + let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), None, Some([0, 0, 0, 0, 0, 3, 17, 180, 219, 192])]); + test_equal(a.as_ref(), b.as_ref(), false); + } + + #[test] + fn test_decimal_offsets() { + // Test the case where offset != 0 + let a = create_decimal_array(&[ + Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), + None, + None, + Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64]), + None, + None, + ]); + let b = create_decimal_array(&[ + Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), + None, + None, + Some([0, 0, 0, 0, 0, 3, 17, 180, 219, 192]), + None, + None, + ]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); + } + /// Create a fixed size list of 2 value lengths fn create_fixed_size_list_array, T: AsRef<[Option]>>( data: T, diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs index 03a4b70c72c..0fc227aff73 100644 --- a/rust/arrow/src/array/mod.rs +++ b/rust/arrow/src/array/mod.rs @@ -208,6 +208,7 @@ pub type DurationNanosecondBufferBuilder = BufferBuilder pub use self::builder::ArrayBuilder; pub use self::builder::BinaryBuilder; +pub use self::builder::DecimalBuilder; pub use self::builder::FixedSizeBinaryBuilder; pub use self::builder::FixedSizeListBuilder; pub use self::builder::LargeBinaryBuilder; From 1f9fc5783e8be67facf7522ac1e88cc323fbc6d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 15 Nov 2020 14:07:02 +0100 Subject: [PATCH 04/13] feat: construct decimal array from i128 values --- rust/arrow/src/array/builder.rs | 48 ++++++++++++++++++++++++++----- rust/arrow/src/array/equal/mod.rs | 32 ++++++++++----------- 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index b1b62dde37c..ab0e487f6a4 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1288,6 +1288,8 @@ pub struct FixedSizeBinaryBuilder { #[derive(Debug)] pub struct DecimalBuilder { builder: FixedSizeListBuilder, + precision: usize, + scale: usize } impl ArrayBuilder for BinaryBuilder { @@ -1725,7 +1727,7 @@ impl ArrayBuilder for DecimalBuilder { /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::Decimal(23, 6) + DataType::Decimal(self.precision, self.scale) } /// Returns the builder as a mutable `Any` reference. @@ -1976,10 +1978,13 @@ impl FixedSizeBinaryBuilder { impl DecimalBuilder { /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values /// array - pub fn new(capacity: usize, byte_width: i32) -> Self { + pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { let values_builder = UInt8Builder::new(capacity); + let byte_width = (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32; Self { builder: FixedSizeListBuilder::new(values_builder, byte_width), + precision, + scale } } @@ -1987,16 +1992,23 @@ impl DecimalBuilder { /// /// Automatically calls the `append` method to delimit the slice appended in as a /// distinct array element. - pub fn append_value(&mut self, value: &[u8]) -> Result<()> { - if self.builder.value_length() != value.len() as i32 { + pub fn append_value(&mut self, value: i128) -> Result<()> { + let value_as_bytes = Self::from_i128_to_fixed_size_bytes(value, self.builder.value_length() as usize); + if self.builder.value_length() != value_as_bytes.len() as i32 { return Err(ArrowError::InvalidArgumentError( "Byte slice does not have the same length as DecimalBuilder value lengths".to_string() )); } - self.builder.values().append_slice(value)?; + self.builder.values().append_slice(value_as_bytes.as_slice())?; self.builder.append(true) } + fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Vec { + let res = v.to_be_bytes(); + let start_byte = 16 - size; + res[start_byte..16].to_vec() + } + /// Append a null value to the array. pub fn append_null(&mut self) -> Result<()> { let length: usize = self.builder.value_length() as usize; @@ -2152,8 +2164,8 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { DataType::FixedSizeBinary(len) => { Box::new(FixedSizeBinaryBuilder::new(capacity, *len)) } - DataType::Decimal(precision, _) => { - Box::new(DecimalBuilder::new(capacity, (10.0_f64.powi(*precision as i32).log2() / 8.0).ceil() as i32)) + DataType::Decimal(precision, scale) => { + Box::new(DecimalBuilder::new(capacity, *precision, *scale)) } DataType::Utf8 => Box::new(StringBuilder::new(capacity)), DataType::Date32(DateUnit::Day) => Box::new(Date32Builder::new(capacity)), @@ -3317,6 +3329,28 @@ mod tests { assert_eq!(5, fixed_size_binary_array.value_length()); } + #[test] + fn test_decimal_builder() { + let mut builder = DecimalBuilder::new(30, 23, 6); + let val_1 = 8_887_000_000; + let val_2 = -8_887_000_000; + + // [b"hello", null, "arrow"] + builder.append_value(val_1).unwrap(); + builder.append_null().unwrap(); + builder.append_value(val_2).unwrap(); + let decimal_array: DecimalArray = builder.finish(); + + assert_eq!( + &DataType::Decimal(23, 6), + decimal_array.data_type() + ); + assert_eq!(3, decimal_array.len()); + assert_eq!(1, decimal_array.null_count()); + assert_eq!(20, decimal_array.value_offset(2)); + assert_eq!(10, decimal_array.value_length()); + } + #[test] fn test_string_array_builder_finish() { let mut builder = StringBuilder::new(10); diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs index 3d2e242285b..191625a938e 100644 --- a/rust/arrow/src/array/equal/mod.rs +++ b/rust/arrow/src/array/equal/mod.rs @@ -615,14 +615,14 @@ mod tests { test_equal(&a_slice, &b_slice, true); } - fn create_decimal_array, T: AsRef<[Option]>>( - data: T, + fn create_decimal_array( + data: &[Option], ) -> ArrayDataRef { - let mut builder = DecimalBuilder::new(20, 10); + let mut builder = DecimalBuilder::new(20, 23, 6); - for d in data.as_ref() { + for d in data { if let Some(v) = d { - builder.append_value(v.as_ref()).unwrap(); + builder.append_value(*v).unwrap(); } else { builder.append_null().unwrap(); } @@ -632,25 +632,25 @@ mod tests { #[test] fn test_decimal_equal() { - let a = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); - let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + let a = create_decimal_array(&[Some(8_887_000_000), Some(-8_887_000_000)]); + let b = create_decimal_array(&[Some(8_887_000_000), Some(-8_887_000_000)]); test_equal(a.as_ref(), b.as_ref(), true); - let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 3, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + let b = create_decimal_array(&[Some(15_887_000_000), Some(-8_887_000_000)]); test_equal(a.as_ref(), b.as_ref(), false); } // Test the case where null_count > 0 #[test] fn test_decimal_null() { - let a = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), None, Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); - let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), None, Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64])]); + let a = create_decimal_array(&[Some(8_887_000_000), None, Some(-8_887_000_000)]); + let b = create_decimal_array(&[Some(8_887_000_000), None, Some(-8_887_000_000)]); test_equal(a.as_ref(), b.as_ref(), true); - let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64]), None]); + let b = create_decimal_array(&[Some(8_887_000_000), Some(-8_887_000_000), None]); test_equal(a.as_ref(), b.as_ref(), false); - let b = create_decimal_array(&[Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), None, Some([0, 0, 0, 0, 0, 3, 17, 180, 219, 192])]); + let b = create_decimal_array(&[Some(15_887_000_000), None, Some(-8_887_000_000)]); test_equal(a.as_ref(), b.as_ref(), false); } @@ -658,18 +658,18 @@ mod tests { fn test_decimal_offsets() { // Test the case where offset != 0 let a = create_decimal_array(&[ - Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), + Some(8_887_000_000), None, None, - Some([255, 255, 255, 255, 255, 253, 238, 75, 36, 64]), + Some(-8_887_000_000), None, None, ]); let b = create_decimal_array(&[ - Some([0, 0, 0, 0, 0, 2, 17, 180, 219, 192]), + Some(8_887_000_000), None, None, - Some([0, 0, 0, 0, 0, 3, 17, 180, 219, 192]), + Some(15_887_000_000), None, None, ]); From 069e90a6724d8dda44e30eee3ad84c577419549c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 15 Nov 2020 15:54:59 +0100 Subject: [PATCH 05/13] refactor: explicit conversion from fixed size list array to decimal array --- rust/arrow/src/array/array.rs | 2 +- rust/arrow/src/array/array_binary.rs | 53 +++++++++++++--------------- rust/arrow/src/array/builder.rs | 2 +- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index 31a67a86940..4ff5a8dd2b2 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -307,7 +307,7 @@ pub fn make_array(data: ArrayDataRef) -> ArrayRef { dt => panic!("Unexpected dictionary key type {:?}", dt), }, DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, - DataType::Decimal(_, _) => Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef, + DataType::Decimal(_, _) => Arc::new(DecimalArray::from(data)) as ArrayRef, dt => panic!("Unexpected data type {:?}", dt), } } diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 5bf7c6f685e..371d6dec5ec 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -499,34 +499,8 @@ impl DecimalArray { fn value_offset_at(&self, i: usize) -> i32 { self.length * i as i32 } -} -impl From for DecimalArray { - fn from(data: ArrayDataRef) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "DecimalArray data should contain 1 buffer only (values)" - ); - let value_data = data.buffers()[0].raw_data(); - let (precision, scale) = match data.data_type() { - DataType::Decimal(precision, scale) => (*precision, *scale), - _ => panic!("Expected data type to be Decimal"), - }; - let length = (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32; - Self { - data, - value_data: RawPtrBox::new(value_data), - precision, - scale, - length, - } - } -} - -/// Creates a `FixedSizeBinaryArray` from `FixedSizeList` array -impl From for DecimalArray { - fn from(v: FixedSizeListArray) -> Self { + pub fn from_fixed_size_list_array(v: FixedSizeListArray, precision: usize, scale: usize) -> Self { assert_eq!( v.data_ref().child_data()[0].child_data().len(), 0, @@ -539,7 +513,7 @@ impl From for DecimalArray { "DecimalArray can only be created from FixedSizeList arrays, mismatched data types." ); - let mut builder = ArrayData::builder(DataType::Decimal(23, 6)) + let mut builder = ArrayData::builder(DataType::Decimal(precision, scale)) .len(v.len()) .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()); if let Some(bitmap) = v.data_ref().null_bitmap() { @@ -553,6 +527,29 @@ impl From for DecimalArray { } } +impl From for DecimalArray { + fn from(data: ArrayDataRef) -> Self { + assert_eq!( + data.buffers().len(), + 1, + "DecimalArray data should contain 1 buffer only (values)" + ); + let value_data = data.buffers()[0].raw_data(); + let (precision, scale) = match data.data_type() { + DataType::Decimal(precision, scale) => (*precision, *scale), + _ => panic!("Expected data type to be Decimal"), + }; + let length = (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32; + Self { + data, + value_data: RawPtrBox::new(value_data), + precision, + scale, + length, + } + } +} + impl fmt::Debug for DecimalArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "DecimalArray<{}, {}>\n[\n", self.precision, self.scale)?; diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index ab0e487f6a4..eed7a79272c 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -2018,7 +2018,7 @@ impl DecimalBuilder { /// Builds the `DecimalArray` and reset this builder. pub fn finish(&mut self) -> DecimalArray { - DecimalArray::from(self.builder.finish()) + DecimalArray::from_fixed_size_list_array(self.builder.finish(), self.precision, self.scale) } } From 2cdfccb86bf3addf0f1b5112fe27230801b897de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 15 Nov 2020 17:21:40 +0100 Subject: [PATCH 06/13] refactor: remove unwrap in equality check and add test --- rust/arrow/src/array/array_binary.rs | 8 ++- rust/arrow/src/array/builder.rs | 2 +- rust/arrow/src/array/equal_json.rs | 84 +++++++++++++++++++++++++++- 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 371d6dec5ec..613b31753a3 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -466,6 +466,7 @@ impl DecimalArray { } fn from_bytes_to_i128(b: &[u8]) -> i128 { + assert!(b.len() <= 16, "DecimalArray supports only up to size 16"); let first_bit = b[0] & 128u8 == 128u8; let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] }; for (i, v) in b.iter().enumerate() { @@ -474,6 +475,11 @@ impl DecimalArray { i128::from_be_bytes(result) } + /// Returns the byte size per value for Decimal arrays with a given precision + pub fn calc_fixed_byte_size(precision: usize) -> i32 { + (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32 + } + /// Returns the offset for the element at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. @@ -539,7 +545,7 @@ impl From for DecimalArray { DataType::Decimal(precision, scale) => (*precision, *scale), _ => panic!("Expected data type to be Decimal"), }; - let length = (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32; + let length = Self::calc_fixed_byte_size(precision); Self { data, value_data: RawPtrBox::new(value_data), diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index eed7a79272c..677d4985681 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1980,7 +1980,7 @@ impl DecimalBuilder { /// array pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { let values_builder = UInt8Builder::new(capacity); - let byte_width = (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32; + let byte_width = DecimalArray::calc_fixed_byte_size(precision); Self { builder: FixedSizeListBuilder::new(values_builder, byte_width), precision, diff --git a/rust/arrow/src/array/equal_json.rs b/rust/arrow/src/array/equal_json.rs index f2cf52eb81d..6531082d01d 100644 --- a/rust/arrow/src/array/equal_json.rs +++ b/rust/arrow/src/array/equal_json.rs @@ -332,7 +332,7 @@ impl JsonEqual for DecimalArray { (0..self.len()).all(|i| match json[i] { JString(s) => { - self.is_valid(i) && (s.parse::().unwrap() == self.value(i)) + self.is_valid(i) && (s.parse::().map_or_else(|_| false, |v| v == self.value(i))) } JNull => self.is_null(i), _ => false, @@ -880,6 +880,88 @@ mod tests { assert!(json_array.ne(&arrow_array)); } + #[test] + fn test_decimal_json_equal() { + // Test the equal case + let mut builder = DecimalBuilder::new(30, 23, 6); + builder.append_value(1_000).unwrap(); + builder.append_null().unwrap(); + builder.append_value(-250).unwrap(); + let arrow_array: DecimalArray = builder.finish(); + let json_array: Value = serde_json::from_str( + r#" + [ + "1000", + null, + "-250" + ] + "#, + ) + .unwrap(); + println!("{:?}", arrow_array); + assert!(arrow_array.eq(&json_array)); + assert!(json_array.eq(&arrow_array)); + + // Test unequal case + builder.append_value(1_000).unwrap(); + builder.append_null().unwrap(); + builder.append_value(55).unwrap(); + let arrow_array: DecimalArray = builder.finish(); + let json_array: Value = serde_json::from_str( + r#" + [ + "1000", + null, + "-250" + ] + "#, + ) + .unwrap(); + assert!(arrow_array.ne(&json_array)); + assert!(json_array.ne(&arrow_array)); + + // Test unequal length case + let json_array: Value = serde_json::from_str( + r#" + [ + "1000", + null, + null, + "55" + ] + "#, + ) + .unwrap(); + assert!(arrow_array.ne(&json_array)); + assert!(json_array.ne(&arrow_array)); + + // Test incorrect type case + let json_array: Value = serde_json::from_str( + r#" + { + "a": 1 + } + "#, + ) + .unwrap(); + assert!(arrow_array.ne(&json_array)); + assert!(json_array.ne(&arrow_array)); + + // Test incorrect value type case + let json_array: Value = serde_json::from_str( + r#" + [ + "hello", + null, + 1 + ] + "#, + ) + .unwrap(); + assert!(arrow_array.ne(&json_array)); + assert!(json_array.ne(&arrow_array)); + } + #[test] fn test_struct_json_equal() { let strings: ArrayRef = Arc::new(StringArray::from(vec![ From 31754804145b2360ce997fef777674055f149ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 15 Nov 2020 17:25:38 +0100 Subject: [PATCH 07/13] chore: formatting --- rust/arrow/src/array/array_binary.rs | 6 +++++- rust/arrow/src/array/builder.rs | 26 ++++++++++++++++---------- rust/arrow/src/array/equal/decimal.rs | 4 +++- rust/arrow/src/array/equal/mod.rs | 20 ++++++++------------ rust/arrow/src/array/equal_json.rs | 5 ++++- rust/arrow/src/error.rs | 4 +++- 6 files changed, 39 insertions(+), 26 deletions(-) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 613b31753a3..15d6ccd0045 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -506,7 +506,11 @@ impl DecimalArray { self.length * i as i32 } - pub fn from_fixed_size_list_array(v: FixedSizeListArray, precision: usize, scale: usize) -> Self { + pub fn from_fixed_size_list_array( + v: FixedSizeListArray, + precision: usize, + scale: usize, + ) -> Self { assert_eq!( v.data_ref().child_data()[0].child_data().len(), 0, diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 677d4985681..b95c1be9ed8 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1289,7 +1289,7 @@ pub struct FixedSizeBinaryBuilder { pub struct DecimalBuilder { builder: FixedSizeListBuilder, precision: usize, - scale: usize + scale: usize, } impl ArrayBuilder for BinaryBuilder { @@ -1984,7 +1984,7 @@ impl DecimalBuilder { Self { builder: FixedSizeListBuilder::new(values_builder, byte_width), precision, - scale + scale, } } @@ -1993,13 +1993,18 @@ impl DecimalBuilder { /// Automatically calls the `append` method to delimit the slice appended in as a /// distinct array element. pub fn append_value(&mut self, value: i128) -> Result<()> { - let value_as_bytes = Self::from_i128_to_fixed_size_bytes(value, self.builder.value_length() as usize); + let value_as_bytes = Self::from_i128_to_fixed_size_bytes( + value, + self.builder.value_length() as usize, + ); if self.builder.value_length() != value_as_bytes.len() as i32 { return Err(ArrowError::InvalidArgumentError( "Byte slice does not have the same length as DecimalBuilder value lengths".to_string() )); } - self.builder.values().append_slice(value_as_bytes.as_slice())?; + self.builder + .values() + .append_slice(value_as_bytes.as_slice())?; self.builder.append(true) } @@ -2018,7 +2023,11 @@ impl DecimalBuilder { /// Builds the `DecimalArray` and reset this builder. pub fn finish(&mut self) -> DecimalArray { - DecimalArray::from_fixed_size_list_array(self.builder.finish(), self.precision, self.scale) + DecimalArray::from_fixed_size_list_array( + self.builder.finish(), + self.precision, + self.scale, + ) } } @@ -3333,7 +3342,7 @@ mod tests { fn test_decimal_builder() { let mut builder = DecimalBuilder::new(30, 23, 6); let val_1 = 8_887_000_000; - let val_2 = -8_887_000_000; + let val_2 = -8_887_000_000; // [b"hello", null, "arrow"] builder.append_value(val_1).unwrap(); @@ -3341,10 +3350,7 @@ mod tests { builder.append_value(val_2).unwrap(); let decimal_array: DecimalArray = builder.finish(); - assert_eq!( - &DataType::Decimal(23, 6), - decimal_array.data_type() - ); + assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); assert_eq!(3, decimal_array.len()); assert_eq!(1, decimal_array.null_count()); assert_eq!(20, decimal_array.value_offset(2)); diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs index 14bafee222d..fabcf0d2bf2 100644 --- a/rust/arrow/src/array/equal/decimal.rs +++ b/rust/arrow/src/array/equal/decimal.rs @@ -27,7 +27,9 @@ pub(super) fn decimal_equal( len: usize, ) -> bool { let size = match lhs.data_type() { - DataType::Decimal(p, _) => (10.0_f64.powi(*p as i32).log2() / 8.0).ceil() as usize, + DataType::Decimal(p, _) => { + (10.0_f64.powi(*p as i32).log2() / 8.0).ceil() as usize + } _ => unreachable!(), }; diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs index 191625a938e..20e54a5fa6b 100644 --- a/rust/arrow/src/array/equal/mod.rs +++ b/rust/arrow/src/array/equal/mod.rs @@ -20,9 +20,9 @@ //! depend on dynamic casting of `Array`. use super::{ - Array, ArrayData, BinaryOffsetSizeTrait, FixedSizeBinaryArray, GenericBinaryArray, - GenericListArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, - StringOffsetSizeTrait, StructArray, DecimalArray + Array, ArrayData, BinaryOffsetSizeTrait, DecimalArray, FixedSizeBinaryArray, + GenericBinaryArray, GenericListArray, GenericStringArray, OffsetSizeTrait, + PrimitiveArray, StringOffsetSizeTrait, StructArray, }; use crate::datatypes::{ArrowPrimitiveType, DataType, IntervalUnit}; @@ -151,9 +151,7 @@ fn equal_values( DataType::FixedSizeBinary(_) => { fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) } - DataType::Decimal(_, _) => { - decimal_equal(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::Decimal(_, _) => decimal_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => { @@ -228,9 +226,9 @@ mod tests { use crate::array::{ array::Array, ArrayDataRef, ArrayRef, BinaryOffsetSizeTrait, BooleanArray, - FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder, - ListBuilder, NullArray, PrimitiveBuilder, StringArray, StringDictionaryBuilder, - StringOffsetSizeTrait, StructArray, DecimalBuilder + DecimalBuilder, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, + Int32Builder, ListBuilder, NullArray, PrimitiveBuilder, StringArray, + StringDictionaryBuilder, StringOffsetSizeTrait, StructArray, }; use crate::array::{GenericStringArray, Int32Array}; use crate::datatypes::Int16Type; @@ -615,9 +613,7 @@ mod tests { test_equal(&a_slice, &b_slice, true); } - fn create_decimal_array( - data: &[Option], - ) -> ArrayDataRef { + fn create_decimal_array(data: &[Option]) -> ArrayDataRef { let mut builder = DecimalBuilder::new(20, 23, 6); for d in data { diff --git a/rust/arrow/src/array/equal_json.rs b/rust/arrow/src/array/equal_json.rs index 6531082d01d..808cb5bb772 100644 --- a/rust/arrow/src/array/equal_json.rs +++ b/rust/arrow/src/array/equal_json.rs @@ -332,7 +332,10 @@ impl JsonEqual for DecimalArray { (0..self.len()).all(|i| match json[i] { JString(s) => { - self.is_valid(i) && (s.parse::().map_or_else(|_| false, |v| v == self.value(i))) + self.is_valid(i) + && (s + .parse::() + .map_or_else(|_| false, |v| v == self.value(i))) } JNull => self.is_null(i), _ => false, diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs index 1ffcf6ce88d..51b614ae44c 100644 --- a/rust/arrow/src/error.rs +++ b/rust/arrow/src/error.rs @@ -40,7 +40,9 @@ pub enum ArrowError { impl ArrowError { /// Wraps an external error in an `ArrowError`. - pub fn from_external_error(error: Box) -> Self { + pub fn from_external_error( + error: Box, + ) -> Self { Self::ExternalError(error) } } From d534cfe59ada16a7de192efb967ce77ca8aa5b52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 15 Nov 2020 17:32:38 +0100 Subject: [PATCH 08/13] refactor: reuse calc method for byte size --- rust/arrow/src/array/equal/decimal.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs index fabcf0d2bf2..497a9b2976c 100644 --- a/rust/arrow/src/array/equal/decimal.rs +++ b/rust/arrow/src/array/equal/decimal.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType}; +use crate::{array::ArrayData, array::DecimalArray, datatypes::DataType}; use super::utils::equal_len; @@ -27,8 +27,8 @@ pub(super) fn decimal_equal( len: usize, ) -> bool { let size = match lhs.data_type() { - DataType::Decimal(p, _) => { - (10.0_f64.powi(*p as i32).log2() / 8.0).ceil() as usize + DataType::Decimal(precision, _) => { + DecimalArray::calc_fixed_byte_size(*precision) as usize } _ => unreachable!(), }; From f0a36bbd9b523dbe19e5b061ac1bfe24513214cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 15 Nov 2020 17:42:37 +0100 Subject: [PATCH 09/13] refactor: error handling for i128->byte --- rust/arrow/src/array/builder.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index b95c1be9ed8..5e97ca14dac 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1996,7 +1996,7 @@ impl DecimalBuilder { let value_as_bytes = Self::from_i128_to_fixed_size_bytes( value, self.builder.value_length() as usize, - ); + )?; if self.builder.value_length() != value_as_bytes.len() as i32 { return Err(ArrowError::InvalidArgumentError( "Byte slice does not have the same length as DecimalBuilder value lengths".to_string() @@ -2008,10 +2008,15 @@ impl DecimalBuilder { self.builder.append(true) } - fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Vec { + fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Result> { + if size > 16 { + return Err(ArrowError::InvalidArgumentError( + "DecimalBuilder only supports values up to 16 bytes.".to_string(), + )); + } let res = v.to_be_bytes(); let start_byte = 16 - size; - res[start_byte..16].to_vec() + Ok(res[start_byte..16].to_vec()) } /// Append a null value to the array. From 2c2b9868720cb735de98fe85fb39c505d9c4c5ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 15 Nov 2020 17:47:18 +0100 Subject: [PATCH 10/13] chore: clean up --- rust/arrow/src/array/builder.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 5e97ca14dac..64d3e5d1fb6 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -3346,13 +3346,10 @@ mod tests { #[test] fn test_decimal_builder() { let mut builder = DecimalBuilder::new(30, 23, 6); - let val_1 = 8_887_000_000; - let val_2 = -8_887_000_000; - // [b"hello", null, "arrow"] - builder.append_value(val_1).unwrap(); + builder.append_value(8_887_000_000).unwrap(); builder.append_null().unwrap(); - builder.append_value(val_2).unwrap(); + builder.append_value(-8_887_000_000).unwrap(); let decimal_array: DecimalArray = builder.finish(); assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); From 4fafe186ab18e4f8c8498a527288841fdc28266a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Tue, 17 Nov 2020 07:56:06 +0100 Subject: [PATCH 11/13] test: add equality check with leading null --- rust/arrow/src/array/equal/mod.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs index 20e54a5fa6b..62fe7d66aed 100644 --- a/rust/arrow/src/array/equal/mod.rs +++ b/rust/arrow/src/array/equal/mod.rs @@ -681,6 +681,22 @@ mod tests { let a_slice = a.slice(4, 1); let b_slice = b.slice(4, 1); test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(1, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, false); + + let b = create_decimal_array(&[ + None, + None, + None, + Some(-8_887_000_000), + Some(-3_000), + None, + ]); + let a_slice = a.slice(1, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, true); } /// Create a fixed size list of 2 value lengths From 2fa1981b24b3651cb735ee3c247ebc8e68de68c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Tue, 17 Nov 2020 18:21:24 +0100 Subject: [PATCH 12/13] fix: failing tests with offsets --- rust/arrow/src/array/equal/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs index 62fe7d66aed..76d583f9b1f 100644 --- a/rust/arrow/src/array/equal/mod.rs +++ b/rust/arrow/src/array/equal/mod.rs @@ -611,6 +611,10 @@ mod tests { let a_slice = a.slice(4, 1); let b_slice = b.slice(4, 1); test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(3, 3); + let b_slice = b.slice(3, 3); + test_equal(&a_slice, &b_slice, false); } fn create_decimal_array(data: &[Option]) -> ArrayDataRef { @@ -682,6 +686,10 @@ mod tests { let b_slice = b.slice(4, 1); test_equal(&a_slice, &b_slice, true); + let a_slice = a.slice(3, 3); + let b_slice = b.slice(3, 3); + test_equal(&a_slice, &b_slice, false); + let a_slice = a.slice(1, 3); let b_slice = b.slice(1, 3); test_equal(&a_slice, &b_slice, false); From 2cbe5cd84e7a137bbdc369e05140e19b6da0840e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Wed, 18 Nov 2020 07:32:59 +0100 Subject: [PATCH 13/13] fix: correct equality check with offsets --- rust/arrow/src/array/equal/decimal.rs | 4 ++-- rust/arrow/src/array/equal/mod.rs | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs index 497a9b2976c..d8534b825ac 100644 --- a/rust/arrow/src/array/equal/decimal.rs +++ b/rust/arrow/src/array/equal/decimal.rs @@ -33,8 +33,8 @@ pub(super) fn decimal_equal( _ => unreachable!(), }; - let lhs_values = lhs.buffer::(0); - let rhs_values = rhs.buffer::(0); + let lhs_values = &lhs.buffers()[0].data()[lhs.offset() * size..]; + let rhs_values = &rhs.buffers()[0].data()[rhs.offset() * size..]; if lhs.null_count() == 0 && rhs.null_count() == 0 { equal_len( diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs index 76d583f9b1f..920235802c6 100644 --- a/rust/arrow/src/array/equal/mod.rs +++ b/rust/arrow/src/array/equal/mod.rs @@ -611,10 +611,6 @@ mod tests { let a_slice = a.slice(4, 1); let b_slice = b.slice(4, 1); test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(3, 3); - let b_slice = b.slice(3, 3); - test_equal(&a_slice, &b_slice, false); } fn create_decimal_array(data: &[Option]) -> ArrayDataRef {