From 8bf9024f4933849a9d04990d47e1d7ff0f29e7bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Fri, 27 Nov 2020 16:50:22 +0100 Subject: [PATCH 1/9] feat: first working ipc test --- rust/arrow/src/array/array_binary.rs | 25 +++++++++++-------------- rust/arrow/src/array/builder.rs | 8 ++++---- rust/arrow/src/array/equal/decimal.rs | 6 +++--- rust/arrow/src/array/equal_json.rs | 1 - rust/arrow/src/datatypes.rs | 16 ++++++++++++++++ rust/arrow/src/ipc/convert.rs | 4 ++++ rust/arrow/src/ipc/reader.rs | 14 ++++++++++++++ rust/arrow/src/util/integration_util.rs | 5 +++++ rust/parquet/src/arrow/schema.rs | 6 +++--- 9 files changed, 60 insertions(+), 25 deletions(-) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 15d6ccd0045..18fbeff9cce 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -467,12 +467,14 @@ impl DecimalArray { fn from_bytes_to_i128(b: &[u8]) -> i128 { assert!(b.len() <= 16, "DecimalArray supports only up to size 16"); - let first_bit = b[0] & 128u8 == 128u8; + // println!("First as bytes: {:?}", (-11697 as i64).to_be_bytes()); + // println!("First as bytes: {:?}", (-11697 as i64).to_le_bytes()); + let first_bit = b[b.len() - 1] & 128u8 == 128u8; let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] }; for (i, v) in b.iter().enumerate() { - result[i + (16 - b.len())] = *v; + result[i] = *v; } - i128::from_be_bytes(result) + i128::from_le_bytes(result) } /// Returns the byte size per value for Decimal arrays with a given precision @@ -549,7 +551,7 @@ impl From for DecimalArray { DataType::Decimal(precision, scale) => (*precision, *scale), _ => panic!("Expected data type to be Decimal"), }; - let length = Self::calc_fixed_byte_size(precision); + let length = 16; Self { data, value_data: RawPtrBox::new(value_data), @@ -950,11 +952,9 @@ mod tests { #[test] fn test_decimal_array() { - let values: [u8; 20] = [ - 0, 0, 0, 0, 0, 2, 17, 180, 219, 192, 255, 255, 255, 255, 255, 253, 238, 75, - 36, 64, - ]; - + // let val_8887: [u8; 16] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + // let val_neg_8887: [u8; 16] = [64, 36, 75, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; + let values: [u8; 32] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; let array_data = ArrayData::builder(DataType::Decimal(23, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) @@ -962,15 +962,12 @@ mod tests { let decimal_array = DecimalArray::from(array_data); assert_eq!(8_887_000_000, decimal_array.value(0)); assert_eq!(-8_887_000_000, decimal_array.value(1)); - assert_eq!(10, decimal_array.value_length()); + assert_eq!(16, decimal_array.value_length()); } #[test] fn test_decimal_array_fmt_debug() { - let values: [u8; 20] = [ - 0, 0, 0, 0, 0, 2, 17, 180, 219, 192, 255, 255, 255, 255, 255, 253, 238, 75, - 36, 64, - ]; + let values: [u8; 32] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; let array_data = ArrayData::builder(DataType::Decimal(23, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 08fcd6468b6..cbd24762126 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1971,7 +1971,7 @@ impl DecimalBuilder { /// array pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { let values_builder = UInt8Builder::new(capacity); - let byte_width = DecimalArray::calc_fixed_byte_size(precision); + let byte_width = 16; Self { builder: FixedSizeListBuilder::new(values_builder, byte_width), precision, @@ -2005,7 +2005,7 @@ impl DecimalBuilder { "DecimalBuilder only supports values up to 16 bytes.".to_string(), )); } - let res = v.to_be_bytes(); + let res = v.to_le_bytes(); let start_byte = 16 - size; Ok(res[start_byte..16].to_vec()) } @@ -3612,8 +3612,8 @@ mod tests { assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); assert_eq!(3, decimal_array.len()); assert_eq!(1, decimal_array.null_count()); - assert_eq!(20, decimal_array.value_offset(2)); - assert_eq!(10, decimal_array.value_length()); + assert_eq!(32, decimal_array.value_offset(2)); + assert_eq!(16, decimal_array.value_length()); } #[test] diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs index d8534b825ac..75881778836 100644 --- a/rust/arrow/src/array/equal/decimal.rs +++ b/rust/arrow/src/array/equal/decimal.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, array::DecimalArray, datatypes::DataType}; +use crate::{array::ArrayData, datatypes::DataType}; use super::utils::equal_len; @@ -27,8 +27,8 @@ pub(super) fn decimal_equal( len: usize, ) -> bool { let size = match lhs.data_type() { - DataType::Decimal(precision, _) => { - DecimalArray::calc_fixed_byte_size(*precision) as usize + DataType::Decimal(_, _) => { + 16 } _ => unreachable!(), }; diff --git a/rust/arrow/src/array/equal_json.rs b/rust/arrow/src/array/equal_json.rs index 8016eb4c8d6..9c68c0dbf3c 100644 --- a/rust/arrow/src/array/equal_json.rs +++ b/rust/arrow/src/array/equal_json.rs @@ -900,7 +900,6 @@ mod tests { "#, ) .unwrap(); - println!("{:?}", arrow_array); assert!(arrow_array.eq(&json_array)); assert!(json_array.eq(&arrow_array)); diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 0a26d2e5fd2..abf9f98a180 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -895,6 +895,22 @@ impl DataType { )) } } + Some(s) if s == "decimal" => { + // return a list with any type as its child isn't defined in the map + let precision = match map.get("precision") { + Some(p) => Ok(p.as_u64().unwrap() as usize), + None => Err(ArrowError::ParseError( + "Expecting a precision for decimal".to_string(), + )) + }; + let scale = match map.get("scale") { + Some(s) => Ok(s.as_u64().unwrap() as usize), + _ => Err(ArrowError::ParseError( + "Expecting a scale for decimal".to_string(), + ))}; + + Ok(DataType::Decimal(precision?, scale?)) + } Some(s) if s == "floatingpoint" => match map.get("precision") { Some(p) if p == "HALF" => Ok(DataType::Float16), Some(p) if p == "SINGLE" => Ok(DataType::Float32), diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 5c5544297a1..271dce21477 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -270,6 +270,10 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT DataType::Struct(fields) } + ipc::Type::Decimal => { + let fsb = field.type_as_decimal().unwrap(); + DataType::Decimal(fsb.precision() as usize, fsb.scale() as usize) + } t => unimplemented!("Type {:?} not supported", t), } } diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index 76ad6b77cf3..ec3f5db03d2 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -337,6 +337,19 @@ fn create_primitive_array( } builder.build() } + Decimal(_, _) => { + // read 3 buffers + let mut builder = ArrayData::builder(data_type.clone()) + .len(length) + .buffers(buffers[1..2].to_vec()) + .offset(0); + if null_count > 0 { + builder = builder + .null_count(null_count) + .null_bit_buffer(buffers[0].clone()) + } + builder.build() + } t => panic!("Data type {:?} either unsupported or not primitive", t), }; @@ -978,6 +991,7 @@ mod tests { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_primitive", + "generated_decimal" ]; paths.iter().for_each(|path| { let file = File::open(format!( diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 94d0a9b75a0..ee1895ca096 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -331,6 +331,11 @@ impl ArrowJsonBatch { let arr = arr.as_any().downcast_ref::().unwrap(); arr.equals_json(&json_array.iter().collect::>()[..]) } + DataType::Decimal(_, _) => { + let arr = + arr.as_any().downcast_ref::().unwrap(); + arr.equals_json(&json_array.iter().collect::>()[..]) + } DataType::Dictionary(ref key_type, _) => match key_type.as_ref() { DataType::Int8 => { let arr = arr diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index c93325b79b1..9857487f231 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -26,7 +26,7 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use arrow::datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}; +use arrow::{array::DecimalArray, datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}}; use arrow::ipc::writer; use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; @@ -400,10 +400,10 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } - DataType::Decimal(_, _) => { + DataType::Decimal(precision, _) => { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(repetition) - .with_length(10) + .with_length(DecimalArray::calc_fixed_byte_size(*precision)) .build() } DataType::Utf8 | DataType::LargeUtf8 => { From 59cecd221fc0e7ce385f1096a65c4bbc311e4f8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Fri, 27 Nov 2020 17:01:30 +0100 Subject: [PATCH 2/9] chore: remove printlns --- rust/arrow/src/array/array_binary.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 18fbeff9cce..1642b002319 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -467,8 +467,6 @@ impl DecimalArray { fn from_bytes_to_i128(b: &[u8]) -> i128 { assert!(b.len() <= 16, "DecimalArray supports only up to size 16"); - // println!("First as bytes: {:?}", (-11697 as i64).to_be_bytes()); - // println!("First as bytes: {:?}", (-11697 as i64).to_le_bytes()); let first_bit = b[b.len() - 1] & 128u8 == 128u8; let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] }; for (i, v) in b.iter().enumerate() { From 0394d7840ba2b49632f9a752442ae1a4845a5552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Mon, 30 Nov 2020 11:00:00 +0100 Subject: [PATCH 3/9] feat: add ipc convert for decimal --- rust/arrow/src/ipc/convert.rs | 11 +++++++++++ rust/arrow/src/ipc/reader.rs | 1 + rust/arrow/src/ipc/writer.rs | 2 ++ 3 files changed, 14 insertions(+) diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 271dce21477..c8be77d1654 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -566,6 +566,17 @@ pub(crate) fn get_fb_field_type<'a>( // type in the DictionaryEncoding metadata in the parent field get_fb_field_type(value_type, is_nullable, fbb) } + Decimal(precision, scale) => { + let mut builder = ipc::DecimalBuilder::new(fbb); + builder.add_precision(*precision as i32); + builder.add_scale(*scale as i32); + builder.add_bitWidth(128); + FBFieldType { + type_type: ipc::Type::Decimal, + type_: builder.finish().as_union_value(), + children: Some(fbb.create_vector(&empty_fields[..])), + } + } t => unimplemented!("Type {:?} not supported", t), } } diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index ec3f5db03d2..c7694560d25 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -1020,6 +1020,7 @@ mod tests { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_primitive", + "generated_decimal" ]; paths.iter().for_each(|path| { let file = File::open(format!( diff --git a/rust/arrow/src/ipc/writer.rs b/rust/arrow/src/ipc/writer.rs index cf32eaa2096..d4747d9f5c0 100644 --- a/rust/arrow/src/ipc/writer.rs +++ b/rust/arrow/src/ipc/writer.rs @@ -821,6 +821,7 @@ mod tests { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_primitive", + "generated_decimal", ]; paths.iter().for_each(|path| { let file = File::open(format!( @@ -865,6 +866,7 @@ mod tests { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_primitive", + "generated_decimal", ]; paths.iter().for_each(|path| { let file = File::open(format!( From 0f8f22ea614fd9c43b0d70e0c5528f462d489773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Mon, 30 Nov 2020 14:21:48 +0100 Subject: [PATCH 4/9] chore: reformat --- rust/arrow/src/array/array_binary.rs | 10 ++++++++-- rust/arrow/src/array/equal/decimal.rs | 4 +--- rust/arrow/src/datatypes.rs | 5 +++-- rust/arrow/src/ipc/reader.rs | 4 ++-- rust/arrow/src/util/integration_util.rs | 3 +-- rust/parquet/src/arrow/schema.rs | 5 ++++- 6 files changed, 19 insertions(+), 12 deletions(-) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 1642b002319..cfaf1aaed11 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -952,7 +952,10 @@ mod tests { fn test_decimal_array() { // let val_8887: [u8; 16] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; // let val_neg_8887: [u8; 16] = [64, 36, 75, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; - let values: [u8; 32] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; + let values: [u8; 32] = [ + 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ]; let array_data = ArrayData::builder(DataType::Decimal(23, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) @@ -965,7 +968,10 @@ mod tests { #[test] fn test_decimal_array_fmt_debug() { - let values: [u8; 32] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; + let values: [u8; 32] = [ + 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ]; let array_data = ArrayData::builder(DataType::Decimal(23, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs index 75881778836..715b308c4b8 100644 --- a/rust/arrow/src/array/equal/decimal.rs +++ b/rust/arrow/src/array/equal/decimal.rs @@ -27,9 +27,7 @@ pub(super) fn decimal_equal( len: usize, ) -> bool { let size = match lhs.data_type() { - DataType::Decimal(_, _) => { - 16 - } + DataType::Decimal(_, _) => 16, _ => unreachable!(), }; diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index abf9f98a180..f123f0dc816 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -901,13 +901,14 @@ impl DataType { Some(p) => Ok(p.as_u64().unwrap() as usize), None => Err(ArrowError::ParseError( "Expecting a precision for decimal".to_string(), - )) + )), }; let scale = match map.get("scale") { Some(s) => Ok(s.as_u64().unwrap() as usize), _ => Err(ArrowError::ParseError( "Expecting a scale for decimal".to_string(), - ))}; + )), + }; Ok(DataType::Decimal(precision?, scale?)) } diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index c7694560d25..eb0cb573b56 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -991,7 +991,7 @@ mod tests { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_primitive", - "generated_decimal" + "generated_decimal", ]; paths.iter().for_each(|path| { let file = File::open(format!( @@ -1020,7 +1020,7 @@ mod tests { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_primitive", - "generated_decimal" + "generated_decimal", ]; paths.iter().for_each(|path| { let file = File::open(format!( diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index ee1895ca096..621960b73a3 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -332,8 +332,7 @@ impl ArrowJsonBatch { arr.equals_json(&json_array.iter().collect::>()[..]) } DataType::Decimal(_, _) => { - let arr = - arr.as_any().downcast_ref::().unwrap(); + let arr = arr.as_any().downcast_ref::().unwrap(); arr.equals_json(&json_array.iter().collect::>()[..]) } DataType::Dictionary(ref key_type, _) => match key_type.as_ref() { diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 9857487f231..6633b4d3291 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -26,8 +26,11 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use arrow::{array::DecimalArray, datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}}; use arrow::ipc::writer; +use arrow::{ + array::DecimalArray, + datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}, +}; use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError::ArrowError, Result}; From a4c7da74b679b8f37ef2e30998b22c3dc37a829c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Thu, 3 Dec 2020 08:30:28 +0100 Subject: [PATCH 5/9] chore: address review comments --- rust/arrow/src/array/array_binary.rs | 21 +++++---------------- rust/arrow/src/ipc/convert.rs | 7 +++++++ 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index cfaf1aaed11..db652f8714a 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::convert::From; +use std::convert::{From, TryInto}; use std::fmt; use std::mem; use std::{any::Any, iter::FromIterator}; @@ -462,22 +462,11 @@ impl DecimalArray { (self.value_offset_at(offset + 1) - pos) as usize, ) }; - Self::from_bytes_to_i128(raw_val) - } - - fn from_bytes_to_i128(b: &[u8]) -> i128 { - assert!(b.len() <= 16, "DecimalArray supports only up to size 16"); - let first_bit = b[b.len() - 1] & 128u8 == 128u8; - let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] }; - for (i, v) in b.iter().enumerate() { - result[i] = *v; + let as_array = raw_val.try_into(); + match as_array { + Ok(v) if raw_val.len() == 16 => i128::from_le_bytes(v), + _ => panic!("DecimalArray elements are not 128bit integers."), } - i128::from_le_bytes(result) - } - - /// Returns the byte size per value for Decimal arrays with a given precision - pub fn calc_fixed_byte_size(precision: usize) -> i32 { - (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as i32 } /// Returns the offset for the element at index `i`. diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index c8be77d1654..2bc15f70b77 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -97,6 +97,12 @@ pub fn fb_to_schema(fb: ipc::Schema) -> Schema { let len = c_fields.len(); for i in 0..len { let c_field: ipc::Field = c_fields.get(i); + match c_field.type_type() { + ipc::Type::Decimal if fb.endianness() == ipc::Endianness::Big => { + unimplemented!("Big Endian is not supported for Decimal!") + } + _ => (), + }; fields.push(c_field.into()); } @@ -753,6 +759,7 @@ mod tests { 123, true, ), + Field::new("decimal", DataType::Decimal(10, 6), false), ], md, ); From 5a4ce5124d7642e10beb7d60a69427235cbd8af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Thu, 3 Dec 2020 08:38:40 +0100 Subject: [PATCH 6/9] chore: move fixed size calculation from array to parquet schema --- rust/parquet/src/arrow/schema.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 6633b4d3291..8674dfa6b2b 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -26,11 +26,8 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use arrow::datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}; use arrow::ipc::writer; -use arrow::{ - array::DecimalArray, - datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}, -}; use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError::ArrowError, Result}; @@ -403,12 +400,13 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } - DataType::Decimal(precision, _) => { - Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_repetition(repetition) - .with_length(DecimalArray::calc_fixed_byte_size(*precision)) - .build() - } + DataType::Decimal(precision, _) => Type::primitive_type_builder( + name, + PhysicalType::FIXED_LEN_BYTE_ARRAY, + ) + .with_repetition(repetition) + .with_length((10.0_f64.powi(*precision as i32).log2() / 8.0).ceil() as i32) + .build(), DataType::Utf8 | DataType::LargeUtf8 => { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) .with_logical_type(LogicalType::UTF8) From 44665312af83604a11f30eacf34ffffa766f5199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Thu, 3 Dec 2020 18:07:54 +0100 Subject: [PATCH 7/9] chore: add test for decimal big-endian panic --- rust/arrow/src/ipc/reader.rs | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index eb0cb573b56..2786fdeb091 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -1008,6 +1008,48 @@ mod tests { }); } + #[test] + #[should_panic(expected = "Big Endian is not supported for Decimal!")] + fn read_decimal_file_be() { + let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined"); + let paths = vec![ + "generated_decimal", + ]; + paths.iter().for_each(|path| { + let file = File::open(format!( + "{}/arrow-ipc-stream/integration/1.0.0-bigendian/{}.arrow_file", + testdata, path + )) + .unwrap(); + + FileReader::try_new(file).unwrap(); + }); + } + + #[test] + fn read_generated_files_be() { + // complementary to the previous test + let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined"); + let paths = vec![ + "generated_interval", + "generated_datetime", + "generated_dictionary", + "generated_nested", + "generated_primitive_no_batches", + "generated_primitive_zerolength", + "generated_primitive", + ]; + paths.iter().for_each(|path| { + let file = File::open(format!( + "{}/arrow-ipc-stream/integration/1.0.0-bigendian/{}.arrow_file", + testdata, path + )) + .unwrap(); + + FileReader::try_new(file).unwrap(); + }); + } + #[test] fn read_generated_streams() { let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined"); From 3dadd2cb22e3d6814faf49b4284191424b67f549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Thu, 3 Dec 2020 18:08:39 +0100 Subject: [PATCH 8/9] chore: formatting --- rust/arrow/src/ipc/reader.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index 2786fdeb091..39f07531bc6 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -1012,9 +1012,7 @@ mod tests { #[should_panic(expected = "Big Endian is not supported for Decimal!")] fn read_decimal_file_be() { let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined"); - let paths = vec![ - "generated_decimal", - ]; + let paths = vec!["generated_decimal"]; paths.iter().for_each(|path| { let file = File::open(format!( "{}/arrow-ipc-stream/integration/1.0.0-bigendian/{}.arrow_file", From da795890fc3751d80f96b3a6ae19c75a7a1c0f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Fri, 4 Dec 2020 08:01:47 +0100 Subject: [PATCH 9/9] chore: test renaming --- rust/arrow/src/ipc/reader.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index 39f07531bc6..329b99671fe 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -1010,22 +1010,18 @@ mod tests { #[test] #[should_panic(expected = "Big Endian is not supported for Decimal!")] - fn read_decimal_file_be() { + fn read_decimal_be_file_should_panic() { let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined"); - let paths = vec!["generated_decimal"]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/1.0.0-bigendian/{}.arrow_file", - testdata, path + let file = File::open(format!( + "{}/arrow-ipc-stream/integration/1.0.0-bigendian/generated_decimal.arrow_file", + testdata )) .unwrap(); - - FileReader::try_new(file).unwrap(); - }); + FileReader::try_new(file).unwrap(); } #[test] - fn read_generated_files_be() { + fn read_generated_be_files_should_work() { // complementary to the previous test let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined"); let paths = vec![