diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index bbdd46af5d01f..ae8395aef6a41 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -81,6 +81,15 @@ macro_rules! get_statistic { Some(DataType::Int16) => { Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap()))) } + Some(DataType::UInt8) => { + Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::UInt16) => { + Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::UInt32) => { + Some(ScalarValue::UInt32(Some((*s.$func()) as u32))) + } Some(DataType::Date32) => { Some(ScalarValue::Date32(Some(*s.$func()))) } @@ -100,6 +109,9 @@ macro_rules! get_statistic { *scale, )) } + Some(DataType::UInt64) => { + Some(ScalarValue::UInt64(Some((*s.$func()) as u64))) + } _ => Some(ScalarValue::Int64(Some(*s.$func()))), } } diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 93cb7636b82ea..eebf3447cbe9f 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -26,7 +26,8 @@ use arrow::datatypes::{Date32Type, Date64Type}; use arrow_array::{ make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array, + Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{DataType, Field, Schema}; use datafusion::datasource::physical_plan::parquet::{ @@ -703,8 +704,6 @@ async fn test_dates_64_diff_rg_sizes() { .run(); } -// BUG: -// https://github.com/apache/datafusion/issues/10604 #[tokio::test] async fn test_uint() { // This creates a parquet files of 4 columns named "u8", "u16", "u32", "u64" @@ -719,48 +718,40 @@ async fn test_uint() { row_per_group: 4, }; - // u8 - // BUG: expect UInt8Array but returns Int32Array Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt8Array - expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt8Array + expected_min: Arc::new(UInt8Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt8Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u8", } .run(); - // u16 - // BUG: expect UInt16Array but returns Int32Array Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt16Array - expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt16Array + expected_min: Arc::new(UInt16Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt16Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u16", } .run(); - // u32 - // BUG: expect UInt32Array but returns Int32Array Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt32Array - expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt32Array + expected_min: Arc::new(UInt32Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt32Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u32", } .run(); - // u64 - // BUG: expect UInt64rray but returns Int64Array Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt64Array - expected_max: Arc::new(Int64Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt64Array + expected_min: Arc::new(UInt64Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt64Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u64", @@ -788,8 +779,6 @@ async fn test_int32_range() { .run(); } -// BUG: not convert UInt32Array to Int32Array -// https://github.com/apache/datafusion/issues/10604 #[tokio::test] async fn test_uint32_range() { // This creates a parquet file of 1 column "u" @@ -801,8 +790,8 @@ async fn test_uint32_range() { Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0])), // should be UInt32Array - expected_max: Arc::new(Int32Array::from(vec![300000])), // should be UInt32Array + expected_min: Arc::new(UInt32Array::from(vec![0])), + expected_max: Arc::new(UInt32Array::from(vec![300000])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: UInt64Array::from(vec![4]), column_name: "u", @@ -820,44 +809,45 @@ async fn test_numeric_limits_unsigned() { Test { reader: reader.build().await, - expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])), - expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])), + expected_min: Arc::new(UInt8Array::from(vec![u8::MIN, 100])), + expected_max: Arc::new(UInt8Array::from(vec![100, u8::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i8", + column_name: "u8", } .run(); Test { reader: reader.build().await, - expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])), - expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])), + expected_min: Arc::new(UInt16Array::from(vec![u16::MIN, 100])), + expected_max: Arc::new(UInt16Array::from(vec![100, u16::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i16", + column_name: "u16", } .run(); Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])), - expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])), + expected_min: Arc::new(UInt32Array::from(vec![u32::MIN, 100])), + expected_max: Arc::new(UInt32Array::from(vec![100, u32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i32", + column_name: "u32", } .run(); Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])), - expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])), + expected_min: Arc::new(UInt64Array::from(vec![u64::MIN, 100])), + expected_max: Arc::new(UInt64Array::from(vec![100, u64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i64", + column_name: "u64", } .run(); } + #[tokio::test] async fn test_numeric_limits_signed() { // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.