Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 80 additions & 1 deletion datafusion/core/tests/parquet/arrow_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ use arrow::datatypes::{
use arrow_array::{
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array,
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray,
IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray,
LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray,
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
Expand Down Expand Up @@ -1072,6 +1073,84 @@ async fn test_dates_64_diff_rg_sizes() {
.run();
}

#[tokio::test]
#[should_panic]
// Currently this test `should_panic` since statistics for `Intervals`
// are not supported and `IntervalMonthDayNano` cannot be written
// to parquet yet.
// Refer to issue: https://github.com/apache/arrow-rs/issues/5847
// and https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747
async fn test_interval_diff_rg_sizes() {
// This creates a parquet files of 3 columns:
// "year_month" --> IntervalYearMonthArray
// "day_time" --> IntervalDayTimeArray
// "month_day_nano" --> IntervalMonthDayNanoArray
//
// The file is created by 4 record batches (each has a null row)
// each has 5 rows but then will be split into 2 row groups with size 13, 7
let reader = TestReader {
scenario: Scenario::Interval,
row_per_group: 13,
}
.build()
.await;

// TODO: expected values need to be changed once issue is resolved
// expected_min: Arc::new(IntervalYearMonthArray::from(vec![
// IntervalYearMonthType::make_value(1, 10),
// IntervalYearMonthType::make_value(4, 13),
// ])),
// expected_max: Arc::new(IntervalYearMonthArray::from(vec![
// IntervalYearMonthType::make_value(6, 51),
// IntervalYearMonthType::make_value(8, 53),
// ])),
Test {
reader: &reader,
expected_min: Arc::new(IntervalYearMonthArray::from(vec![None, None])),
expected_max: Arc::new(IntervalYearMonthArray::from(vec![None, None])),
expected_null_counts: UInt64Array::from(vec![2, 2]),
expected_row_counts: UInt64Array::from(vec![13, 7]),
column_name: "year_month",
}
.run();

// expected_min: Arc::new(IntervalDayTimeArray::from(vec![
// IntervalDayTimeType::make_value(1, 10),
// IntervalDayTimeType::make_value(4, 13),
// ])),
// expected_max: Arc::new(IntervalDayTimeArray::from(vec![
// IntervalDayTimeType::make_value(6, 51),
// IntervalDayTimeType::make_value(8, 53),
// ])),
Test {
reader: &reader,
expected_min: Arc::new(IntervalDayTimeArray::from(vec![None, None])),
expected_max: Arc::new(IntervalDayTimeArray::from(vec![None, None])),
expected_null_counts: UInt64Array::from(vec![2, 2]),
expected_row_counts: UInt64Array::from(vec![13, 7]),
column_name: "day_time",
}
.run();

// expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![
// IntervalMonthDayNanoType::make_value(1, 10, 100),
// IntervalMonthDayNanoType::make_value(4, 13, 103),
// ])),
// expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![
// IntervalMonthDayNanoType::make_value(6, 51, 501),
// IntervalMonthDayNanoType::make_value(8, 53, 503),
// ])),
Test {
reader: &reader,
expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![None, None])),
expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![None, None])),
expected_null_counts: UInt64Array::from(vec![2, 2]),
expected_row_counts: UInt64Array::from(vec![13, 7]),
column_name: "month_day_nano",
}
.run();
}

#[tokio::test]
async fn test_uint() {
// This creates a parquet files of 4 columns named "u8", "u16", "u32", "u64"
Expand Down
80 changes: 79 additions & 1 deletion datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

//! Parquet integration tests
use arrow::array::Decimal128Array;
use arrow::datatypes::i256;
use arrow::datatypes::{
i256, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
};
use arrow::{
array::{
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
Expand All @@ -33,6 +35,10 @@ use arrow::{
record_batch::RecordBatch,
util::pretty::pretty_format_batches,
};
use arrow_array::{
IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray,
};
use arrow_schema::IntervalUnit;
use chrono::{Datelike, Duration, TimeDelta};
use datafusion::{
datasource::{physical_plan::ParquetExec, provider_as_source, TableProvider},
Expand Down Expand Up @@ -80,6 +86,7 @@ enum Scenario {
Time32Millisecond,
Time64Nanosecond,
Time64Microsecond,
Interval,
/// 7 Rows, for each i8, i16, i32, i64, u8, u16, u32, u64, f32, f64
/// -MIN, -100, -1, 0, 1, 100, MAX
NumericLimits,
Expand Down Expand Up @@ -925,6 +932,71 @@ fn make_dict_batch() -> RecordBatch {
.unwrap()
}

fn make_interval_batch(offset: i32) -> RecordBatch {
let schema = Schema::new(vec![
Field::new(
"year_month",
DataType::Interval(IntervalUnit::YearMonth),
true,
),
Field::new("day_time", DataType::Interval(IntervalUnit::DayTime), true),
Field::new(
"month_day_nano",
DataType::Interval(IntervalUnit::MonthDayNano),
true,
),
]);
let schema = Arc::new(schema);

let ym_arr = IntervalYearMonthArray::from(vec![
Some(IntervalYearMonthType::make_value(1 + offset, 10 + offset)),
Some(IntervalYearMonthType::make_value(2 + offset, 20 + offset)),
Some(IntervalYearMonthType::make_value(3 + offset, 30 + offset)),
None,
Some(IntervalYearMonthType::make_value(5 + offset, 50 + offset)),
]);

let dt_arr = IntervalDayTimeArray::from(vec![
Some(IntervalDayTimeType::make_value(1 + offset, 10 + offset)),
Some(IntervalDayTimeType::make_value(2 + offset, 20 + offset)),
Some(IntervalDayTimeType::make_value(3 + offset, 30 + offset)),
None,
Some(IntervalDayTimeType::make_value(5 + offset, 50 + offset)),
]);

// Not yet implemented, refer to:
// https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747
let mdn_arr = IntervalMonthDayNanoArray::from(vec![
Some(IntervalMonthDayNanoType::make_value(
1 + offset,
10 + offset,
100 + (offset as i64),
)),
Some(IntervalMonthDayNanoType::make_value(
2 + offset,
20 + offset,
200 + (offset as i64),
)),
Some(IntervalMonthDayNanoType::make_value(
3 + offset,
30 + offset,
300 + (offset as i64),
)),
None,
Some(IntervalMonthDayNanoType::make_value(
5 + offset,
50 + offset,
500 + (offset as i64),
)),
]);

RecordBatch::try_new(
schema,
vec![Arc::new(ym_arr), Arc::new(dt_arr), Arc::new(mdn_arr)],
)
.unwrap()
}

fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
match scenario {
Scenario::Boolean => {
Expand Down Expand Up @@ -1346,6 +1418,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
]),
]
}
Scenario::Interval => vec![
make_interval_batch(0),
make_interval_batch(1),
make_interval_batch(2),
make_interval_batch(3),
],
}
}

Expand Down