From a4d39f188c02e9475092fa4cf4f39a6cbbde43b7 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Wed, 6 Nov 2024 15:42:30 -0500 Subject: [PATCH 1/5] Added Timestamp/Binary/Float to fuzz --- .../core/tests/fuzz_cases/aggregate_fuzz.rs | 20 +++ .../aggregation_fuzzer/data_generator.rs | 130 ++++++++++++++++-- test-utils/src/array_gen/binary.rs | 94 +++++++++++++ test-utils/src/array_gen/mod.rs | 2 + test-utils/src/array_gen/primitive.rs | 7 +- test-utils/src/array_gen/random_data.rs | 9 +- 6 files changed, 245 insertions(+), 17 deletions(-) create mode 100644 test-utils/src/array_gen/binary.rs diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 16f539b75967f..fdcdb7ebeed97 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -50,6 +50,7 @@ use datafusion_common::HashMap; use datafusion_physical_expr_common::sort_expr::LexOrdering; use rand::rngs::StdRng; use rand::{thread_rng, Rng, SeedableRng}; +use std::str; use tokio::task::JoinSet; // ======================================================================== @@ -171,6 +172,25 @@ fn baseline_config() -> DatasetGeneratorConfig { ColumnDescr::new("time32_ms", DataType::Time32(TimeUnit::Millisecond)), ColumnDescr::new("time64_us", DataType::Time64(TimeUnit::Microsecond)), ColumnDescr::new("time64_ns", DataType::Time64(TimeUnit::Nanosecond)), + // TODO: randomize timezones for timestamp types + ColumnDescr::new("timestamp_s", DataType::Timestamp(TimeUnit::Second, None)), + ColumnDescr::new( + "timestamp_ms", + DataType::Timestamp(TimeUnit::Millisecond, None), + ), + ColumnDescr::new( + "timestamp_us", + DataType::Timestamp(TimeUnit::Microsecond, None), + ), + ColumnDescr::new( + "timestamp_ns", + DataType::Timestamp(TimeUnit::Nanosecond, None), + ), + ColumnDescr::new("binary", DataType::Binary), + ColumnDescr::new("large_binary", DataType::LargeBinary), + ColumnDescr::new("binaryview", DataType::BinaryView), + ColumnDescr::new("float32", DataType::Float32), + ColumnDescr::new("float64", DataType::Float64), ColumnDescr::new( "interval_year_month", DataType::Interval(IntervalUnit::YearMonth), diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index 88133a134e4da..146890e27f1e0 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -18,11 +18,13 @@ use std::sync::Arc; use arrow::datatypes::{ - ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeUtf8Type, - StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, - Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type, + BinaryType, BinaryViewType, ByteArrayType, ByteViewType, Date32Type, Date64Type, + Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, + Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, + IntervalYearMonthType, LargeBinaryType, LargeUtf8Type, StringViewType, + Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type, }; use arrow_array::{ArrayRef, RecordBatch}; use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit}; @@ -35,7 +37,10 @@ use rand::{ thread_rng, Rng, SeedableRng, }; use test_utils::{ - array_gen::{DecimalArrayGenerator, PrimitiveArrayGenerator, StringArrayGenerator}, + array_gen::{ + BinaryArrayGenerator, DecimalArrayGenerator, PrimitiveArrayGenerator, + StringArrayGenerator, + }, stagger_batch, }; @@ -71,21 +76,19 @@ pub struct DatasetGeneratorConfig { } impl DatasetGeneratorConfig { - /// return a list of all column names + /// Return a list of all column names pub fn all_columns(&self) -> Vec<&str> { self.columns.iter().map(|d| d.name.as_str()).collect() } - /// return a list of column names that are "numeric" + /// Return a list of column names that are "numeric" pub fn numeric_columns(&self) -> Vec<&str> { self.columns .iter() .filter_map(|d| { - if d.column_type.is_numeric() { - Some(d.name.as_str()) - } else { - None - } + (d.column_type.is_numeric() + && !matches!(d.column_type, DataType::Float32 | DataType::Float64)) + .then(|| d.name.as_str()) }) .collect() } @@ -278,6 +281,37 @@ macro_rules! generate_primitive_array { }}; } +macro_rules! generate_binary_array { + ( + $SELF:ident, + $NUM_ROWS:ident, + $MAX_NUM_DISTINCT:expr, + $BATCH_GEN_RNG:ident, + $ARRAY_GEN_RNG:ident, + $ARROW_TYPE:ident + ) => {{ + let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len()); + let null_pct = $SELF.candidate_null_pcts[null_pct_idx]; + + let max_len = $BATCH_GEN_RNG.gen_range(1..100); + + let mut generator = BinaryArrayGenerator { + max_len, + num_binaries: $NUM_ROWS, + num_distinct_binaries: $MAX_NUM_DISTINCT, + null_pct, + rng: $ARRAY_GEN_RNG, + }; + + match $ARROW_TYPE::DATA_TYPE { + DataType::Binary => generator.gen_data::(), + DataType::LargeBinary => generator.gen_data::(), + DataType::BinaryView => generator.gen_binary_view(), + _ => unreachable!(), + } + }}; +} + impl RecordBatchGenerator { fn new(min_rows_nun: usize, max_rows_num: usize, columns: Vec) -> Self { let candidate_null_pcts = vec![0.0, 0.01, 0.1, 0.5]; @@ -527,6 +561,76 @@ impl RecordBatchGenerator { IntervalMonthDayNanoType ) } + DataType::Timestamp(TimeUnit::Second, None) => { + generate_primitive_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + TimestampSecondType + ) + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + generate_primitive_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + TimestampMillisecondType + ) + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + generate_primitive_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + TimestampMicrosecondType + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + generate_primitive_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + TimestampNanosecondType + ) + } + DataType::Binary => { + generate_binary_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + BinaryType + ) + } + DataType::LargeBinary => { + generate_binary_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + LargeBinaryType + ) + } + DataType::BinaryView => { + generate_binary_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + BinaryViewType + ) + } DataType::Decimal128(precision, scale) => { generate_decimal_array!( self, diff --git a/test-utils/src/array_gen/binary.rs b/test-utils/src/array_gen/binary.rs new file mode 100644 index 0000000000000..d342118fa85d3 --- /dev/null +++ b/test-utils/src/array_gen/binary.rs @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + ArrayRef, BinaryViewArray, GenericBinaryArray, OffsetSizeTrait, UInt32Array, +}; +use arrow::compute; +use rand::rngs::StdRng; +use rand::Rng; + +/// Randomly generate binary arrays +pub struct BinaryArrayGenerator { + /// The maximum length of each binary element + pub max_len: usize, + /// The total number of binaries in the output + pub num_binaries: usize, + /// The number of distinct binary values in the columns + pub num_distinct_binaries: usize, + /// The percentage of nulls in the columns + pub null_pct: f64, + /// Random number generator + pub rng: StdRng, +} + +impl BinaryArrayGenerator { + /// Creates a BinaryArray or LargeBinaryArray with random binary data. + pub fn gen_data(&mut self) -> ArrayRef { + let distinct_binaries: GenericBinaryArray = (0..self.num_distinct_binaries) + .map(|_| Some(random_binary(&mut self.rng, self.max_len))) + .collect(); + + // Pick num_binaries randomly from the distinct binary table + let indices: UInt32Array = (0..self.num_binaries) + .map(|_| { + if self.rng.gen::() < self.null_pct { + None + } else if self.num_distinct_binaries > 1 { + let range = 0..(self.num_distinct_binaries as u32); + Some(self.rng.gen_range(range)) + } else { + Some(0) + } + }) + .collect(); + + compute::take(&distinct_binaries, &indices, None).unwrap() + } + + /// Creates a BinaryViewArray with random binary data. + pub fn gen_binary_view(&mut self) -> ArrayRef { + let distinct_binary_views: BinaryViewArray = (0..self.num_distinct_binaries) + .map(|_| Some(random_binary(&mut self.rng, self.max_len))) + .collect(); + + let indices: UInt32Array = (0..self.num_binaries) + .map(|_| { + if self.rng.gen::() < self.null_pct { + None + } else if self.num_distinct_binaries > 1 { + let range = 0..(self.num_distinct_binaries as u32); + Some(self.rng.gen_range(range)) + } else { + Some(0) + } + }) + .collect(); + + compute::take(&distinct_binary_views, &indices, None).unwrap() + } +} + +/// Return a binary vector of random bytes of length 1..=max_len +fn random_binary(rng: &mut StdRng, max_len: usize) -> Vec { + if max_len == 0 { + Vec::new() + } else { + let len = rng.gen_range(1..=max_len); + (0..len).map(|_| rng.gen()).collect() + } +} diff --git a/test-utils/src/array_gen/mod.rs b/test-utils/src/array_gen/mod.rs index 8e0e39ddfdce1..d076bb1b6f0b8 100644 --- a/test-utils/src/array_gen/mod.rs +++ b/test-utils/src/array_gen/mod.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +mod binary; mod decimal; mod primitive; mod random_data; mod string; +pub use binary::BinaryArrayGenerator; pub use decimal::DecimalArrayGenerator; pub use primitive::PrimitiveArrayGenerator; pub use string::StringArrayGenerator; diff --git a/test-utils/src/array_gen/primitive.rs b/test-utils/src/array_gen/primitive.rs index 2469cbf446601..500a68143f034 100644 --- a/test-utils/src/array_gen/primitive.rs +++ b/test-utils/src/array_gen/primitive.rs @@ -56,10 +56,13 @@ impl PrimitiveArrayGenerator { | DataType::Date64 | DataType::Time32(_) | DataType::Time64(_) - | DataType::Interval(_) => (0..self.num_distinct_primitives) + | DataType::Interval(_) + | DataType::Binary + | DataType::LargeBinary + | DataType::BinaryView + | DataType::Timestamp(_, _) => (0..self.num_distinct_primitives) .map(|_| Some(A::generate_random_native_data(&mut self.rng))) .collect(), - _ => { let arrow_type = A::DATA_TYPE; panic!("Unsupported arrow data type: {arrow_type}") diff --git a/test-utils/src/array_gen/random_data.rs b/test-utils/src/array_gen/random_data.rs index 23227100d73fd..a7297d45fdf07 100644 --- a/test-utils/src/array_gen/random_data.rs +++ b/test-utils/src/array_gen/random_data.rs @@ -21,8 +21,9 @@ use arrow::datatypes::{ Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTime, IntervalDayTimeType, IntervalMonthDayNano, IntervalMonthDayNanoType, IntervalYearMonthType, Time32MillisecondType, Time32SecondType, - Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, + Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, }; use rand::distributions::Standard; use rand::prelude::Distribution; @@ -66,6 +67,10 @@ basic_random_data!(Time64MicrosecondType); basic_random_data!(Time64NanosecondType); basic_random_data!(IntervalYearMonthType); basic_random_data!(Decimal128Type); +basic_random_data!(TimestampSecondType); +basic_random_data!(TimestampMillisecondType); +basic_random_data!(TimestampMicrosecondType); +basic_random_data!(TimestampNanosecondType); impl RandomNativeData for Date64Type { fn generate_random_native_data(rng: &mut StdRng) -> Self::Native { From 49f2d52ca77ef2ff3760e6e84c93002ada2733fc Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Wed, 6 Nov 2024 16:06:46 -0500 Subject: [PATCH 2/5] clippy fix --- .../fuzz_cases/aggregation_fuzzer/data_generator.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index 146890e27f1e0..803f11a653bea 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -86,9 +86,13 @@ impl DatasetGeneratorConfig { self.columns .iter() .filter_map(|d| { - (d.column_type.is_numeric() - && !matches!(d.column_type, DataType::Float32 | DataType::Float64)) - .then(|| d.name.as_str()) + if d.column_type.is_numeric() + && !matches!(d.column_type, DataType::Float32 | DataType::Float64) + { + Some(d.name.as_str()) + } else { + None + } }) .collect() } From fbbe37dbf5d81be28d08137dc2e314e88a14ecf8 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Thu, 7 Nov 2024 13:22:42 -0500 Subject: [PATCH 3/5] small fix --- datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index fdcdb7ebeed97..f1b55d3002c2b 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -186,9 +186,6 @@ fn baseline_config() -> DatasetGeneratorConfig { "timestamp_ns", DataType::Timestamp(TimeUnit::Nanosecond, None), ), - ColumnDescr::new("binary", DataType::Binary), - ColumnDescr::new("large_binary", DataType::LargeBinary), - ColumnDescr::new("binaryview", DataType::BinaryView), ColumnDescr::new("float32", DataType::Float32), ColumnDescr::new("float64", DataType::Float64), ColumnDescr::new( @@ -230,6 +227,9 @@ fn baseline_config() -> DatasetGeneratorConfig { // low cardinality columns ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10), ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10), + ColumnDescr::new("binary", DataType::Binary), + ColumnDescr::new("large_binary", DataType::LargeBinary), + ColumnDescr::new("binaryview", DataType::BinaryView), ]; let min_num_rows = 512; From 0599f1d2732368d62a96ca1f215487e9ac3c65a6 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Thu, 7 Nov 2024 23:17:55 -0500 Subject: [PATCH 4/5] remove todo --- datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index f1b55d3002c2b..c423d5a662662 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -172,7 +172,6 @@ fn baseline_config() -> DatasetGeneratorConfig { ColumnDescr::new("time32_ms", DataType::Time32(TimeUnit::Millisecond)), ColumnDescr::new("time64_us", DataType::Time64(TimeUnit::Microsecond)), ColumnDescr::new("time64_ns", DataType::Time64(TimeUnit::Nanosecond)), - // TODO: randomize timezones for timestamp types ColumnDescr::new("timestamp_s", DataType::Timestamp(TimeUnit::Second, None)), ColumnDescr::new( "timestamp_ms", From 6fc968d524d0ef96e9ccd9a29990a8128fa793d8 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Fri, 8 Nov 2024 19:03:23 -0500 Subject: [PATCH 5/5] remove todo --- datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index c423d5a662662..792e23b519e04 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -222,7 +222,6 @@ fn baseline_config() -> DatasetGeneratorConfig { ColumnDescr::new("utf8", DataType::Utf8), ColumnDescr::new("largeutf8", DataType::LargeUtf8), ColumnDescr::new("utf8view", DataType::Utf8View), - // todo binary // low cardinality columns ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10), ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),