Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ use datafusion_common::HashMap;
use datafusion_physical_expr_common::sort_expr::LexOrdering;
use rand::rngs::StdRng;
use rand::{thread_rng, Rng, SeedableRng};
use std::str;
use tokio::task::JoinSet;

// ========================================================================
Expand Down Expand Up @@ -171,6 +172,21 @@ fn baseline_config() -> DatasetGeneratorConfig {
ColumnDescr::new("time32_ms", DataType::Time32(TimeUnit::Millisecond)),
ColumnDescr::new("time64_us", DataType::Time64(TimeUnit::Microsecond)),
ColumnDescr::new("time64_ns", DataType::Time64(TimeUnit::Nanosecond)),
ColumnDescr::new("timestamp_s", DataType::Timestamp(TimeUnit::Second, None)),
ColumnDescr::new(
"timestamp_ms",
DataType::Timestamp(TimeUnit::Millisecond, None),
),
ColumnDescr::new(
"timestamp_us",
DataType::Timestamp(TimeUnit::Microsecond, None),
),
ColumnDescr::new(
"timestamp_ns",
DataType::Timestamp(TimeUnit::Nanosecond, None),
),
ColumnDescr::new("float32", DataType::Float32),
ColumnDescr::new("float64", DataType::Float64),
ColumnDescr::new(
"interval_year_month",
DataType::Interval(IntervalUnit::YearMonth),
Expand Down Expand Up @@ -206,10 +222,12 @@ fn baseline_config() -> DatasetGeneratorConfig {
ColumnDescr::new("utf8", DataType::Utf8),
ColumnDescr::new("largeutf8", DataType::LargeUtf8),
ColumnDescr::new("utf8view", DataType::Utf8View),
// todo binary
// low cardinality columns
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),
ColumnDescr::new("binary", DataType::Binary),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could potentially remove the todo binary a few lines above

ColumnDescr::new("large_binary", DataType::LargeBinary),
ColumnDescr::new("binaryview", DataType::BinaryView),
];

let min_num_rows = 512;
Expand Down
126 changes: 117 additions & 9 deletions datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@
use std::sync::Arc;

use arrow::datatypes::{
ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeUtf8Type,
StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type,
BinaryType, BinaryViewType, ByteArrayType, ByteViewType, Date32Type, Date64Type,
Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type,
Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType,
IntervalYearMonthType, LargeBinaryType, LargeUtf8Type, StringViewType,
Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type,
};
use arrow_array::{ArrayRef, RecordBatch};
use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit};
Expand All @@ -35,7 +37,10 @@ use rand::{
thread_rng, Rng, SeedableRng,
};
use test_utils::{
array_gen::{DecimalArrayGenerator, PrimitiveArrayGenerator, StringArrayGenerator},
array_gen::{
BinaryArrayGenerator, DecimalArrayGenerator, PrimitiveArrayGenerator,
StringArrayGenerator,
},
stagger_batch,
};

Expand Down Expand Up @@ -71,17 +76,19 @@ pub struct DatasetGeneratorConfig {
}

impl DatasetGeneratorConfig {
/// return a list of all column names
/// Return a list of all column names
pub fn all_columns(&self) -> Vec<&str> {
self.columns.iter().map(|d| d.name.as_str()).collect()
}

/// return a list of column names that are "numeric"
/// Return a list of column names that are "numeric"
pub fn numeric_columns(&self) -> Vec<&str> {
self.columns
.iter()
.filter_map(|d| {
if d.column_type.is_numeric() {
if d.column_type.is_numeric()
&& !matches!(d.column_type, DataType::Float32 | DataType::Float64)
{
Some(d.name.as_str())
} else {
None
Expand Down Expand Up @@ -278,6 +285,37 @@ macro_rules! generate_primitive_array {
}};
}

macro_rules! generate_binary_array {
(
$SELF:ident,
$NUM_ROWS:ident,
$MAX_NUM_DISTINCT:expr,
$BATCH_GEN_RNG:ident,
$ARRAY_GEN_RNG:ident,
$ARROW_TYPE:ident
) => {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];

let max_len = $BATCH_GEN_RNG.gen_range(1..100);

let mut generator = BinaryArrayGenerator {
max_len,
num_binaries: $NUM_ROWS,
num_distinct_binaries: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};

match $ARROW_TYPE::DATA_TYPE {
DataType::Binary => generator.gen_data::<i32>(),
DataType::LargeBinary => generator.gen_data::<i64>(),
DataType::BinaryView => generator.gen_binary_view(),
_ => unreachable!(),
}
}};
}

impl RecordBatchGenerator {
fn new(min_rows_nun: usize, max_rows_num: usize, columns: Vec<ColumnDescr>) -> Self {
let candidate_null_pcts = vec![0.0, 0.01, 0.1, 0.5];
Expand Down Expand Up @@ -527,6 +565,76 @@ impl RecordBatchGenerator {
IntervalMonthDayNanoType
)
}
DataType::Timestamp(TimeUnit::Second, None) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
TimestampSecondType
)
}
DataType::Timestamp(TimeUnit::Millisecond, None) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
TimestampMillisecondType
)
}
DataType::Timestamp(TimeUnit::Microsecond, None) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
TimestampMicrosecondType
)
}
DataType::Timestamp(TimeUnit::Nanosecond, None) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
TimestampNanosecondType
)
}
DataType::Binary => {
generate_binary_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
BinaryType
)
}
DataType::LargeBinary => {
generate_binary_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
LargeBinaryType
)
}
DataType::BinaryView => {
generate_binary_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
BinaryViewType
)
}
DataType::Decimal128(precision, scale) => {
generate_decimal_array!(
self,
Expand Down
94 changes: 94 additions & 0 deletions test-utils/src/array_gen/binary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{
ArrayRef, BinaryViewArray, GenericBinaryArray, OffsetSizeTrait, UInt32Array,
};
use arrow::compute;
use rand::rngs::StdRng;
use rand::Rng;

/// Randomly generate binary arrays
pub struct BinaryArrayGenerator {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

/// The maximum length of each binary element
pub max_len: usize,
/// The total number of binaries in the output
pub num_binaries: usize,
/// The number of distinct binary values in the columns
pub num_distinct_binaries: usize,
/// The percentage of nulls in the columns
pub null_pct: f64,
/// Random number generator
pub rng: StdRng,
}

impl BinaryArrayGenerator {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Love it, thinking of if we should tests for this generator?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the generator itself is part of a test 🤔 What would we test? Maybe that the distinct values are as specified?

/// Creates a BinaryArray or LargeBinaryArray with random binary data.
pub fn gen_data<O: OffsetSizeTrait>(&mut self) -> ArrayRef {
let distinct_binaries: GenericBinaryArray<O> = (0..self.num_distinct_binaries)
.map(|_| Some(random_binary(&mut self.rng, self.max_len)))
.collect();

// Pick num_binaries randomly from the distinct binary table
let indices: UInt32Array = (0..self.num_binaries)
.map(|_| {
if self.rng.gen::<f64>() < self.null_pct {
None
} else if self.num_distinct_binaries > 1 {
let range = 0..(self.num_distinct_binaries as u32);
Some(self.rng.gen_range(range))
} else {
Some(0)
}
})
.collect();

compute::take(&distinct_binaries, &indices, None).unwrap()
}

/// Creates a BinaryViewArray with random binary data.
pub fn gen_binary_view(&mut self) -> ArrayRef {
let distinct_binary_views: BinaryViewArray = (0..self.num_distinct_binaries)
.map(|_| Some(random_binary(&mut self.rng, self.max_len)))
.collect();

let indices: UInt32Array = (0..self.num_binaries)
.map(|_| {
if self.rng.gen::<f64>() < self.null_pct {
None
} else if self.num_distinct_binaries > 1 {
let range = 0..(self.num_distinct_binaries as u32);
Some(self.rng.gen_range(range))
} else {
Some(0)
}
})
.collect();

compute::take(&distinct_binary_views, &indices, None).unwrap()
}
}

/// Return a binary vector of random bytes of length 1..=max_len
fn random_binary(rng: &mut StdRng, max_len: usize) -> Vec<u8> {
if max_len == 0 {
Vec::new()
} else {
let len = rng.gen_range(1..=max_len);
(0..len).map(|_| rng.gen()).collect()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wondering if len differs from max_len?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that len is the actual length of the value, which is drawn between 1..max_len

}
}
2 changes: 2 additions & 0 deletions test-utils/src/array_gen/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
// specific language governing permissions and limitations
// under the License.

mod binary;
mod decimal;
mod primitive;
mod random_data;
mod string;

pub use binary::BinaryArrayGenerator;
pub use decimal::DecimalArrayGenerator;
pub use primitive::PrimitiveArrayGenerator;
pub use string::StringArrayGenerator;
7 changes: 5 additions & 2 deletions test-utils/src/array_gen/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,13 @@ impl PrimitiveArrayGenerator {
| DataType::Date64
| DataType::Time32(_)
| DataType::Time64(_)
| DataType::Interval(_) => (0..self.num_distinct_primitives)
| DataType::Interval(_)
| DataType::Binary
| DataType::LargeBinary
| DataType::BinaryView
| DataType::Timestamp(_, _) => (0..self.num_distinct_primitives)
.map(|_| Some(A::generate_random_native_data(&mut self.rng)))
.collect(),

_ => {
let arrow_type = A::DATA_TYPE;
panic!("Unsupported arrow data type: {arrow_type}")
Expand Down
9 changes: 7 additions & 2 deletions test-utils/src/array_gen/random_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ use arrow::datatypes::{
Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTime,
IntervalDayTimeType, IntervalMonthDayNano, IntervalMonthDayNanoType,
IntervalYearMonthType, Time32MillisecondType, Time32SecondType,
Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type,
UInt8Type,
Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type,
UInt32Type, UInt64Type, UInt8Type,
};
use rand::distributions::Standard;
use rand::prelude::Distribution;
Expand Down Expand Up @@ -66,6 +67,10 @@ basic_random_data!(Time64MicrosecondType);
basic_random_data!(Time64NanosecondType);
basic_random_data!(IntervalYearMonthType);
basic_random_data!(Decimal128Type);
basic_random_data!(TimestampSecondType);
basic_random_data!(TimestampMillisecondType);
basic_random_data!(TimestampMicrosecondType);
basic_random_data!(TimestampNanosecondType);

impl RandomNativeData for Date64Type {
fn generate_random_native_data(rng: &mut StdRng) -> Self::Native {
Expand Down