diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 792e23b519e04..29e1d7bc22ecb 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -225,6 +225,7 @@ fn baseline_config() -> DatasetGeneratorConfig { // low cardinality columns ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10), ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10), + ColumnDescr::new("bool", DataType::Boolean), ColumnDescr::new("binary", DataType::Binary), ColumnDescr::new("large_binary", DataType::LargeBinary), ColumnDescr::new("binaryview", DataType::BinaryView), diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index fd4e3c40db2a2..e4c0cb6fe77f7 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -18,9 +18,9 @@ use std::sync::Arc; use arrow::datatypes::{ - BinaryType, BinaryViewType, ByteArrayType, ByteViewType, Date32Type, Date64Type, - Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, - Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, + BinaryType, BinaryViewType, BooleanType, ByteArrayType, ByteViewType, Date32Type, + Date64Type, Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, + Int32Type, Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeBinaryType, LargeUtf8Type, StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, @@ -38,8 +38,8 @@ use rand::{ }; use test_utils::{ array_gen::{ - BinaryArrayGenerator, DecimalArrayGenerator, PrimitiveArrayGenerator, - StringArrayGenerator, + BinaryArrayGenerator, BooleanArrayGenerator, DecimalArrayGenerator, + PrimitiveArrayGenerator, StringArrayGenerator, }, stagger_batch, }; @@ -269,6 +269,26 @@ macro_rules! generate_decimal_array { }}; } +// Generating `BooleanArray` due to it being a special type in Arrow (bit-packed) +macro_rules! generate_boolean_array { + ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{ + // Select a null percentage from the candidate percentages + let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len()); + let null_pct = $SELF.candidate_null_pcts[null_pct_idx]; + + let num_distinct_booleans = if $MAX_NUM_DISTINCT >= 2 { 2 } else { 1 }; + + let mut generator = BooleanArrayGenerator { + num_booleans: $NUM_ROWS, + num_distinct_booleans, + null_pct, + rng: $ARRAY_GEN_RNG, + }; + + generator.gen_data::<$ARROW_TYPE>() + }}; +} + macro_rules! generate_primitive_array { ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{ let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len()); @@ -689,6 +709,16 @@ impl RecordBatchGenerator { StringViewType ) } + DataType::Boolean => { + generate_boolean_array! { + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + BooleanType + } + } _ => { panic!("Unsupported data generator type: {}", col.column_type) } diff --git a/test-utils/src/array_gen/boolean.rs b/test-utils/src/array_gen/boolean.rs new file mode 100644 index 0000000000000..f3b83dd245f72 --- /dev/null +++ b/test-utils/src/array_gen/boolean.rs @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, BooleanArray, BooleanBuilder, UInt32Array}; +use arrow::compute::take; +use rand::rngs::StdRng; +use rand::Rng; + +/// Randomly generate boolean arrays +pub struct BooleanArrayGenerator { + pub num_booleans: usize, + pub num_distinct_booleans: usize, + pub null_pct: f64, + pub rng: StdRng, +} + +impl BooleanArrayGenerator { + /// Generate BooleanArray with bit-packed values + pub fn gen_data(&mut self) -> ArrayRef { + // Table of booleans from which to draw (distinct means 1 or 2) + let distinct_booleans: BooleanArray = match self.num_distinct_booleans { + 1 => { + let value = self.rng.gen::(); + let mut builder = BooleanBuilder::with_capacity(1); + builder.append_value(value); + builder.finish() + } + 2 => { + let mut builder = BooleanBuilder::with_capacity(2); + builder.append_value(true); + builder.append_value(false); + builder.finish() + } + _ => unreachable!(), + }; + + // Generate indices to select from the distinct booleans + let indices: UInt32Array = (0..self.num_booleans) + .map(|_| { + if self.rng.gen::() < self.null_pct { + None + } else if self.num_distinct_booleans > 1 { + Some(self.rng.gen_range(0..self.num_distinct_booleans as u32)) + } else { + Some(0) + } + }) + .collect(); + + let options = None; + + take(&distinct_booleans, &indices, options).unwrap() + } +} diff --git a/test-utils/src/array_gen/mod.rs b/test-utils/src/array_gen/mod.rs index d076bb1b6f0b8..1d420c543f9f4 100644 --- a/test-utils/src/array_gen/mod.rs +++ b/test-utils/src/array_gen/mod.rs @@ -16,12 +16,14 @@ // under the License. mod binary; +mod boolean; mod decimal; mod primitive; mod random_data; mod string; pub use binary::BinaryArrayGenerator; +pub use boolean::BooleanArrayGenerator; pub use decimal::DecimalArrayGenerator; pub use primitive::PrimitiveArrayGenerator; pub use string::StringArrayGenerator;