From a1a7194a11fd2c70af37208a31bed9d4bee1fe3c Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Sun, 10 Nov 2024 00:19:34 -0500 Subject: [PATCH 1/5] add bool col --- .../core/tests/fuzz_cases/aggregate_fuzz.rs | 1 + .../aggregation_fuzzer/data_generator.rs | 39 ++++++++++- test-utils/src/array_gen/boolean.rs | 69 +++++++++++++++++++ test-utils/src/array_gen/mod.rs | 2 + 4 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 test-utils/src/array_gen/boolean.rs diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 16f539b75967f..f353c597add1a 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -210,6 +210,7 @@ fn baseline_config() -> DatasetGeneratorConfig { // low cardinality columns ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10), ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10), + ColumnDescr::new("bool", DataType::Boolean), ]; let min_num_rows = 512; diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index f0973826b5073..7fddb9608e90d 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -18,8 +18,8 @@ use std::sync::Arc; use arrow::datatypes::{ - ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + BooleanType, ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, + Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeUtf8Type, StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type, @@ -35,7 +35,10 @@ use rand::{ thread_rng, Rng, SeedableRng, }; use test_utils::{ - array_gen::{DecimalArrayGenerator, PrimitiveArrayGenerator, StringArrayGenerator}, + array_gen::{ + BooleanArrayGenerator, DecimalArrayGenerator, PrimitiveArrayGenerator, + StringArrayGenerator, + }, stagger_batch, }; @@ -262,6 +265,26 @@ macro_rules! generate_decimal_array { }}; } +// Generating `BooleanArray` due to it being a special type in Arrow (bit-packed) +macro_rules! generate_boolean_array { + ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{ + // Select a null percentage from the candidate percentages + let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len()); + let null_pct = $SELF.candidate_null_pcts[null_pct_idx]; + + let num_distinct_booleans = if $MAX_NUM_DISTINCT >= 2 { 2 } else { 1 }; + + let mut generator = BooleanArrayGenerator { + num_booleans: $NUM_ROWS, + num_distinct_booleans, + null_pct, + rng: $ARRAY_GEN_RNG, + }; + + generator.gen_data::<$ARROW_TYPE>() + }}; +} + macro_rules! generate_primitive_array { ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{ let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len()); @@ -581,6 +604,16 @@ impl RecordBatchGenerator { StringViewType ) } + DataType::Boolean => { + generate_boolean_array! { + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + BooleanType + } + } _ => { panic!("Unsupported data generator type: {}", col.column_type) } diff --git a/test-utils/src/array_gen/boolean.rs b/test-utils/src/array_gen/boolean.rs new file mode 100644 index 0000000000000..7733f0cd97bfb --- /dev/null +++ b/test-utils/src/array_gen/boolean.rs @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, BooleanArray, BooleanBuilder, UInt32Array}; +use arrow::compute::take; +use rand::rngs::StdRng; +use rand::Rng; + +/// Randomly generate decimal arrays +pub struct BooleanArrayGenerator { + pub num_booleans: usize, + pub num_distinct_booleans: usize, + pub null_pct: f64, + pub rng: StdRng, +} + +impl BooleanArrayGenerator { + /// Generate BooleanArray with bit-packed values + pub fn gen_data(&mut self) -> ArrayRef { + // Table of booleans from which to draw (distinct means 1 or 2) + let distinct_booleans: BooleanArray = match self.num_distinct_booleans { + 1 => { + let value = self.rng.gen::(); + let mut builder = BooleanBuilder::with_capacity(1); + builder.append_value(value); + builder.finish() + } + 2 => { + let mut builder = BooleanBuilder::with_capacity(2); + builder.append_value(true); + builder.append_value(false); + builder.finish() + } + _ => unreachable!(), + }; + + // Generate indices to select from the distinct booleans + let indices: UInt32Array = (0..self.num_booleans) + .map(|_| { + if self.rng.gen::() < self.null_pct { + None + } else if self.num_distinct_booleans > 1 { + Some(self.rng.gen_range(0..self.num_distinct_booleans as u32)) + } else { + Some(0) + } + }) + .collect(); + + let options = None; + let result = take(&distinct_booleans, &indices, options).unwrap(); + + result + } +} diff --git a/test-utils/src/array_gen/mod.rs b/test-utils/src/array_gen/mod.rs index 8e0e39ddfdce1..d03de8d0bdd75 100644 --- a/test-utils/src/array_gen/mod.rs +++ b/test-utils/src/array_gen/mod.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +mod boolean; mod decimal; mod primitive; mod random_data; mod string; +pub use boolean::BooleanArrayGenerator; pub use decimal::DecimalArrayGenerator; pub use primitive::PrimitiveArrayGenerator; pub use string::StringArrayGenerator; From caf3ba4ca9930a07a0796f72f800106808889469 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Sun, 10 Nov 2024 01:00:22 -0500 Subject: [PATCH 2/5] clippy fix --- datafusion/functions/src/encoding/inner.rs | 2 +- test-utils/src/array_gen/boolean.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 4f91879f94db7..cb5bba0ab6d13 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -195,7 +195,7 @@ impl ScalarUDFImpl for DecodeFunc { } match arg_types[0] { - DataType::Utf8 | DataType::Binary | DataType::Null => { + DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => { Ok(vec![DataType::Binary, DataType::Utf8]) } DataType::LargeUtf8 | DataType::LargeBinary => { diff --git a/test-utils/src/array_gen/boolean.rs b/test-utils/src/array_gen/boolean.rs index 7733f0cd97bfb..0d2c72d1b1ecd 100644 --- a/test-utils/src/array_gen/boolean.rs +++ b/test-utils/src/array_gen/boolean.rs @@ -62,8 +62,7 @@ impl BooleanArrayGenerator { .collect(); let options = None; - let result = take(&distinct_booleans, &indices, options).unwrap(); - result + take(&distinct_booleans, &indices, options).unwrap() } } From 684a90571fc898d6b23618b2ee85b8bc73fbb23b Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Sun, 10 Nov 2024 01:01:33 -0500 Subject: [PATCH 3/5] remove change --- datafusion/functions/src/encoding/inner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index cb5bba0ab6d13..4f91879f94db7 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -195,7 +195,7 @@ impl ScalarUDFImpl for DecodeFunc { } match arg_types[0] { - DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => { + DataType::Utf8 | DataType::Binary | DataType::Null => { Ok(vec![DataType::Binary, DataType::Utf8]) } DataType::LargeUtf8 | DataType::LargeBinary => { From 0e9f09246e3c8b149e35e1ec852f894696f63b70 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Mon, 11 Nov 2024 11:05:11 -0500 Subject: [PATCH 4/5] fmt fix --- .../fuzz_cases/aggregation_fuzzer/data_generator.rs | 11 +++++------ test-utils/src/array_gen/mod.rs | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index 3e937d0765768..e4c0cb6fe77f7 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -18,9 +18,9 @@ use std::sync::Arc; use arrow::datatypes::{ - BinaryType, BinaryViewType, BooleanType, ByteArrayType, ByteViewType, Date32Type, Date64Type, - Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, - Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, + BinaryType, BinaryViewType, BooleanType, ByteArrayType, ByteViewType, Date32Type, + Date64Type, Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, + Int32Type, Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeBinaryType, LargeUtf8Type, StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, @@ -38,9 +38,8 @@ use rand::{ }; use test_utils::{ array_gen::{ - BooleanArrayGenerator, - BinaryArrayGenerator, DecimalArrayGenerator, PrimitiveArrayGenerator, - StringArrayGenerator, + BinaryArrayGenerator, BooleanArrayGenerator, DecimalArrayGenerator, + PrimitiveArrayGenerator, StringArrayGenerator, }, stagger_batch, }; diff --git a/test-utils/src/array_gen/mod.rs b/test-utils/src/array_gen/mod.rs index 12cf4625d53ac..1d420c543f9f4 100644 --- a/test-utils/src/array_gen/mod.rs +++ b/test-utils/src/array_gen/mod.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -mod boolean; mod binary; +mod boolean; mod decimal; mod primitive; mod random_data; From d58d7f4fec22b5b62a20bad7a19fefd99b760cad Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Tue, 12 Nov 2024 13:18:58 -0500 Subject: [PATCH 5/5] typo fix --- test-utils/src/array_gen/boolean.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-utils/src/array_gen/boolean.rs b/test-utils/src/array_gen/boolean.rs index 0d2c72d1b1ecd..f3b83dd245f72 100644 --- a/test-utils/src/array_gen/boolean.rs +++ b/test-utils/src/array_gen/boolean.rs @@ -20,7 +20,7 @@ use arrow::compute::take; use rand::rngs::StdRng; use rand::Rng; -/// Randomly generate decimal arrays +/// Randomly generate boolean arrays pub struct BooleanArrayGenerator { pub num_booleans: usize, pub num_distinct_booleans: usize,