From 0bea528a1b2ff9b9ca06b3029570bf6b2f56808c Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 24 Feb 2026 10:57:27 -0500 Subject: [PATCH] Add tests and benchmarks --- datafusion/functions-nested/Cargo.toml | 4 + .../benches/array_position.rs | 237 ++++++++++++++++++ datafusion/sqllogictest/test_files/array.slt | 105 ++++++++ 3 files changed, 346 insertions(+) create mode 100644 datafusion/functions-nested/benches/array_position.rs diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index e5e601f30ae84..ee1d92ce5db83 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -96,3 +96,7 @@ name = "array_repeat" [[bench]] harness = false name = "array_set_ops" + +[[bench]] +harness = false +name = "array_position" diff --git a/datafusion/functions-nested/benches/array_position.rs b/datafusion/functions-nested/benches/array_position.rs new file mode 100644 index 0000000000000..08367648449d2 --- /dev/null +++ b/datafusion/functions-nested/benches/array_position.rs @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Int64Array, ListArray}; +use arrow::buffer::OffsetBuffer; +use arrow::datatypes::{DataType, Field}; +use criterion::{ + criterion_group, criterion_main, {BenchmarkId, Criterion}, +}; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::position::ArrayPosition; +use rand::Rng; +use rand::SeedableRng; +use rand::rngs::StdRng; +use std::hint::black_box; +use std::sync::Arc; + +const NUM_ROWS: usize = 10000; +const SEED: u64 = 42; +const NULL_DENSITY: f64 = 0.1; +const SENTINEL_NEEDLE: i64 = -1; + +fn criterion_benchmark(c: &mut Criterion) { + for size in [10, 100, 500] { + bench_array_position(c, size); + } +} + +fn bench_array_position(c: &mut Criterion, array_size: usize) { + let mut group = c.benchmark_group("array_position_i64"); + let haystack_found_once = create_haystack_with_sentinel( + NUM_ROWS, + array_size, + NULL_DENSITY, + SENTINEL_NEEDLE, + 0, + ); + let haystack_found_many = create_haystack_with_sentinels( + NUM_ROWS, + array_size, + NULL_DENSITY, + SENTINEL_NEEDLE, + ); + let haystack_not_found = + create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY); + let num_rows = haystack_not_found.len(); + let arg_fields: Vec> = vec![ + Field::new("haystack", haystack_not_found.data_type().clone(), false).into(), + Field::new("needle", DataType::Int64, false).into(), + ]; + let return_field: Arc = Field::new("result", DataType::UInt64, true).into(); + let config_options = Arc::new(ConfigOptions::default()); + let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE)); + + // Benchmark: one match per row. + let args_found_once = vec![ + ColumnarValue::Array(haystack_found_once.clone()), + ColumnarValue::Scalar(needle.clone()), + ]; + group.bench_with_input( + BenchmarkId::new("found_once", array_size), + &array_size, + |b, _| { + let udf = ArrayPosition::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found_once.clone(), + arg_fields: arg_fields.clone(), + number_rows: num_rows, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + // Benchmark: many matches per row. + let args_found_many = vec![ + ColumnarValue::Array(haystack_found_many.clone()), + ColumnarValue::Scalar(needle.clone()), + ]; + group.bench_with_input( + BenchmarkId::new("found_many", array_size), + &array_size, + |b, _| { + let udf = ArrayPosition::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found_many.clone(), + arg_fields: arg_fields.clone(), + number_rows: num_rows, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + // Benchmark: needle is not found in any row. + let args_not_found = vec![ + ColumnarValue::Array(haystack_not_found.clone()), + ColumnarValue::Scalar(needle.clone()), + ]; + group.bench_with_input( + BenchmarkId::new("not_found", array_size), + &array_size, + |b, _| { + let udf = ArrayPosition::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_not_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: num_rows, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + group.finish(); +} + +fn create_haystack_without_sentinel( + num_rows: usize, + array_size: usize, + null_density: f64, +) -> ArrayRef { + create_haystack_from_fn(num_rows, array_size, |_, _, rng| { + random_haystack_value(rng, array_size, null_density) + }) +} + +fn create_haystack_with_sentinel( + num_rows: usize, + array_size: usize, + null_density: f64, + sentinel: i64, + sentinel_index: usize, +) -> ArrayRef { + assert!(sentinel_index < array_size); + + create_haystack_from_fn(num_rows, array_size, |_, col, rng| { + if col == sentinel_index { + Some(sentinel) + } else { + random_haystack_value(rng, array_size, null_density) + } + }) +} + +fn create_haystack_with_sentinels( + num_rows: usize, + array_size: usize, + null_density: f64, + sentinel: i64, +) -> ArrayRef { + create_haystack_from_fn(num_rows, array_size, |_, col, rng| { + // Place the sentinel in half the positions to create many matches per row. + if col % 2 == 0 { + Some(sentinel) + } else { + random_haystack_value(rng, array_size, null_density) + } + }) +} + +fn create_haystack_from_fn( + num_rows: usize, + array_size: usize, + mut value_at: F, +) -> ArrayRef +where + F: FnMut(usize, usize, &mut StdRng) -> Option, +{ + let mut rng = StdRng::seed_from_u64(SEED); + let mut values = Vec::with_capacity(num_rows * array_size); + for row in 0..num_rows { + for col in 0..array_size { + values.push(value_at(row, col, &mut rng)); + } + } + let values = values.into_iter().collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} + +fn random_haystack_value( + rng: &mut StdRng, + array_size: usize, + null_density: f64, +) -> Option { + if rng.random::() < null_density { + None + } else { + Some(rng.random_range(0..array_size as i64)) + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 66503c957c5ad..cf3494394e3ef 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -3880,6 +3880,111 @@ select array_position(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [11, 12, 13]), NULL 6 4 NULL 1 NULL +# array_position with NULL element in haystack array (NULL = NULL semantics) +query III +select array_position([1, NULL, 3], arrow_cast(NULL, 'Int64')), array_position([NULL, 2, 3], arrow_cast(NULL, 'Int64')), array_position([1, 2, NULL], arrow_cast(NULL, 'Int64')); +---- +2 1 3 + +query I +select array_position(arrow_cast([1, NULL, 3], 'LargeList(Int64)'), arrow_cast(NULL, 'Int64')); +---- +2 + +# array_position with NULL element in array and start_from +query II +select array_position([NULL, 1, NULL, 2], arrow_cast(NULL, 'Int64'), 2), array_position([NULL, 1, NULL, 2], arrow_cast(NULL, 'Int64'), 1); +---- +3 1 + +# array_position with column array and scalar element +query IIII +select array_position(column1, 3), array_position(column1, 10), array_position(column1, 20), array_position(column1, 999) from arrays_values_without_nulls; +---- +3 10 NULL NULL +NULL NULL 10 NULL +NULL NULL NULL NULL +NULL NULL NULL NULL + +query II +select array_position(column1, 3), array_position(column1, 20) from large_arrays_values_without_nulls; +---- +3 NULL +NULL 10 +NULL NULL +NULL NULL + +query II +select array_position(column1, 3), array_position(column1, 20) from fixed_size_arrays_values_without_nulls; +---- +3 NULL +NULL 10 +NULL NULL +NULL NULL + +# array_position with column array, scalar element, and scalar start_from +query II +select array_position(column1, 3, 1), array_position(column1, 3, 4) from arrays_values_without_nulls; +---- +3 NULL +NULL NULL +NULL NULL +NULL NULL + +query II +select array_position(column1, 3, 1), array_position(column1, 3, 4) from large_arrays_values_without_nulls; +---- +3 NULL +NULL NULL +NULL NULL +NULL NULL + +# array_position with column array, scalar element, and column start_from +query I +select array_position(column1, 3, column3) from arrays_values_without_nulls; +---- +3 +NULL +NULL +NULL + +# array_position with scalar haystack, scalar element, and column start_from +query I +select array_position([1, 2, 1, 2], 2, column3) from arrays_values_without_nulls; +---- +2 +2 +4 +4 + +# array_position start_from boundary cases +query IIII +select array_position([1, 2, 3], 3, 3), array_position([1, 2, 3], 1, 2), array_position([1, 2, 3], 1, 1), array_position([1, 2, 3], 3, 4); +---- +3 NULL 1 NULL + +query II +select array_position([1, 2, 3], 3, 4), array_position([1], 1, 2); +---- +NULL NULL + +# array_position with empty array in various contexts +query II +select array_position(arrow_cast(make_array(), 'List(Int64)'), 1), array_position(arrow_cast(make_array(), 'LargeList(Int64)'), 1); +---- +NULL NULL + +# FixedSizeList with start_from +query II +select array_position(arrow_cast([1, 2, 3, 1, 2], 'FixedSizeList(5, Int64)'), 1, 2), array_position(arrow_cast([1, 2, 3, 1, 2], 'FixedSizeList(5, Int64)'), 2, 4); +---- +4 5 + +query I +select array_position(arrow_cast(['a', 'b', 'c', 'b'], 'FixedSizeList(4, Utf8)'), 'b', 3); +---- +4 + ## array_positions (aliases: `list_positions`) query ?