From 7a7f215f007722dc38387c5dae439ca9614a66ff Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 02:26:33 +0800 Subject: [PATCH 1/9] complete benchmark for ltrim. --- datafusion/functions/benches/ltrim.rs | 162 +++++++++++++++++++++++--- 1 file changed, 147 insertions(+), 15 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 01acb9de3381..a54a1d3b8f87 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -17,32 +17,164 @@ extern crate criterion; -use arrow::array::{ArrayRef, StringArray}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use arrow::{ + array::{ArrayRef, LargeStringArray, StringArray, StringViewArray}, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; use datafusion_common::ScalarValue; use datafusion_expr::ColumnarValue; use datafusion_functions::string; +use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng}; use std::sync::Arc; -fn create_args(size: usize, characters: &str) -> Vec { - let iter = - std::iter::repeat(format!("{}datafusion{}", characters, characters)).take(size); - let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef; +pub fn seedable_rng() -> StdRng { + StdRng::seed_from_u64(42) +} + +pub enum StringArrayType { + Utf8View, + Utf8, + LargeUtf8, +} + +pub fn create_prefixed_string_array_and_pattern( + size: usize, + prefix: &str, + generated_len: usize, + string_array_type: StringArrayType, +) -> (ArrayRef, ScalarValue) { + let rng = &mut seedable_rng(); + + let lens = vec![generated_len; size]; + let string_iter = lens.into_iter().map(|len| { + if rng.gen::() < 0.1 { + None + } else { + let mut value = prefix.as_bytes().to_vec(); + let generated = rng.sample_iter(&Alphanumeric).take(len); + value.extend(generated); + Some(String::from_utf8(value).unwrap()) + } + }); + + match string_array_type { + StringArrayType::Utf8View => ( + Arc::new(string_iter.collect::()), + ScalarValue::Utf8View(Some(prefix.to_string())), + ), + StringArrayType::Utf8 => ( + Arc::new(string_iter.collect::()), + ScalarValue::Utf8(Some(prefix.to_string())), + ), + StringArrayType::LargeUtf8 => ( + Arc::new(string_iter.collect::()), + ScalarValue::LargeUtf8(Some(prefix.to_string())), + ), + } +} + +fn create_args( + size: usize, + characters: &str, + generated_len: usize, + string_array_type: StringArrayType, +) -> Vec { + let (string_array, pattern) = create_prefixed_string_array_and_pattern( + size, + characters, + generated_len, + string_array_type, + ); vec![ - ColumnarValue::Array(array), - ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))), + ColumnarValue::Array(string_array), + ColumnarValue::Scalar(pattern), ] } fn criterion_benchmark(c: &mut Criterion) { let ltrim = string::ltrim(); - for char in ["\"", "Header:"] { - for size in [1024, 4096, 8192] { - let args = create_args(size, char); - c.bench_function(&format!("ltrim {}: {}", char, size), |b| { - b.iter(|| black_box(ltrim.invoke(&args))) - }); - } + let prefix = ",!()"; + for size in [1024, 4096, 8192] { + // len=12, prefix=4, len_after_ltrim=8 + let len = 12; + let len_exclude_prefix = len - prefix.len(); + let mut group = c.benchmark_group("INPUT LEN <= 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8View); + group.bench_function( + format!("ltrim_string_view [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8); + group.bench_function( + format!("ltrim_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::LargeUtf8); + group.bench_function( + format!("ltrim_large_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + group.finish(); + + // len=64, prefix=4, len_after_ltrim=60 + let len = 64; + let len_exclude_prefix = len - prefix.len(); + let mut group = c.benchmark_group("INPUT LEN > 12, OUTPUT LEN > 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8View); + group.bench_function( + format!("ltrim_string_view [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8); + group.bench_function( + format!("ltrim_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::LargeUtf8); + group.bench_function( + format!("ltrim_large_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + group.finish(); + + // len=15, prefix=4, len_after_ltrim=11 + let len = 15; + let len_exclude_prefix = len - prefix.len(); + let mut group = c.benchmark_group("INPUT LEN > 12, OUTPUT LEN <= 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8View); + group.bench_function( + format!("ltrim_string_view [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8); + group.bench_function( + format!("ltrim_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::LargeUtf8); + group.bench_function( + format!("ltrim_large_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); + + group.finish(); } } From db8e57636677540eda3e82bcfd39ab5c47ab43f4 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 09:06:03 +0800 Subject: [PATCH 2/9] improve benchmarks. --- datafusion/functions/benches/ltrim.rs | 178 ++++++++++++++++++++------ 1 file changed, 138 insertions(+), 40 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index a54a1d3b8f87..90873ca6cbd6 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -17,9 +17,7 @@ extern crate criterion; -use arrow::{ - array::{ArrayRef, LargeStringArray, StringArray, StringViewArray}, -}; +use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray}; use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; use datafusion_common::ScalarValue; use datafusion_expr::ColumnarValue; @@ -37,20 +35,21 @@ pub enum StringArrayType { LargeUtf8, } -pub fn create_prefixed_string_array_and_pattern( +pub fn create_string_array_and_characters( size: usize, - prefix: &str, - generated_len: usize, + characters: &str, + trimmed: &str, + remaining_len: usize, string_array_type: StringArrayType, ) -> (ArrayRef, ScalarValue) { let rng = &mut seedable_rng(); - let lens = vec![generated_len; size]; + let lens = vec![remaining_len; size]; let string_iter = lens.into_iter().map(|len| { if rng.gen::() < 0.1 { None } else { - let mut value = prefix.as_bytes().to_vec(); + let mut value = trimmed.as_bytes().to_vec(); let generated = rng.sample_iter(&Alphanumeric).take(len); value.extend(generated); Some(String::from_utf8(value).unwrap()) @@ -60,29 +59,43 @@ pub fn create_prefixed_string_array_and_pattern( match string_array_type { StringArrayType::Utf8View => ( Arc::new(string_iter.collect::()), - ScalarValue::Utf8View(Some(prefix.to_string())), + ScalarValue::Utf8View(Some(trimmed.to_string())), ), StringArrayType::Utf8 => ( Arc::new(string_iter.collect::()), - ScalarValue::Utf8(Some(prefix.to_string())), + ScalarValue::Utf8(Some(trimmed.to_string())), ), StringArrayType::LargeUtf8 => ( Arc::new(string_iter.collect::()), - ScalarValue::LargeUtf8(Some(prefix.to_string())), + ScalarValue::LargeUtf8(Some(trimmed.to_string())), ), } } +/// Create args for the ltrim benchmark +/// Inputs: +/// - size: rows num of the test array +/// - characters: the characters we need to trim +/// - trimmed: the part in the testing string that will be trimmed +/// - remaining_len: the len of the remaining part of testing string after trimming +/// - string_array_type: the method used to store the testing strings +/// +/// Outputs: +/// - testing string array +/// - trimmed characters +/// fn create_args( size: usize, characters: &str, - generated_len: usize, + trimmed: &str, + remaining_len: usize, string_array_type: StringArrayType, ) -> Vec { - let (string_array, pattern) = create_prefixed_string_array_and_pattern( + let (string_array, pattern) = create_string_array_and_characters( size, characters, - generated_len, + trimmed, + remaining_len, string_array_type, ); vec![ @@ -93,84 +106,169 @@ fn create_args( fn criterion_benchmark(c: &mut Criterion) { let ltrim = string::ltrim(); - let prefix = ",!()"; + let characters = ",!()"; + for size in [1024, 4096, 8192] { - // len=12, prefix=4, len_after_ltrim=8 + // len=12, trimmed_len=4, len_after_ltrim=8 let len = 12; - let len_exclude_prefix = len - prefix.len(); + let trimmed = characters; + let remaining_len = len - trimmed.len(); let mut group = c.benchmark_group("INPUT LEN <= 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8View); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::Utf8View, + ); group.bench_function( - format!("ltrim_string_view [size={}, strlen={}]", size, len), + format!( + "ltrim_string_view [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::Utf8, + ); group.bench_function( - format!("ltrim_string [size={}, strlen={}]", size, len), + format!( + "ltrim_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::LargeUtf8); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::LargeUtf8, + ); group.bench_function( - format!("ltrim_large_string [size={}, strlen={}]", size, len), + format!( + "ltrim_large_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); group.finish(); - // len=64, prefix=4, len_after_ltrim=60 + // len=64, trimmed_len=4, len_after_ltrim=60 let len = 64; - let len_exclude_prefix = len - prefix.len(); + let trimmed = characters; + let remaining_len = len - trimmed.len(); let mut group = c.benchmark_group("INPUT LEN > 12, OUTPUT LEN > 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8View); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::Utf8View, + ); group.bench_function( - format!("ltrim_string_view [size={}, strlen={}]", size, len), + format!( + "ltrim_string_view [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::Utf8, + ); group.bench_function( - format!("ltrim_string [size={}, strlen={}]", size, len), + format!( + "ltrim_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::LargeUtf8); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::LargeUtf8, + ); group.bench_function( - format!("ltrim_large_string [size={}, strlen={}]", size, len), + format!( + "ltrim_large_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); group.finish(); - // len=15, prefix=4, len_after_ltrim=11 - let len = 15; - let len_exclude_prefix = len - prefix.len(); + // len=64, trimmed_len=56, len_after_ltrim=8 + let len = 64; + let trimmed = characters.repeat(15); + let remaining_len = len - trimmed.len(); let mut group = c.benchmark_group("INPUT LEN > 12, OUTPUT LEN <= 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8View); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::Utf8View, + ); group.bench_function( - format!("ltrim_string_view [size={}, strlen={}]", size, len), + format!( + "ltrim_string_view [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::Utf8); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::Utf8, + ); group.bench_function( - format!("ltrim_string [size={}, strlen={}]", size, len), + format!( + "ltrim_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - let args = create_args(size, &prefix, len_exclude_prefix, StringArrayType::LargeUtf8); + let args = create_args( + size, + &characters, + &trimmed, + remaining_len, + StringArrayType::LargeUtf8, + ); group.bench_function( - format!("ltrim_large_string [size={}, strlen={}]", size, len), + format!( + "ltrim_large_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", + size, len, remaining_len + ), |b| b.iter(|| black_box(ltrim.invoke(&args))), ); From 6f03eb22f4513fba4387fc9ff6fe7290da7ff156 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 09:25:03 +0800 Subject: [PATCH 3/9] remove unused param. --- datafusion/functions/benches/ltrim.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 90873ca6cbd6..23fff5b71953 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -37,7 +37,6 @@ pub enum StringArrayType { pub fn create_string_array_and_characters( size: usize, - characters: &str, trimmed: &str, remaining_len: usize, string_array_type: StringArrayType, @@ -93,7 +92,6 @@ fn create_args( ) -> Vec { let (string_array, pattern) = create_string_array_and_characters( size, - characters, trimmed, remaining_len, string_array_type, From a5625aa5758b5391853f7471fdc690bba3027a28 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 09:28:53 +0800 Subject: [PATCH 4/9] fix bench. --- datafusion/functions/benches/ltrim.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 23fff5b71953..8007945f5624 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -37,6 +37,7 @@ pub enum StringArrayType { pub fn create_string_array_and_characters( size: usize, + characters: &str, trimmed: &str, remaining_len: usize, string_array_type: StringArrayType, @@ -58,15 +59,15 @@ pub fn create_string_array_and_characters( match string_array_type { StringArrayType::Utf8View => ( Arc::new(string_iter.collect::()), - ScalarValue::Utf8View(Some(trimmed.to_string())), + ScalarValue::Utf8View(Some(characters.to_string())), ), StringArrayType::Utf8 => ( Arc::new(string_iter.collect::()), - ScalarValue::Utf8(Some(trimmed.to_string())), + ScalarValue::Utf8(Some(characters.to_string())), ), StringArrayType::LargeUtf8 => ( Arc::new(string_iter.collect::()), - ScalarValue::LargeUtf8(Some(trimmed.to_string())), + ScalarValue::LargeUtf8(Some(characters.to_string())), ), } } @@ -92,6 +93,7 @@ fn create_args( ) -> Vec { let (string_array, pattern) = create_string_array_and_characters( size, + characters, trimmed, remaining_len, string_array_type, From 35b6ca49c5c268ce340af041d29b846340e2fdff Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 12:05:46 +0800 Subject: [PATCH 5/9] refactor to remove repeated codes. --- datafusion/functions/benches/ltrim.rs | 228 ++++++++++---------------- 1 file changed, 90 insertions(+), 138 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 8007945f5624..838719b9ed59 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -18,23 +18,37 @@ extern crate criterion; use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray}; -use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; +use criterion::{ + black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup, + Criterion, SamplingMode, +}; use datafusion_common::ScalarValue; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarUDF}; use datafusion_functions::string; use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng}; -use std::sync::Arc; +use std::{fmt, sync::Arc}; pub fn seedable_rng() -> StdRng { StdRng::seed_from_u64(42) } +#[derive(Clone, Copy)] pub enum StringArrayType { Utf8View, Utf8, LargeUtf8, } +impl fmt::Display for StringArrayType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + StringArrayType::Utf8View => f.write_str("string_view"), + StringArrayType::Utf8 => f.write_str("string"), + StringArrayType::LargeUtf8 => f.write_str("large_string"), + } + } +} + pub fn create_string_array_and_characters( size: usize, characters: &str, @@ -104,175 +118,113 @@ fn create_args( ] } +fn run_with_string_type<'a, M: Measurement>( + group: &mut BenchmarkGroup<'a, M>, + ltrim: &ScalarUDF, + size: usize, + len: usize, + characters: &str, + trimmed: &str, + remaining_len: usize, + string_type: StringArrayType, +) { + let args = create_args(size, &characters, &trimmed, remaining_len, string_type); + group.bench_function( + format!( + "{string_type} [size={size}, len_before={len}, len_after={remaining_len}]", + ), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); +} + +fn run_one_group( + c: &mut Criterion, + group_name: &str, + ltrim: &ScalarUDF, + string_types: &[StringArrayType], + size: usize, + len: usize, + characters: &str, + trimmed: &str, + remaining_len: usize, +) { + let mut group = c.benchmark_group(group_name); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + for string_type in string_types { + run_with_string_type( + &mut group, + ltrim, + size, + len, + characters, + trimmed, + remaining_len, + *string_type, + ); + } + + group.finish(); +} + fn criterion_benchmark(c: &mut Criterion) { let ltrim = string::ltrim(); let characters = ",!()"; + let string_types = [ + StringArrayType::Utf8View, + StringArrayType::Utf8, + StringArrayType::LargeUtf8, + ]; for size in [1024, 4096, 8192] { // len=12, trimmed_len=4, len_after_ltrim=8 let len = 12; let trimmed = characters; let remaining_len = len - trimmed.len(); - let mut group = c.benchmark_group("INPUT LEN <= 12"); - group.sampling_mode(SamplingMode::Flat); - group.sample_size(10); - - let args = create_args( - size, - &characters, - &trimmed, - remaining_len, - StringArrayType::Utf8View, - ); - group.bench_function( - format!( - "ltrim_string_view [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), - ); - - let args = create_args( - size, - &characters, - &trimmed, - remaining_len, - StringArrayType::Utf8, - ); - group.bench_function( - format!( - "ltrim_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), - ); - - let args = create_args( + run_one_group( + c, + "INPUT LEN <= 12", + <rim, + &string_types, size, + len, &characters, &trimmed, remaining_len, - StringArrayType::LargeUtf8, - ); - group.bench_function( - format!( - "ltrim_large_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - group.finish(); - // len=64, trimmed_len=4, len_after_ltrim=60 let len = 64; let trimmed = characters; let remaining_len = len - trimmed.len(); - let mut group = c.benchmark_group("INPUT LEN > 12, OUTPUT LEN > 12"); - group.sampling_mode(SamplingMode::Flat); - group.sample_size(10); - - let args = create_args( + run_one_group( + c, + "INPUT LEN > 12, OUTPUT LEN > 12", + <rim, + &string_types, size, + len, &characters, &trimmed, remaining_len, - StringArrayType::Utf8View, - ); - group.bench_function( - format!( - "ltrim_string_view [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), - ); - - let args = create_args( - size, - &characters, - &trimmed, - remaining_len, - StringArrayType::Utf8, - ); - group.bench_function( - format!( - "ltrim_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), - ); - - let args = create_args( - size, - &characters, - &trimmed, - remaining_len, - StringArrayType::LargeUtf8, - ); - group.bench_function( - format!( - "ltrim_large_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), ); - group.finish(); - // len=64, trimmed_len=56, len_after_ltrim=8 let len = 64; let trimmed = characters.repeat(15); let remaining_len = len - trimmed.len(); - let mut group = c.benchmark_group("INPUT LEN > 12, OUTPUT LEN <= 12"); - group.sampling_mode(SamplingMode::Flat); - group.sample_size(10); - - let args = create_args( - size, - &characters, - &trimmed, - remaining_len, - StringArrayType::Utf8View, - ); - group.bench_function( - format!( - "ltrim_string_view [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), - ); - - let args = create_args( + run_one_group( + c, + "INPUT LEN > 12, OUTPUT LEN <= 12", + <rim, + &string_types, size, + len, &characters, &trimmed, remaining_len, - StringArrayType::Utf8, ); - group.bench_function( - format!( - "ltrim_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), - ); - - let args = create_args( - size, - &characters, - &trimmed, - remaining_len, - StringArrayType::LargeUtf8, - ); - group.bench_function( - format!( - "ltrim_large_string [size={}, len_before_ltrim={}, len_after_ltrim={}]", - size, len, remaining_len - ), - |b| b.iter(|| black_box(ltrim.invoke(&args))), - ); - - group.finish(); } } From a3d7da62c015c6eb3850f892202c26f7e06330a0 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 12:10:16 +0800 Subject: [PATCH 6/9] fix clippy. --- datafusion/functions/benches/ltrim.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 838719b9ed59..02d90a9a70ea 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -118,8 +118,9 @@ fn create_args( ] } -fn run_with_string_type<'a, M: Measurement>( - group: &mut BenchmarkGroup<'a, M>, +#[allow(clippy::too_many_arguments)] +fn run_with_string_type( + group: &mut BenchmarkGroup<'_, M>, ltrim: &ScalarUDF, size: usize, len: usize, @@ -128,7 +129,7 @@ fn run_with_string_type<'a, M: Measurement>( remaining_len: usize, string_type: StringArrayType, ) { - let args = create_args(size, &characters, &trimmed, remaining_len, string_type); + let args = create_args(size, characters, trimmed, remaining_len, string_type); group.bench_function( format!( "{string_type} [size={size}, len_before={len}, len_after={remaining_len}]", @@ -137,6 +138,7 @@ fn run_with_string_type<'a, M: Measurement>( ); } +#[allow(clippy::too_many_arguments)] fn run_one_group( c: &mut Criterion, group_name: &str, @@ -189,8 +191,8 @@ fn criterion_benchmark(c: &mut Criterion) { &string_types, size, len, - &characters, - &trimmed, + characters, + trimmed, remaining_len, ); @@ -205,8 +207,8 @@ fn criterion_benchmark(c: &mut Criterion) { &string_types, size, len, - &characters, - &trimmed, + characters, + trimmed, remaining_len, ); @@ -221,7 +223,7 @@ fn criterion_benchmark(c: &mut Criterion) { &string_types, size, len, - &characters, + characters, &trimmed, remaining_len, ); From 3f02b45cfb54f54c24acb3fce71ee943575acad9 Mon Sep 17 00:00:00 2001 From: kamille <3144148605@qq.com> Date: Wed, 18 Sep 2024 19:24:44 +0800 Subject: [PATCH 7/9] Update datafusion/functions/benches/ltrim.rs Co-authored-by: Andrew Lamb --- datafusion/functions/benches/ltrim.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 02d90a9a70ea..ced0ac77c692 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -49,6 +49,7 @@ impl fmt::Display for StringArrayType { } } +/// returns an array of strings, and `characters` as a ScalarValue pub fn create_string_array_and_characters( size: usize, characters: &str, From 2485abb1c71b1e63559219ac6a88dd2959aa408d Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 19:37:27 +0800 Subject: [PATCH 8/9] improve codes and add more comments. --- datafusion/functions/benches/ltrim.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index ced0ac77c692..6d8772d320a1 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -59,18 +59,22 @@ pub fn create_string_array_and_characters( ) -> (ArrayRef, ScalarValue) { let rng = &mut seedable_rng(); - let lens = vec![remaining_len; size]; - let string_iter = lens.into_iter().map(|len| { + // Create `size` rows: + // - 10% rows will be `None` + // - Other 90% will be strings with same `remaining_len` lengths + // We will build the string array on it later. + let string_iter = (0..size).into_iter().map(|_| { if rng.gen::() < 0.1 { None } else { let mut value = trimmed.as_bytes().to_vec(); - let generated = rng.sample_iter(&Alphanumeric).take(len); + let generated = rng.sample_iter(&Alphanumeric).take(remaining_len); value.extend(generated); Some(String::from_utf8(value).unwrap()) } }); + // Build the target `string array` and `characters` according to `string_array_type` match string_array_type { StringArrayType::Utf8View => ( Arc::new(string_iter.collect::()), From 161fd189ba562bb3183531a6685e148969654858 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 19:39:11 +0800 Subject: [PATCH 9/9] fix clippy. --- datafusion/functions/benches/ltrim.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 6d8772d320a1..b3fa5ef4fdff 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -63,7 +63,7 @@ pub fn create_string_array_and_characters( // - 10% rows will be `None` // - Other 90% will be strings with same `remaining_len` lengths // We will build the string array on it later. - let string_iter = (0..size).into_iter().map(|_| { + let string_iter = (0..size).map(|_| { if rng.gen::() < 0.1 { None } else {