diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 0b14b5bfae8..1b814fe3b75 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -111,7 +111,7 @@ name = "take_kernels" harness = false [[bench]] -name = "length_kernel" +name = "octet_length_kernel" harness = false [[bench]] diff --git a/rust/arrow/benches/length_kernel.rs b/rust/arrow/benches/octet_length_kernel.rs similarity index 85% rename from rust/arrow/benches/length_kernel.rs rename to rust/arrow/benches/octet_length_kernel.rs index b70f6374f8f..17c7e427ef0 100644 --- a/rust/arrow/benches/length_kernel.rs +++ b/rust/arrow/benches/octet_length_kernel.rs @@ -22,10 +22,10 @@ use criterion::Criterion; extern crate arrow; use arrow::array::*; -use arrow::compute::kernels::length::length; +use arrow::compute::kernels::octet_length::octet_length; -fn bench_length(array: &StringArray) { - criterion::black_box(length(array).unwrap()); +fn bench_octet_length(array: &StringArray) { + criterion::black_box(octet_length(array).unwrap()); } fn add_benchmark(c: &mut Criterion) { @@ -40,7 +40,7 @@ fn add_benchmark(c: &mut Criterion) { } let array = StringArray::from(values); - c.bench_function("length", |b| b.iter(|| bench_length(&array))); + c.bench_function("length", |b| b.iter(|| bench_octet_length(&array))); } criterion_group!(benches, add_benchmark); diff --git a/rust/arrow/src/compute/kernels/mod.rs b/rust/arrow/src/compute/kernels/mod.rs index 49283e430bd..0ac5793b962 100644 --- a/rust/arrow/src/compute/kernels/mod.rs +++ b/rust/arrow/src/compute/kernels/mod.rs @@ -25,8 +25,8 @@ pub mod cast; pub mod comparison; pub mod concat; pub mod filter; -pub mod length; pub mod limit; +pub mod octet_length; pub mod sort; pub mod substring; pub mod take; diff --git a/rust/arrow/src/compute/kernels/length.rs b/rust/arrow/src/compute/kernels/octet_length.rs similarity index 86% rename from rust/arrow/src/compute/kernels/length.rs rename to rust/arrow/src/compute/kernels/octet_length.rs index 5f2ed59e262..bb050825a8c 100644 --- a/rust/arrow/src/compute/kernels/length.rs +++ b/rust/arrow/src/compute/kernels/octet_length.rs @@ -24,7 +24,7 @@ use crate::{ }; use std::sync::Arc; -fn length_string(array: &Array, data_type: DataType) -> Result +fn octet_length_string(array: &Array, data_type: DataType) -> Result where OffsetSize: OffsetSizeTrait, { @@ -41,7 +41,7 @@ where // Benefit // ~60% speedup // Soundness - // `values` is an iterator with a known size. + // `lengths` is an iterator with a known size. let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; let null_bit_buffer = array @@ -62,17 +62,17 @@ where Ok(make_array(Arc::new(data))) } -/// Returns an array of Int32/Int64 denoting the number of characters in each string in the array. +/// Returns an array of Int32/Int64 denoting the number of octets in each string in the array. /// /// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8 /// * length of null is null. -/// * length is in number of bytes -pub fn length(array: &Array) -> Result { +/// * length is in number of octets +pub fn octet_length(array: &Array) -> Result { match array.data_type() { - DataType::Utf8 => length_string::(array, DataType::Int32), - DataType::LargeUtf8 => length_string::(array, DataType::Int64), + DataType::Utf8 => octet_length_string::(array, DataType::Int32), + DataType::LargeUtf8 => octet_length_string::(array, DataType::Int64), _ => Err(ArrowError::ComputeError(format!( - "length not supported for {:?}", + "octet_length not supported for {:?}", array.data_type() ))), } @@ -99,6 +99,7 @@ mod tests { (vec!["hello", " ", "world"], 3, vec![5, 1, 5]), (vec!["hello", " ", "world", "!"], 4, vec![5, 1, 5, 1]), (vec!["💖"], 1, vec![4]), + (vec!["josé"], 1, vec![5]), (values, 4096, expected), ] } @@ -107,7 +108,7 @@ mod tests { fn test_string() -> Result<()> { cases().into_iter().try_for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = length(&array)?; + let result = octet_length(&array)?; assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { @@ -121,7 +122,7 @@ mod tests { fn test_large_string() -> Result<()> { cases().into_iter().try_for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = length(&array)?; + let result = octet_length(&array)?; assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { @@ -145,7 +146,7 @@ mod tests { .into_iter() .try_for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = length(&array)?; + let result = octet_length(&array)?; assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); @@ -161,7 +162,7 @@ mod tests { .into_iter() .try_for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = length(&array)?; + let result = octet_length(&array)?; assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); @@ -181,7 +182,7 @@ mod tests { fn wrong_type() -> Result<()> { let array: UInt64Array = vec![1u64].into(); - assert!(length(&array).is_err()); + assert!(octet_length(&array).is_err()); Ok(()) } @@ -196,7 +197,7 @@ mod tests { .buffers(a.data_ref().buffers().to_vec()) .build(), ); - let result = length(b.as_ref())?; + let result = octet_length(b.as_ref())?; let expected = Int32Array::from(vec![1, 5]); assert_eq!(expected.data(), result.data()); diff --git a/rust/datafusion/src/logical_plan/expr.rs b/rust/datafusion/src/logical_plan/expr.rs index 31898d3e695..364c2274418 100644 --- a/rust/datafusion/src/logical_plan/expr.rs +++ b/rust/datafusion/src/logical_plan/expr.rs @@ -820,20 +820,14 @@ unary_scalar_expr!(Trim, trim); unary_scalar_expr!(Ltrim, ltrim); unary_scalar_expr!(Rtrim, rtrim); unary_scalar_expr!(Upper, upper); +unary_scalar_expr!(Length, length); +unary_scalar_expr!(OctetLength, octet_length); unary_scalar_expr!(MD5, md5); unary_scalar_expr!(SHA224, sha224); unary_scalar_expr!(SHA256, sha256); unary_scalar_expr!(SHA384, sha384); unary_scalar_expr!(SHA512, sha512); -/// returns the length of a string in bytes -pub fn length(e: Expr) -> Expr { - Expr::ScalarFunction { - fun: functions::BuiltinScalarFunction::Length, - args: vec![e], - } -} - /// returns the concatenation of string expressions pub fn concat(args: Vec) -> Expr { Expr::ScalarFunction { diff --git a/rust/datafusion/src/logical_plan/mod.rs b/rust/datafusion/src/logical_plan/mod.rs index cceb94794b5..690945d724d 100644 --- a/rust/datafusion/src/logical_plan/mod.rs +++ b/rust/datafusion/src/logical_plan/mod.rs @@ -36,9 +36,9 @@ pub use display::display_schema; pub use expr::{ abs, acos, and, array, asin, atan, avg, binary_expr, case, ceil, col, concat, cos, count, count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, - in_list, length, lit, ln, log10, log2, lower, ltrim, max, md5, min, or, round, rtrim, - sha224, sha256, sha384, sha512, signum, sin, sqrt, sum, tan, trim, trunc, upper, - when, Expr, ExpressionVisitor, Literal, Recursion, + in_list, length, lit, ln, log10, log2, lower, ltrim, max, md5, min, octet_length, or, + round, rtrim, sha224, sha256, sha384, sha512, signum, sin, sqrt, sum, tan, trim, + trunc, upper, when, Expr, ExpressionVisitor, Literal, Recursion, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; diff --git a/rust/datafusion/src/physical_plan/functions.rs b/rust/datafusion/src/physical_plan/functions.rs index a5da899d097..2a94a552b75 100644 --- a/rust/datafusion/src/physical_plan/functions.rs +++ b/rust/datafusion/src/physical_plan/functions.rs @@ -42,7 +42,7 @@ use crate::physical_plan::math_expressions; use crate::physical_plan::string_expressions; use arrow::{ array::ArrayRef, - compute::kernels::length::length, + compute::kernels::octet_length::octet_length, datatypes::TimeUnit, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, @@ -117,6 +117,8 @@ pub enum BuiltinScalarFunction { Signum, /// length Length, + /// octet_length + OctetLength, /// concat Concat, /// lower @@ -177,6 +179,7 @@ impl FromStr for BuiltinScalarFunction { "truc" => BuiltinScalarFunction::Trunc, "abs" => BuiltinScalarFunction::Abs, "signum" => BuiltinScalarFunction::Signum, + "octet_length" => BuiltinScalarFunction::OctetLength, "length" => BuiltinScalarFunction::Length, "char_length" => BuiltinScalarFunction::Length, "character_length" => BuiltinScalarFunction::Length, @@ -228,6 +231,16 @@ pub fn return_type( // the return type of the built in function. // Some built-in functions' return type depends on the incoming type. match fun { + BuiltinScalarFunction::OctetLength => Ok(match arg_types[0] { + DataType::LargeUtf8 => DataType::Int64, + DataType::Utf8 => DataType::Int32, + _ => { + // this error is internal as `data_types` should have captured this. + return Err(DataFusionError::Internal( + "The octet_length function can only accept strings.".to_string(), + )); + } + }), BuiltinScalarFunction::Length => Ok(match arg_types[0] { DataType::LargeUtf8 => DataType::Int64, DataType::Utf8 => DataType::Int32, @@ -424,7 +437,8 @@ pub fn create_physical_expr( other, ))), }, - BuiltinScalarFunction::Length => |args| Ok(length(args[0].as_ref())?), + BuiltinScalarFunction::OctetLength => |args| Ok(octet_length(args[0].as_ref())?), + BuiltinScalarFunction::Length => |args| Ok(octet_length(args[0].as_ref())?), BuiltinScalarFunction::Concat => { |args| Ok(Arc::new(string_expressions::concatenate(args)?)) } @@ -501,6 +515,7 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature { BuiltinScalarFunction::Concat => Signature::Variadic(vec![DataType::Utf8]), BuiltinScalarFunction::Upper | BuiltinScalarFunction::Lower + | BuiltinScalarFunction::OctetLength | BuiltinScalarFunction::Length | BuiltinScalarFunction::Trim | BuiltinScalarFunction::Ltrim