Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rust/arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ name = "take_kernels"
harness = false

[[bench]]
name = "length_kernel"
name = "octet_length_kernel"
harness = false

[[bench]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ use criterion::Criterion;
extern crate arrow;

use arrow::array::*;
use arrow::compute::kernels::length::length;
use arrow::compute::kernels::octet_length::octet_length;

fn bench_length(array: &StringArray) {
criterion::black_box(length(array).unwrap());
fn bench_octet_length(array: &StringArray) {
criterion::black_box(octet_length(array).unwrap());
}

fn add_benchmark(c: &mut Criterion) {
Expand All @@ -40,7 +40,7 @@ fn add_benchmark(c: &mut Criterion) {
}
let array = StringArray::from(values);

c.bench_function("length", |b| b.iter(|| bench_length(&array)));
c.bench_function("length", |b| b.iter(|| bench_octet_length(&array)));
}

criterion_group!(benches, add_benchmark);
Expand Down
2 changes: 1 addition & 1 deletion rust/arrow/src/compute/kernels/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ pub mod cast;
pub mod comparison;
pub mod concat;
pub mod filter;
pub mod length;
pub mod limit;
pub mod octet_length;
pub mod sort;
pub mod substring;
pub mod take;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::{
};
use std::sync::Arc;

fn length_string<OffsetSize>(array: &Array, data_type: DataType) -> Result<ArrayRef>
fn octet_length_string<OffsetSize>(array: &Array, data_type: DataType) -> Result<ArrayRef>
where
OffsetSize: OffsetSizeTrait,
{
Expand All @@ -41,7 +41,7 @@ where
// Benefit
// ~60% speedup
// Soundness
// `values` is an iterator with a known size.
// `lengths` is an iterator with a known size.
let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };

let null_bit_buffer = array
Expand All @@ -62,17 +62,17 @@ where
Ok(make_array(Arc::new(data)))
}

/// Returns an array of Int32/Int64 denoting the number of characters in each string in the array.
/// Returns an array of Int32/Int64 denoting the number of octets in each string in the array.
///
/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8
/// * length of null is null.
/// * length is in number of bytes
pub fn length(array: &Array) -> Result<ArrayRef> {
/// * length is in number of octets
pub fn octet_length(array: &Array) -> Result<ArrayRef> {
match array.data_type() {
DataType::Utf8 => length_string::<i32>(array, DataType::Int32),
DataType::LargeUtf8 => length_string::<i64>(array, DataType::Int64),
DataType::Utf8 => octet_length_string::<i32>(array, DataType::Int32),
DataType::LargeUtf8 => octet_length_string::<i64>(array, DataType::Int64),
_ => Err(ArrowError::ComputeError(format!(
"length not supported for {:?}",
"octet_length not supported for {:?}",
array.data_type()
))),
}
Expand All @@ -99,6 +99,7 @@ mod tests {
(vec!["hello", " ", "world"], 3, vec![5, 1, 5]),
(vec!["hello", " ", "world", "!"], 4, vec![5, 1, 5, 1]),
(vec!["💖"], 1, vec![4]),
(vec!["josé"], 1, vec![5]),
(values, 4096, expected),
]
}
Expand All @@ -107,7 +108,7 @@ mod tests {
fn test_string() -> Result<()> {
cases().into_iter().try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = length(&array)?;
let result = octet_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
Expand All @@ -121,7 +122,7 @@ mod tests {
fn test_large_string() -> Result<()> {
cases().into_iter().try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = length(&array)?;
let result = octet_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
Expand All @@ -145,7 +146,7 @@ mod tests {
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = length(&array)?;
let result = octet_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();

Expand All @@ -161,7 +162,7 @@ mod tests {
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = length(&array)?;
let result = octet_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();

Expand All @@ -181,7 +182,7 @@ mod tests {
fn wrong_type() -> Result<()> {
let array: UInt64Array = vec![1u64].into();

assert!(length(&array).is_err());
assert!(octet_length(&array).is_err());
Ok(())
}

Expand All @@ -196,7 +197,7 @@ mod tests {
.buffers(a.data_ref().buffers().to_vec())
.build(),
);
let result = length(b.as_ref())?;
let result = octet_length(b.as_ref())?;

let expected = Int32Array::from(vec![1, 5]);
assert_eq!(expected.data(), result.data());
Expand Down
10 changes: 2 additions & 8 deletions rust/datafusion/src/logical_plan/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -820,20 +820,14 @@ unary_scalar_expr!(Trim, trim);
unary_scalar_expr!(Ltrim, ltrim);
unary_scalar_expr!(Rtrim, rtrim);
unary_scalar_expr!(Upper, upper);
unary_scalar_expr!(Length, length);
unary_scalar_expr!(OctetLength, octet_length);
unary_scalar_expr!(MD5, md5);
unary_scalar_expr!(SHA224, sha224);
unary_scalar_expr!(SHA256, sha256);
unary_scalar_expr!(SHA384, sha384);
unary_scalar_expr!(SHA512, sha512);

/// returns the length of a string in bytes
pub fn length(e: Expr) -> Expr {
Expr::ScalarFunction {
fun: functions::BuiltinScalarFunction::Length,
args: vec![e],
}
}

/// returns the concatenation of string expressions
pub fn concat(args: Vec<Expr>) -> Expr {
Expr::ScalarFunction {
Expand Down
6 changes: 3 additions & 3 deletions rust/datafusion/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ pub use display::display_schema;
pub use expr::{
abs, acos, and, array, asin, atan, avg, binary_expr, case, ceil, col, concat, cos,
count, count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor,
in_list, length, lit, ln, log10, log2, lower, ltrim, max, md5, min, or, round, rtrim,
sha224, sha256, sha384, sha512, signum, sin, sqrt, sum, tan, trim, trunc, upper,
when, Expr, ExpressionVisitor, Literal, Recursion,
in_list, length, lit, ln, log10, log2, lower, ltrim, max, md5, min, octet_length, or,
round, rtrim, sha224, sha256, sha384, sha512, signum, sin, sqrt, sum, tan, trim,
trunc, upper, when, Expr, ExpressionVisitor, Literal, Recursion,
};
pub use extension::UserDefinedLogicalNode;
pub use operators::Operator;
Expand Down
19 changes: 17 additions & 2 deletions rust/datafusion/src/physical_plan/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ use crate::physical_plan::math_expressions;
use crate::physical_plan::string_expressions;
use arrow::{
array::ArrayRef,
compute::kernels::length::length,
compute::kernels::octet_length::octet_length,
datatypes::TimeUnit,
datatypes::{DataType, Field, Schema},
record_batch::RecordBatch,
Expand Down Expand Up @@ -117,6 +117,8 @@ pub enum BuiltinScalarFunction {
Signum,
/// length
Length,
/// octet_length
OctetLength,
/// concat
Concat,
/// lower
Expand Down Expand Up @@ -177,6 +179,7 @@ impl FromStr for BuiltinScalarFunction {
"truc" => BuiltinScalarFunction::Trunc,
"abs" => BuiltinScalarFunction::Abs,
"signum" => BuiltinScalarFunction::Signum,
"octet_length" => BuiltinScalarFunction::OctetLength,
"length" => BuiltinScalarFunction::Length,
"char_length" => BuiltinScalarFunction::Length,
"character_length" => BuiltinScalarFunction::Length,
Expand Down Expand Up @@ -228,6 +231,16 @@ pub fn return_type(
// the return type of the built in function.
// Some built-in functions' return type depends on the incoming type.
match fun {
BuiltinScalarFunction::OctetLength => Ok(match arg_types[0] {
DataType::LargeUtf8 => DataType::Int64,
DataType::Utf8 => DataType::Int32,
_ => {
// this error is internal as `data_types` should have captured this.
return Err(DataFusionError::Internal(
"The octet_length function can only accept strings.".to_string(),
));
}
}),
BuiltinScalarFunction::Length => Ok(match arg_types[0] {
DataType::LargeUtf8 => DataType::Int64,
DataType::Utf8 => DataType::Int32,
Expand Down Expand Up @@ -424,7 +437,8 @@ pub fn create_physical_expr(
other,
))),
},
BuiltinScalarFunction::Length => |args| Ok(length(args[0].as_ref())?),
BuiltinScalarFunction::OctetLength => |args| Ok(octet_length(args[0].as_ref())?),
BuiltinScalarFunction::Length => |args| Ok(octet_length(args[0].as_ref())?),
BuiltinScalarFunction::Concat => {
|args| Ok(Arc::new(string_expressions::concatenate(args)?))
}
Expand Down Expand Up @@ -501,6 +515,7 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature {
BuiltinScalarFunction::Concat => Signature::Variadic(vec![DataType::Utf8]),
BuiltinScalarFunction::Upper
| BuiltinScalarFunction::Lower
| BuiltinScalarFunction::OctetLength
| BuiltinScalarFunction::Length
| BuiltinScalarFunction::Trim
| BuiltinScalarFunction::Ltrim
Expand Down