Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions rust/arrow/benches/length_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ use criterion::Criterion;

extern crate arrow;

use arrow::array::*;
use arrow::compute::kernels::length::length;
use arrow::{array::*, compute::kernels::length::length};

fn bench_length(array: &StringArray) {
criterion::black_box(length(array).unwrap());
Expand Down
208 changes: 208 additions & 0 deletions rust/arrow/src/compute/kernels/bit_length.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Defines kernel for length of a string array
use crate::{array::*, buffer::Buffer};
use crate::{
datatypes::DataType,
error::{ArrowError, Result},
};
use std::sync::Arc;

fn bit_length_string<OffsetSize>(array: &Array, data_type: DataType) -> ArrayRef
where
OffsetSize: OffsetSizeTrait,
{
// note: offsets are stored as u8, but they can be interpreted as OffsetSize
let offsets = &array.data_ref().buffers()[0];
// this is a 30% improvement over iterating over u8s and building OffsetSize, which
// justifies the usage of `unsafe`.
let slice: &[OffsetSize] =
&unsafe { offsets.typed_data::<OffsetSize>() }[array.offset()..];

let bit_size = OffsetSize::from_usize(8).unwrap();
let lengths = slice
.windows(2)
.map(|offset| (offset[1] - offset[0]) * bit_size);

// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` is an iterator with a known size.
let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };

let null_bit_buffer = array
.data_ref()
.null_bitmap()
.as_ref()
.map(|b| b.bits.clone());

let data = ArrayData::new(
data_type,
array.len(),
None,
null_bit_buffer,
0,
vec![buffer],
vec![],
);
make_array(Arc::new(data))
}

/// Returns an array of Int32/Int64 denoting the number of bits in each string in the array.
///
/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &Array) -> Result<ArrayRef> {
match array.data_type() {
DataType::Utf8 => Ok(bit_length_string::<i32>(array, DataType::Int32)),
DataType::LargeUtf8 => Ok(bit_length_string::<i64>(array, DataType::Int64)),
_ => Err(ArrowError::ComputeError(format!(
"bit_length not supported for {:?}",
array.data_type()
))),
}
}

#[cfg(test)]
mod tests {
use super::*;

fn cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}

// a large array
let mut values = vec!["one", "on", "o", ""];
let mut expected = vec![24, 16, 8, 0];
for _ in 0..10 {
values = double_vec(values);
expected = double_vec(expected);
}

vec![
(vec!["hello", " ", "world", "!"], 4, vec![40, 8, 40, 8]),
(vec!["💖"], 1, vec![32]),
(vec!["josé"], 1, vec![40]),
(values, 4096, expected),
]
}

#[test]
fn test_string() -> Result<()> {
cases().into_iter().try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = bit_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value, result.value(i));
});
Ok(())
})
}

#[test]
fn test_large_string() -> Result<()> {
cases().into_iter().try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = bit_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value as i64, result.value(i));
});
Ok(())
})
}

fn null_cases() -> Vec<(Vec<Option<&'static str>>, usize, Vec<Option<i32>>)> {
vec![(
vec![Some("one"), None, Some("three"), Some("four")],
4,
vec![Some(24), None, Some(40), Some(32)],
)]
}

#[test]
fn null_string() -> Result<()> {
null_cases()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = bit_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();

let expected: Int32Array = expected.into();
assert_eq!(expected.data(), result.data());
Ok(())
})
}

#[test]
fn null_large_string() -> Result<()> {
null_cases()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = bit_length(&array)?;
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();

// convert to i64
let expected: Int64Array = expected
.iter()
.map(|e| e.map(|e| e as i64))
.collect::<Vec<_>>()
.into();
assert_eq!(expected.data(), result.data());
Ok(())
})
}

/// Tests that bit_length is not valid for u64.
#[test]
fn wrong_type() {
let array: UInt64Array = vec![1u64].into();

assert!(bit_length(&array).is_err());
}

/// Tests with an offset
#[test]
fn offsets() -> Result<()> {
let a = StringArray::from(vec!["hello", " ", "world"]);
let b = make_array(
ArrayData::builder(DataType::Utf8)
.len(2)
.offset(1)
.buffers(a.data_ref().buffers().to_vec())
.build(),
);
let result = bit_length(b.as_ref())?;

let expected = Int32Array::from(vec![8, 40]);
assert_eq!(expected.data(), result.data());

Ok(())
}
}
1 change: 1 addition & 0 deletions rust/arrow/src/compute/kernels/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
pub mod aggregate;
pub mod arithmetic;
pub mod arity;
pub mod bit_length;
pub mod boolean;
pub mod cast;
pub mod cast_utils;
Expand Down
2 changes: 2 additions & 0 deletions rust/datafusion/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ md-5 = "^0.9.1"
sha2 = "^0.9.1"
ordered-float = "2.0"
unicode-segmentation = "^1.7.1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is something I have been thinking a lot about -- how can we keep DataFusion's dependency stack reasonable (it is already pretty large and it just keeps getting larger).

One thing I was thinking about was making some of these dependencies optional (so that we had features like regex and unicode and hash which would only pull in the dependencies / have those functions if the features were enabled.

What do you think @jorgecarleitao / @andygrove / @ovr ? If it is a reasonable idea (I think we mentioned it before) I will file a JIRA to track?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I was nervous about additional dependencies. Perhaps this topic can be raised at the next Arrow Rust call to agree some sort of assessment criteria.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, regex / lazy_static are already non-optional dependencies of arrow, so I think not that much can be gained there, unless we make it optional in Arrow as well.

I think it is a good idea to make some features optional, to reduce compile times whenever you are not working on them.

Another thing we can do to split benchmarks / examples / etc. out of the crate to make compile times a bit shorter, which I started doing hereL #9494 and #9493

regex = "1"
lazy_static = "^1.4.0"

[dev-dependencies]
rand = "0.8"
Expand Down
3 changes: 3 additions & 0 deletions rust/datafusion/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@
extern crate arrow;
extern crate sqlparser;

#[macro_use]
extern crate lazy_static;

pub mod dataframe;
pub mod datasource;
pub mod error;
Expand Down
28 changes: 19 additions & 9 deletions rust/datafusion/src/logical_plan/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -874,29 +874,39 @@ unary_scalar_expr!(Log2, log2);
unary_scalar_expr!(Log10, log10);

// string functions
unary_scalar_expr!(Ascii, ascii);
unary_scalar_expr!(BitLength, bit_length);
unary_scalar_expr!(Btrim, btrim);
unary_scalar_expr!(CharacterLength, character_length);
unary_scalar_expr!(CharacterLength, length);
unary_scalar_expr!(Chr, chr);
unary_scalar_expr!(Concat, concat);
unary_scalar_expr!(ConcatWithSeparator, concat_ws);
unary_scalar_expr!(InitCap, initcap);
unary_scalar_expr!(Left, left);
unary_scalar_expr!(Lower, lower);
unary_scalar_expr!(Lpad, lpad);
unary_scalar_expr!(Ltrim, ltrim);
unary_scalar_expr!(MD5, md5);
unary_scalar_expr!(OctetLength, octet_length);
unary_scalar_expr!(RegexpReplace, regexp_replace);
unary_scalar_expr!(Repeat, repeat);
unary_scalar_expr!(Replace, replace);
unary_scalar_expr!(Reverse, reverse);
unary_scalar_expr!(Right, right);
unary_scalar_expr!(Rpad, rpad);
unary_scalar_expr!(Rtrim, rtrim);
unary_scalar_expr!(SHA224, sha224);
unary_scalar_expr!(SHA256, sha256);
unary_scalar_expr!(SHA384, sha384);
unary_scalar_expr!(SHA512, sha512);
unary_scalar_expr!(SplitPart, split_part);
unary_scalar_expr!(StartsWith, starts_with);
unary_scalar_expr!(Strpos, strpos);
unary_scalar_expr!(Substr, substr);
unary_scalar_expr!(Translate, translate);
unary_scalar_expr!(Trim, trim);
unary_scalar_expr!(Upper, upper);

/// returns the concatenation of string expressions
pub fn concat(args: Vec<Expr>) -> Expr {
Expr::ScalarFunction {
fun: functions::BuiltinScalarFunction::Concat,
args,
}
}

/// returns an array of fixed size with each argument on it.
pub fn array(args: Vec<Expr>) -> Expr {
Expr::ScalarFunction {
Expand Down
14 changes: 8 additions & 6 deletions rust/datafusion/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ pub use builder::LogicalPlanBuilder;
pub use dfschema::{DFField, DFSchema, DFSchemaRef, ToDFSchema};
pub use display::display_schema;
pub use expr::{
abs, acos, and, array, asin, atan, avg, binary_expr, bit_length, case, ceil,
character_length, col, combine_filters, concat, cos, count, count_distinct,
create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list, length, lit, ln,
log10, log2, lower, ltrim, max, md5, min, octet_length, or, round, rtrim, sha224,
sha256, sha384, sha512, signum, sin, sqrt, sum, tan, trim, trunc, upper, when, Expr,
ExpressionVisitor, Literal, Recursion,
abs, acos, and, array, ascii, asin, atan, avg, binary_expr, bit_length, btrim, case,
ceil, character_length, chr, col, combine_filters, concat, concat_ws, cos, count,
count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list,
initcap, left, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, min, octet_length,
or, regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim, sha224,
sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, substr,
sum, tan, translate, trim, trunc, upper, when, Expr, ExpressionVisitor, Literal,
Recursion,
};
pub use extension::UserDefinedLogicalNode;
pub use operators::Operator;
Expand Down
Loading