apache · seddonm1 · Feb 21, 2021 · Feb 21, 2021 · Feb 21, 2021 · alamb
diff --git a/rust/arrow/benches/length_kernel.rs b/rust/arrow/benches/length_kernel.rs
@@ -21,8 +21,7 @@ use criterion::Criterion;
 
 extern crate arrow;
 
-use arrow::array::*;
-use arrow::compute::kernels::length::length;
+use arrow::{array::*, compute::kernels::length::length};
 
 fn bench_length(array: &StringArray) {
     criterion::black_box(length(array).unwrap());

diff --git a/rust/arrow/src/compute/kernels/bit_length.rs b/rust/arrow/src/compute/kernels/bit_length.rs
@@ -0,0 +1,208 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines kernel for length of a string array
+
+use crate::{array::*, buffer::Buffer};
+use crate::{
+    datatypes::DataType,
+    error::{ArrowError, Result},
+};
+use std::sync::Arc;
+
+fn bit_length_string<OffsetSize>(array: &Array, data_type: DataType) -> ArrayRef
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    // note: offsets are stored as u8, but they can be interpreted as OffsetSize
+    let offsets = &array.data_ref().buffers()[0];
+    // this is a 30% improvement over iterating over u8s and building OffsetSize, which
+    // justifies the usage of `unsafe`.
+    let slice: &[OffsetSize] =
+        &unsafe { offsets.typed_data::<OffsetSize>() }[array.offset()..];
+
+    let bit_size = OffsetSize::from_usize(8).unwrap();
+    let lengths = slice
+        .windows(2)
+        .map(|offset| (offset[1] - offset[0]) * bit_size);
+
+    // JUSTIFICATION
+    //  Benefit
+    //      ~60% speedup
+    //  Soundness
+    //      `values` is an iterator with a known size.
+    let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };
+
+    let null_bit_buffer = array
+        .data_ref()
+        .null_bitmap()
+        .as_ref()
+        .map(|b| b.bits.clone());
+
+    let data = ArrayData::new(
+        data_type,
+        array.len(),
+        None,
+        null_bit_buffer,
+        0,
+        vec![buffer],
+        vec![],
+    );
+    make_array(Arc::new(data))
+}
+
+/// Returns an array of Int32/Int64 denoting the number of bits in each string in the array.
+///
+/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8
+/// * bit_length of null is null.
+/// * bit_length is in number of bits
+pub fn bit_length(array: &Array) -> Result<ArrayRef> {
+    match array.data_type() {
+        DataType::Utf8 => Ok(bit_length_string::<i32>(array, DataType::Int32)),
+        DataType::LargeUtf8 => Ok(bit_length_string::<i64>(array, DataType::Int64)),
+        _ => Err(ArrowError::ComputeError(format!(
+            "bit_length not supported for {:?}",
+            array.data_type()
+        ))),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
+        fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
+            [&v[..], &v[..]].concat()
+        }
+
+        // a large array
+        let mut values = vec!["one", "on", "o", ""];
+        let mut expected = vec![24, 16, 8, 0];
+        for _ in 0..10 {
+            values = double_vec(values);
+            expected = double_vec(expected);
+        }
+
+        vec![
+            (vec!["hello", " ", "world", "!"], 4, vec![40, 8, 40, 8]),
+            (vec!["💖"], 1, vec![32]),
+            (vec!["josé"], 1, vec![40]),
+            (values, 4096, expected),
+        ]
+    }
+
+    #[test]
+    fn test_string() -> Result<()> {
+        cases().into_iter().try_for_each(|(input, len, expected)| {
+            let array = StringArray::from(input);
+            let result = bit_length(&array)?;
+            assert_eq!(len, result.len());
+            let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+            expected.iter().enumerate().for_each(|(i, value)| {
+                assert_eq!(*value, result.value(i));
+            });
+            Ok(())
+        })
+    }
+
+    #[test]
+    fn test_large_string() -> Result<()> {
+        cases().into_iter().try_for_each(|(input, len, expected)| {
+            let array = LargeStringArray::from(input);
+            let result = bit_length(&array)?;
+            assert_eq!(len, result.len());
+            let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
+            expected.iter().enumerate().for_each(|(i, value)| {
+                assert_eq!(*value as i64, result.value(i));
+            });
+            Ok(())
+        })
+    }
+
+    fn null_cases() -> Vec<(Vec<Option<&'static str>>, usize, Vec<Option<i32>>)> {
+        vec![(
+            vec![Some("one"), None, Some("three"), Some("four")],
+            4,
+            vec![Some(24), None, Some(40), Some(32)],
+        )]
+    }
+
+    #[test]
+    fn null_string() -> Result<()> {
+        null_cases()
+            .into_iter()
+            .try_for_each(|(input, len, expected)| {
+                let array = StringArray::from(input);
+                let result = bit_length(&array)?;
+                assert_eq!(len, result.len());
+                let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+                let expected: Int32Array = expected.into();
+                assert_eq!(expected.data(), result.data());
+                Ok(())
+            })
+    }
+
+    #[test]
+    fn null_large_string() -> Result<()> {
+        null_cases()
+            .into_iter()
+            .try_for_each(|(input, len, expected)| {
+                let array = LargeStringArray::from(input);
+                let result = bit_length(&array)?;
+                assert_eq!(len, result.len());
+                let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
+
+                // convert to i64
+                let expected: Int64Array = expected
+                    .iter()
+                    .map(|e| e.map(|e| e as i64))
+                    .collect::<Vec<_>>()
+                    .into();
+                assert_eq!(expected.data(), result.data());
+                Ok(())
+            })
+    }
+
+    /// Tests that bit_length is not valid for u64.
+    #[test]
+    fn wrong_type() {
+        let array: UInt64Array = vec![1u64].into();
+
+        assert!(bit_length(&array).is_err());
+    }
+
+    /// Tests with an offset
+    #[test]
+    fn offsets() -> Result<()> {
+        let a = StringArray::from(vec!["hello", " ", "world"]);
+        let b = make_array(
+            ArrayData::builder(DataType::Utf8)
+                .len(2)
+                .offset(1)
+                .buffers(a.data_ref().buffers().to_vec())
+                .build(),
+        );
+        let result = bit_length(b.as_ref())?;
+
+        let expected = Int32Array::from(vec![8, 40]);
+        assert_eq!(expected.data(), result.data());
+
+        Ok(())
+    }
+}
diff --git a/rust/arrow/src/compute/kernels/mod.rs b/rust/arrow/src/compute/kernels/mod.rs
@@ -20,6 +20,7 @@
 pub mod aggregate;
 pub mod arithmetic;
 pub mod arity;
+pub mod bit_length;
 pub mod boolean;
 pub mod cast;
 pub mod cast_utils;

diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml
@@ -65,6 +65,8 @@ md-5 = "^0.9.1"
 sha2 = "^0.9.1"
 ordered-float = "2.0"
 unicode-segmentation = "^1.7.1"
+regex = "1"
+lazy_static = "^1.4.0"
 
 [dev-dependencies]
 rand = "0.8"

diff --git a/rust/datafusion/src/lib.rs b/rust/datafusion/src/lib.rs
@@ -156,6 +156,9 @@
 extern crate arrow;
 extern crate sqlparser;
 
+#[macro_use]
+extern crate lazy_static;
+
 pub mod dataframe;
 pub mod datasource;
 pub mod error;

diff --git a/rust/datafusion/src/logical_plan/expr.rs b/rust/datafusion/src/logical_plan/expr.rs
@@ -874,29 +874,39 @@ unary_scalar_expr!(Log2, log2);
 unary_scalar_expr!(Log10, log10);
 
 // string functions
+unary_scalar_expr!(Ascii, ascii);
 unary_scalar_expr!(BitLength, bit_length);
+unary_scalar_expr!(Btrim, btrim);
 unary_scalar_expr!(CharacterLength, character_length);
-unary_scalar_expr!(CharacterLength, length);
+unary_scalar_expr!(Chr, chr);
+unary_scalar_expr!(Concat, concat);
+unary_scalar_expr!(ConcatWithSeparator, concat_ws);
+unary_scalar_expr!(InitCap, initcap);
+unary_scalar_expr!(Left, left);
 unary_scalar_expr!(Lower, lower);
+unary_scalar_expr!(Lpad, lpad);
 unary_scalar_expr!(Ltrim, ltrim);
 unary_scalar_expr!(MD5, md5);
 unary_scalar_expr!(OctetLength, octet_length);
+unary_scalar_expr!(RegexpReplace, regexp_replace);
+unary_scalar_expr!(Repeat, repeat);
+unary_scalar_expr!(Replace, replace);
+unary_scalar_expr!(Reverse, reverse);
+unary_scalar_expr!(Right, right);
+unary_scalar_expr!(Rpad, rpad);
 unary_scalar_expr!(Rtrim, rtrim);
 unary_scalar_expr!(SHA224, sha224);
 unary_scalar_expr!(SHA256, sha256);
 unary_scalar_expr!(SHA384, sha384);
 unary_scalar_expr!(SHA512, sha512);
+unary_scalar_expr!(SplitPart, split_part);
+unary_scalar_expr!(StartsWith, starts_with);
+unary_scalar_expr!(Strpos, strpos);
+unary_scalar_expr!(Substr, substr);
+unary_scalar_expr!(Translate, translate);
 unary_scalar_expr!(Trim, trim);
 unary_scalar_expr!(Upper, upper);
 
-/// returns the concatenation of string expressions
-pub fn concat(args: Vec<Expr>) -> Expr {
-    Expr::ScalarFunction {
-        fun: functions::BuiltinScalarFunction::Concat,
-        args,
-    }
-}
-
 /// returns an array of fixed size with each argument on it.
 pub fn array(args: Vec<Expr>) -> Expr {
     Expr::ScalarFunction {

diff --git a/rust/datafusion/src/logical_plan/mod.rs b/rust/datafusion/src/logical_plan/mod.rs
@@ -33,12 +33,14 @@ pub use builder::LogicalPlanBuilder;
 pub use dfschema::{DFField, DFSchema, DFSchemaRef, ToDFSchema};
 pub use display::display_schema;
 pub use expr::{
-    abs, acos, and, array, asin, atan, avg, binary_expr, bit_length, case, ceil,
-    character_length, col, combine_filters, concat, cos, count, count_distinct,
-    create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list, length, lit, ln,
-    log10, log2, lower, ltrim, max, md5, min, octet_length, or, round, rtrim, sha224,
-    sha256, sha384, sha512, signum, sin, sqrt, sum, tan, trim, trunc, upper, when, Expr,
-    ExpressionVisitor, Literal, Recursion,
+    abs, acos, and, array, ascii, asin, atan, avg, binary_expr, bit_length, btrim, case,
+    ceil, character_length, chr, col, combine_filters, concat, concat_ws, cos, count,
+    count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list,
+    initcap, left, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, min, octet_length,
+    or, regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim, sha224,
+    sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, substr,
+    sum, tan, translate, trim, trunc, upper, when, Expr, ExpressionVisitor, Literal,
+    Recursion,
 };
 pub use extension::UserDefinedLogicalNode;
 pub use operators::Operator;