diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 1904d58cfc926..b3f17ae3c2ca4 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -123,18 +123,12 @@ pub enum BuiltinScalarFunction { Lpad, /// random Random, - /// repeat - Repeat, - /// replace - Replace, /// reverse Reverse, /// right Right, /// rpad Rpad, - /// split_part - SplitPart, /// strpos Strpos, /// substr @@ -238,12 +232,9 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Left => Volatility::Immutable, BuiltinScalarFunction::Lpad => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::Repeat => Volatility::Immutable, - BuiltinScalarFunction::Replace => Volatility::Immutable, BuiltinScalarFunction::Reverse => Volatility::Immutable, BuiltinScalarFunction::Right => Volatility::Immutable, BuiltinScalarFunction::Rpad => Volatility::Immutable, - BuiltinScalarFunction::SplitPart => Volatility::Immutable, BuiltinScalarFunction::Strpos => Volatility::Immutable, BuiltinScalarFunction::Substr => Volatility::Immutable, BuiltinScalarFunction::Translate => Volatility::Immutable, @@ -293,12 +284,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"), BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), - BuiltinScalarFunction::Repeat => { - utf8_to_str_type(&input_expr_types[0], "repeat") - } - BuiltinScalarFunction::Replace => { - utf8_to_str_type(&input_expr_types[0], "replace") - } BuiltinScalarFunction::Reverse => { utf8_to_str_type(&input_expr_types[0], "reverse") } @@ -306,9 +291,6 @@ impl BuiltinScalarFunction { utf8_to_str_type(&input_expr_types[0], "right") } BuiltinScalarFunction::Rpad => utf8_to_str_type(&input_expr_types[0], "rpad"), - BuiltinScalarFunction::SplitPart => { - utf8_to_str_type(&input_expr_types[0], "split_part") - } BuiltinScalarFunction::EndsWith => Ok(Boolean), BuiltinScalarFunction::Strpos => { utf8_to_int_type(&input_expr_types[0], "strpos/instr/position") @@ -417,21 +399,12 @@ impl BuiltinScalarFunction { self.volatility(), ) } - BuiltinScalarFunction::Left - | BuiltinScalarFunction::Repeat - | BuiltinScalarFunction::Right => Signature::one_of( - vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], - self.volatility(), - ), - BuiltinScalarFunction::SplitPart => Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, Utf8, Int64]), - Exact(vec![Utf8, LargeUtf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), - ], - self.volatility(), - ), + BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => { + Signature::one_of( + vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + self.volatility(), + ) + } BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => { Signature::one_of( @@ -467,7 +440,7 @@ impl BuiltinScalarFunction { self.volatility(), ), - BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate => { + BuiltinScalarFunction::Translate => { Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility()) } BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()), @@ -637,12 +610,9 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::InitCap => &["initcap"], BuiltinScalarFunction::Left => &["left"], BuiltinScalarFunction::Lpad => &["lpad"], - BuiltinScalarFunction::Repeat => &["repeat"], - BuiltinScalarFunction::Replace => &["replace"], BuiltinScalarFunction::Reverse => &["reverse"], BuiltinScalarFunction::Right => &["right"], BuiltinScalarFunction::Rpad => &["rpad"], - BuiltinScalarFunction::SplitPart => &["split_part"], BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"], BuiltinScalarFunction::Substr => &["substr"], BuiltinScalarFunction::Translate => &["translate"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 60db21e5f5fef..f75d8869671e1 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -598,11 +598,8 @@ scalar_expr!( ); scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase"); scalar_expr!(Left, left, string n, "returns the first `n` characters in the `string`"); -scalar_expr!(Replace, replace, string from to, "replaces all occurrences of `from` with `to` in the `string`"); -scalar_expr!(Repeat, repeat, string n, "repeats the `string` to `n` times"); scalar_expr!(Reverse, reverse, string, "reverses the `string`"); scalar_expr!(Right, right, string n, "returns the last `n` characters in the `string`"); -scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string based on a delimiter and picks out the desired field based on the index."); scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`"); scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`"); scalar_expr!(Substr, substr, string position, "substring from the `position` to the end"); @@ -1056,13 +1053,10 @@ mod test { test_scalar_expr!(Left, left, string, count); test_nary_scalar_expr!(Lpad, lpad, string, count); test_nary_scalar_expr!(Lpad, lpad, string, count, characters); - test_scalar_expr!(Replace, replace, string, from, to); - test_scalar_expr!(Repeat, repeat, string, count); test_scalar_expr!(Reverse, reverse, string); test_scalar_expr!(Right, right, string, count); test_nary_scalar_expr!(Rpad, rpad, string, count); test_nary_scalar_expr!(Rpad, rpad, string, count, characters); - test_scalar_expr!(SplitPart, split_part, expr, delimiter, index); test_scalar_expr!(EndsWith, ends_with, string, characters); test_scalar_expr!(Strpos, strpos, string, substring); test_scalar_expr!(Substr, substr, string, position); diff --git a/datafusion/functions/src/string/mod.rs b/datafusion/functions/src/string/mod.rs index 165a7c6604043..d2b9fb2da8053 100644 --- a/datafusion/functions/src/string/mod.rs +++ b/datafusion/functions/src/string/mod.rs @@ -29,7 +29,10 @@ mod lower; mod ltrim; mod octet_length; mod overlay; +mod repeat; +mod replace; mod rtrim; +mod split_part; mod starts_with; mod to_hex; mod upper; @@ -43,8 +46,11 @@ make_udf_function!(ltrim::LtrimFunc, LTRIM, ltrim); make_udf_function!(lower::LowerFunc, LOWER, lower); make_udf_function!(octet_length::OctetLengthFunc, OCTET_LENGTH, octet_length); make_udf_function!(overlay::OverlayFunc, OVERLAY, overlay); +make_udf_function!(repeat::RepeatFunc, REPEAT, repeat); +make_udf_function!(replace::ReplaceFunc, REPLACE, replace); make_udf_function!(rtrim::RtrimFunc, RTRIM, rtrim); make_udf_function!(starts_with::StartsWithFunc, STARTS_WITH, starts_with); +make_udf_function!(split_part::SplitPartFunc, SPLIT_PART, split_part); make_udf_function!(to_hex::ToHexFunc, TO_HEX, to_hex); make_udf_function!(upper::UpperFunc, UPPER, upper); make_udf_function!(uuid::UuidFunc, UUID, uuid); @@ -87,11 +93,26 @@ pub mod expr_fn { super::overlay().call(args) } + #[doc = "Repeats the `string` to `n` times"] + pub fn repeat(string: Expr, n: Expr) -> Expr { + super::repeat().call(vec![string, n]) + } + + #[doc = "Replaces all occurrences of `from` with `to` in the `string`"] + pub fn replace(string: Expr, from: Expr, to: Expr) -> Expr { + super::replace().call(vec![string, from, to]) + } + #[doc = "Removes all characters, spaces by default, from the end of a string"] pub fn rtrim(args: Vec) -> Expr { super::rtrim().call(args) } + #[doc = "Splits a string based on a delimiter and picks out the desired field based on the index."] + pub fn split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr { + super::split_part().call(vec![string, delimiter, index]) + } + #[doc = "Returns true if string starts with prefix."] pub fn starts_with(arg1: Expr, arg2: Expr) -> Expr { super::starts_with().call(vec![arg1, arg2]) @@ -128,7 +149,10 @@ pub fn functions() -> Vec> { ltrim(), octet_length(), overlay(), + repeat(), + replace(), rtrim(), + split_part(), starts_with(), to_hex(), upper(), diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs new file mode 100644 index 0000000000000..83bc929cb9a48 --- /dev/null +++ b/datafusion/functions/src/string/repeat.rs @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ScalarUDFImpl, Signature}; + +use crate::string::common::*; + +#[derive(Debug)] +pub(super) struct RepeatFunc { + signature: Signature, +} + +impl RepeatFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RepeatFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "repeat" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "repeat") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(repeat::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(repeat::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function repeat"), + } + } +} + +/// Repeats string the specified number of times. +/// repeat('Pg', 4) = 'PgPgPgPg' +fn repeat(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let number_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(number_array.iter()) + .map(|(string, number)| match (string, number) { + (Some(string), Some(number)) => Some(string.repeat(number as usize)), + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::Result; + use datafusion_common::ScalarValue; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::string::common::test::test_function; + use crate::string::repeat::RepeatFunc; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + RepeatFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("Pg")))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(4))), + ], + Ok(Some("PgPgPgPg")), + &str, + Utf8, + StringArray + ); + + test_function!( + RepeatFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::Int64(Some(4))), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RepeatFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("Pg")))), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs new file mode 100644 index 0000000000000..e352442960907 --- /dev/null +++ b/datafusion/functions/src/string/replace.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ScalarUDFImpl, Signature}; + +use crate::string::common::*; + +#[derive(Debug)] +pub(super) struct ReplaceFunc { + signature: Signature, +} + +impl ReplaceFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Utf8, Utf8])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for ReplaceFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "replace" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "replace") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(replace::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(replace::, vec![])(args), + other => { + exec_err!("Unsupported data type {other:?} for function replace") + } + } + } +} + +/// Replaces all occurrences in string of substring from with substring to. +/// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef' +fn replace(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let from_array = as_generic_string_array::(&args[1])?; + let to_array = as_generic_string_array::(&args[2])?; + + let result = string_array + .iter() + .zip(from_array.iter()) + .zip(to_array.iter()) + .map(|((string, from), to)| match (string, from, to) { + (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +mod test {} diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs new file mode 100644 index 0000000000000..af201e90fcf6d --- /dev/null +++ b/datafusion/functions/src/string/split_part.rs @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ScalarUDFImpl, Signature}; + +use crate::string::common::*; + +#[derive(Debug)] +pub(super) struct SplitPartFunc { + signature: Signature, +} + +impl SplitPartFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8, Int64]), + Exact(vec![LargeUtf8, Utf8, Int64]), + Exact(vec![Utf8, LargeUtf8, Int64]), + Exact(vec![LargeUtf8, LargeUtf8, Int64]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SplitPartFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "split_part" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "split_part") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(split_part::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(split_part::, vec![])(args), + other => { + exec_err!("Unsupported data type {other:?} for function split_part") + } + } + } +} + +/// Splits string at occurrences of delimiter and returns the n'th field (counting from one). +/// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def' +fn split_part(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let delimiter_array = as_generic_string_array::(&args[1])?; + let n_array = as_int64_array(&args[2])?; + let result = string_array + .iter() + .zip(delimiter_array.iter()) + .zip(n_array.iter()) + .map(|((string, delimiter), n)| match (string, delimiter, n) { + (Some(string), Some(delimiter), Some(n)) => { + if n <= 0 { + exec_err!("field position must be greater than zero") + } else { + let split_string: Vec<&str> = string.split(delimiter).collect(); + match split_string.get(n as usize - 1) { + Some(s) => Ok(Some(*s)), + None => Ok(Some("")), + } + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::ScalarValue; + use datafusion_common::{exec_err, Result}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::string::common::test::test_function; + use crate::string::split_part::SplitPartFunc; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + SplitPartFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from( + "abc~@~def~@~ghi" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(2))), + ], + Ok(Some("def")), + &str, + Utf8, + StringArray + ); + test_function!( + SplitPartFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from( + "abc~@~def~@~ghi" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(20))), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + SplitPartFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from( + "abc~@~def~@~ghi" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))), + ], + exec_err!("field position must be greater than zero"), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 8759adc89b404..163598c2df827 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -30,17 +30,16 @@ //! an argument i32 is passed to a function that supports f64, the //! argument is automatically is coerced to f64. -use crate::sort_properties::SortProperties; -use crate::{ - conditional_expressions, math_expressions, string_expressions, PhysicalExpr, - ScalarFunctionExpr, -}; +use std::ops::Neg; +use std::sync::Arc; + use arrow::{ array::ArrayRef, compute::kernels::length::bit_length, datatypes::{DataType, Int32Type, Int64Type, Schema}, }; use arrow_array::Array; + use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::execution_props::ExecutionProps; pub use datafusion_expr::FuncMonotonicity; @@ -49,8 +48,12 @@ use datafusion_expr::{ type_coercion::functions::data_types, BuiltinScalarFunction, ColumnarValue, ScalarFunctionImplementation, }; -use std::ops::Neg; -use std::sync::Arc; + +use crate::sort_properties::SortProperties; +use crate::{ + conditional_expressions, math_expressions, string_expressions, PhysicalExpr, + ScalarFunctionExpr, +}; /// Create a physical (function) expression. /// This function errors when `args`' can't be coerced to a valid argument type of the function. @@ -328,26 +331,6 @@ pub fn create_physical_fun( } other => exec_err!("Unsupported data type {other:?} for function lpad"), }), - BuiltinScalarFunction::Repeat => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - make_scalar_function_inner(string_expressions::repeat::)(args) - } - DataType::LargeUtf8 => { - make_scalar_function_inner(string_expressions::repeat::)(args) - } - other => exec_err!("Unsupported data type {other:?} for function repeat"), - }), - BuiltinScalarFunction::Replace => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - make_scalar_function_inner(string_expressions::replace::)(args) - } - DataType::LargeUtf8 => { - make_scalar_function_inner(string_expressions::replace::)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function replace") - } - }), BuiltinScalarFunction::Reverse => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = @@ -387,17 +370,6 @@ pub fn create_physical_fun( } other => exec_err!("Unsupported data type {other:?} for function rpad"), }), - BuiltinScalarFunction::SplitPart => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - make_scalar_function_inner(string_expressions::split_part::)(args) - } - DataType::LargeUtf8 => { - make_scalar_function_inner(string_expressions::split_part::)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function split_part") - } - }), BuiltinScalarFunction::EndsWith => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::ends_with::)(args) @@ -568,9 +540,6 @@ fn func_order_in_one_dimension( #[cfg(test)] mod tests { - use super::*; - use crate::expressions::lit; - use crate::expressions::try_cast; use arrow::{ array::{ Array, ArrayRef, BooleanArray, Float32Array, Float64Array, Int32Array, @@ -579,12 +548,18 @@ mod tests { datatypes::Field, record_batch::RecordBatch, }; + use datafusion_common::cast::as_uint64_array; use datafusion_common::{exec_err, internal_err, plan_err}; use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::type_coercion::functions::data_types; use datafusion_expr::Signature; + use crate::expressions::lit; + use crate::expressions::try_cast; + + use super::*; + /// $FUNC function to test /// $ARGS arguments (vec) to pass to function /// $EXPECTED a Result> where Result allows testing errors and Option allows testing Null @@ -1124,33 +1099,6 @@ mod tests { Utf8, StringArray ); - test_function!( - Repeat, - &[lit("Pg"), lit(ScalarValue::Int64(Some(4))),], - Ok(Some("PgPgPgPg")), - &str, - Utf8, - StringArray - ); - test_function!( - Repeat, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(4))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - test_function!( - Repeat, - &[lit("Pg"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); #[cfg(feature = "unicode_expressions")] test_function!( Reverse, @@ -1447,42 +1395,6 @@ mod tests { Utf8, StringArray ); - test_function!( - SplitPart, - &[ - lit("abc~@~def~@~ghi"), - lit("~@~"), - lit(ScalarValue::Int64(Some(2))), - ], - Ok(Some("def")), - &str, - Utf8, - StringArray - ); - test_function!( - SplitPart, - &[ - lit("abc~@~def~@~ghi"), - lit("~@~"), - lit(ScalarValue::Int64(Some(20))), - ], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - test_function!( - SplitPart, - &[ - lit("abc~@~def~@~ghi"), - lit("~@~"), - lit(ScalarValue::Int64(Some(-1))), - ], - exec_err!("field position must be greater than zero"), - &str, - Utf8, - StringArray - ); test_function!( EndsWith, &[lit("alphabet"), lit("alph"),], @@ -1812,7 +1724,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); // pick some arbitrary functions to test - let funs = [BuiltinScalarFunction::Concat, BuiltinScalarFunction::Repeat]; + let funs = [BuiltinScalarFunction::Concat]; for fun in funs.iter() { let expr = create_physical_expr_with_type_coercion( diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 766e167a94261..812b746354a4a 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -242,73 +242,6 @@ pub fn instr(args: &[ArrayRef]) -> Result { } } -/// Repeats string the specified number of times. -/// repeat('Pg', 4) = 'PgPgPgPg' -pub fn repeat(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let number_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(number_array.iter()) - .map(|(string, number)| match (string, number) { - (Some(string), Some(number)) => Some(string.repeat(number as usize)), - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Replaces all occurrences in string of substring from with substring to. -/// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef' -pub fn replace(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let from_array = as_generic_string_array::(&args[1])?; - let to_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(from_array.iter()) - .zip(to_array.iter()) - .map(|((string, from), to)| match (string, from, to) { - (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Splits string at occurrences of delimiter and returns the n'th field (counting from one). -/// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def' -pub fn split_part(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let delimiter_array = as_generic_string_array::(&args[1])?; - let n_array = as_int64_array(&args[2])?; - let result = string_array - .iter() - .zip(delimiter_array.iter()) - .zip(n_array.iter()) - .map(|((string, delimiter), n)| match (string, delimiter, n) { - (Some(string), Some(delimiter), Some(n)) => { - if n <= 0 { - exec_err!("field position must be greater than zero") - } else { - let split_string: Vec<&str> = string.split(delimiter).collect(); - match split_string.get(n as usize - 1) { - Some(s) => Ok(Some(*s)), - None => Ok(Some("")), - } - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) -} - /// Returns true if string starts with prefix. /// starts_with('alphabet', 'alph') = 't' pub fn starts_with(args: &[ArrayRef]) -> Result { diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 795995ce2c468..297e355dd7b18 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -581,8 +581,8 @@ enum ScalarFunction { // 37 was OctetLength Random = 38; // 39 was RegexpReplace - Repeat = 40; - Replace = 41; + // 40 was Repeat + // 41 was Replace Reverse = 42; Right = 43; Rpad = 44; @@ -591,7 +591,7 @@ enum ScalarFunction { // 47 was SHA256 // 48 was SHA384 // 49 was SHA512 - SplitPart = 50; + // 50 was SplitPart // StartsWith = 51; Strpos = 52; Substr = 53; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 3941171e4fe6e..dce815f0f2345 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22937,12 +22937,9 @@ impl serde::Serialize for ScalarFunction { Self::Left => "Left", Self::Lpad => "Lpad", Self::Random => "Random", - Self::Repeat => "Repeat", - Self::Replace => "Replace", Self::Reverse => "Reverse", Self::Right => "Right", Self::Rpad => "Rpad", - Self::SplitPart => "SplitPart", Self::Strpos => "Strpos", Self::Substr => "Substr", Self::Translate => "Translate", @@ -23002,12 +22999,9 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Left", "Lpad", "Random", - "Repeat", - "Replace", "Reverse", "Right", "Rpad", - "SplitPart", "Strpos", "Substr", "Translate", @@ -23096,12 +23090,9 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Left" => Ok(ScalarFunction::Left), "Lpad" => Ok(ScalarFunction::Lpad), "Random" => Ok(ScalarFunction::Random), - "Repeat" => Ok(ScalarFunction::Repeat), - "Replace" => Ok(ScalarFunction::Replace), "Reverse" => Ok(ScalarFunction::Reverse), "Right" => Ok(ScalarFunction::Right), "Rpad" => Ok(ScalarFunction::Rpad), - "SplitPart" => Ok(ScalarFunction::SplitPart), "Strpos" => Ok(ScalarFunction::Strpos), "Substr" => Ok(ScalarFunction::Substr), "Translate" => Ok(ScalarFunction::Translate), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 58fda7fcb5ad6..2292687b45a6c 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2880,8 +2880,8 @@ pub enum ScalarFunction { /// 37 was OctetLength Random = 38, /// 39 was RegexpReplace - Repeat = 40, - Replace = 41, + /// 40 was Repeat + /// 41 was Replace Reverse = 42, Right = 43, Rpad = 44, @@ -2890,7 +2890,7 @@ pub enum ScalarFunction { /// 47 was SHA256 /// 48 was SHA384 /// 49 was SHA512 - SplitPart = 50, + /// 50 was SplitPart /// StartsWith = 51; Strpos = 52, Substr = 53, @@ -3010,12 +3010,9 @@ impl ScalarFunction { ScalarFunction::Left => "Left", ScalarFunction::Lpad => "Lpad", ScalarFunction::Random => "Random", - ScalarFunction::Repeat => "Repeat", - ScalarFunction::Replace => "Replace", ScalarFunction::Reverse => "Reverse", ScalarFunction::Right => "Right", ScalarFunction::Rpad => "Rpad", - ScalarFunction::SplitPart => "SplitPart", ScalarFunction::Strpos => "Strpos", ScalarFunction::Substr => "Substr", ScalarFunction::Translate => "Translate", @@ -3069,12 +3066,9 @@ impl ScalarFunction { "Left" => Some(Self::Left), "Lpad" => Some(Self::Lpad), "Random" => Some(Self::Random), - "Repeat" => Some(Self::Repeat), - "Replace" => Some(Self::Replace), "Reverse" => Some(Self::Reverse), "Right" => Some(Self::Right), "Rpad" => Some(Self::Rpad), - "SplitPart" => Some(Self::SplitPart), "Strpos" => Some(Self::Strpos), "Substr" => Some(Self::Substr), "Translate" => Some(Self::Translate), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 3b44c1cb276dd..b78e3ae6dc618 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -53,11 +53,10 @@ use datafusion_expr::{ expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, - lpad, nanvl, pi, power, radians, random, repeat, replace, reverse, right, round, - rpad, signum, sin, sinh, split_part, sqrt, strpos, substr, substr_index, substring, - translate, trunc, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, - BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, - GroupingSet, + lpad, nanvl, pi, power, radians, random, reverse, right, round, rpad, signum, sin, + sinh, sqrt, strpos, substr, substr_index, substring, translate, trunc, + AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, + Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -468,12 +467,9 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Left => Self::Left, ScalarFunction::Lpad => Self::Lpad, ScalarFunction::Random => Self::Random, - ScalarFunction::Repeat => Self::Repeat, - ScalarFunction::Replace => Self::Replace, ScalarFunction::Reverse => Self::Reverse, ScalarFunction::Right => Self::Right, ScalarFunction::Rpad => Self::Rpad, - ScalarFunction::SplitPart => Self::SplitPart, ScalarFunction::Strpos => Self::Strpos, ScalarFunction::Substr => Self::Substr, ScalarFunction::Translate => Self::Translate, @@ -1445,15 +1441,6 @@ pub fn parse_expr( parse_expr(&args[1], registry, codec)?, )), ScalarFunction::Random => Ok(random()), - ScalarFunction::Repeat => Ok(repeat( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), - ScalarFunction::Replace => Ok(replace( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), ScalarFunction::Reverse => { Ok(reverse(parse_expr(&args[0], registry, codec)?)) } @@ -1485,11 +1472,6 @@ pub fn parse_expr( .map(|expr| parse_expr(expr, registry, codec)) .collect::, _>>()?, )), - ScalarFunction::SplitPart => Ok(split_part( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), ScalarFunction::EndsWith => Ok(ends_with( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 446a91a39a1b1..0c0f0c6e0a922 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1490,12 +1490,9 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Left => Self::Left, BuiltinScalarFunction::Lpad => Self::Lpad, BuiltinScalarFunction::Random => Self::Random, - BuiltinScalarFunction::Repeat => Self::Repeat, - BuiltinScalarFunction::Replace => Self::Replace, BuiltinScalarFunction::Reverse => Self::Reverse, BuiltinScalarFunction::Right => Self::Right, BuiltinScalarFunction::Rpad => Self::Rpad, - BuiltinScalarFunction::SplitPart => Self::SplitPart, BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::Translate => Self::Translate,