Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions datafusion/expr-common/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1048,11 +1048,23 @@ pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp
.or_else(|| null_coercion(lhs_type, rhs_type))
}

/// coercion rules for regular expression comparison operations with NULL input.
fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
(DataType::Null, Utf8View | Utf8 | LargeUtf8) => Some(rhs_type.clone()),
(Utf8View | Utf8 | LargeUtf8, DataType::Null) => Some(lhs_type.clone()),
(DataType::Null, DataType::Null) => Some(Utf8),
Comment on lines +1055 to +1057
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there is any reason to explicitly list out the types?

Like maybe you could simplify this a bit like

let lhs_null = matches!(lhs_type, DataType::Null);
let rhs_null = matches!(rhs_type, DataType::Null);
math (lhs_null, rhs_null) {
  (true, false) => Some(rhs_type.clone()),
  (false, true) => Some(lhs_type.clone())
  (true, true) => Some(Utf8)
  (false, false) => None,

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we only accept string and NULL input, not other type, such as select 'abc' ~ null;
But select 1 ~ null is not supported.

_ => None,
}
}

/// coercion rules for regular expression comparison operations.
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
string_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
}

/// Checks if the TimeUnit associated with a Time32 or Time64 type is consistent,
Expand Down
105 changes: 105 additions & 0 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2498,6 +2498,111 @@ mod tests {
Ok(())
}

#[test]
fn regex_with_nulls() -> Result<()> {
let schema = Schema::new(vec![
Field::new("a", DataType::Utf8, true),
Field::new("b", DataType::Utf8, true),
]);
let a = Arc::new(StringArray::from(vec![
Some("abc"),
None,
Some("abc"),
None,
Some("abc"),
])) as ArrayRef;
let b = Arc::new(StringArray::from(vec![
Some("^a"),
Some("^A"),
None,
None,
Some("^(b|c)"),
])) as ArrayRef;

let regex_expected =
BooleanArray::from(vec![Some(true), None, None, None, Some(false)]);
let regex_not_expected =
BooleanArray::from(vec![Some(false), None, None, None, Some(true)]);
apply_logic_op(
&Arc::new(schema.clone()),
&a,
&b,
Operator::RegexMatch,
regex_expected.clone(),
)?;
apply_logic_op(
&Arc::new(schema.clone()),
&a,
&b,
Operator::RegexIMatch,
regex_expected.clone(),
)?;
apply_logic_op(
&Arc::new(schema.clone()),
&a,
&b,
Operator::RegexNotMatch,
regex_not_expected.clone(),
)?;
apply_logic_op(
&Arc::new(schema),
&a,
&b,
Operator::RegexNotIMatch,
regex_not_expected.clone(),
)?;

let schema = Schema::new(vec![
Field::new("a", DataType::LargeUtf8, true),
Field::new("b", DataType::LargeUtf8, true),
]);
let a = Arc::new(LargeStringArray::from(vec![
Some("abc"),
None,
Some("abc"),
None,
Some("abc"),
])) as ArrayRef;
let b = Arc::new(LargeStringArray::from(vec![
Some("^a"),
Some("^A"),
None,
None,
Some("^(b|c)"),
])) as ArrayRef;

apply_logic_op(
&Arc::new(schema.clone()),
&a,
&b,
Operator::RegexMatch,
regex_expected.clone(),
)?;
apply_logic_op(
&Arc::new(schema.clone()),
&a,
&b,
Operator::RegexIMatch,
regex_expected.clone(),
)?;
apply_logic_op(
&Arc::new(schema.clone()),
&a,
&b,
Operator::RegexNotMatch,
regex_not_expected.clone(),
)?;
apply_logic_op(
&Arc::new(schema),
&a,
&b,
Operator::RegexNotIMatch,
regex_not_expected.clone(),
)?;

Ok(())
}

#[test]
fn or_with_nulls_op() -> Result<()> {
let schema = Schema::new(vec![
Expand Down
60 changes: 60 additions & 0 deletions datafusion/sqllogictest/test_files/regexp.slt
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,66 @@ SELECT regexp_match('aaa-555', '.*-(\d*)');
----
[555]

query B
select 'abc' ~ null;
----
NULL

query B
select null ~ null;
----
NULL

query B
select null ~ 'abc';
----
NULL

query B
select 'abc' ~* null;
----
NULL

query B
select null ~* null;
----
NULL

query B
select null ~* 'abc';
----
NULL

query B
select 'abc' !~ null;
----
NULL

query B
select null !~ null;
----
NULL

query B
select null !~ 'abc';
----
NULL

query B
select 'abc' !~* null;
----
NULL

query B
select null !~* null;
----
NULL

query B
select null !~* 'abc';
----
NULL

#
# regexp_replace tests
#
Expand Down