From 052add63f335ddc9030aba8202e0776bbfa7c253 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 8 Mar 2026 01:32:51 +0800 Subject: [PATCH 1/6] Native engine crashes on all-literal RLIKE expression --- .../spark-expr/src/predicate_funcs/rlike.rs | 28 +++++++++++++++++-- .../expressions/string/rlike_enabled.sql | 2 +- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index 099e9852cb..02dc414ec5 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -21,7 +21,7 @@ use arrow::array::types::Int32Type; use arrow::array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArray}; use arrow::compute::take; use arrow::datatypes::{DataType, Schema}; -use datafusion::common::{internal_err, Result}; +use datafusion::common::{internal_err, Result, ScalarValue}; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::ColumnarValue; use regex::Regex; @@ -140,8 +140,30 @@ impl PhysicalExpr for RLike { let array = self.is_match(inputs); Ok(ColumnarValue::Array(Arc::new(array))) } - ColumnarValue::Scalar(_) => { - internal_err!("non scalar regexp patterns are not supported") + ColumnarValue::Scalar(scalar) => { + // Handle scalar input (all-literal RLIKE expressions) + // This case occurs when ConstantFolding is disabled and both + // the input string and pattern are literals + if scalar.is_null() { + // NULL RLIKE pattern -> NULL result + return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); + } + + // Extract string value from scalar and match pattern + // We handle each type separately to avoid lifetime issues with Utf8View + let is_match = match scalar { + ScalarValue::Utf8(Some(s)) => self.pattern.is_match(s.as_str()), + ScalarValue::LargeUtf8(Some(s)) => self.pattern.is_match(s.as_str()), + ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(s.as_str()), + _ => { + return internal_err!( + "RLike requires string type for input, got {:?}", + scalar.data_type() + ); + } + }; + + Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(is_match)))) } } } diff --git a/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql b/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql index 822fb3ddb8..1de215a770 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql @@ -35,5 +35,5 @@ query SELECT s RLIKE '' FROM test_rlike_enabled -- literal arguments -query ignore(https://github.com/apache/datafusion-comet/issues/3343) +query SELECT 'hello' RLIKE '^[a-z]+$', '12345' RLIKE '^[a-z]+$', '' RLIKE '', NULL RLIKE 'a' From 778e3f77d6bc2e156e4655729b1444f890695987 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 9 Mar 2026 12:27:32 +0800 Subject: [PATCH 2/6] add test --- .../spark-expr/src/predicate_funcs/rlike.rs | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index 02dc414ec5..a70190f8dc 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -22,6 +22,8 @@ use arrow::array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArra use arrow::compute::take; use arrow::datatypes::{DataType, Schema}; use datafusion::common::{internal_err, Result, ScalarValue}; +#[cfg(test)] +use datafusion::physical_expr::expressions::Literal; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::ColumnarValue; use regex::Regex; @@ -141,20 +143,14 @@ impl PhysicalExpr for RLike { Ok(ColumnarValue::Array(Arc::new(array))) } ColumnarValue::Scalar(scalar) => { - // Handle scalar input (all-literal RLIKE expressions) - // This case occurs when ConstantFolding is disabled and both - // the input string and pattern are literals if scalar.is_null() { - // NULL RLIKE pattern -> NULL result return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); } - // Extract string value from scalar and match pattern - // We handle each type separately to avoid lifetime issues with Utf8View let is_match = match scalar { - ScalarValue::Utf8(Some(s)) => self.pattern.is_match(s.as_str()), - ScalarValue::LargeUtf8(Some(s)) => self.pattern.is_match(s.as_str()), - ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(s.as_str()), + ScalarValue::Utf8(Some(s)) + | ScalarValue::LargeUtf8(Some(s)) + | ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(s.as_str()), _ => { return internal_err!( "RLike requires string type for input, got {:?}", @@ -187,3 +183,25 @@ impl PhysicalExpr for RLike { Display::fmt(self, f) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rlike_scalar_utf8_literal() { + let expr = RLike::try_new( + Arc::new(Literal::new(ScalarValue::Utf8(Some("Rose".to_string())))), + "R[a-z]+", + ) + .unwrap(); + let result = expr + .evaluate(&RecordBatch::new_empty(Arc::new(Schema::empty()))) + .unwrap(); + let ColumnarValue::Scalar(result) = result else { + panic!("expected scalar result"); + }; + + assert_eq!(result, ScalarValue::Boolean(Some(true))); + } +} From 8e33b42b5eb9f9730533bf381df623f00a57fb56 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 11 Mar 2026 11:50:46 +0800 Subject: [PATCH 3/6] address comment --- native/spark-expr/src/predicate_funcs/rlike.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index a70190f8dc..80cfb94980 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -22,8 +22,6 @@ use arrow::array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArra use arrow::compute::take; use arrow::datatypes::{DataType, Schema}; use datafusion::common::{internal_err, Result, ScalarValue}; -#[cfg(test)] -use datafusion::physical_expr::expressions::Literal; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::ColumnarValue; use regex::Regex; @@ -187,6 +185,7 @@ impl PhysicalExpr for RLike { #[cfg(test)] mod tests { use super::*; + use datafusion::physical_expr::expressions::Literal; #[test] fn test_rlike_scalar_utf8_literal() { From 44db06797a77b1cffe803fa6ed02d7ca44e2aab7 Mon Sep 17 00:00:00 2001 From: ChenChen Lai <72776271+0lai0@users.noreply.github.com> Date: Wed, 11 Mar 2026 22:01:24 +0800 Subject: [PATCH 4/6] Update native/spark-expr/src/predicate_funcs/rlike.rs Co-authored-by: Martin Grigorov --- native/spark-expr/src/predicate_funcs/rlike.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index 80cfb94980..3d327eafba 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -148,7 +148,7 @@ impl PhysicalExpr for RLike { let is_match = match scalar { ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) - | ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(s.as_str()), + | ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(&s), _ => { return internal_err!( "RLike requires string type for input, got {:?}", From 7161fa438a00214cc2854be0aee3bbb8b8a87655 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 11 Mar 2026 22:15:53 +0800 Subject: [PATCH 5/6] address comment test --- .../spark-expr/src/predicate_funcs/rlike.rs | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index 3d327eafba..7c5ebb8ecf 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -188,10 +188,30 @@ mod tests { use datafusion::physical_expr::expressions::Literal; #[test] - fn test_rlike_scalar_utf8_literal() { + fn test_rlike_scalar_string_variants() { + let pattern = "R[a-z]+"; + let scalars = [ + ScalarValue::Utf8(Some("Rose".to_string())), + ScalarValue::LargeUtf8(Some("Rose".to_string())), + ScalarValue::Utf8View(Some("Rose".to_string())), + ]; + + for scalar in scalars { + let expr = + RLike::try_new(Arc::new(Literal::new(scalar.clone())), pattern).unwrap(); + let result = expr + .evaluate(&RecordBatch::new_empty(Arc::new(Schema::empty()))) + .unwrap(); + let ColumnarValue::Scalar(result) = result else { + panic!("expected scalar result"); + }; + assert_eq!(result, ScalarValue::Boolean(Some(true))); + } + + // Null input should produce a null boolean result let expr = RLike::try_new( - Arc::new(Literal::new(ScalarValue::Utf8(Some("Rose".to_string())))), - "R[a-z]+", + Arc::new(Literal::new(ScalarValue::Utf8(None))), + pattern, ) .unwrap(); let result = expr @@ -200,7 +220,18 @@ mod tests { let ColumnarValue::Scalar(result) = result else { panic!("expected scalar result"); }; + assert_eq!(result, ScalarValue::Boolean(None)); + } + + #[test] + fn test_rlike_scalar_non_string_error() { + let expr = RLike::try_new( + Arc::new(Literal::new(ScalarValue::Boolean(Some(true)))), + "R[a-z]+", + ) + .unwrap(); - assert_eq!(result, ScalarValue::Boolean(Some(true))); + let result = expr.evaluate(&RecordBatch::new_empty(Arc::new(Schema::empty()))); + assert!(result.is_err()); } } From 8c6070d980b71112d21c90377ec8f1cf4064751a Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 12 Mar 2026 00:00:05 +0800 Subject: [PATCH 6/6] fix fmt --- native/spark-expr/src/predicate_funcs/rlike.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index 7c5ebb8ecf..ed5970a6a2 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -197,8 +197,7 @@ mod tests { ]; for scalar in scalars { - let expr = - RLike::try_new(Arc::new(Literal::new(scalar.clone())), pattern).unwrap(); + let expr = RLike::try_new(Arc::new(Literal::new(scalar.clone())), pattern).unwrap(); let result = expr .evaluate(&RecordBatch::new_empty(Arc::new(Schema::empty()))) .unwrap(); @@ -209,11 +208,8 @@ mod tests { } // Null input should produce a null boolean result - let expr = RLike::try_new( - Arc::new(Literal::new(ScalarValue::Utf8(None))), - pattern, - ) - .unwrap(); + let expr = + RLike::try_new(Arc::new(Literal::new(ScalarValue::Utf8(None))), pattern).unwrap(); let result = expr .evaluate(&RecordBatch::new_empty(Arc::new(Schema::empty()))) .unwrap();