From 447f57150506a7d76ab9bf82ea81d28dbeaac898 Mon Sep 17 00:00:00 2001 From: aryan-212 Date: Thu, 22 Jan 2026 13:14:33 +0530 Subject: [PATCH 1/4] feat: implement `StringView` for SparkConcat --- .../spark/src/function/string/concat.rs | 86 +++++++++++++++++-- .../test_files/spark/string/concat.slt | 17 ++++ 2 files changed, 97 insertions(+), 6 deletions(-) diff --git a/datafusion/spark/src/function/string/concat.rs b/datafusion/spark/src/function/string/concat.rs index f3dae22866c2..1d77365ca31c 100644 --- a/datafusion/spark/src/function/string/concat.rs +++ b/datafusion/spark/src/function/string/concat.rs @@ -53,9 +53,13 @@ impl Default for SparkConcat { impl SparkConcat { pub fn new() -> Self { + use DataType::*; Self { signature: Signature::one_of( - vec![TypeSignature::UserDefined, TypeSignature::Nullary], + vec![ + TypeSignature::Variadic(vec![Utf8View, Utf8, LargeUtf8]), + TypeSignature::Nullary, + ], Volatility::Immutable, ), } @@ -89,10 +93,21 @@ impl ScalarUDFImpl for SparkConcat { ) } fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result { + use DataType::*; + // Spark semantics: concat returns NULL if ANY input is NULL let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); - Ok(Arc::new(Field::new("concat", DataType::Utf8, nullable))) + // Determine return type: Utf8View > LargeUtf8 > Utf8 + let mut dt = &Utf8; + for field in args.arg_fields { + let data_type = field.data_type(); + if data_type == &Utf8View || (data_type == &LargeUtf8 && dt != &Utf8View) { + dt = data_type; + } + } + + Ok(Arc::new(Field::new("concat", dt.clone(), nullable))) } } @@ -110,9 +125,18 @@ fn spark_concat(args: ScalarFunctionArgs) -> Result { // Handle zero-argument case: return empty string if arg_values.is_empty() { - return Ok(ColumnarValue::Scalar(ScalarValue::Utf8( - Some(String::new()), - ))); + let return_type = return_field.data_type(); + return match return_type { + DataType::Utf8View => Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::new(), + )))), + DataType::LargeUtf8 => Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8( + Some(String::new()), + ))), + _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8( + Some(String::new()), + ))), + }; } // Step 1: Check for NULL mask in incoming args @@ -120,7 +144,14 @@ fn spark_concat(args: ScalarFunctionArgs) -> Result { // If all scalars and any is NULL, return NULL immediately if matches!(null_mask, NullMaskResolution::ReturnNull) { - return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); + let return_type = return_field.data_type(); + return match return_type { + DataType::Utf8View => Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None))), + DataType::LargeUtf8 => { + Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None))) + } + _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))), + }; } // Step 2: Delegate to DataFusion's concat @@ -181,6 +212,24 @@ mod tests { ); Ok(()) } + + #[test] + fn test_concat_utf8view() -> Result<()> { + use arrow::array::StringViewArray; + test_scalar_function!( + SparkConcat::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some("Spark".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some("SQL".to_string()))), + ], + Ok(Some("SparkSQL")), + &str, + DataType::Utf8View, + StringViewArray + ); + Ok(()) + } + #[test] fn test_spark_concat_return_field_non_nullable() -> Result<()> { let func = SparkConcat::new(); @@ -227,4 +276,29 @@ mod tests { Ok(()) } + + #[test] + fn test_spark_concat_return_field_largeutf8() -> Result<()> { + let func = SparkConcat::new(); + + let fields = vec![ + Arc::new(Field::new("a", DataType::Utf8, false)), + Arc::new(Field::new("b", DataType::LargeUtf8, false)), + ]; + + let args = ReturnFieldArgs { + arg_fields: &fields, + scalar_arguments: &[], + }; + + let field = func.return_field_from_args(args)?; + + assert_eq!( + field.data_type(), + &DataType::LargeUtf8, + "Expected concat result to be LargeUtf8 when any input is LargeUtf8" + ); + + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/spark/string/concat.slt b/datafusion/sqllogictest/test_files/spark/string/concat.slt index 258cb829d7d4..0cbcade21cdf 100644 --- a/datafusion/sqllogictest/test_files/spark/string/concat.slt +++ b/datafusion/sqllogictest/test_files/spark/string/concat.slt @@ -46,3 +46,20 @@ SELECT concat(a, b, c) from (select 'a' a, 'b' b, 'c' c union all select null a, ---- abc NULL + +# Utf8View: no extra CAST in plan +statement ok +CREATE TABLE test_concat_view (a VARCHAR, b VARCHAR) AS VALUES ('foo', 'bar'), ('hello', 'world'); + +query TT +EXPLAIN SELECT concat(arrow_cast(a, 'Utf8View'), arrow_cast(b, 'Utf8View')) FROM test_concat_view; +---- +logical_plan +01)Projection: concat(test_concat_view.a, test_concat_view.b) AS concat(arrow_cast(test_concat_view.a,Utf8("Utf8View")),arrow_cast(test_concat_view.b,Utf8("Utf8View"))) +02)--TableScan: test_concat_view projection=[a, b] +physical_plan +01)ProjectionExec: expr=[concat(a@0, b@1) as concat(arrow_cast(test_concat_view.a,Utf8("Utf8View")),arrow_cast(test_concat_view.b,Utf8("Utf8View")))] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +statement ok +DROP TABLE test_concat_view; From 2d0eea04c1e7f893d09e856b1ff389cd892cb86a Mon Sep 17 00:00:00 2001 From: aryan-212 Date: Sun, 25 Jan 2026 15:58:26 +0530 Subject: [PATCH 2/4] tests(spark): additional sparkconcat tests --- .../sqllogictest/test_files/spark/string/concat.slt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/datafusion/sqllogictest/test_files/spark/string/concat.slt b/datafusion/sqllogictest/test_files/spark/string/concat.slt index 0cbcade21cdf..7f116c7eb974 100644 --- a/datafusion/sqllogictest/test_files/spark/string/concat.slt +++ b/datafusion/sqllogictest/test_files/spark/string/concat.slt @@ -47,6 +47,18 @@ SELECT concat(a, b, c) from (select 'a' a, 'b' b, 'c' c union all select null a, abc NULL +# Test mixed types: Utf8View + Utf8 +query T +SELECT concat(arrow_cast('hello', 'Utf8View'), ' world'); +---- +hello world + +# Test all three types mixed together +query T +SELECT concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View')); +---- +abc + # Utf8View: no extra CAST in plan statement ok CREATE TABLE test_concat_view (a VARCHAR, b VARCHAR) AS VALUES ('foo', 'bar'), ('hello', 'world'); From 78f2080c79000b4d8497ed6c1e998f873ac61d73 Mon Sep 17 00:00:00 2001 From: aryan-212 Date: Sun, 25 Jan 2026 20:39:49 +0530 Subject: [PATCH 3/4] fix: restore UserDefined in TypeSignature --- datafusion/spark/src/function/string/concat.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/datafusion/spark/src/function/string/concat.rs b/datafusion/spark/src/function/string/concat.rs index 1d77365ca31c..952dcf381e8f 100644 --- a/datafusion/spark/src/function/string/concat.rs +++ b/datafusion/spark/src/function/string/concat.rs @@ -53,13 +53,9 @@ impl Default for SparkConcat { impl SparkConcat { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![ - TypeSignature::Variadic(vec![Utf8View, Utf8, LargeUtf8]), - TypeSignature::Nullary, - ], + vec![TypeSignature::UserDefined, TypeSignature::Nullary], Volatility::Immutable, ), } From 1358178f0f41b8ad723f99815eb017894f3d4920 Mon Sep 17 00:00:00 2001 From: aryan-212 Date: Sun, 25 Jan 2026 22:45:44 +0530 Subject: [PATCH 4/4] feat: more sensible tests --- .../spark/src/function/string/concat.rs | 42 ------------------- .../test_files/spark/string/concat.slt | 37 +++++++--------- 2 files changed, 16 insertions(+), 63 deletions(-) diff --git a/datafusion/spark/src/function/string/concat.rs b/datafusion/spark/src/function/string/concat.rs index 952dcf381e8f..d280b9c1c002 100644 --- a/datafusion/spark/src/function/string/concat.rs +++ b/datafusion/spark/src/function/string/concat.rs @@ -209,23 +209,6 @@ mod tests { Ok(()) } - #[test] - fn test_concat_utf8view() -> Result<()> { - use arrow::array::StringViewArray; - test_scalar_function!( - SparkConcat::new(), - vec![ - ColumnarValue::Scalar(ScalarValue::Utf8View(Some("Spark".to_string()))), - ColumnarValue::Scalar(ScalarValue::Utf8View(Some("SQL".to_string()))), - ], - Ok(Some("SparkSQL")), - &str, - DataType::Utf8View, - StringViewArray - ); - Ok(()) - } - #[test] fn test_spark_concat_return_field_non_nullable() -> Result<()> { let func = SparkConcat::new(); @@ -272,29 +255,4 @@ mod tests { Ok(()) } - - #[test] - fn test_spark_concat_return_field_largeutf8() -> Result<()> { - let func = SparkConcat::new(); - - let fields = vec![ - Arc::new(Field::new("a", DataType::Utf8, false)), - Arc::new(Field::new("b", DataType::LargeUtf8, false)), - ]; - - let args = ReturnFieldArgs { - arg_fields: &fields, - scalar_arguments: &[], - }; - - let field = func.return_field_from_args(args)?; - - assert_eq!( - field.data_type(), - &DataType::LargeUtf8, - "Expected concat result to be LargeUtf8 when any input is LargeUtf8" - ); - - Ok(()) - } } diff --git a/datafusion/sqllogictest/test_files/spark/string/concat.slt b/datafusion/sqllogictest/test_files/spark/string/concat.slt index 7f116c7eb974..97e7b57f7d06 100644 --- a/datafusion/sqllogictest/test_files/spark/string/concat.slt +++ b/datafusion/sqllogictest/test_files/spark/string/concat.slt @@ -20,6 +20,12 @@ SELECT concat('Spark', 'SQL'); ---- SparkSQL +# Test two Utf8View inputs: value and return type +query TT +SELECT concat(arrow_cast('Spark', 'Utf8View'), arrow_cast('SQL', 'Utf8View')), arrow_typeof(concat(arrow_cast('Spark', 'Utf8View'), arrow_cast('SQL', 'Utf8View'))); +---- +SparkSQL Utf8View + query T SELECT concat('Spark', 'SQL', NULL); ---- @@ -48,30 +54,19 @@ abc NULL # Test mixed types: Utf8View + Utf8 -query T -SELECT concat(arrow_cast('hello', 'Utf8View'), ' world'); +query TT +SELECT concat(arrow_cast('hello', 'Utf8View'), ' world'), arrow_typeof(concat(arrow_cast('hello', 'Utf8View'), ' world')); ---- -hello world +hello world Utf8View -# Test all three types mixed together -query T -SELECT concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View')); +# Test Utf8 + LargeUtf8 => return type LargeUtf8 +query TT +SELECT concat('a', arrow_cast('b', 'LargeUtf8')), arrow_typeof(concat('a', arrow_cast('b', 'LargeUtf8'))); ---- -abc - -# Utf8View: no extra CAST in plan -statement ok -CREATE TABLE test_concat_view (a VARCHAR, b VARCHAR) AS VALUES ('foo', 'bar'), ('hello', 'world'); +ab LargeUtf8 +# Test all three types mixed together query TT -EXPLAIN SELECT concat(arrow_cast(a, 'Utf8View'), arrow_cast(b, 'Utf8View')) FROM test_concat_view; +SELECT concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View')), arrow_typeof(concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View'))); ---- -logical_plan -01)Projection: concat(test_concat_view.a, test_concat_view.b) AS concat(arrow_cast(test_concat_view.a,Utf8("Utf8View")),arrow_cast(test_concat_view.b,Utf8("Utf8View"))) -02)--TableScan: test_concat_view projection=[a, b] -physical_plan -01)ProjectionExec: expr=[concat(a@0, b@1) as concat(arrow_cast(test_concat_view.a,Utf8("Utf8View")),arrow_cast(test_concat_view.b,Utf8("Utf8View")))] -02)--DataSourceExec: partitions=1, partition_sizes=[1] - -statement ok -DROP TABLE test_concat_view; +abc Utf8View