From 5018e8173d83f9de2afe73917002f1c02cbc3c8b Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 16 Apr 2026 18:30:13 -0400 Subject: [PATCH 1/2] chore: Rename concat-specific string builders, make pub(crate) Rename the three builders in `datafusion/functions/src/strings.rs` to make their special-purpose nature explicit: - `StringArrayBuilder` -> `ConcatStringBuilder` - `LargeStringArrayBuilder` -> `ConcatLargeStringBuilder` - `StringViewArrayBuilder` -> `ConcatStringViewBuilder` These builders are used only by `concat` and `concat_ws`, and their API is specific enough to the requirements of `concat`-like callers that it seems unlikely that other call-sites will develop in the future. This also frees the previous names of these builders to be used by a more broadly useful string builder API. Also make these types `pub(crate)`, instead of `pub`; there is no good reason to make them part of the public API. This is a breaking API change. --- datafusion/functions/src/string/concat.rs | 9 ++-- datafusion/functions/src/string/concat_ws.rs | 9 ++-- datafusion/functions/src/strings.rs | 50 +++++++++++-------- .../library-user-guide/upgrading/54.0.0.md | 8 +++ 4 files changed, 47 insertions(+), 29 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index e22c3b80dacb4..b10db23472c99 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -22,7 +22,8 @@ use std::sync::Arc; use crate::string::concat; use crate::strings::{ - ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder, + ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder, + ConcatStringViewBuilder, }; use datafusion_common::cast::{as_binary_array, as_string_array, as_string_view_array}; use datafusion_common::{ @@ -242,7 +243,7 @@ impl ScalarUDFImpl for ConcatFunc { match return_datatype { DataType::Utf8 => { - let mut builder = StringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() @@ -254,7 +255,7 @@ impl ScalarUDFImpl for ConcatFunc { Ok(ColumnarValue::Array(Arc::new(string_array))) } DataType::Utf8View => { - let mut builder = StringViewArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() @@ -266,7 +267,7 @@ impl ScalarUDFImpl for ConcatFunc { Ok(ColumnarValue::Array(Arc::new(string_array))) } DataType::LargeUtf8 => { - let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index fc4cc6e43b160..2c2d4bd42165b 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -24,7 +24,8 @@ use crate::string::concat; use crate::string::concat::simplify_concat; use crate::string::concat_ws; use crate::strings::{ - ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder, + ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder, + ConcatStringViewBuilder, }; use datafusion_common::cast::{ as_large_string_array, as_string_array, as_string_view_array, @@ -311,7 +312,7 @@ impl ScalarUDFImpl for ConcatWsFunc { match return_datatype { DataType::Utf8View => { - let mut builder = StringViewArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; @@ -332,7 +333,7 @@ impl ScalarUDFImpl for ConcatWsFunc { Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?))) } DataType::LargeUtf8 => { - let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; @@ -353,7 +354,7 @@ impl ScalarUDFImpl for ConcatWsFunc { Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?))) } _ => { - let mut builder = StringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index d0e1ecf8a8e7c..e44a14f97a41a 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -26,17 +26,20 @@ use arrow::array::{ use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer}; use arrow::datatypes::DataType; -/// Optimized version of the StringBuilder in Arrow that: -/// 1. Precalculating the expected length of the result, avoiding reallocations. -/// 2. Avoids creating / incrementally creating a `NullBufferBuilder` -pub struct StringArrayBuilder { +/// Builder used by `concat`/`concat_ws` to assemble a [`StringArray`] one row +/// at a time from multiple input columns. +/// +/// Each row is written via repeated `write` calls, followed by a single +/// `append_offset` to commit the row. The output null buffer is computed in +/// bulk by the caller and supplied to `finish`. +pub(crate) struct ConcatStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call tainted: bool, } -impl StringArrayBuilder { +impl ConcatStringBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let capacity = item_capacity .checked_add(1) @@ -151,13 +154,15 @@ impl StringArrayBuilder { } } -/// Optimized version of Arrow's [`StringViewBuilder`]. Rather than adding NULLs -/// on a row-by-row basis, the caller should provide nulls when calling -/// [`finish`](Self::finish). This allows callers to compute nulls more -/// efficiently (e.g., via bulk bitmap operations). +/// Builder used by `concat`/`concat_ws` to assemble a [`StringViewArray`] one +/// row at a time from multiple input columns. +/// +/// Each row is written via repeated `write` calls, followed by a single +/// `append_offset` to commit the row as a single string view. The output null +/// buffer is supplied by the caller at `finish` time. /// -/// [`StringViewBuilder`]: arrow::array::StringViewBuilder -pub struct StringViewArrayBuilder { +/// [`StringViewArray`]: arrow::array::StringViewArray +pub(crate) struct ConcatStringViewBuilder { views: Vec, data: Vec, block: Vec, @@ -165,7 +170,7 @@ pub struct StringViewArrayBuilder { tainted: bool, } -impl StringViewArrayBuilder { +impl ConcatStringViewBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { Self { views: Vec::with_capacity(item_capacity), @@ -286,14 +291,17 @@ impl StringViewArrayBuilder { } } -pub struct LargeStringArrayBuilder { +/// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one +/// row at a time from multiple input columns. See [`ConcatStringBuilder`] for +/// details on the row-composition contract. +pub(crate) struct ConcatLargeStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call tainted: bool, } -impl LargeStringArrayBuilder { +impl ConcatLargeStringBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let capacity = item_capacity .checked_add(1) @@ -424,9 +432,9 @@ impl LargeStringArrayBuilder { /// - start_offset: The start offset of the substring in the view /// /// LLVM is apparently overly eager to inline this function into some hot loops, -/// which bloats them and regresses performance, so we disable inling for now. +/// which bloats them and regresses performance, so we disable inlining for now. #[inline(never)] -pub fn append_view( +pub(crate) fn append_view( views_buffer: &mut Vec, original_view: &u128, substr: &str, @@ -447,7 +455,7 @@ pub fn append_view( } #[derive(Debug)] -pub enum ColumnarValueRef<'a> { +pub(crate) enum ColumnarValueRef<'a> { Scalar(&'a [u8]), NullableArray(&'a StringArray), NonNullableArray(&'a StringArray), @@ -497,13 +505,13 @@ mod tests { #[test] #[should_panic(expected = "capacity integer overflow")] - fn test_overflow_string_array_builder() { - let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX); + fn test_overflow_concat_string_builder() { + let _builder = ConcatStringBuilder::with_capacity(usize::MAX, usize::MAX); } #[test] #[should_panic(expected = "capacity integer overflow")] - fn test_overflow_large_string_array_builder() { - let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX); + fn test_overflow_concat_large_string_builder() { + let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX); } } diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index da1db6de324e4..0ad9abf86834b 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -320,4 +320,12 @@ The difference is only observable for strings containing combining characters clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text, behavior is unchanged. +### Items in `datafusion_functions::strings` are no longer public + +`StringArrayBuilder`, `LargeStringArrayBuilder`, `StringViewArrayBuilder`, +`ColumnarValueRef`, and `append_view` have been reduced to `pub(crate)`. They +were only ever used to implement `concat` and `concat_ws` inside the crate. If +you were importing them externally, use Arrow's corresponding builders with a +caller-computed `NullBuffer`. + [#17861]: https://github.com/apache/datafusion/pull/17861 From ba61baaa723736f013f413ab813e50ef4f6e8aba Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 17 Apr 2026 08:22:16 -0400 Subject: [PATCH 2/2] . --- datafusion/functions/src/strings.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index e44a14f97a41a..f986ffd2e3756 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -30,8 +30,8 @@ use arrow::datatypes::DataType; /// at a time from multiple input columns. /// /// Each row is written via repeated `write` calls, followed by a single -/// `append_offset` to commit the row. The output null buffer is computed in -/// bulk by the caller and supplied to `finish`. +/// `append_offset` call to commit the row. The output null buffer is supplied +/// by the caller at `finish` time. pub(crate) struct ConcatStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, @@ -158,10 +158,8 @@ impl ConcatStringBuilder { /// row at a time from multiple input columns. /// /// Each row is written via repeated `write` calls, followed by a single -/// `append_offset` to commit the row as a single string view. The output null -/// buffer is supplied by the caller at `finish` time. -/// -/// [`StringViewArray`]: arrow::array::StringViewArray +/// `append_offset` call to commit the row as a single string view. The output +/// null buffer is supplied by the caller at `finish` time. pub(crate) struct ConcatStringViewBuilder { views: Vec, data: Vec,