diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index e22c3b80dacb4..b10db23472c99 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -22,7 +22,8 @@ use std::sync::Arc; use crate::string::concat; use crate::strings::{ - ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder, + ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder, + ConcatStringViewBuilder, }; use datafusion_common::cast::{as_binary_array, as_string_array, as_string_view_array}; use datafusion_common::{ @@ -242,7 +243,7 @@ impl ScalarUDFImpl for ConcatFunc { match return_datatype { DataType::Utf8 => { - let mut builder = StringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() @@ -254,7 +255,7 @@ impl ScalarUDFImpl for ConcatFunc { Ok(ColumnarValue::Array(Arc::new(string_array))) } DataType::Utf8View => { - let mut builder = StringViewArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() @@ -266,7 +267,7 @@ impl ScalarUDFImpl for ConcatFunc { Ok(ColumnarValue::Array(Arc::new(string_array))) } DataType::LargeUtf8 => { - let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index fc4cc6e43b160..2c2d4bd42165b 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -24,7 +24,8 @@ use crate::string::concat; use crate::string::concat::simplify_concat; use crate::string::concat_ws; use crate::strings::{ - ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder, + ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder, + ConcatStringViewBuilder, }; use datafusion_common::cast::{ as_large_string_array, as_string_array, as_string_view_array, @@ -311,7 +312,7 @@ impl ScalarUDFImpl for ConcatWsFunc { match return_datatype { DataType::Utf8View => { - let mut builder = StringViewArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; @@ -332,7 +333,7 @@ impl ScalarUDFImpl for ConcatWsFunc { Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?))) } DataType::LargeUtf8 => { - let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; @@ -353,7 +354,7 @@ impl ScalarUDFImpl for ConcatWsFunc { Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?))) } _ => { - let mut builder = StringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index d0e1ecf8a8e7c..f986ffd2e3756 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -26,17 +26,20 @@ use arrow::array::{ use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer}; use arrow::datatypes::DataType; -/// Optimized version of the StringBuilder in Arrow that: -/// 1. Precalculating the expected length of the result, avoiding reallocations. -/// 2. Avoids creating / incrementally creating a `NullBufferBuilder` -pub struct StringArrayBuilder { +/// Builder used by `concat`/`concat_ws` to assemble a [`StringArray`] one row +/// at a time from multiple input columns. +/// +/// Each row is written via repeated `write` calls, followed by a single +/// `append_offset` call to commit the row. The output null buffer is supplied +/// by the caller at `finish` time. +pub(crate) struct ConcatStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call tainted: bool, } -impl StringArrayBuilder { +impl ConcatStringBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let capacity = item_capacity .checked_add(1) @@ -151,13 +154,13 @@ impl StringArrayBuilder { } } -/// Optimized version of Arrow's [`StringViewBuilder`]. Rather than adding NULLs -/// on a row-by-row basis, the caller should provide nulls when calling -/// [`finish`](Self::finish). This allows callers to compute nulls more -/// efficiently (e.g., via bulk bitmap operations). +/// Builder used by `concat`/`concat_ws` to assemble a [`StringViewArray`] one +/// row at a time from multiple input columns. /// -/// [`StringViewBuilder`]: arrow::array::StringViewBuilder -pub struct StringViewArrayBuilder { +/// Each row is written via repeated `write` calls, followed by a single +/// `append_offset` call to commit the row as a single string view. The output +/// null buffer is supplied by the caller at `finish` time. +pub(crate) struct ConcatStringViewBuilder { views: Vec, data: Vec, block: Vec, @@ -165,7 +168,7 @@ pub struct StringViewArrayBuilder { tainted: bool, } -impl StringViewArrayBuilder { +impl ConcatStringViewBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { Self { views: Vec::with_capacity(item_capacity), @@ -286,14 +289,17 @@ impl StringViewArrayBuilder { } } -pub struct LargeStringArrayBuilder { +/// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one +/// row at a time from multiple input columns. See [`ConcatStringBuilder`] for +/// details on the row-composition contract. +pub(crate) struct ConcatLargeStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call tainted: bool, } -impl LargeStringArrayBuilder { +impl ConcatLargeStringBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let capacity = item_capacity .checked_add(1) @@ -424,9 +430,9 @@ impl LargeStringArrayBuilder { /// - start_offset: The start offset of the substring in the view /// /// LLVM is apparently overly eager to inline this function into some hot loops, -/// which bloats them and regresses performance, so we disable inling for now. +/// which bloats them and regresses performance, so we disable inlining for now. #[inline(never)] -pub fn append_view( +pub(crate) fn append_view( views_buffer: &mut Vec, original_view: &u128, substr: &str, @@ -447,7 +453,7 @@ pub fn append_view( } #[derive(Debug)] -pub enum ColumnarValueRef<'a> { +pub(crate) enum ColumnarValueRef<'a> { Scalar(&'a [u8]), NullableArray(&'a StringArray), NonNullableArray(&'a StringArray), @@ -497,13 +503,13 @@ mod tests { #[test] #[should_panic(expected = "capacity integer overflow")] - fn test_overflow_string_array_builder() { - let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX); + fn test_overflow_concat_string_builder() { + let _builder = ConcatStringBuilder::with_capacity(usize::MAX, usize::MAX); } #[test] #[should_panic(expected = "capacity integer overflow")] - fn test_overflow_large_string_array_builder() { - let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX); + fn test_overflow_concat_large_string_builder() { + let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX); } } diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index da1db6de324e4..0ad9abf86834b 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -320,4 +320,12 @@ The difference is only observable for strings containing combining characters clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text, behavior is unchanged. +### Items in `datafusion_functions::strings` are no longer public + +`StringArrayBuilder`, `LargeStringArrayBuilder`, `StringViewArrayBuilder`, +`ColumnarValueRef`, and `append_view` have been reduced to `pub(crate)`. They +were only ever used to implement `concat` and `concat_ws` inside the crate. If +you were importing them externally, use Arrow's corresponding builders with a +caller-computed `NullBuffer`. + [#17861]: https://github.com/apache/datafusion/pull/17861