From ccb32e7c0a26eda65c73049a467313fcb4d87431 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 1 Aug 2025 17:28:52 +0800 Subject: [PATCH 1/2] Port arrow-rs optimization for get_buffer_memory_size for gc string view --- datafusion/physical-plan/src/coalesce/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/coalesce/mod.rs b/datafusion/physical-plan/src/coalesce/mod.rs index 0eca27f8e40e0..b4c55d58bec1b 100644 --- a/datafusion/physical-plan/src/coalesce/mod.rs +++ b/datafusion/physical-plan/src/coalesce/mod.rs @@ -240,7 +240,8 @@ fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch { } }) .sum(); - let actual_buffer_size = s.get_buffer_memory_size(); + let actual_buffer_size = + s.data_buffers().iter().map(|b| b.capacity()).sum::(); // Re-creating the array copies data and can be time consuming. // We only do it if the array is sparse From cddce851ea13369f69398ec9dc871dff2545a481 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 1 Aug 2025 17:32:41 +0800 Subject: [PATCH 2/2] add comments and fast path --- datafusion/physical-plan/src/coalesce/mod.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/datafusion/physical-plan/src/coalesce/mod.rs b/datafusion/physical-plan/src/coalesce/mod.rs index b4c55d58bec1b..8e0ba072b7467 100644 --- a/datafusion/physical-plan/src/coalesce/mod.rs +++ b/datafusion/physical-plan/src/coalesce/mod.rs @@ -228,6 +228,12 @@ fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch { let Some(s) = c.as_string_view_opt() else { return Arc::clone(c); }; + + // Fast path: if the data buffers are empty, we can return the original array + if s.data_buffers().is_empty() { + return Arc::clone(c); + } + let ideal_buffer_size: usize = s .views() .iter() @@ -240,6 +246,9 @@ fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch { } }) .sum(); + + // We don't use get_buffer_memory_size here, because gc is for the contents of the + // data buffers, not views and nulls. let actual_buffer_size = s.data_buffers().iter().map(|b| b.capacity()).sum::();