From c7c9982b1b918553e69a18482dc0306fe5ca747a Mon Sep 17 00:00:00 2001 From: SeoYoung Lee Date: Mon, 4 Aug 2025 06:11:09 +0000 Subject: [PATCH 1/3] use GreedyMemoryPool for sanity check --- datafusion/execution/src/memory_pool/pool.rs | 4 +++- datafusion/physical-plan/src/sorts/merge.rs | 4 +++- .../src/sorts/multi_level_merge.rs | 17 ++++++++++++++--- .../physical-plan/src/sorts/streaming_merge.rs | 9 ++++++--- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index da456b7071f77..017cbe65a2840 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -44,6 +44,7 @@ impl MemoryPool for UnboundedMemoryPool { fn try_grow(&self, reservation: &MemoryReservation, additional: usize) -> Result<()> { self.grow(reservation, additional); + println!("[mem pool] {} used", self.reserved()); Ok(()) } @@ -70,7 +71,8 @@ pub struct GreedyMemoryPool { impl GreedyMemoryPool { /// Create a new pool that can allocate up to `pool_size` bytes pub fn new(pool_size: usize) -> Self { - debug!("Created new GreedyMemoryPool(pool_size={pool_size})"); + // debug!("Created new GreedyMemoryPool(pool_size={pool_size})"); + println!("Created new GreedyMemoryPool(pool_size={pool_size})"); Self { pool_size, used: AtomicUsize::new(0), diff --git a/datafusion/physical-plan/src/sorts/merge.rs b/datafusion/physical-plan/src/sorts/merge.rs index 0b0136cd12ced..3c22a14d11e55 100644 --- a/datafusion/physical-plan/src/sorts/merge.rs +++ b/datafusion/physical-plan/src/sorts/merge.rs @@ -195,7 +195,9 @@ impl SortPreservingMergeStream { match futures::ready!(self.streams.poll_next(cx, idx)) { None => Poll::Ready(Ok(())), - Some(Err(e)) => Poll::Ready(Err(e)), + Some(Err(e)) => { + Poll::Ready(Err(e)) + } Some(Ok((cursor, batch))) => { self.cursors[idx] = Some(Cursor::new(cursor)); Poll::Ready(self.in_progress.push_batch(idx, batch)) diff --git a/datafusion/physical-plan/src/sorts/multi_level_merge.rs b/datafusion/physical-plan/src/sorts/multi_level_merge.rs index bb6fc751b8974..7821999f6dd22 100644 --- a/datafusion/physical-plan/src/sorts/multi_level_merge.rs +++ b/datafusion/physical-plan/src/sorts/multi_level_merge.rs @@ -248,11 +248,15 @@ impl MultiLevelMergeBuilder { // If we have no sorted spill files left, this is the last run true, true, + 0, // TODO(ding-young) ) } // Need to merge multiple streams - (_, _) => { + (spill, inmem) => { + println!( + "[merge_sorted_runs_within_mem_limit] spill{spill}, inmem{inmem}" + ); let mut memory_reservation = self.reservation.new_empty(); // Don't account for existing streams memory @@ -277,12 +281,17 @@ impl MultiLevelMergeBuilder { .read_spill_as_stream(spill.file)?; sorted_streams.push(stream); } - + println!( + "[merge_sorted_runs_within_mem_limit] memory_reservation:{}", + memory_reservation.size() + ); + // TODO(ding-young) pass memory limit but what about in mem size? let merge_sort_stream = self.create_new_merge_sort( sorted_streams, // If we have no sorted spill files left, this is the last run self.sorted_spill_files.is_empty(), is_only_merging_memory_streams, + memory_reservation.size(), )?; // If we're only merging memory streams, we don't need to attach the memory reservation @@ -308,6 +317,7 @@ impl MultiLevelMergeBuilder { streams: Vec, is_output: bool, all_in_memory: bool, + expected_max_memory_usage: usize, ) -> Result { let mut builder = StreamingMergeBuilder::new() .with_schema(Arc::clone(&self.schema)) @@ -328,7 +338,8 @@ impl MultiLevelMergeBuilder { // (reserving memory for the biggest batch in each stream) // TODO - avoid this hack as this can be broken easily when `SortPreservingMergeStream` // changes the implementation to use more/less memory - builder = builder.with_bypass_mempool(); + println!("create new merge sort with bypass mempool"); + builder = builder.with_bypass_mempool(expected_max_memory_usage); } else { // If we are only merging in-memory streams, we need to use the memory reservation // because we don't know the maximum size of the batches in the streams diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs index 191b135753412..246be18732773 100644 --- a/datafusion/physical-plan/src/sorts/streaming_merge.rs +++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs @@ -29,8 +29,9 @@ use arrow::array::*; use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::{internal_err, Result}; use datafusion_execution::disk_manager::RefCountedTempFile; +#[allow(unused_imports)] use datafusion_execution::memory_pool::{ - human_readable_size, MemoryConsumer, MemoryPool, MemoryReservation, + human_readable_size, GreedyMemoryPool, MemoryConsumer, MemoryPool, MemoryReservation, UnboundedMemoryPool, }; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -161,8 +162,10 @@ impl<'a> StreamingMergeBuilder<'a> { /// Bypass the mempool and avoid using the memory reservation. /// /// This is not marked as `pub` because it is not recommended to use this method - pub(super) fn with_bypass_mempool(self) -> Self { - let mem_pool: Arc = Arc::new(UnboundedMemoryPool::default()); + pub(super) fn with_bypass_mempool(self, memory_limit: usize) -> Self { + // TODO(ding-young) Bypass main memory pool and use separate GreedyMemoryPool for sanity check + // let mem_pool: Arc = Arc::new(UnboundedMemoryPool::default()); + let mem_pool: Arc = Arc::new(GreedyMemoryPool::new(memory_limit)); self.with_reservation( MemoryConsumer::new("merge stream mock memory").register(&mem_pool), From af0aac6eb33bc9f7dcd053310c85895334a44519 Mon Sep 17 00:00:00 2001 From: SeoYoung Lee Date: Fri, 8 Aug 2025 04:16:26 +0000 Subject: [PATCH 2/3] validate whether batch read from spill exceeds max_record_batch_mem --- datafusion/core/tests/sql/runtime_config.rs | 2 +- datafusion/physical-plan/benches/spill_io.rs | 14 ++++-- datafusion/physical-plan/src/sorts/merge.rs | 4 +- .../src/sorts/multi_level_merge.rs | 24 ++++------ .../src/sorts/streaming_merge.rs | 6 +-- datafusion/physical-plan/src/spill/mod.rs | 47 ++++++++++++++++--- .../physical-plan/src/spill/spill_manager.rs | 2 + 7 files changed, 63 insertions(+), 36 deletions(-) diff --git a/datafusion/core/tests/sql/runtime_config.rs b/datafusion/core/tests/sql/runtime_config.rs index 9627d7bccdb04..dfa27f6307670 100644 --- a/datafusion/core/tests/sql/runtime_config.rs +++ b/datafusion/core/tests/sql/runtime_config.rs @@ -193,7 +193,7 @@ async fn test_max_temp_directory_size_enforcement() { .unwrap(); let result = ctx.sql(query).await.unwrap().collect().await; - + println!("result is {result:?}"); assert!( result.is_ok(), "Should not fail due to max temp directory size limit" diff --git a/datafusion/physical-plan/benches/spill_io.rs b/datafusion/physical-plan/benches/spill_io.rs index e42c8073ae1c0..40c8f7634c8c4 100644 --- a/datafusion/physical-plan/benches/spill_io.rs +++ b/datafusion/physical-plan/benches/spill_io.rs @@ -115,8 +115,9 @@ fn bench_spill_io(c: &mut Criterion) { // - Wait for the consumer to finish processing |spill_file| { rt.block_on(async { - let stream = - spill_manager.read_spill_as_stream(spill_file).unwrap(); + let stream = spill_manager + .read_spill_as_stream(spill_file, None) + .unwrap(); let _ = collect(stream).await.unwrap(); }) }, @@ -519,8 +520,9 @@ fn benchmark_spill_batches_for_all_codec( ) .unwrap() .unwrap(); - let stream = - spill_manager.read_spill_as_stream(spill_file).unwrap(); + let stream = spill_manager + .read_spill_as_stream(spill_file, None) + .unwrap(); let _ = collect(stream).await.unwrap(); }) }, @@ -553,7 +555,9 @@ fn benchmark_spill_batches_for_all_codec( let rt = Runtime::new().unwrap(); let start = Instant::now(); rt.block_on(async { - let stream = spill_manager.read_spill_as_stream(spill_file).unwrap(); + let stream = spill_manager + .read_spill_as_stream(spill_file, None) + .unwrap(); let _ = collect(stream).await.unwrap(); }); let read_time = start.elapsed(); diff --git a/datafusion/physical-plan/src/sorts/merge.rs b/datafusion/physical-plan/src/sorts/merge.rs index 3c22a14d11e55..0b0136cd12ced 100644 --- a/datafusion/physical-plan/src/sorts/merge.rs +++ b/datafusion/physical-plan/src/sorts/merge.rs @@ -195,9 +195,7 @@ impl SortPreservingMergeStream { match futures::ready!(self.streams.poll_next(cx, idx)) { None => Poll::Ready(Ok(())), - Some(Err(e)) => { - Poll::Ready(Err(e)) - } + Some(Err(e)) => Poll::Ready(Err(e)), Some(Ok((cursor, batch))) => { self.cursors[idx] = Some(Cursor::new(cursor)); Poll::Ready(self.in_progress.push_batch(idx, batch)) diff --git a/datafusion/physical-plan/src/sorts/multi_level_merge.rs b/datafusion/physical-plan/src/sorts/multi_level_merge.rs index 7821999f6dd22..58d046cc90911 100644 --- a/datafusion/physical-plan/src/sorts/multi_level_merge.rs +++ b/datafusion/physical-plan/src/sorts/multi_level_merge.rs @@ -237,7 +237,8 @@ impl MultiLevelMergeBuilder { let spill_file = self.sorted_spill_files.remove(0); // Not reserving any memory for this disk as we are not holding it in memory - self.spill_manager.read_spill_as_stream(spill_file.file) + self.spill_manager + .read_spill_as_stream(spill_file.file, None) } // Only in memory streams, so merge them all in a single pass @@ -248,15 +249,11 @@ impl MultiLevelMergeBuilder { // If we have no sorted spill files left, this is the last run true, true, - 0, // TODO(ding-young) ) } // Need to merge multiple streams - (spill, inmem) => { - println!( - "[merge_sorted_runs_within_mem_limit] spill{spill}, inmem{inmem}" - ); + (_, _) => { let mut memory_reservation = self.reservation.new_empty(); // Don't account for existing streams memory @@ -278,20 +275,17 @@ impl MultiLevelMergeBuilder { .spill_manager .clone() .with_batch_read_buffer_capacity(buffer_size) - .read_spill_as_stream(spill.file)?; + .read_spill_as_stream( + spill.file, + Some(spill.max_record_batch_memory), + )?; sorted_streams.push(stream); } - println!( - "[merge_sorted_runs_within_mem_limit] memory_reservation:{}", - memory_reservation.size() - ); - // TODO(ding-young) pass memory limit but what about in mem size? let merge_sort_stream = self.create_new_merge_sort( sorted_streams, // If we have no sorted spill files left, this is the last run self.sorted_spill_files.is_empty(), is_only_merging_memory_streams, - memory_reservation.size(), )?; // If we're only merging memory streams, we don't need to attach the memory reservation @@ -317,7 +311,6 @@ impl MultiLevelMergeBuilder { streams: Vec, is_output: bool, all_in_memory: bool, - expected_max_memory_usage: usize, ) -> Result { let mut builder = StreamingMergeBuilder::new() .with_schema(Arc::clone(&self.schema)) @@ -338,8 +331,7 @@ impl MultiLevelMergeBuilder { // (reserving memory for the biggest batch in each stream) // TODO - avoid this hack as this can be broken easily when `SortPreservingMergeStream` // changes the implementation to use more/less memory - println!("create new merge sort with bypass mempool"); - builder = builder.with_bypass_mempool(expected_max_memory_usage); + builder = builder.with_bypass_mempool(); } else { // If we are only merging in-memory streams, we need to use the memory reservation // because we don't know the maximum size of the batches in the streams diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs index 246be18732773..f980a0aa35bad 100644 --- a/datafusion/physical-plan/src/sorts/streaming_merge.rs +++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs @@ -162,10 +162,8 @@ impl<'a> StreamingMergeBuilder<'a> { /// Bypass the mempool and avoid using the memory reservation. /// /// This is not marked as `pub` because it is not recommended to use this method - pub(super) fn with_bypass_mempool(self, memory_limit: usize) -> Self { - // TODO(ding-young) Bypass main memory pool and use separate GreedyMemoryPool for sanity check - // let mem_pool: Arc = Arc::new(UnboundedMemoryPool::default()); - let mem_pool: Arc = Arc::new(GreedyMemoryPool::new(memory_limit)); + pub(super) fn with_bypass_mempool(self) -> Self { + let mem_pool: Arc = Arc::new(UnboundedMemoryPool::default()); self.with_reservation( MemoryConsumer::new("merge stream mock memory").register(&mem_pool), diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index fab62bff840f6..59ebc7cd1e700 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -54,8 +54,13 @@ use futures::{FutureExt as _, Stream}; struct SpillReaderStream { schema: SchemaRef, state: SpillReaderStreamState, + /// how much memory the largest memory batch is taking + pub max_record_batch_memory: Option, } +// Small margin allowed to accommodate slight memory accounting variation +const MEMORY_MARGIN: usize = 4096; + /// When we poll for the next batch, we will get back both the batch and the reader, /// so we can call `next` again. type NextRecordBatchResult = Result<(StreamReader>, Option)>; @@ -76,10 +81,15 @@ enum SpillReaderStreamState { } impl SpillReaderStream { - fn new(schema: SchemaRef, spill_file: RefCountedTempFile) -> Self { + fn new( + schema: SchemaRef, + spill_file: RefCountedTempFile, + max_record_batch_memory: Option, + ) -> Self { Self { schema, state: SpillReaderStreamState::Uninitialized(spill_file), + max_record_batch_memory, } } @@ -125,6 +135,27 @@ impl SpillReaderStream { Ok((reader, batch)) => { match batch { Some(batch) => { + if let Some(max_record_batch_memory) = + self.max_record_batch_memory + { + let actual_size = + get_record_batch_memory_size(&batch); + if actual_size + > max_record_batch_memory + MEMORY_MARGIN + { + return Poll::Ready(Some(Err( + DataFusionError::ResourcesExhausted( + format!( + "Record batch memory usage ({actual_size} bytes) exceeds the expected limit ({max_record_batch_memory} bytes)\n + by more than the allowed tolerance ({MEMORY_MARGIN} bytes).\n + This likely indicates a bug in memory accounting during spilling.\n + Please report this issue", + ) + .to_owned(), + ), + ))); + } + } self.state = SpillReaderStreamState::Waiting(reader); Poll::Ready(Some(Ok(batch))) @@ -417,7 +448,7 @@ mod tests { let spilled_rows = spill_manager.metrics.spilled_rows.value(); assert_eq!(spilled_rows, num_rows); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), schema); let batches = collect(stream).await?; @@ -481,7 +512,7 @@ mod tests { let spilled_rows = spill_manager.metrics.spilled_rows.value(); assert_eq!(spilled_rows, num_rows); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), dict_schema); let batches = collect(stream).await?; assert_eq!(batches.len(), 2); @@ -512,7 +543,7 @@ mod tests { assert!(spill_file.path().exists()); assert!(max_batch_mem > 0); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), schema); let batches = collect(stream).await?; @@ -547,7 +578,7 @@ mod tests { let spilled_rows = spill_manager.metrics.spilled_rows.value(); assert_eq!(spilled_rows, num_rows); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), schema); let batches = collect(stream).await?; @@ -931,8 +962,10 @@ mod tests { .spill_record_batch_and_finish(&batches, "Test2")? .unwrap(); - let mut stream_1 = spill_manager.read_spill_as_stream(spill_file_1)?; - let mut stream_2 = spill_manager.read_spill_as_stream(spill_file_2)?; + let mut stream_1 = + spill_manager.read_spill_as_stream(spill_file_1, None)?; + let mut stream_2 = + spill_manager.read_spill_as_stream(spill_file_2, None)?; stream_1.next().await; stream_2.next().await; diff --git a/datafusion/physical-plan/src/spill/spill_manager.rs b/datafusion/physical-plan/src/spill/spill_manager.rs index ad23bd66a021a..cc39102d89819 100644 --- a/datafusion/physical-plan/src/spill/spill_manager.rs +++ b/datafusion/physical-plan/src/spill/spill_manager.rs @@ -174,10 +174,12 @@ impl SpillManager { pub fn read_spill_as_stream( &self, spill_file_path: RefCountedTempFile, + max_record_batch_memory: Option, ) -> Result { let stream = Box::pin(cooperative(SpillReaderStream::new( Arc::clone(&self.schema), spill_file_path, + max_record_batch_memory, ))); Ok(spawn_buffered(stream, self.batch_read_buffer_capacity)) From 5a510cc7bb500e87f06f42f915e11fd997eb7eed Mon Sep 17 00:00:00 2001 From: SeoYoung Lee Date: Fri, 8 Aug 2025 05:29:25 +0000 Subject: [PATCH 3/3] replace err with warn log --- datafusion/core/tests/sql/runtime_config.rs | 2 +- datafusion/execution/src/memory_pool/pool.rs | 4 +-- .../src/sorts/streaming_merge.rs | 3 +- datafusion/physical-plan/src/spill/mod.rs | 30 +++++++++---------- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/datafusion/core/tests/sql/runtime_config.rs b/datafusion/core/tests/sql/runtime_config.rs index dfa27f6307670..9627d7bccdb04 100644 --- a/datafusion/core/tests/sql/runtime_config.rs +++ b/datafusion/core/tests/sql/runtime_config.rs @@ -193,7 +193,7 @@ async fn test_max_temp_directory_size_enforcement() { .unwrap(); let result = ctx.sql(query).await.unwrap().collect().await; - println!("result is {result:?}"); + assert!( result.is_ok(), "Should not fail due to max temp directory size limit" diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index 017cbe65a2840..da456b7071f77 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -44,7 +44,6 @@ impl MemoryPool for UnboundedMemoryPool { fn try_grow(&self, reservation: &MemoryReservation, additional: usize) -> Result<()> { self.grow(reservation, additional); - println!("[mem pool] {} used", self.reserved()); Ok(()) } @@ -71,8 +70,7 @@ pub struct GreedyMemoryPool { impl GreedyMemoryPool { /// Create a new pool that can allocate up to `pool_size` bytes pub fn new(pool_size: usize) -> Self { - // debug!("Created new GreedyMemoryPool(pool_size={pool_size})"); - println!("Created new GreedyMemoryPool(pool_size={pool_size})"); + debug!("Created new GreedyMemoryPool(pool_size={pool_size})"); Self { pool_size, used: AtomicUsize::new(0), diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs index f980a0aa35bad..191b135753412 100644 --- a/datafusion/physical-plan/src/sorts/streaming_merge.rs +++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs @@ -29,9 +29,8 @@ use arrow::array::*; use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::{internal_err, Result}; use datafusion_execution::disk_manager::RefCountedTempFile; -#[allow(unused_imports)] use datafusion_execution::memory_pool::{ - human_readable_size, GreedyMemoryPool, MemoryConsumer, MemoryPool, MemoryReservation, + human_readable_size, MemoryConsumer, MemoryPool, MemoryReservation, UnboundedMemoryPool, }; use datafusion_physical_expr_common::sort_expr::LexOrdering; diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index 59ebc7cd1e700..33a7cdbe3f721 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -43,6 +43,7 @@ use datafusion_common_runtime::SpawnedTask; use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::RecordBatchStream; use futures::{FutureExt as _, Stream}; +use log::warn; /// Stream that reads spill files from disk where each batch is read in a spawned blocking task /// It will read one batch at a time and will not do any buffering, to buffer data use [`crate::common::spawn_buffered`] @@ -54,12 +55,15 @@ use futures::{FutureExt as _, Stream}; struct SpillReaderStream { schema: SchemaRef, state: SpillReaderStreamState, - /// how much memory the largest memory batch is taking - pub max_record_batch_memory: Option, + /// Maximum memory size observed among spilling sorted record batches. + /// This is used for validation purposes during reading each RecordBatch from spill. + /// For context on why this value is recorded and validated, + /// see `physical_plan/sort/multi_level_merge.rs`. + max_record_batch_memory: Option, } // Small margin allowed to accommodate slight memory accounting variation -const MEMORY_MARGIN: usize = 4096; +const SPILL_BATCH_MEMORY_MARGIN: usize = 4096; /// When we poll for the next batch, we will get back both the batch and the reader, /// so we can call `next` again. @@ -141,19 +145,15 @@ impl SpillReaderStream { let actual_size = get_record_batch_memory_size(&batch); if actual_size - > max_record_batch_memory + MEMORY_MARGIN + > max_record_batch_memory + + SPILL_BATCH_MEMORY_MARGIN { - return Poll::Ready(Some(Err( - DataFusionError::ResourcesExhausted( - format!( - "Record batch memory usage ({actual_size} bytes) exceeds the expected limit ({max_record_batch_memory} bytes)\n - by more than the allowed tolerance ({MEMORY_MARGIN} bytes).\n - This likely indicates a bug in memory accounting during spilling.\n - Please report this issue", - ) - .to_owned(), - ), - ))); + warn!( + "Record batch memory usage ({actual_size} bytes) exceeds the expected limit ({max_record_batch_memory} bytes) \n\ + by more than the allowed tolerance ({SPILL_BATCH_MEMORY_MARGIN} bytes).\n\ + This likely indicates a bug in memory accounting during spilling.\n\ + Please report this issue in https://github.com/apache/datafusion/issues/17340." + ); } } self.state = SpillReaderStreamState::Waiting(reader);