From 290200c241b4badc0de2527c2d0f4737b3f44c53 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 18:53:32 +0300 Subject: [PATCH 1/6] perf(decoding): pre-allocate decode buffer from sequence block analysis - Block-level: calculate exact output size (sum of match lengths + literals buffer length) before executing sequences, issue a single reserve() call instead of per-sequence re-allocations - Frame-level: when frame_content_size is declared in the header, pre-allocate the decode buffer upfront to avoid incremental growth - RLE/Raw blocks: reserve decompressed_size before the push loop Eliminates repeated re-allocations in the hot decode path. Closes #20 --- zstd/src/decoding/block_decoder.rs | 4 ++++ zstd/src/decoding/decode_buffer.rs | 8 ++++++++ zstd/src/decoding/frame_decoder.rs | 18 +++++++++++++++++- zstd/src/decoding/sequence_execution.rs | 6 ++++++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/zstd/src/decoding/block_decoder.rs b/zstd/src/decoding/block_decoder.rs index 023ad964..abae0560 100644 --- a/zstd/src/decoding/block_decoder.rs +++ b/zstd/src/decoding/block_decoder.rs @@ -58,6 +58,8 @@ impl BlockDecoder { let full_reads = header.decompressed_size / BATCH_SIZE as u32; let single_read_size = header.decompressed_size % BATCH_SIZE as u32; + workspace.buffer.reserve(header.decompressed_size as usize); + source.read_exact(&mut buf[0..1]).map_err(|err| { DecodeBlockContentError::ReadError { step: block_type, @@ -84,6 +86,8 @@ impl BlockDecoder { let full_reads = header.decompressed_size / BATCH_SIZE as u32; let single_read_size = header.decompressed_size % BATCH_SIZE as u32; + workspace.buffer.reserve(header.decompressed_size as usize); + for _ in 0..full_reads { source.read_exact(&mut buf[..]).map_err(|err| { DecodeBlockContentError::ReadError { diff --git a/zstd/src/decoding/decode_buffer.rs b/zstd/src/decoding/decode_buffer.rs index 51f7b7be..e8fe11bc 100644 --- a/zstd/src/decoding/decode_buffer.rs +++ b/zstd/src/decoding/decode_buffer.rs @@ -60,6 +60,14 @@ impl DecodeBuffer { self.buffer.len() } + /// Pre-allocate capacity for `amount` additional bytes. + /// + /// Call this before a batch of `push`/`repeat` operations to avoid + /// repeated re-allocations inside the hot decode loop. + pub fn reserve(&mut self, amount: usize) { + self.buffer.reserve(amount); + } + pub fn push(&mut self, data: &[u8]) { self.buffer.extend(data); self.total_output_counter += data.len() as u64; diff --git a/zstd/src/decoding/frame_decoder.rs b/zstd/src/decoding/frame_decoder.rs index ca89326c..3e9b7b6c 100644 --- a/zstd/src/decoding/frame_decoder.rs +++ b/zstd/src/decoding/frame_decoder.rs @@ -96,11 +96,21 @@ impl FrameDecoderState { pub fn new(source: impl Read) -> Result { let (frame, header_size) = frame::read_frame_header(source)?; let window_size = frame.window_size()?; + let mut decoder_scratch = DecoderScratch::new(window_size as usize); + // When the frame header declares the decompressed size, pre-allocate + // for the full content to avoid incremental re-allocations. + let fcs = frame.frame_content_size(); + let reserve = if fcs > 0 && fcs <= MAXIMUM_ALLOWED_WINDOW_SIZE { + fcs as usize + } else { + window_size as usize + }; + decoder_scratch.buffer.reserve(reserve); Ok(FrameDecoderState { frame_header: frame, frame_finished: false, block_counter: 0, - decoder_scratch: DecoderScratch::new(window_size as usize), + decoder_scratch, bytes_read_counter: u64::from(header_size), check_sum: None, using_dict: None, @@ -121,6 +131,12 @@ impl FrameDecoderState { self.frame_finished = false; self.block_counter = 0; self.decoder_scratch.reset(window_size as usize); + // When the frame header declares the decompressed size, pre-allocate + // for the full content to avoid incremental re-allocations. + let fcs = self.frame_header.frame_content_size(); + if fcs > 0 && fcs <= MAXIMUM_ALLOWED_WINDOW_SIZE { + self.decoder_scratch.buffer.reserve(fcs as usize); + } self.bytes_read_counter = u64::from(header_size); self.check_sum = None; self.using_dict = None; diff --git a/zstd/src/decoding/sequence_execution.rs b/zstd/src/decoding/sequence_execution.rs index afaf9b87..7ab1a2a1 100644 --- a/zstd/src/decoding/sequence_execution.rs +++ b/zstd/src/decoding/sequence_execution.rs @@ -8,6 +8,12 @@ pub fn execute_sequences(scratch: &mut DecoderScratch) -> Result<(), ExecuteSequ let old_buffer_size = scratch.buffer.len(); let mut seq_sum = 0; + // Pre-allocate the exact output size for this block in one shot. + // Total output = all literals (copied via sequences + trailing) + all match bytes. + let total_match_len: usize = scratch.sequences.iter().map(|s| s.ml as usize).sum(); + let total_output = total_match_len + scratch.literals_buffer.len(); + scratch.buffer.reserve(total_output); + for idx in 0..scratch.sequences.len() { let seq = scratch.sequences[idx]; From a7c66e04734a2bf900f5719865d14b0da2577094 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 19:18:45 +0300 Subject: [PATCH 2/6] fix(decoding): bound pre-allocation against malformed input - Enforce MAXIMUM_ALLOWED_WINDOW_SIZE in FrameDecoderState::new() - Clamp sequence pre-allocation to MAX_BLOCK_SIZE (128KB) - Document decode_block_content, FrameDecoderState::new/reset --- zstd/src/decoding/block_decoder.rs | 7 ++++++- zstd/src/decoding/frame_decoder.rs | 14 ++++++++++++++ zstd/src/decoding/sequence_execution.rs | 6 +++++- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/zstd/src/decoding/block_decoder.rs b/zstd/src/decoding/block_decoder.rs index abae0560..0938d680 100644 --- a/zstd/src/decoding/block_decoder.rs +++ b/zstd/src/decoding/block_decoder.rs @@ -36,10 +36,15 @@ pub fn new() -> BlockDecoder { } impl BlockDecoder { + /// Decode the body of a single block described by `header` from `source` into `workspace`. + /// + /// Returns the number of bytes consumed from `source`. + /// The decode buffer inside `workspace` is pre-allocated for the expected + /// decompressed size before any data is written. pub fn decode_block_content( &mut self, header: &BlockHeader, - workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees + workspace: &mut DecoderScratch, mut source: impl Read, ) -> Result { match self.internal_state { diff --git a/zstd/src/decoding/frame_decoder.rs b/zstd/src/decoding/frame_decoder.rs index 3e9b7b6c..f7d89ece 100644 --- a/zstd/src/decoding/frame_decoder.rs +++ b/zstd/src/decoding/frame_decoder.rs @@ -93,9 +93,20 @@ pub enum BlockDecodingStrategy { } impl FrameDecoderState { + /// Read the frame header from `source` and create a new decoder state. + /// + /// Pre-allocates the decode buffer based on the declared frame content size + /// (when available) or falls back to the window size. pub fn new(source: impl Read) -> Result { let (frame, header_size) = frame::read_frame_header(source)?; let window_size = frame.window_size()?; + + if window_size > MAXIMUM_ALLOWED_WINDOW_SIZE { + return Err(FrameDecoderError::WindowSizeTooBig { + requested: window_size, + }); + } + let mut decoder_scratch = DecoderScratch::new(window_size as usize); // When the frame header declares the decompressed size, pre-allocate // for the full content to avoid incremental re-allocations. @@ -117,6 +128,9 @@ impl FrameDecoderState { }) } + /// Reset this state for a new frame read from `source`, reusing existing allocations. + /// + /// Pre-allocates the decode buffer when the frame content size is declared. pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { let (frame_header, header_size) = frame::read_frame_header(source)?; let window_size = frame_header.window_size()?; diff --git a/zstd/src/decoding/sequence_execution.rs b/zstd/src/decoding/sequence_execution.rs index 7ab1a2a1..0e4a97ea 100644 --- a/zstd/src/decoding/sequence_execution.rs +++ b/zstd/src/decoding/sequence_execution.rs @@ -1,5 +1,6 @@ use super::prefetch; use super::scratch::DecoderScratch; +use crate::common::MAX_BLOCK_SIZE; use crate::decoding::errors::ExecuteSequencesError; /// Take the provided decoder and execute the sequences stored within @@ -10,8 +11,11 @@ pub fn execute_sequences(scratch: &mut DecoderScratch) -> Result<(), ExecuteSequ // Pre-allocate the exact output size for this block in one shot. // Total output = all literals (copied via sequences + trailing) + all match bytes. + // Clamped to MAX_BLOCK_SIZE to guard against corrupted inputs that could + // trigger huge allocations before semantic validation catches the error. let total_match_len: usize = scratch.sequences.iter().map(|s| s.ml as usize).sum(); - let total_output = total_match_len + scratch.literals_buffer.len(); + let total_output = + (total_match_len + scratch.literals_buffer.len()).min(MAX_BLOCK_SIZE as usize); scratch.buffer.reserve(total_output); for idx in 0..scratch.sequences.len() { From e1b3bf561a46544f02681bf2e3d7ee687fdcf54c Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 19:35:21 +0300 Subject: [PATCH 3/6] perf(decoding): use MAX_BLOCK_SIZE for sequence pre-allocation - Replace per-block exact sum with constant MAX_BLOCK_SIZE reserve, eliminating extra iteration over sequences and overflow risk - Fix WindowSizeTooBig error message to report the enforced implementation limit (100 MiB) instead of the spec maximum - Make MAXIMUM_ALLOWED_WINDOW_SIZE pub(crate) with doc comment --- zstd/src/decoding/errors.rs | 4 ++-- zstd/src/decoding/frame_decoder.rs | 9 ++++++--- zstd/src/decoding/sequence_execution.rs | 13 +++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/zstd/src/decoding/errors.rs b/zstd/src/decoding/errors.rs index 06a0085b..f5218d8a 100644 --- a/zstd/src/decoding/errors.rs +++ b/zstd/src/decoding/errors.rs @@ -531,9 +531,9 @@ impl core::fmt::Display for FrameDecoderError { FrameDecoderError::WindowSizeTooBig { requested } => { write!( f, - "Specified window_size is too big; Requested: {}, Max: {}", + "Specified window_size is too big; Requested: {}, Allowed: {}", requested, - crate::common::MAX_WINDOW_SIZE, + crate::decoding::frame_decoder::MAXIMUM_ALLOWED_WINDOW_SIZE, ) } FrameDecoderError::DictionaryDecodeError(e) => { diff --git a/zstd/src/decoding/frame_decoder.rs b/zstd/src/decoding/frame_decoder.rs index f7d89ece..711f529d 100644 --- a/zstd/src/decoding/frame_decoder.rs +++ b/zstd/src/decoding/frame_decoder.rs @@ -16,7 +16,10 @@ use core::convert::TryInto; /// While the maximum window size allowed by the spec is significantly larger, /// our implementation limits it to 100mb to protect against malformed frames. -const MAXIMUM_ALLOWED_WINDOW_SIZE: u64 = 1024 * 1024 * 100; +/// Implementation limit for window size (100 MiB) to protect against +/// malformed frames. The zstd spec allows much larger windows, but this +/// cap prevents excessive memory allocation on untrusted input. +pub(crate) const MAXIMUM_ALLOWED_WINDOW_SIZE: u64 = 1024 * 1024 * 100; /// Low level Zstandard decoder that can be used to decompress frames with fine control over when and how many bytes are decoded. /// @@ -144,9 +147,9 @@ impl FrameDecoderState { self.frame_header = frame_header; self.frame_finished = false; self.block_counter = 0; + // reset() already reserves window_size internally via DecodeBuffer::reset, + // so we only need an additional reserve when FCS exceeds that. self.decoder_scratch.reset(window_size as usize); - // When the frame header declares the decompressed size, pre-allocate - // for the full content to avoid incremental re-allocations. let fcs = self.frame_header.frame_content_size(); if fcs > 0 && fcs <= MAXIMUM_ALLOWED_WINDOW_SIZE { self.decoder_scratch.buffer.reserve(fcs as usize); diff --git a/zstd/src/decoding/sequence_execution.rs b/zstd/src/decoding/sequence_execution.rs index 0e4a97ea..fd5e5656 100644 --- a/zstd/src/decoding/sequence_execution.rs +++ b/zstd/src/decoding/sequence_execution.rs @@ -9,14 +9,11 @@ pub fn execute_sequences(scratch: &mut DecoderScratch) -> Result<(), ExecuteSequ let old_buffer_size = scratch.buffer.len(); let mut seq_sum = 0; - // Pre-allocate the exact output size for this block in one shot. - // Total output = all literals (copied via sequences + trailing) + all match bytes. - // Clamped to MAX_BLOCK_SIZE to guard against corrupted inputs that could - // trigger huge allocations before semantic validation catches the error. - let total_match_len: usize = scratch.sequences.iter().map(|s| s.ml as usize).sum(); - let total_output = - (total_match_len + scratch.literals_buffer.len()).min(MAX_BLOCK_SIZE as usize); - scratch.buffer.reserve(total_output); + // Reserve once for the maximum possible decoded block output (128 KB per + // the zstd spec). This avoids repeated re-allocations inside the hot + // execute loop without an extra scan over the sequence vector, and is + // inherently bounded against corrupted inputs. + scratch.buffer.reserve(MAX_BLOCK_SIZE as usize); for idx in 0..scratch.sequences.len() { let seq = scratch.sequences[idx]; From c006a9b7b797b566c18ebad840d6ae7ccb1b183a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 19:55:32 +0300 Subject: [PATCH 4/6] perf(decoding): limit frame-level reserve to window_size - Remove FCS-based pre-allocation that could reserve up to 100 MiB even for streaming callers that drain incrementally - Keep window_size reservation in new() for initial capacity - Consolidate duplicate doc comment on MAXIMUM_ALLOWED_WINDOW_SIZE --- zstd/src/decoding/frame_decoder.rs | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/zstd/src/decoding/frame_decoder.rs b/zstd/src/decoding/frame_decoder.rs index 711f529d..c2e5635a 100644 --- a/zstd/src/decoding/frame_decoder.rs +++ b/zstd/src/decoding/frame_decoder.rs @@ -14,8 +14,6 @@ use alloc::collections::BTreeMap; use alloc::vec::Vec; use core::convert::TryInto; -/// While the maximum window size allowed by the spec is significantly larger, -/// our implementation limits it to 100mb to protect against malformed frames. /// Implementation limit for window size (100 MiB) to protect against /// malformed frames. The zstd spec allows much larger windows, but this /// cap prevents excessive memory allocation on untrusted input. @@ -98,8 +96,8 @@ pub enum BlockDecodingStrategy { impl FrameDecoderState { /// Read the frame header from `source` and create a new decoder state. /// - /// Pre-allocates the decode buffer based on the declared frame content size - /// (when available) or falls back to the window size. + /// Pre-allocates the decode buffer to `window_size` so the first block + /// does not trigger incremental growth from zero capacity. pub fn new(source: impl Read) -> Result { let (frame, header_size) = frame::read_frame_header(source)?; let window_size = frame.window_size()?; @@ -111,15 +109,7 @@ impl FrameDecoderState { } let mut decoder_scratch = DecoderScratch::new(window_size as usize); - // When the frame header declares the decompressed size, pre-allocate - // for the full content to avoid incremental re-allocations. - let fcs = frame.frame_content_size(); - let reserve = if fcs > 0 && fcs <= MAXIMUM_ALLOWED_WINDOW_SIZE { - fcs as usize - } else { - window_size as usize - }; - decoder_scratch.buffer.reserve(reserve); + decoder_scratch.buffer.reserve(window_size as usize); Ok(FrameDecoderState { frame_header: frame, frame_finished: false, @@ -133,7 +123,9 @@ impl FrameDecoderState { /// Reset this state for a new frame read from `source`, reusing existing allocations. /// - /// Pre-allocates the decode buffer when the frame content size is declared. + /// `DecodeBuffer::reset` reserves `window_size` internally; no additional + /// frame-level reservation is needed since block-level pre-allocation + /// (MAX_BLOCK_SIZE per block) handles growth during decoding. pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { let (frame_header, header_size) = frame::read_frame_header(source)?; let window_size = frame_header.window_size()?; @@ -147,13 +139,7 @@ impl FrameDecoderState { self.frame_header = frame_header; self.frame_finished = false; self.block_counter = 0; - // reset() already reserves window_size internally via DecodeBuffer::reset, - // so we only need an additional reserve when FCS exceeds that. self.decoder_scratch.reset(window_size as usize); - let fcs = self.frame_header.frame_content_size(); - if fcs > 0 && fcs <= MAXIMUM_ALLOWED_WINDOW_SIZE { - self.decoder_scratch.buffer.reserve(fcs as usize); - } self.bytes_read_counter = u64::from(header_size); self.check_sum = None; self.using_dict = None; From 4d38ca773ea4ffc18da68138c07f2f5209b41f3e Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 20:23:35 +0300 Subject: [PATCH 5/6] refactor(decoding): move MAXIMUM_ALLOWED_WINDOW_SIZE to common module - Relocate constant from frame_decoder to crate::common - Clarify decode_block_content and reset() doc strings - Reference shared constant from errors module --- zstd/src/common/mod.rs | 5 +++++ zstd/src/decoding/block_decoder.rs | 5 +++-- zstd/src/decoding/errors.rs | 2 +- zstd/src/decoding/frame_decoder.rs | 11 ++++------- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/zstd/src/common/mod.rs b/zstd/src/common/mod.rs index e4736bae..ee51adf6 100644 --- a/zstd/src/common/mod.rs +++ b/zstd/src/common/mod.rs @@ -19,3 +19,8 @@ pub const MAX_WINDOW_SIZE: u64 = (1 << 41) + 7 * (1 << 38); /// /// pub const MAX_BLOCK_SIZE: u32 = 128 * 1024; + +/// Implementation limit for window size (100 MiB) to protect against +/// malformed frames. The zstd spec allows much larger windows, but this +/// cap prevents excessive memory allocation on untrusted input. +pub const MAXIMUM_ALLOWED_WINDOW_SIZE: u64 = 1024 * 1024 * 100; diff --git a/zstd/src/decoding/block_decoder.rs b/zstd/src/decoding/block_decoder.rs index 0938d680..cf7f5504 100644 --- a/zstd/src/decoding/block_decoder.rs +++ b/zstd/src/decoding/block_decoder.rs @@ -39,8 +39,9 @@ impl BlockDecoder { /// Decode the body of a single block described by `header` from `source` into `workspace`. /// /// Returns the number of bytes consumed from `source`. - /// The decode buffer inside `workspace` is pre-allocated for the expected - /// decompressed size before any data is written. + /// The decode buffer inside `workspace` may be reserved or grown during + /// decoding. For some block types the decompressed size is known up front, + /// but this is not guaranteed before any data is written. pub fn decode_block_content( &mut self, header: &BlockHeader, diff --git a/zstd/src/decoding/errors.rs b/zstd/src/decoding/errors.rs index f5218d8a..a7381b22 100644 --- a/zstd/src/decoding/errors.rs +++ b/zstd/src/decoding/errors.rs @@ -533,7 +533,7 @@ impl core::fmt::Display for FrameDecoderError { f, "Specified window_size is too big; Requested: {}, Allowed: {}", requested, - crate::decoding::frame_decoder::MAXIMUM_ALLOWED_WINDOW_SIZE, + crate::common::MAXIMUM_ALLOWED_WINDOW_SIZE, ) } FrameDecoderError::DictionaryDecodeError(e) => { diff --git a/zstd/src/decoding/frame_decoder.rs b/zstd/src/decoding/frame_decoder.rs index c2e5635a..020f299a 100644 --- a/zstd/src/decoding/frame_decoder.rs +++ b/zstd/src/decoding/frame_decoder.rs @@ -14,10 +14,7 @@ use alloc::collections::BTreeMap; use alloc::vec::Vec; use core::convert::TryInto; -/// Implementation limit for window size (100 MiB) to protect against -/// malformed frames. The zstd spec allows much larger windows, but this -/// cap prevents excessive memory allocation on untrusted input. -pub(crate) const MAXIMUM_ALLOWED_WINDOW_SIZE: u64 = 1024 * 1024 * 100; +use crate::common::MAXIMUM_ALLOWED_WINDOW_SIZE; /// Low level Zstandard decoder that can be used to decompress frames with fine control over when and how many bytes are decoded. /// @@ -123,9 +120,9 @@ impl FrameDecoderState { /// Reset this state for a new frame read from `source`, reusing existing allocations. /// - /// `DecodeBuffer::reset` reserves `window_size` internally; no additional - /// frame-level reservation is needed since block-level pre-allocation - /// (MAX_BLOCK_SIZE per block) handles growth during decoding. + /// `DecodeBuffer::reset` reserves `window_size` internally, so no + /// additional frame-level reservation is needed here. Further buffer + /// growth during decoding is performed on demand by the active block path. pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { let (frame_header, header_size) = frame::read_frame_header(source)?; let window_size = frame_header.window_size()?; From b6ddf29738972a937aa10d40c67175bb75a1e63a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 21:02:38 +0300 Subject: [PATCH 6/6] perf(decoding): inline DecodeBuffer::reserve forwarding method --- zstd/src/decoding/decode_buffer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/zstd/src/decoding/decode_buffer.rs b/zstd/src/decoding/decode_buffer.rs index e8fe11bc..def2db96 100644 --- a/zstd/src/decoding/decode_buffer.rs +++ b/zstd/src/decoding/decode_buffer.rs @@ -64,6 +64,7 @@ impl DecodeBuffer { /// /// Call this before a batch of `push`/`repeat` operations to avoid /// repeated re-allocations inside the hot decode loop. + #[inline] pub fn reserve(&mut self, amount: usize) { self.buffer.reserve(amount); }