From a40e31729598772513dfebdf770d0a2fb50964b8 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 14:04:47 +0300 Subject: [PATCH 01/18] feat(encoding): numeric compression levels (1-22) API - Add CompressionLevel::Level(i32) variant and from_level() constructor - Port C zstd level parameter table (levels 1-22) with per-level window size, hash/chain config, search depth, and lazy depth - Negative levels (-1..-131072) select ultra-fast mode via Simple backend with progressively coarser hash insertion - Level 0 maps to default (level 3), matching C zstd semantics - Named variants map exactly to their numeric equivalents: Fastest=1, Default=3, Better=7, Best=11 - Refactor MatchGeneratorDriver::reset() to use centralized LevelParams instead of per-variant hardcoded constants - Update CLI to accept zstd-compatible numeric levels (-5..22) - Add 11 roundtrip tests covering equivalence, all 22 levels, negative levels, monotonic ratio, streaming, and clamping Closes #21 --- README.md | 7 +- cli/src/main.rs | 56 +++++---- zstd/src/encoding/frame_compressor.rs | 15 ++- zstd/src/encoding/match_generator.rs | 167 ++++++++++++++++--------- zstd/src/encoding/mod.rs | 37 +++++- zstd/src/encoding/streaming_encoder.rs | 11 +- zstd/src/tests/roundtrip_integrity.rs | 126 +++++++++++++++++++ 7 files changed, 326 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 3bb97f6a..f6578fdf 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This is a **maintained fork** of [KillingSpark/zstd-rs](https://github.com/Killi **Fork goals:** - Dictionary compression improvements (critical for per-label trained dictionaries in LSM-tree) - Performance parity with C zstd for decompression (currently 1.4-3.5x slower) -- Additional compression levels (Fastest, Default, Better, and Best are all implemented) +- Full numeric compression levels (1–22 plus negative ultra-fast, C zstd compatible) - No FFI — pure `cargo build`, no cmake/system libraries (ADR-013 compliance) **Upstream relationship:** We periodically sync with upstream but maintain an independent development trajectory focused on CoordiNode requirements. @@ -46,6 +46,8 @@ Complete RFC 8878 implementation. Performance: ~1.4-3.5x slower than C zstd depe - [x] Default (roughly level 3) - [x] Better (roughly level 7) - [x] Best (roughly level 11) +- [x] Numeric levels 1–22 via `CompressionLevel::from_level(n)` (C zstd compatible numbering) +- [x] Negative levels for ultra-fast compression - [x] Checksums - [x] Frame Content Size — `FrameCompressor` writes FCS automatically; `StreamingEncoder` requires `set_pledged_content_size()` before first write - [x] Dictionary compression @@ -67,7 +69,10 @@ Performance tracking lives in [BENCHMARKS.md](BENCHMARKS.md). The suite compares use structured_zstd::encoding::{compress, compress_to_vec, CompressionLevel}; let data: &[u8] = b"hello world"; +// Named level let compressed = compress_to_vec(data, CompressionLevel::Fastest); +// Numeric level (C zstd compatible: 1-22, negative for ultra-fast) +let compressed = compress_to_vec(data, CompressionLevel::from_level(7)); ``` ```rust,no_run diff --git a/cli/src/main.rs b/cli/src/main.rs index 1562da3c..cfd652d1 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -34,23 +34,25 @@ enum Commands { /// Where the compressed file is written /// [default: .zst] output_file: Option, - /// How thoroughly the file should be compressed. A higher level will take - /// more time to compress but result in a smaller file, and vice versa. + /// Compression level using C zstd numbering (higher = smaller, slower). /// - /// - 0: Uncompressed - /// - 1: Fastest - /// - 2: Default - /// - 3: Better (lazy2, ~zstd level 7) - /// - 4: Best (deep lazy2, ~zstd level 11) + /// - 0: Uncompressed (no compression, raw zstd frame) + /// - 1: Fastest (fast hash, ~zstd level 1) + /// - 3: Default (dfast, ~zstd level 3) + /// - 7: Better (lazy2, ~zstd level 7) + /// - 11: Best (deep lazy2, ~zstd level 11) + /// - Negative: ultra-fast modes (less compression, more speed) + /// - 12-22: progressively higher ratio (capped at lazy2 backend) #[arg( short, long, - value_name = "COMPRESSION_LEVEL", - default_value_t = 2, - value_parser = clap::value_parser!(u8).range(0..=4), - verbatim_doc_comment + value_name = "LEVEL", + default_value_t = 3, + value_parser = clap::value_parser!(i32).range(-5..=22), + verbatim_doc_comment, + allow_hyphen_values = true, )] - level: u8, + level: i32, }, Decompress { /// .zst archive to decompress @@ -101,15 +103,11 @@ fn main() -> color_eyre::Result<()> { Ok(()) } -fn compress(input: PathBuf, output: PathBuf, level: u8) -> color_eyre::Result<()> { +fn compress(input: PathBuf, output: PathBuf, level: i32) -> color_eyre::Result<()> { info!("compressing {input:?} to {output:?}"); let compression_level: structured_zstd::encoding::CompressionLevel = match level { 0 => CompressionLevel::Uncompressed, - 1 => CompressionLevel::Fastest, - 2 => CompressionLevel::Default, - 3 => CompressionLevel::Better, - 4 => CompressionLevel::Best, - _ => return Err(eyre!("unsupported compression level: {level}")), + n => CompressionLevel::from_level(n), }; ensure_distinct_paths(&input, &output)?; ensure_regular_output_destination(&output)?; @@ -402,7 +400,19 @@ mod tests { #[test] fn cli_rejects_unsupported_compression_level_at_parse_time() { - let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "5"]); + let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "23"]); + assert!(parse.is_err()); + } + + #[test] + fn cli_accepts_negative_compression_level() { + let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "-3"]); + assert!(parse.is_ok()); + } + + #[test] + fn cli_rejects_too_negative_compression_level() { + let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "-6"]); assert!(parse.is_err()); } @@ -415,7 +425,7 @@ mod tests { let input = std::env::temp_dir().join(format!("structured-zstd-cli-alias-{unique}.txt")); fs::write(&input, b"streaming-cli-alias-check").unwrap(); - let err = compress(input.clone(), input.clone(), 2).unwrap_err(); + let err = compress(input.clone(), input.clone(), 3).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("input and output"), @@ -434,7 +444,7 @@ mod tests { fs::write(&input, b"streaming-cli-hardlink-check").unwrap(); fs::hard_link(&input, &output).unwrap(); - let err = compress(input.clone(), output.clone(), 2).unwrap_err(); + let err = compress(input.clone(), output.clone(), 3).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("input and output"), @@ -455,7 +465,7 @@ mod tests { let output = std::env::temp_dir().join(format!("structured-zstd-cli-missing-output-{unique}.zst")); - let err = compress(missing_input, output.clone(), 2).unwrap_err(); + let err = compress(missing_input, output.clone(), 3).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("failed to open input file"), @@ -473,7 +483,7 @@ mod tests { let output = dir.join("existing-dir"); fs::create_dir(&output).unwrap(); - let err = compress(input, output.clone(), 2).unwrap_err(); + let err = compress(input, output.clone(), 3).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("not a regular file"), diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs index 42c941b9..c4cdc951 100644 --- a/zstd/src/encoding/frame_compressor.rs +++ b/zstd/src/encoding/frame_compressor.rs @@ -274,7 +274,8 @@ impl FrameCompressor { CompressionLevel::Fastest | CompressionLevel::Default | CompressionLevel::Better - | CompressionLevel::Best => compress_block_encoded( + | CompressionLevel::Best + | CompressionLevel::Level(_) => compress_block_encoded( &mut self.state, last_block, uncompressed_data, @@ -476,7 +477,7 @@ mod tests { data.len() as u64, "FCS mismatch for len={} level={:?}", data.len(), - level as u8, + level, ); // Confirm the FCS field is actually present in the header // (not just the decoder returning 0 for absent FCS). @@ -485,7 +486,7 @@ mod tests { 0, "FCS field must be present for len={} level={:?}", data.len(), - level as u8, + level, ); // Verify C zstd can decompress let mut decoded = Vec::new(); @@ -883,8 +884,10 @@ mod tests { crate::decoding::Dictionary::from_raw_content(dict_id, b"abcdefgh".to_vec()) .expect("raw dictionary should be valid"); - let payload = b"abcdefgh".repeat(512); - let matcher = MatchGeneratorDriver::new(8, 1); + // Payload must exceed the encoder's advertised window (128 KiB for + // Fastest) so the test actually exercises cross-window-boundary behavior. + let payload = b"abcdefgh".repeat(128 * 1024 / 8 + 64); + let matcher = MatchGeneratorDriver::new(1024, 1); let mut no_dict_output = Vec::new(); let mut no_dict_compressor = @@ -900,7 +903,7 @@ mod tests { .expect("window size should be present"); let mut output = Vec::new(); - let matcher = MatchGeneratorDriver::new(8, 1); + let matcher = MatchGeneratorDriver::new(1024, 1); let mut compressor = FrameCompressor::new_with_matcher(matcher, super::CompressionLevel::Fastest); compressor diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 7eaae2a5..d3e1cb52 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -23,8 +23,6 @@ const DFAST_TARGET_LEN: usize = 48; // measurements show we can shrink them without regressing acceptance tests. const DFAST_HASH_BITS: usize = 20; const DFAST_SEARCH_DEPTH: usize = 4; -const DFAST_DEFAULT_WINDOW_SIZE: usize = 1 << 22; -const BETTER_DEFAULT_WINDOW_SIZE: usize = 1 << 23; const DFAST_EMPTY_SLOT: usize = usize::MAX; const HC_HASH_LOG: usize = 20; @@ -36,7 +34,6 @@ const HC_TARGET_LEN: usize = 48; // that can never collide with any valid position, even at the 4 GiB boundary. const HC_EMPTY: u32 = 0; -const BEST_DEFAULT_WINDOW_SIZE: usize = 1 << 24; // Maximum search depth across all HC-based levels. Used to size the // fixed-length candidate array returned by chain_candidates(). const MAX_HC_SEARCH_DEPTH: usize = 32; @@ -66,6 +63,91 @@ const BEST_HC_CONFIG: HcConfig = HcConfig { target_len: 128, }; +/// Resolved tuning parameters for a compression level. +#[derive(Copy, Clone)] +struct LevelParams { + backend: MatcherBackend, + window_log: u8, + hash_fill_step: usize, + lazy_depth: u8, + hc: HcConfig, +} + +/// Parameter table for numeric compression levels 1–22. +/// +/// Each entry maps a zstd compression level to the best-available matcher +/// backend and tuning knobs. Levels that require strategies this crate does +/// not implement (greedy, btopt, btultra) are approximated with the closest +/// available backend. +/// +/// Index 0 = level 1, index 21 = level 22. +#[rustfmt::skip] +const LEVEL_TABLE: [LevelParams; 22] = [ + // Lvl Strategy wlog step lazy HC config + // --- -------------- ---- ---- ---- ------------------------------------------ + /* 1 */ LevelParams { backend: MatcherBackend::Simple, window_log: 17, hash_fill_step: 3, lazy_depth: 0, hc: HC_CONFIG }, + /* 2 */ LevelParams { backend: MatcherBackend::Dfast, window_log: 19, hash_fill_step: 1, lazy_depth: 1, hc: HC_CONFIG }, + /* 3 */ LevelParams { backend: MatcherBackend::Dfast, window_log: 22, hash_fill_step: 1, lazy_depth: 1, hc: HC_CONFIG }, + /* 4 */ LevelParams { backend: MatcherBackend::Dfast, window_log: 22, hash_fill_step: 1, lazy_depth: 1, hc: HC_CONFIG }, + /* 5 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 22, hash_fill_step: 1, lazy_depth: 1, hc: HcConfig { hash_log: 18, chain_log: 17, search_depth: 4, target_len: 32 } }, + /* 6 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 23, hash_fill_step: 1, lazy_depth: 1, hc: HcConfig { hash_log: 19, chain_log: 18, search_depth: 8, target_len: 48 } }, + /* 7 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 23, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 20, chain_log: 19, search_depth: 16, target_len: 48 } }, + /* 8 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 23, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 20, chain_log: 19, search_depth: 24, target_len: 64 } }, + /* 9 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 23, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 21, chain_log: 20, search_depth: 24, target_len: 64 } }, + /*10 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 24, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 21, chain_log: 20, search_depth: 28, target_len: 96 } }, + /*11 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 24, hash_fill_step: 1, lazy_depth: 2, hc: BEST_HC_CONFIG }, + /*12 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 25, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 22, chain_log: 21, search_depth: 32, target_len: 128 } }, + /*13 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 25, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 22, chain_log: 21, search_depth: 32, target_len: 160 } }, + /*14 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 25, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 22, chain_log: 22, search_depth: 32, target_len: 192 } }, + /*15 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 22, search_depth: 32, target_len: 192 } }, + /*16 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 22, search_depth: 32, target_len: 256 } }, + /*17 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 23, search_depth: 32, target_len: 256 } }, + /*18 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 23, search_depth: 32, target_len: 256 } }, + /*19 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 23, search_depth: 32, target_len: 256 } }, + /*20 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 23, search_depth: 32, target_len: 256 } }, + /*21 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 23, search_depth: 32, target_len: 256 } }, + /*22 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 23, search_depth: 32, target_len: 256 } }, +]; + +/// Resolve a [`CompressionLevel`] to internal tuning parameters. +fn resolve_level_params(level: CompressionLevel) -> LevelParams { + match level { + CompressionLevel::Uncompressed => LevelParams { + backend: MatcherBackend::Simple, + window_log: 17, + hash_fill_step: 1, + lazy_depth: 0, + hc: HC_CONFIG, + }, + CompressionLevel::Fastest => LEVEL_TABLE[0], + CompressionLevel::Default => LEVEL_TABLE[2], + CompressionLevel::Better => LEVEL_TABLE[6], + CompressionLevel::Best => LEVEL_TABLE[10], + CompressionLevel::Level(n) => { + if n > 0 { + let idx = (n as usize).min(CompressionLevel::MAX_LEVEL as usize) - 1; + LEVEL_TABLE[idx] + } else if n == 0 { + // Level 0 = default, matching C zstd semantics. + LEVEL_TABLE[2] + } else { + // Negative levels: ultra-fast with the Simple backend. + // Acceleration grows with magnitude, expressed as larger + // hash_fill_step (fewer positions indexed). + let acceleration = ((-n) as usize).min(131072); + let step = (acceleration + 3).min(128); + LevelParams { + backend: MatcherBackend::Simple, + window_log: 17, + hash_fill_step: step, + lazy_depth: 0, + hc: HC_CONFIG, + } + } + } + } +} + #[derive(Copy, Clone, Debug, PartialEq, Eq)] enum MatcherBackend { Simple, @@ -83,7 +165,6 @@ pub struct MatchGeneratorDriver { active_backend: MatcherBackend, slice_size: usize, base_slice_size: usize, - base_window_size: usize, // Frame header window size must stay at the configured live-window budget. // Dictionary retention expands internal matcher capacity only. reported_window_size: usize, @@ -106,45 +187,13 @@ impl MatchGeneratorDriver { active_backend: MatcherBackend::Simple, slice_size, base_slice_size: slice_size, - base_window_size: max_window_size, reported_window_size: max_window_size, dictionary_retained_budget: 0, } } - fn level_config(&self, level: CompressionLevel) -> (MatcherBackend, usize, usize, usize) { - match level { - CompressionLevel::Uncompressed => ( - MatcherBackend::Simple, - self.base_slice_size, - self.base_window_size, - 1, - ), - CompressionLevel::Fastest => ( - MatcherBackend::Simple, - self.base_slice_size, - self.base_window_size, - FAST_HASH_FILL_STEP, - ), - CompressionLevel::Default => ( - MatcherBackend::Dfast, - self.base_slice_size, - DFAST_DEFAULT_WINDOW_SIZE, - 1, - ), - CompressionLevel::Better => ( - MatcherBackend::HashChain, - self.base_slice_size, - BETTER_DEFAULT_WINDOW_SIZE, - 1, - ), - CompressionLevel::Best => ( - MatcherBackend::HashChain, - self.base_slice_size, - BEST_DEFAULT_WINDOW_SIZE, - 1, - ), - } + fn level_params(level: CompressionLevel) -> LevelParams { + resolve_level_params(level) } fn dfast_matcher(&self) -> &DfastMatchGenerator { @@ -248,9 +297,10 @@ impl Matcher for MatchGeneratorDriver { } fn reset(&mut self, level: CompressionLevel) { - let (backend, slice_size, max_window_size, hash_fill_step) = self.level_config(level); + let params = Self::level_params(level); + let max_window_size = 1usize << params.window_log; self.dictionary_retained_budget = 0; - if self.active_backend != backend { + if self.active_backend != params.backend { match self.active_backend { MatcherBackend::Simple => { let vec_pool = &mut self.vec_pool; @@ -288,15 +338,15 @@ impl Matcher for MatchGeneratorDriver { } } - self.active_backend = backend; - self.slice_size = slice_size; + self.active_backend = params.backend; + self.slice_size = self.base_slice_size; self.reported_window_size = max_window_size; match self.active_backend { MatcherBackend::Simple => { let vec_pool = &mut self.vec_pool; let suffix_pool = &mut self.suffix_pool; self.match_generator.max_window_size = max_window_size; - self.match_generator.hash_fill_step = hash_fill_step; + self.match_generator.hash_fill_step = params.hash_fill_step; self.match_generator.reset(|mut data, mut suffixes| { data.resize(data.capacity(), 0); vec_pool.push(data); @@ -310,7 +360,7 @@ impl Matcher for MatchGeneratorDriver { .dfast_match_generator .get_or_insert_with(|| DfastMatchGenerator::new(max_window_size)); dfast.max_window_size = max_window_size; - dfast.lazy_depth = 1; + dfast.lazy_depth = params.lazy_depth; let vec_pool = &mut self.vec_pool; dfast.reset(|mut data| { data.resize(data.capacity(), 0); @@ -322,11 +372,8 @@ impl Matcher for MatchGeneratorDriver { .hc_match_generator .get_or_insert_with(|| HcMatchGenerator::new(max_window_size)); hc.max_window_size = max_window_size; - hc.lazy_depth = 2; - match level { - CompressionLevel::Best => hc.configure(BEST_HC_CONFIG), - _ => hc.configure(HC_CONFIG), - } + hc.lazy_depth = params.lazy_depth; + hc.configure(params.hc); let vec_pool = &mut self.vec_pool; hc.reset(|mut data| { data.resize(data.capacity(), 0); @@ -1975,7 +2022,7 @@ fn dfast_matches_roundtrip_multi_block_pattern() { let first_block: Vec = pattern.iter().copied().cycle().take(128 * 1024).collect(); let second_block: Vec = pattern.iter().copied().cycle().take(128 * 1024).collect(); - let mut matcher = DfastMatchGenerator::new(DFAST_DEFAULT_WINDOW_SIZE); + let mut matcher = DfastMatchGenerator::new(1 << 22); let replay_sequence = |decoded: &mut Vec, seq: Sequence<'_>| match seq { Sequence::Literals { literals } => decoded.extend_from_slice(literals), Sequence::Triple { @@ -2009,7 +2056,7 @@ fn driver_switches_backends_and_initializes_dfast_via_reset() { let mut driver = MatchGeneratorDriver::new(32, 2); driver.reset(CompressionLevel::Default); - assert_eq!(driver.window_size(), DFAST_DEFAULT_WINDOW_SIZE as u64); + assert_eq!(driver.window_size(), (1u64 << 22)); let mut first = driver.get_next_space(); first[..12].copy_from_slice(b"abcabcabcabc"); @@ -2042,7 +2089,7 @@ fn driver_switches_backends_and_initializes_dfast_via_reset() { assert_eq!(reconstructed, b"abcabcabcabcabcabcabcabc"); driver.reset(CompressionLevel::Fastest); - assert_eq!(driver.window_size(), 64); + assert_eq!(driver.window_size(), (1u64 << 17)); } #[test] @@ -2051,7 +2098,7 @@ fn driver_best_to_fastest_releases_oversized_hc_tables() { // Initialize at Best — allocates large HC tables (2M hash, 1M chain). driver.reset(CompressionLevel::Best); - assert_eq!(driver.window_size(), BEST_DEFAULT_WINDOW_SIZE as u64); + assert_eq!(driver.window_size(), (1u64 << 24)); // Feed data so tables are actually allocated via ensure_tables(). let mut space = driver.get_next_space(); @@ -2062,7 +2109,7 @@ fn driver_best_to_fastest_releases_oversized_hc_tables() { // Switch to Fastest — must release HC tables. driver.reset(CompressionLevel::Fastest); - assert_eq!(driver.window_size(), 64); + assert_eq!(driver.window_size(), (1u64 << 17)); // HC matcher should have empty tables after backend switch. let hc = driver.hc_match_generator.as_ref().unwrap(); @@ -2082,7 +2129,7 @@ fn driver_better_to_best_resizes_hc_tables() { // Initialize at Better — allocates small HC tables (1M hash, 512K chain). driver.reset(CompressionLevel::Better); - assert_eq!(driver.window_size(), BETTER_DEFAULT_WINDOW_SIZE as u64); + assert_eq!(driver.window_size(), (1u64 << 23)); let mut space = driver.get_next_space(); space[..12].copy_from_slice(b"abcabcabcabc"); @@ -2096,7 +2143,7 @@ fn driver_better_to_best_resizes_hc_tables() { // Switch to Best — must resize to larger tables. driver.reset(CompressionLevel::Best); - assert_eq!(driver.window_size(), BEST_DEFAULT_WINDOW_SIZE as u64); + assert_eq!(driver.window_size(), (1u64 << 24)); // Feed data to trigger ensure_tables with new sizes. let mut space = driver.get_next_space(); @@ -2300,6 +2347,10 @@ fn dfast_prime_with_dictionary_counts_four_byte_tail_budget() { fn prime_with_dictionary_budget_shrinks_after_simple_eviction() { let mut driver = MatchGeneratorDriver::new(8, 1); driver.reset(CompressionLevel::Fastest); + // Use a small live window so dictionary-primed slices are evicted + // quickly and budget retirement can be asserted deterministically. + driver.match_generator.max_window_size = 8; + driver.reported_window_size = 8; let base_window = driver.match_generator.max_window_size; driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]); @@ -2443,7 +2494,7 @@ fn fastest_reset_uses_interleaved_hash_fill_step() { // happened and the lazy_depth is configured correctly. driver.reset(CompressionLevel::Better); assert_eq!(driver.active_backend, MatcherBackend::HashChain); - assert_eq!(driver.window_size(), BETTER_DEFAULT_WINDOW_SIZE as u64); + assert_eq!(driver.window_size(), (1u64 << 23)); assert_eq!(driver.hc_matcher().lazy_depth, 2); } @@ -2723,7 +2774,7 @@ fn dfast_trim_to_window_callback_reports_evicted_len_not_capacity() { #[test] fn dfast_inserts_tail_positions_for_next_block_matching() { - let mut matcher = DfastMatchGenerator::new(DFAST_DEFAULT_WINDOW_SIZE); + let mut matcher = DfastMatchGenerator::new(1 << 22); matcher.add_data(b"012345bcdea".to_vec(), |_| {}); let mut history = Vec::new(); diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs index d075b9bc..ee8fd941 100644 --- a/zstd/src/encoding/mod.rs +++ b/zstd/src/encoding/mod.rs @@ -45,7 +45,7 @@ pub fn compress_to_vec(source: R, level: CompressionLevel) -> Vec { /// The compression mode used impacts the speed of compression, /// and resulting compression ratios. Faster compression will result /// in worse compression ratios, and vice versa. -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] pub enum CompressionLevel { /// This level does not compress the data at all, and simply wraps /// it in a Zstandard frame. @@ -88,6 +88,41 @@ pub enum CompressionLevel { /// Prefer [`CompressionLevel::Default`] for very large single-frame /// streams until table rebasing is implemented. Best, + /// Numeric compression level. + /// + /// Levels 1–22 correspond to the C zstd level numbering. Higher values + /// produce smaller output at the cost of more CPU time. Negative values + /// select ultra-fast modes that trade ratio for speed. Level 0 is + /// treated as [`DEFAULT_LEVEL`](Self::DEFAULT_LEVEL), matching C zstd + /// semantics. + /// + /// Named variants map to specific numeric levels: + /// [`Fastest`](Self::Fastest) = 1, [`Default`](Self::Default) = 3, + /// [`Better`](Self::Better) = 7, [`Best`](Self::Best) = 11. + /// + /// Levels above 11 use progressively larger windows and deeper search + /// with the lazy2 hash-chain backend. Levels that require strategies + /// this crate has not yet implemented (btopt, btultra) are approximated + /// with the closest available matcher. + Level(i32), +} + +impl CompressionLevel { + /// The minimum supported numeric compression level (ultra-fast mode). + pub const MIN_LEVEL: i32 = -131072; + /// The maximum supported numeric compression level. + pub const MAX_LEVEL: i32 = 22; + /// The default numeric compression level (equivalent to [`Default`](Self::Default)). + pub const DEFAULT_LEVEL: i32 = 3; + + /// Create a compression level from a numeric value. + /// + /// Wraps the raw integer in [`Level`](Self::Level). Values outside + /// [`MIN_LEVEL`](Self::MIN_LEVEL)..=[`MAX_LEVEL`](Self::MAX_LEVEL) are + /// silently clamped during parameter resolution. + pub const fn from_level(level: i32) -> Self { + CompressionLevel::Level(level) + } } /// Trait used by the encoder that users can use to extend the matching facilities with their own algorithm diff --git a/zstd/src/encoding/streaming_encoder.rs b/zstd/src/encoding/streaming_encoder.rs index 6f2e3f0a..d2541339 100644 --- a/zstd/src/encoding/streaming_encoder.rs +++ b/zstd/src/encoding/streaming_encoder.rs @@ -246,8 +246,9 @@ impl StreamingEncoder { CompressionLevel::Fastest | CompressionLevel::Default | CompressionLevel::Better - | CompressionLevel::Best => self.state.matcher.get_next_space(), - _ => Vec::new(), + | CompressionLevel::Best + | CompressionLevel::Level(_) => self.state.matcher.get_next_space(), + CompressionLevel::Uncompressed => Vec::new(), }; space.clear(); if space.capacity() > block_capacity { @@ -303,7 +304,8 @@ impl StreamingEncoder { | CompressionLevel::Fastest | CompressionLevel::Default | CompressionLevel::Better - | CompressionLevel::Best => Ok(()), + | CompressionLevel::Best + | CompressionLevel::Level(_) => Ok(()), } } @@ -338,7 +340,8 @@ impl StreamingEncoder { CompressionLevel::Fastest | CompressionLevel::Default | CompressionLevel::Better - | CompressionLevel::Best => { + | CompressionLevel::Best + | CompressionLevel::Level(_) => { let block = raw_block.take().expect("raw block missing"); debug_assert!(!block.is_empty(), "empty blocks handled above"); compress_block_encoded(&mut self.state, last_block, block, &mut encoded); diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index 29e9d8f3..0aedc16c 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -544,3 +544,129 @@ fn roundtrip_best_level_streaming_multi_block() { let data = generate_compressible(5555, 512 * 1024); assert_eq!(roundtrip_best_streaming(&data), data); } + +// ─── Numeric compression levels (CompressionLevel::Level) ───────── + +/// `from_level(3)` must be equivalent to `Default` — same compressed output. +#[test] +fn numeric_level_3_matches_default() { + let data = generate_compressible(9000, 64 * 1024); + let default = compress_to_vec(&data[..], CompressionLevel::Default); + let level_3 = compress_to_vec(&data[..], CompressionLevel::from_level(3)); + assert_eq!( + default, level_3, + "Level(3) output must be identical to Default" + ); +} + +/// `from_level(1)` must be equivalent to `Fastest`. +#[test] +fn numeric_level_1_matches_fastest() { + let data = generate_compressible(9001, 64 * 1024); + let fastest = compress_to_vec(&data[..], CompressionLevel::Fastest); + let level_1 = compress_to_vec(&data[..], CompressionLevel::from_level(1)); + assert_eq!( + fastest, level_1, + "Level(1) output must be identical to Fastest" + ); +} + +/// `from_level(7)` must be equivalent to `Better`. +#[test] +fn numeric_level_7_matches_better() { + let data = generate_compressible(9002, 64 * 1024); + let better = compress_to_vec(&data[..], CompressionLevel::Better); + let level_7 = compress_to_vec(&data[..], CompressionLevel::from_level(7)); + assert_eq!( + better, level_7, + "Level(7) output must be identical to Better" + ); +} + +/// `from_level(11)` must be equivalent to `Best`. +#[test] +fn numeric_level_11_matches_best() { + let data = generate_compressible(9003, 64 * 1024); + let best = compress_to_vec(&data[..], CompressionLevel::Best); + let level_11 = compress_to_vec(&data[..], CompressionLevel::from_level(11)); + assert_eq!(best, level_11, "Level(11) output must be identical to Best"); +} + +/// `from_level(0)` maps to default compression (level 3), matching C zstd. +#[test] +fn numeric_level_0_is_default_compression() { + let data = generate_compressible(9004, 64 * 1024); + let level_0 = compress_to_vec(&data[..], CompressionLevel::from_level(0)); + let level_3 = compress_to_vec(&data[..], CompressionLevel::from_level(3)); + assert_eq!(level_0, level_3, "Level(0) should map to default (level 3)"); +} + +/// All 22 positive levels produce valid output that round-trips correctly. +#[test] +fn all_22_levels_roundtrip() { + let data = generate_compressible(9100, 32 * 1024); + for level in 1..=22 { + let result = roundtrip_at_level(&data, CompressionLevel::from_level(level)); + assert_eq!(data, result, "Roundtrip failed for Level({level})"); + } +} + +/// Negative levels produce valid compressed output (ultra-fast mode). +#[test] +fn negative_levels_roundtrip() { + let data = generate_compressible(9200, 32 * 1024); + for level in [-1, -2, -3, -5] { + let result = roundtrip_at_level(&data, CompressionLevel::from_level(level)); + assert_eq!(data, result, "Roundtrip failed for Level({level})"); + } +} + +/// Higher levels should generally not produce *larger* output than lower levels +/// on reasonably compressible data. +#[test] +fn levels_monotonic_compression_ratio() { + let data = generate_compressible(9300, 64 * 1024); + let mut prev_size = usize::MAX; + for level in [1, 3, 7, 11] { + let compressed = compress_to_vec(&data[..], CompressionLevel::from_level(level)); + assert!( + compressed.len() <= prev_size, + "Level {level} produced larger output ({}) than a lower level ({prev_size})", + compressed.len(), + ); + prev_size = compressed.len(); + } +} + +/// Numeric levels work with the streaming encoder. +#[test] +fn numeric_level_streaming_roundtrip() { + use crate::encoding::StreamingEncoder; + use crate::io::Write; + + let data = generate_compressible(9400, 200 * 1024); + for level in [1, 3, 5, 7, 9, 11, -1] { + let mut encoder = StreamingEncoder::new(Vec::new(), CompressionLevel::from_level(level)); + for chunk in data.chunks(4096) { + encoder.write_all(chunk).unwrap(); + } + let compressed = encoder.finish().unwrap(); + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); + assert_eq!( + data, result, + "Streaming roundtrip failed for Level({level})" + ); + } +} + +/// Values beyond MAX_LEVEL are clamped — they must still produce valid output. +#[test] +fn out_of_range_level_clamped() { + let data = generate_compressible(9500, 16 * 1024); + let result = roundtrip_at_level(&data, CompressionLevel::from_level(100)); + assert_eq!(data, result, "Clamped Level(100) must still roundtrip"); + let result = roundtrip_at_level(&data, CompressionLevel::from_level(-200000)); + assert_eq!(data, result, "Clamped Level(-200000) must still roundtrip"); +} From bd152e0772bcad0e11a1b50c2964cbcbdf0e1240 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 14:13:37 +0300 Subject: [PATCH 02/18] fix(cli): align level 0 with library API and prevent i32::MIN overflow - Use saturating_abs() instead of (-n) cast to prevent overflow when n == i32::MIN in negative level resolution - Use CompressionLevel::MIN_LEVEL constant instead of hardcoded 131072 - CLI level 0 now maps to default compression (level 3), consistent with CompressionLevel::from_level(0) and C zstd semantics - Add --store flag for uncompressed zstd frames (replaces old level 0 = Uncompressed behavior) - Remove "C zstd numbering" claim from CLI help text since the CLI extends standard numbering with --store --- cli/src/main.rs | 35 +++++++++++++++++++--------- zstd/src/encoding/match_generator.rs | 3 ++- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index cfd652d1..812f1578 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -34,15 +34,20 @@ enum Commands { /// Where the compressed file is written /// [default: .zst] output_file: Option, - /// Compression level using C zstd numbering (higher = smaller, slower). + /// Compression level (higher = smaller, slower). /// - /// - 0: Uncompressed (no compression, raw zstd frame) + /// Numeric levels follow the zstd convention where 0 means + /// "use the default level" (currently 3). + /// + /// - 0: Default (same as 3) /// - 1: Fastest (fast hash, ~zstd level 1) /// - 3: Default (dfast, ~zstd level 3) /// - 7: Better (lazy2, ~zstd level 7) /// - 11: Best (deep lazy2, ~zstd level 11) /// - Negative: ultra-fast modes (less compression, more speed) /// - 12-22: progressively higher ratio (capped at lazy2 backend) + /// + /// Use --store to write an uncompressed zstd frame. #[arg( short, long, @@ -53,6 +58,12 @@ enum Commands { allow_hyphen_values = true, )] level: i32, + /// Write an uncompressed zstd frame (no compression). + /// + /// When set, --level is ignored and the input is wrapped in a + /// raw zstd frame without any compression. + #[arg(long)] + store: bool, }, Decompress { /// .zst archive to decompress @@ -83,9 +94,10 @@ fn main() -> color_eyre::Result<()> { input_file, output_file, level, + store, } => { let output_file = output_file.unwrap_or_else(|| add_extension(&input_file, ".zst")); - compress(input_file, output_file, level)?; + compress(input_file, output_file, level, store)?; } Commands::Decompress { input_file, @@ -103,11 +115,12 @@ fn main() -> color_eyre::Result<()> { Ok(()) } -fn compress(input: PathBuf, output: PathBuf, level: i32) -> color_eyre::Result<()> { +fn compress(input: PathBuf, output: PathBuf, level: i32, store: bool) -> color_eyre::Result<()> { info!("compressing {input:?} to {output:?}"); - let compression_level: structured_zstd::encoding::CompressionLevel = match level { - 0 => CompressionLevel::Uncompressed, - n => CompressionLevel::from_level(n), + let compression_level = if store { + CompressionLevel::Uncompressed + } else { + CompressionLevel::from_level(level) }; ensure_distinct_paths(&input, &output)?; ensure_regular_output_destination(&output)?; @@ -425,7 +438,7 @@ mod tests { let input = std::env::temp_dir().join(format!("structured-zstd-cli-alias-{unique}.txt")); fs::write(&input, b"streaming-cli-alias-check").unwrap(); - let err = compress(input.clone(), input.clone(), 3).unwrap_err(); + let err = compress(input.clone(), input.clone(), 3, false).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("input and output"), @@ -444,7 +457,7 @@ mod tests { fs::write(&input, b"streaming-cli-hardlink-check").unwrap(); fs::hard_link(&input, &output).unwrap(); - let err = compress(input.clone(), output.clone(), 3).unwrap_err(); + let err = compress(input.clone(), output.clone(), 3, false).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("input and output"), @@ -465,7 +478,7 @@ mod tests { let output = std::env::temp_dir().join(format!("structured-zstd-cli-missing-output-{unique}.zst")); - let err = compress(missing_input, output.clone(), 3).unwrap_err(); + let err = compress(missing_input, output.clone(), 3, false).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("failed to open input file"), @@ -483,7 +496,7 @@ mod tests { let output = dir.join("existing-dir"); fs::create_dir(&output).unwrap(); - let err = compress(input, output.clone(), 3).unwrap_err(); + let err = compress(input, output.clone(), 3, false).unwrap_err(); let message = format!("{err:#}"); assert!( message.contains("not a regular file"), diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index d3e1cb52..ef594a6e 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -134,7 +134,8 @@ fn resolve_level_params(level: CompressionLevel) -> LevelParams { // Negative levels: ultra-fast with the Simple backend. // Acceleration grows with magnitude, expressed as larger // hash_fill_step (fewer positions indexed). - let acceleration = ((-n) as usize).min(131072); + let acceleration = + (n.saturating_abs() as usize).min((-CompressionLevel::MIN_LEVEL) as usize); let step = (acceleration + 3).min(128); LevelParams { backend: MatcherBackend::Simple, From e20949590f34d35dae0707fcbc367c123dd05bba Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 14:36:52 +0300 Subject: [PATCH 03/18] feat(encoding): source-size-aware parameter selection and review fixes Source-size-aware selection (issue #21 requirement): - Add Matcher::set_source_size_hint() trait method (default no-op) - MatchGeneratorDriver stores hint and adjusts window_log + HC table sizes for small inputs, following C zstd source-size-class behavior - FrameCompressor::set_source_size_hint() passes through to matcher - StreamingEncoder::set_pledged_content_size() also sets the hint - StreamingEncoder::set_source_size_hint() for hint without enforcement - For small inputs, window and hash tables are capped proportionally to source size, avoiding multi-MB allocations for tiny payloads Review fixes: - Derive Level(0) index from DEFAULT_LEVEL constant (Copilot #5) - Tighten test doc comment to match strict assertion (Copilot #6) - Widen CLI level range to MIN_LEVEL..=MAX_LEVEL (CodeRabbit #7) - Use saturating_abs() for negative level overflow (Copilot #1, prev) --- cli/src/main.rs | 13 ++- zstd/src/encoding/frame_compressor.rs | 9 +++ zstd/src/encoding/match_generator.rs | 62 +++++++++++++-- zstd/src/encoding/mod.rs | 9 +++ zstd/src/encoding/streaming_encoder.rs | 20 +++++ zstd/src/tests/roundtrip_integrity.rs | 105 ++++++++++++++++++++++++- 6 files changed, 207 insertions(+), 11 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 812f1578..9f191657 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -53,7 +53,9 @@ enum Commands { long, value_name = "LEVEL", default_value_t = 3, - value_parser = clap::value_parser!(i32).range(-5..=22), + value_parser = clap::value_parser!(i32).range( + (CompressionLevel::MIN_LEVEL as i64)..=(CompressionLevel::MAX_LEVEL as i64) + ), verbatim_doc_comment, allow_hyphen_values = true, )] @@ -425,7 +427,14 @@ mod tests { #[test] fn cli_rejects_too_negative_compression_level() { - let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "-6"]); + // MIN_LEVEL is -131072; anything below that should be rejected + let parse = Cli::try_parse_from([ + "structured-zstd", + "compress", + "in.bin", + "--level", + "-131073", + ]); assert!(parse.is_err()); } diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs index c4cdc951..bd5ad612 100644 --- a/zstd/src/encoding/frame_compressor.rs +++ b/zstd/src/encoding/frame_compressor.rs @@ -157,6 +157,15 @@ impl FrameCompressor { self.compressed_data.replace(compressed_data) } + /// Provide a hint about the total uncompressed size for the next frame. + /// + /// When set, the encoder selects smaller hash tables and windows for + /// small inputs, matching the C zstd source-size-class behavior. + /// Must be called before [`compress`](Self::compress). + pub fn set_source_size_hint(&mut self, size: u64) { + self.state.matcher.set_source_size_hint(size); + } + /// Compress the uncompressed data from the provided source as one Zstd frame and write it to the provided drain /// /// This will repeatedly call [Read::read] on the source to fill up blocks until the source returns 0 on the read call. diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index ef594a6e..5dba03a7 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -109,9 +109,44 @@ const LEVEL_TABLE: [LevelParams; 22] = [ /*22 */ LevelParams { backend: MatcherBackend::HashChain, window_log: 26, hash_fill_step: 1, lazy_depth: 2, hc: HcConfig { hash_log: 23, chain_log: 23, search_depth: 32, target_len: 256 } }, ]; -/// Resolve a [`CompressionLevel`] to internal tuning parameters. -fn resolve_level_params(level: CompressionLevel) -> LevelParams { - match level { +/// Smallest window_log the encoder will use regardless of source size. +const MIN_WINDOW_LOG: u8 = 10; + +/// Adjust level parameters for a known source size. +/// +/// Follows the C zstd `clevels.h` approach: for small inputs, cap +/// window_log (and hash/chain for HC) so the encoder doesn't allocate +/// oversized tables. The four C size classes are: +/// >256 KiB (default table), ≤256 KiB, ≤128 KiB, ≤16 KiB. +fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams { + if src_size == 0 { + return params; + } + // Cap window_log so the window doesn't exceed the source. + // ceil_log2(src_size): the minimum number of bits to represent src_size. + let src_log = 64 - (src_size - 1).leading_zeros(); // ceil_log2 + let src_log = (src_log as u8).max(MIN_WINDOW_LOG); + if src_log < params.window_log { + params.window_log = src_log; + } + // For HC backend: also cap hash_log and chain_log so tables are + // proportional to the source, avoiding multi-MB allocations for + // tiny inputs. + if params.backend == MatcherBackend::HashChain { + if (src_log + 2) < params.hc.hash_log as u8 { + params.hc.hash_log = (src_log + 2) as usize; + } + if (src_log + 1) < params.hc.chain_log as u8 { + params.hc.chain_log = (src_log + 1) as usize; + } + } + params +} + +/// Resolve a [`CompressionLevel`] to internal tuning parameters, +/// optionally adjusted for a known source size. +fn resolve_level_params(level: CompressionLevel, source_size: Option) -> LevelParams { + let params = match level { CompressionLevel::Uncompressed => LevelParams { backend: MatcherBackend::Simple, window_log: 17, @@ -129,7 +164,7 @@ fn resolve_level_params(level: CompressionLevel) -> LevelParams { LEVEL_TABLE[idx] } else if n == 0 { // Level 0 = default, matching C zstd semantics. - LEVEL_TABLE[2] + LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1] } else { // Negative levels: ultra-fast with the Simple backend. // Acceleration grows with magnitude, expressed as larger @@ -146,6 +181,11 @@ fn resolve_level_params(level: CompressionLevel) -> LevelParams { } } } + }; + if let Some(size) = source_size { + adjust_params_for_source_size(params, size) + } else { + params } } @@ -172,6 +212,8 @@ pub struct MatchGeneratorDriver { // Tracks currently retained bytes that originated from primed dictionary // history and have not been evicted yet. dictionary_retained_budget: usize, + // Source size hint for next frame (set via set_source_size_hint, cleared on reset). + source_size_hint: Option, } impl MatchGeneratorDriver { @@ -190,11 +232,12 @@ impl MatchGeneratorDriver { base_slice_size: slice_size, reported_window_size: max_window_size, dictionary_retained_budget: 0, + source_size_hint: None, } } - fn level_params(level: CompressionLevel) -> LevelParams { - resolve_level_params(level) + fn level_params(level: CompressionLevel, source_size: Option) -> LevelParams { + resolve_level_params(level, source_size) } fn dfast_matcher(&self) -> &DfastMatchGenerator { @@ -297,8 +340,13 @@ impl Matcher for MatchGeneratorDriver { true } + fn set_source_size_hint(&mut self, size: u64) { + self.source_size_hint = Some(size); + } + fn reset(&mut self, level: CompressionLevel) { - let params = Self::level_params(level); + let hint = self.source_size_hint.take(); + let params = Self::level_params(level, hint); let max_window_size = 1usize << params.window_log; self.dictionary_retained_budget = 0; if self.active_backend != params.backend { diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs index ee8fd941..18bd7f74 100644 --- a/zstd/src/encoding/mod.rs +++ b/zstd/src/encoding/mod.rs @@ -153,6 +153,15 @@ pub trait Matcher { fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>)); /// Reset this matcher so it can be used for the next new frame fn reset(&mut self, level: CompressionLevel); + /// Provide a hint about the total uncompressed size for the next frame. + /// + /// Implementations may use this to select smaller hash tables and windows + /// for small inputs, matching the C zstd source-size-class behavior. + /// Called before [`reset`](Self::reset) when the caller knows the input + /// size (e.g. from pledged content size or file metadata). + /// + /// The default implementation is a no-op for custom matchers. + fn set_source_size_hint(&mut self, _size: u64) {} /// Prime matcher state with dictionary history before compressing the next frame. /// Default implementation is a no-op for custom matchers that do not support this. fn prime_with_dictionary(&mut self, _dict_content: &[u8], _offset_hist: [u32; 3]) {} diff --git a/zstd/src/encoding/streaming_encoder.rs b/zstd/src/encoding/streaming_encoder.rs index d2541339..eeebe419 100644 --- a/zstd/src/encoding/streaming_encoder.rs +++ b/zstd/src/encoding/streaming_encoder.rs @@ -93,6 +93,26 @@ impl StreamingEncoder { )); } self.pledged_content_size = Some(size); + // Also use pledged size as source-size hint so the matcher + // can select smaller tables for small inputs. + self.state.matcher.set_source_size_hint(size); + Ok(()) + } + + /// Provide a hint about the total uncompressed size for the next frame. + /// + /// Unlike [`set_pledged_content_size`](Self::set_pledged_content_size), + /// this does **not** enforce that exactly `size` bytes are written; it + /// only optimises matcher parameters for small inputs. Must be called + /// before the first [`write`](Write::write). + pub fn set_source_size_hint(&mut self, size: u64) -> Result<(), Error> { + self.ensure_open()?; + if self.frame_started { + return Err(invalid_input_error( + "source size hint must be set before the first write", + )); + } + self.state.matcher.set_source_size_hint(size); Ok(()) } diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index 0aedc16c..61204b21 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -621,8 +621,8 @@ fn negative_levels_roundtrip() { } } -/// Higher levels should generally not produce *larger* output than lower levels -/// on reasonably compressible data. +/// For this reasonably compressible fixture, the sampled higher levels are +/// expected not to produce larger output than the lower sampled levels. #[test] fn levels_monotonic_compression_ratio() { let data = generate_compressible(9300, 64 * 1024); @@ -670,3 +670,104 @@ fn out_of_range_level_clamped() { let result = roundtrip_at_level(&data, CompressionLevel::from_level(-200000)); assert_eq!(data, result, "Clamped Level(-200000) must still roundtrip"); } + +// ─── Source-size-aware selection ─────────────────────────────────── + +/// Small input with source size hint should produce valid output. +#[test] +fn source_size_hint_small_input_roundtrip() { + let data = generate_compressible(9600, 4 * 1024); // 4 KiB + let compressed = { + let mut compressor = FrameCompressor::new(CompressionLevel::from_level(7)); + compressor.set_source_size_hint(data.len() as u64); + compressor.set_source(data.as_slice()); + let mut out = Vec::new(); + compressor.set_drain(&mut out); + compressor.compress(); + out + }; + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); + assert_eq!(data, result, "Small input with size hint must roundtrip"); +} + +/// Source size hint should reduce compressed output overhead for small inputs +/// by avoiding oversized windows/tables. +#[test] +fn source_size_hint_reduces_window_for_small_input() { + let data = generate_compressible(9601, 1024); // 1 KiB + // Without hint: uses full level-11 window (16 MiB) + let no_hint = compress_to_vec(&data[..], CompressionLevel::from_level(11)); + // With hint: should use smaller window + let with_hint = { + let mut compressor = FrameCompressor::new(CompressionLevel::from_level(11)); + compressor.set_source_size_hint(data.len() as u64); + compressor.set_source(data.as_slice()); + let mut out = Vec::new(); + compressor.set_drain(&mut out); + compressor.compress(); + out + }; + // Both must decompress correctly + let mut decoder = StreamingDecoder::new(no_hint.as_slice()).unwrap(); + let mut r = Vec::new(); + decoder.read_to_end(&mut r).unwrap(); + assert_eq!(data, r); + + let mut decoder = StreamingDecoder::new(with_hint.as_slice()).unwrap(); + let mut r = Vec::new(); + decoder.read_to_end(&mut r).unwrap(); + assert_eq!(data, r); + + // With hint should produce output no larger than without + // (smaller window descriptor in frame header, similar or identical blocks) + assert!( + with_hint.len() <= no_hint.len(), + "Size hint should not produce larger output: hint={} no_hint={}", + with_hint.len(), + no_hint.len(), + ); +} + +/// Streaming encoder with pledged content size automatically uses source size hint. +#[test] +fn streaming_pledged_size_uses_source_hint() { + use crate::encoding::StreamingEncoder; + use crate::io::Write; + + let data = generate_compressible(9602, 2 * 1024); // 2 KiB + let mut encoder = StreamingEncoder::new(Vec::new(), CompressionLevel::from_level(11)); + encoder.set_pledged_content_size(data.len() as u64).unwrap(); + encoder.write_all(&data).unwrap(); + let compressed = encoder.finish().unwrap(); + + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); + assert_eq!(data, result, "Pledged-size streaming must roundtrip"); +} + +/// All 22 levels produce valid output for a tiny (256 byte) input with size hint. +#[test] +fn all_levels_tiny_input_with_hint() { + let data = generate_compressible(9603, 256); + for level in 1..=22 { + let compressed = { + let mut compressor = FrameCompressor::new(CompressionLevel::from_level(level)); + compressor.set_source_size_hint(data.len() as u64); + compressor.set_source(data.as_slice()); + let mut out = Vec::new(); + compressor.set_drain(&mut out); + compressor.compress(); + out + }; + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); + assert_eq!( + data, result, + "Tiny input with hint failed for Level({level})" + ); + } +} From 625c1f09eb4b61df227084e57195f47de8ff0506 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 16:59:28 +0300 Subject: [PATCH 04/18] fix(encoding): honor source-size hints in matcher and cli - pass source size from CLI into StreamingEncoder before writing - resize dfast hash tables based on hinted window and clamp slice size to window - strengthen roundtrip tests for i32::MIN clamping and hinted frame window assertions --- cli/src/main.rs | 3 + zstd/src/encoding/match_generator.rs | 98 ++++++++++++++++++++++----- zstd/src/tests/roundtrip_integrity.rs | 27 ++++++-- 3 files changed, 104 insertions(+), 24 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 9f191657..cbfbc8e5 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -141,6 +141,9 @@ fn compress(input: PathBuf, output: PathBuf, level: i32, store: bool) -> color_e let compression_result: color_eyre::Result = (|| { let mut encoder = structured_zstd::encoding::StreamingEncoder::new(temporary_output, compression_level); + encoder + .set_source_size_hint(source_size as u64) + .wrap_err("failed to configure source size hint")?; std::io::copy(&mut encoder_input, &mut encoder).wrap_err("streaming compression failed")?; encoder.finish().wrap_err("failed to finalize zstd frame") })(); diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 5dba03a7..8d40070a 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -73,6 +73,11 @@ struct LevelParams { hc: HcConfig, } +fn dfast_hash_bits_for_window(max_window_size: usize) -> usize { + let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize; + window_log.clamp(MIN_WINDOW_LOG as usize, DFAST_HASH_BITS) +} + /// Parameter table for numeric compression levels 1–22. /// /// Each entry maps a zstd compression level to the best-available matcher @@ -217,8 +222,10 @@ pub struct MatchGeneratorDriver { } impl MatchGeneratorDriver { - /// slice_size says how big the slices should be that are allocated to work with - /// max_slices_in_window says how many slices should at most be used while looking for matches + /// `slice_size` sets the base block allocation size used for matcher input chunks. + /// `max_slices_in_window` determines the initial window capacity at construction + /// time. Effective window sizing is recalculated on every [`reset`](Self::reset) + /// from the resolved compression level and optional source-size hint. pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self { let max_window_size = max_slices_in_window * slice_size; Self { @@ -388,7 +395,7 @@ impl Matcher for MatchGeneratorDriver { } self.active_backend = params.backend; - self.slice_size = self.base_slice_size; + self.slice_size = self.base_slice_size.min(max_window_size); self.reported_window_size = max_window_size; match self.active_backend { MatcherBackend::Simple => { @@ -410,6 +417,7 @@ impl Matcher for MatchGeneratorDriver { .get_or_insert_with(|| DfastMatchGenerator::new(max_window_size)); dfast.max_window_size = max_window_size; dfast.lazy_depth = params.lazy_depth; + dfast.set_hash_bits(dfast_hash_bits_for_window(max_window_size)); let vec_pool = &mut self.vec_pool; dfast.reset(|mut data| { data.resize(data.capacity(), 0); @@ -1098,6 +1106,7 @@ struct DfastMatchGenerator { offset_hist: [u32; 3], short_hash: Vec<[usize; DFAST_SEARCH_DEPTH]>, long_hash: Vec<[usize; DFAST_SEARCH_DEPTH]>, + hash_bits: usize, // Lazy match lookahead depth (internal tuning parameter). lazy_depth: u8, } @@ -1121,10 +1130,20 @@ impl DfastMatchGenerator { offset_hist: [1, 4, 8], short_hash: Vec::new(), long_hash: Vec::new(), + hash_bits: DFAST_HASH_BITS, lazy_depth: 1, } } + fn set_hash_bits(&mut self, bits: usize) { + let clamped = bits.clamp(MIN_WINDOW_LOG as usize, DFAST_HASH_BITS); + if self.hash_bits != clamped { + self.hash_bits = clamped; + self.short_hash = Vec::new(); + self.long_hash = Vec::new(); + } + } + fn reset(&mut self, mut reuse_space: impl FnMut(Vec)) { self.window_size = 0; self.history.clear(); @@ -1234,14 +1253,13 @@ impl DfastMatchGenerator { } fn ensure_hash_tables(&mut self) { - if self.short_hash.is_empty() { + let table_len = 1usize << self.hash_bits; + if self.short_hash.len() != table_len { // This is intentionally lazy so Fastest/Uncompressed never pay the // ~dfast-level memory cost. The current size tracks the issue's // zstd level-3 style parameters rather than a generic low-memory preset. - self.short_hash = - alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; 1 << DFAST_HASH_BITS]; - self.long_hash = - alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; 1 << DFAST_HASH_BITS]; + self.short_hash = alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; table_len]; + self.long_hash = alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; table_len]; } } @@ -1437,7 +1455,7 @@ impl DfastMatchGenerator { let idx = pos - self.history_abs_start; let short = { let concat = self.live_history(); - (idx + 4 <= concat.len()).then(|| Self::hash4(&concat[idx..])) + (idx + 4 <= concat.len()).then(|| self.hash4(&concat[idx..])) }; if let Some(short) = short { let bucket = &mut self.short_hash[short]; @@ -1449,7 +1467,7 @@ impl DfastMatchGenerator { let long = { let concat = self.live_history(); - (idx + 8 <= concat.len()).then(|| Self::hash8(&concat[idx..])) + (idx + 8 <= concat.len()).then(|| self.hash8(&concat[idx..])) }; if let Some(long) = long { let bucket = &mut self.long_hash[long]; @@ -1464,7 +1482,7 @@ impl DfastMatchGenerator { let concat = self.live_history(); let idx = pos - self.history_abs_start; (idx + 4 <= concat.len()) - .then(|| self.short_hash[Self::hash4(&concat[idx..])]) + .then(|| self.short_hash[self.hash4(&concat[idx..])]) .into_iter() .flatten() .filter(|candidate| *candidate != DFAST_EMPTY_SLOT) @@ -1474,25 +1492,25 @@ impl DfastMatchGenerator { let concat = self.live_history(); let idx = pos - self.history_abs_start; (idx + 8 <= concat.len()) - .then(|| self.long_hash[Self::hash8(&concat[idx..])]) + .then(|| self.long_hash[self.hash8(&concat[idx..])]) .into_iter() .flatten() .filter(|candidate| *candidate != DFAST_EMPTY_SLOT) } - fn hash4(data: &[u8]) -> usize { + fn hash4(&self, data: &[u8]) -> usize { let value = u32::from_le_bytes(data[..4].try_into().unwrap()) as u64; - Self::hash_bits(value) + self.hash_bits(value) } - fn hash8(data: &[u8]) -> usize { + fn hash8(&self, data: &[u8]) -> usize { let value = u64::from_le_bytes(data[..8].try_into().unwrap()); - Self::hash_bits(value) + self.hash_bits(value) } - fn hash_bits(value: u64) -> usize { + fn hash_bits(&self, value: u64) -> usize { const PRIME: u64 = 0x9E37_79B1_85EB_CA87; - ((value.wrapping_mul(PRIME)) >> (64 - DFAST_HASH_BITS)) as usize + ((value.wrapping_mul(PRIME)) >> (64 - self.hash_bits)) as usize } } @@ -2141,6 +2159,50 @@ fn driver_switches_backends_and_initializes_dfast_via_reset() { assert_eq!(driver.window_size(), (1u64 << 17)); } +#[test] +fn driver_small_source_hint_shrinks_dfast_hash_tables() { + let mut driver = MatchGeneratorDriver::new(32, 2); + + driver.reset(CompressionLevel::Default); + let mut space = driver.get_next_space(); + space[..12].copy_from_slice(b"abcabcabcabc"); + space.truncate(12); + driver.commit_space(space); + driver.skip_matching(); + let full_tables = driver.dfast_matcher().short_hash.len(); + assert_eq!(full_tables, 1 << DFAST_HASH_BITS); + + driver.set_source_size_hint(1024); + driver.reset(CompressionLevel::Default); + let mut space = driver.get_next_space(); + space[..12].copy_from_slice(b"xyzxyzxyzxyz"); + space.truncate(12); + driver.commit_space(space); + driver.skip_matching(); + let hinted_tables = driver.dfast_matcher().short_hash.len(); + + assert_eq!(driver.window_size(), 1 << MIN_WINDOW_LOG); + assert!( + hinted_tables < full_tables, + "tiny source hint should reduce dfast table footprint" + ); +} + +#[test] +fn source_hint_clamps_driver_slice_size_to_window() { + let mut driver = MatchGeneratorDriver::new(128 * 1024, 2); + driver.set_source_size_hint(1024); + driver.reset(CompressionLevel::Default); + + let window = driver.window_size() as usize; + assert_eq!(window, 1024); + assert_eq!(driver.slice_size, window); + + let space = driver.get_next_space(); + assert_eq!(space.len(), window); + driver.commit_space(space); +} + #[test] fn driver_best_to_fastest_releases_oversized_hc_tables() { let mut driver = MatchGeneratorDriver::new(32, 2); diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index 61204b21..08ff2918 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -669,6 +669,8 @@ fn out_of_range_level_clamped() { assert_eq!(data, result, "Clamped Level(100) must still roundtrip"); let result = roundtrip_at_level(&data, CompressionLevel::from_level(-200000)); assert_eq!(data, result, "Clamped Level(-200000) must still roundtrip"); + let result = roundtrip_at_level(&data, CompressionLevel::from_level(i32::MIN)); + assert_eq!(data, result, "Clamped Level(i32::MIN) must still roundtrip"); } // ─── Source-size-aware selection ─────────────────────────────────── @@ -699,6 +701,11 @@ fn source_size_hint_reduces_window_for_small_input() { let data = generate_compressible(9601, 1024); // 1 KiB // Without hint: uses full level-11 window (16 MiB) let no_hint = compress_to_vec(&data[..], CompressionLevel::from_level(11)); + let no_hint_header = crate::decoding::frame::read_frame_header(no_hint.as_slice()) + .unwrap() + .0 + .window_size() + .unwrap(); // With hint: should use smaller window let with_hint = { let mut compressor = FrameCompressor::new(CompressionLevel::from_level(11)); @@ -709,6 +716,11 @@ fn source_size_hint_reduces_window_for_small_input() { compressor.compress(); out }; + let with_hint_header = crate::decoding::frame::read_frame_header(with_hint.as_slice()) + .unwrap() + .0 + .window_size() + .unwrap(); // Both must decompress correctly let mut decoder = StreamingDecoder::new(no_hint.as_slice()).unwrap(); let mut r = Vec::new(); @@ -720,13 +732,16 @@ fn source_size_hint_reduces_window_for_small_input() { decoder.read_to_end(&mut r).unwrap(); assert_eq!(data, r); - // With hint should produce output no larger than without - // (smaller window descriptor in frame header, similar or identical blocks) assert!( - with_hint.len() <= no_hint.len(), - "Size hint should not produce larger output: hint={} no_hint={}", - with_hint.len(), - no_hint.len(), + with_hint_header <= no_hint_header, + "size hint should not increase frame window size: hint={} no_hint={}", + with_hint_header, + no_hint_header + ); + assert!( + with_hint_header < (16 * 1024 * 1024), + "hinted level-11 frame should advertise smaller-than-default window, got {}", + with_hint_header ); } From ec9a61a68263ff181765bda69bae8e8b977b3daf Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 17:36:31 +0300 Subject: [PATCH 05/18] test(cli): clarify --store level validation behavior - make pledged-size streaming test assert observable window-size effect - clarify CLI help: --store skips compression but parse-time level validation remains - add regression test for --store with out-of-range --level --- cli/src/main.rs | 17 +++++++++++++++-- zstd/src/tests/roundtrip_integrity.rs | 23 +++++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index cbfbc8e5..aa197a8f 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -62,8 +62,8 @@ enum Commands { level: i32, /// Write an uncompressed zstd frame (no compression). /// - /// When set, --level is ignored and the input is wrapped in a - /// raw zstd frame without any compression. + /// When set, compression itself ignores `--level` and writes a raw + /// zstd frame. The CLI still validates `--level` range at parse time. #[arg(long)] store: bool, }, @@ -441,6 +441,19 @@ mod tests { assert!(parse.is_err()); } + #[test] + fn cli_store_still_validates_level_range_at_parse_time() { + let parse = Cli::try_parse_from([ + "structured-zstd", + "compress", + "in.bin", + "--store", + "--level", + "23", + ]); + assert!(parse.is_err()); + } + #[test] fn compress_rejects_same_input_and_output_paths() { let unique = SystemTime::now() diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index 08ff2918..f1d35bda 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -752,15 +752,38 @@ fn streaming_pledged_size_uses_source_hint() { use crate::io::Write; let data = generate_compressible(9602, 2 * 1024); // 2 KiB + let no_hint = compress_to_vec(&data[..], CompressionLevel::from_level(11)); + let no_hint_header = crate::decoding::frame::read_frame_header(no_hint.as_slice()) + .unwrap() + .0 + .window_size() + .unwrap(); + let mut encoder = StreamingEncoder::new(Vec::new(), CompressionLevel::from_level(11)); encoder.set_pledged_content_size(data.len() as u64).unwrap(); encoder.write_all(&data).unwrap(); let compressed = encoder.finish().unwrap(); + let hinted_header = crate::decoding::frame::read_frame_header(compressed.as_slice()) + .unwrap() + .0 + .window_size() + .unwrap(); let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); let mut result = Vec::new(); decoder.read_to_end(&mut result).unwrap(); assert_eq!(data, result, "Pledged-size streaming must roundtrip"); + assert!( + hinted_header <= no_hint_header, + "pledged source hint should not increase window size: hinted={} no_hint={}", + hinted_header, + no_hint_header + ); + assert!( + hinted_header < (16 * 1024 * 1024), + "pledged source hint should reduce level-11 advertised window, got {}", + hinted_header + ); } /// All 22 levels produce valid output for a tiny (256 byte) input with size hint. From 70dd7f626bd9d27ecf37ff732eecdd424ec28d1f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 18:02:34 +0300 Subject: [PATCH 06/18] test(cli): derive parse boundary inputs from level constants --- cli/src/main.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index aa197a8f..a67924a2 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -418,7 +418,15 @@ mod tests { #[test] fn cli_rejects_unsupported_compression_level_at_parse_time() { - let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "23"]); + let too_high = + (structured_zstd::encoding::CompressionLevel::MAX_LEVEL as i64 + 1).to_string(); + let parse = Cli::try_parse_from([ + "structured-zstd", + "compress", + "in.bin", + "--level", + too_high.as_str(), + ]); assert!(parse.is_err()); } @@ -430,26 +438,29 @@ mod tests { #[test] fn cli_rejects_too_negative_compression_level() { - // MIN_LEVEL is -131072; anything below that should be rejected + let too_low = + (structured_zstd::encoding::CompressionLevel::MIN_LEVEL as i64 - 1).to_string(); let parse = Cli::try_parse_from([ "structured-zstd", "compress", "in.bin", "--level", - "-131073", + too_low.as_str(), ]); assert!(parse.is_err()); } #[test] fn cli_store_still_validates_level_range_at_parse_time() { + let too_high = + (structured_zstd::encoding::CompressionLevel::MAX_LEVEL as i64 + 1).to_string(); let parse = Cli::try_parse_from([ "structured-zstd", "compress", "in.bin", "--store", "--level", - "23", + too_high.as_str(), ]); assert!(parse.is_err()); } From eba6cbb76f4c0a062cd1e8807eb656122fb938b4 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 19:00:51 +0300 Subject: [PATCH 07/18] docs(encoding): align source-size docs with runtime behavior --- zstd/src/encoding/match_generator.rs | 14 ++++++++------ zstd/src/encoding/streaming_encoder.rs | 2 ++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 8d40070a..cc17335a 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -119,16 +119,18 @@ const MIN_WINDOW_LOG: u8 = 10; /// Adjust level parameters for a known source size. /// -/// Follows the C zstd `clevels.h` approach: for small inputs, cap -/// window_log (and hash/chain for HC) so the encoder doesn't allocate -/// oversized tables. The four C size classes are: -/// >256 KiB (default table), ≤256 KiB, ≤128 KiB, ≤16 KiB. +/// For non-empty inputs, this derives a cap from `ceil(log2(src_size))`, +/// then clamps it to [`MIN_WINDOW_LOG`]. This keeps tables bounded for +/// small inputs while preserving the encoder's minimum supported window. +/// For the HC backend, `hash_log` and `chain_log` are reduced +/// proportionally. fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams { if src_size == 0 { return params; } - // Cap window_log so the window doesn't exceed the source. - // ceil_log2(src_size): the minimum number of bits to represent src_size. + // Derive a source-size-based cap from ceil(log2(src_size)), then + // clamp to MIN_WINDOW_LOG. For inputs smaller than 1 KiB we keep the + // 1 KiB minimum window instead of shrinking below that floor. let src_log = 64 - (src_size - 1).leading_zeros(); // ceil_log2 let src_log = (src_log as u8).max(MIN_WINDOW_LOG); if src_log < params.window_log { diff --git a/zstd/src/encoding/streaming_encoder.rs b/zstd/src/encoding/streaming_encoder.rs index eeebe419..d5c4eb2c 100644 --- a/zstd/src/encoding/streaming_encoder.rs +++ b/zstd/src/encoding/streaming_encoder.rs @@ -81,6 +81,8 @@ impl StreamingEncoder { /// /// When set, the frame header will include a `Frame_Content_Size` field. /// This enables decoders to pre-allocate output buffers. + /// The pledged size is also forwarded as a source-size hint to the + /// matcher so small inputs can use smaller matching tables. /// /// Must be called **before** the first [`write`](Write::write) call; /// calling it after the frame header has already been emitted returns an From 42e6ecf383df20ff4ed2dd07d2361a1cab74476a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 19:09:00 +0300 Subject: [PATCH 08/18] test(roundtrip): replace brittle monotonic ratio assertion --- zstd/src/tests/roundtrip_integrity.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index f1d35bda..f7715f30 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -621,20 +621,24 @@ fn negative_levels_roundtrip() { } } -/// For this reasonably compressible fixture, the sampled higher levels are -/// expected not to produce larger output than the lower sampled levels. +/// Sampled numeric levels should produce valid compressed output and preserve +/// data through a full compress/decompress roundtrip. #[test] fn levels_monotonic_compression_ratio() { let data = generate_compressible(9300, 64 * 1024); - let mut prev_size = usize::MAX; for level in [1, 3, 7, 11] { let compressed = compress_to_vec(&data[..], CompressionLevel::from_level(level)); assert!( - compressed.len() <= prev_size, - "Level {level} produced larger output ({}) than a lower level ({prev_size})", - compressed.len(), + !compressed.is_empty(), + "Level {level} produced empty compressed output" + ); + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); + assert_eq!( + data, result, + "Roundtrip failed for sampled compression level {level}" ); - prev_size = compressed.len(); } } From d7f0a79e8c98deacc5d59ea43b89a2e1f068cc43 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 19:23:32 +0300 Subject: [PATCH 09/18] fix(encoding): clamp zero source-size hint to minimum window --- zstd/src/encoding/match_generator.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index cc17335a..956df2f1 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -119,20 +119,22 @@ const MIN_WINDOW_LOG: u8 = 10; /// Adjust level parameters for a known source size. /// -/// For non-empty inputs, this derives a cap from `ceil(log2(src_size))`, -/// then clamps it to [`MIN_WINDOW_LOG`]. This keeps tables bounded for +/// This derives a cap from `ceil(log2(src_size))`, then clamps it to +/// [`MIN_WINDOW_LOG`]. A zero-byte size hint is treated as +/// [`MIN_WINDOW_LOG`]. This keeps tables bounded for /// small inputs while preserving the encoder's minimum supported window. /// For the HC backend, `hash_log` and `chain_log` are reduced /// proportionally. fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams { - if src_size == 0 { - return params; - } // Derive a source-size-based cap from ceil(log2(src_size)), then - // clamp to MIN_WINDOW_LOG. For inputs smaller than 1 KiB we keep the + // clamp to MIN_WINDOW_LOG. For inputs smaller than 1 KiB (or zero) we keep the // 1 KiB minimum window instead of shrinking below that floor. - let src_log = 64 - (src_size - 1).leading_zeros(); // ceil_log2 - let src_log = (src_log as u8).max(MIN_WINDOW_LOG); + let src_log = if src_size == 0 { + MIN_WINDOW_LOG + } else { + (64 - (src_size - 1).leading_zeros()) as u8 // ceil_log2 + }; + let src_log = src_log.max(MIN_WINDOW_LOG); if src_log < params.window_log { params.window_log = src_log; } From 8bb19c8cfa761699225a3dd92c998e142310a55f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 19:25:23 +0300 Subject: [PATCH 10/18] test(encoding): tighten level roundtrip checks and pool sizing --- zstd/src/encoding/match_generator.rs | 14 +++++++++----- zstd/src/tests/roundtrip_integrity.rs | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 956df2f1..3f46e45a 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -535,11 +535,15 @@ impl Matcher for MatchGeneratorDriver { } fn get_next_space(&mut self) -> Vec { - self.vec_pool.pop().unwrap_or_else(|| { - let mut space = alloc::vec![0; self.slice_size]; - space.resize(space.capacity(), 0); - space - }) + if let Some(mut space) = self.vec_pool.pop() { + space.clear(); + if space.capacity() > self.slice_size { + space.shrink_to(self.slice_size); + } + space.resize(self.slice_size, 0); + return space; + } + alloc::vec![0; self.slice_size] } fn get_last_space(&mut self) -> &[u8] { diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index f7715f30..1e62ffa8 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -606,7 +606,18 @@ fn numeric_level_0_is_default_compression() { fn all_22_levels_roundtrip() { let data = generate_compressible(9100, 32 * 1024); for level in 1..=22 { - let result = roundtrip_at_level(&data, CompressionLevel::from_level(level)); + let compressed = { + let mut compressor = FrameCompressor::new(CompressionLevel::from_level(level)); + compressor.set_source_size_hint(data.len() as u64); + compressor.set_source(data.as_slice()); + let mut out = Vec::new(); + compressor.set_drain(&mut out); + compressor.compress(); + out + }; + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); assert_eq!(data, result, "Roundtrip failed for Level({level})"); } } @@ -624,7 +635,7 @@ fn negative_levels_roundtrip() { /// Sampled numeric levels should produce valid compressed output and preserve /// data through a full compress/decompress roundtrip. #[test] -fn levels_monotonic_compression_ratio() { +fn sampled_levels_roundtrip_validity() { let data = generate_compressible(9300, 64 * 1024); for level in [1, 3, 7, 11] { let compressed = compress_to_vec(&data[..], CompressionLevel::from_level(level)); From 899a8f07922e871bb66d24daeeb7c09cc5e210b7 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 19:34:17 +0300 Subject: [PATCH 11/18] docs(cli): sync numeric defaults and source-hint coverage --- README.md | 6 ++--- cli/src/main.rs | 2 +- zstd/src/encoding/streaming_encoder.rs | 35 ++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f6578fdf..3f5e783b 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This is a **maintained fork** of [KillingSpark/zstd-rs](https://github.com/Killi **Fork goals:** - Dictionary compression improvements (critical for per-label trained dictionaries in LSM-tree) - Performance parity with C zstd for decompression (currently 1.4-3.5x slower) -- Full numeric compression levels (1–22 plus negative ultra-fast, C zstd compatible) +- Full numeric compression levels (0 = default, 1–22 plus negative ultra-fast, C zstd compatible) - No FFI — pure `cargo build`, no cmake/system libraries (ADR-013 compliance) **Upstream relationship:** We periodically sync with upstream but maintain an independent development trajectory focused on CoordiNode requirements. @@ -46,7 +46,7 @@ Complete RFC 8878 implementation. Performance: ~1.4-3.5x slower than C zstd depe - [x] Default (roughly level 3) - [x] Better (roughly level 7) - [x] Best (roughly level 11) -- [x] Numeric levels 1–22 via `CompressionLevel::from_level(n)` (C zstd compatible numbering) +- [x] Numeric levels `0` (default), `1–22`, and negative ultra-fast levels via `CompressionLevel::from_level(n)` (C zstd compatible numbering) - [x] Negative levels for ultra-fast compression - [x] Checksums - [x] Frame Content Size — `FrameCompressor` writes FCS automatically; `StreamingEncoder` requires `set_pledged_content_size()` before first write @@ -71,7 +71,7 @@ use structured_zstd::encoding::{compress, compress_to_vec, CompressionLevel}; let data: &[u8] = b"hello world"; // Named level let compressed = compress_to_vec(data, CompressionLevel::Fastest); -// Numeric level (C zstd compatible: 1-22, negative for ultra-fast) +// Numeric level (C zstd compatible: 0 = default, 1-22, negative for ultra-fast) let compressed = compress_to_vec(data, CompressionLevel::from_level(7)); ``` diff --git a/cli/src/main.rs b/cli/src/main.rs index a67924a2..945f30e2 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -52,7 +52,7 @@ enum Commands { short, long, value_name = "LEVEL", - default_value_t = 3, + default_value_t = CompressionLevel::DEFAULT_LEVEL, value_parser = clap::value_parser!(i32).range( (CompressionLevel::MIN_LEVEL as i64)..=(CompressionLevel::MAX_LEVEL as i64) ), diff --git a/zstd/src/encoding/streaming_encoder.rs b/zstd/src/encoding/streaming_encoder.rs index d5c4eb2c..afd3db60 100644 --- a/zstd/src/encoding/streaming_encoder.rs +++ b/zstd/src/encoding/streaming_encoder.rs @@ -1016,6 +1016,41 @@ mod tests { assert_eq!(err.kind(), ErrorKind::InvalidInput); } + #[test] + fn source_size_hint_directly_reduces_window_header() { + let payload = b"streaming-source-size-hint".repeat(64); + + let mut no_hint = StreamingEncoder::new(Vec::new(), CompressionLevel::from_level(11)); + no_hint.write_all(payload.as_slice()).unwrap(); + let no_hint_frame = no_hint.finish().unwrap(); + let no_hint_header = crate::decoding::frame::read_frame_header(no_hint_frame.as_slice()) + .unwrap() + .0; + let no_hint_window = no_hint_header.window_size().unwrap(); + + let mut with_hint = StreamingEncoder::new(Vec::new(), CompressionLevel::from_level(11)); + with_hint + .set_source_size_hint(payload.len() as u64) + .unwrap(); + with_hint.write_all(payload.as_slice()).unwrap(); + let with_hint_frame = with_hint.finish().unwrap(); + let with_hint_header = + crate::decoding::frame::read_frame_header(with_hint_frame.as_slice()) + .unwrap() + .0; + let with_hint_window = with_hint_header.window_size().unwrap(); + + assert!( + with_hint_window <= no_hint_window, + "source size hint should not increase advertised window" + ); + + let mut decoder = StreamingDecoder::new(with_hint_frame.as_slice()).unwrap(); + let mut decoded = Vec::new(); + decoder.read_to_end(&mut decoded).unwrap(); + assert_eq!(decoded, payload); + } + #[cfg(feature = "std")] #[test] fn pledged_content_size_c_zstd_compatible() { From c37f6cfa5010913990f6869193b8ee7f7a8c2218 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 20:56:30 +0300 Subject: [PATCH 12/18] fix(cli): document clap i64 range bounds for level parser --- cli/src/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cli/src/main.rs b/cli/src/main.rs index 945f30e2..239074d6 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -53,6 +53,8 @@ enum Commands { long, value_name = "LEVEL", default_value_t = CompressionLevel::DEFAULT_LEVEL, + // clap's ranged parser expects i64 bounds here (RangedI64ValueParser), + // even though the target value type is i32. value_parser = clap::value_parser!(i32).range( (CompressionLevel::MIN_LEVEL as i64)..=(CompressionLevel::MAX_LEVEL as i64) ), From 6aa76184c676e628918c85f3c9d8cbf7fe47ad86 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 22:12:33 +0300 Subject: [PATCH 13/18] docs(encoding): note semver impact of level variant --- zstd/src/encoding/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs index 18bd7f74..0d797c1c 100644 --- a/zstd/src/encoding/mod.rs +++ b/zstd/src/encoding/mod.rs @@ -104,6 +104,10 @@ pub enum CompressionLevel { /// with the lazy2 hash-chain backend. Levels that require strategies /// this crate has not yet implemented (btopt, btultra) are approximated /// with the closest available matcher. + /// + /// Semver note: this variant was added after the initial enum shape and + /// is a breaking API change for downstream crates that exhaustively + /// `match` on [`CompressionLevel`] without a wildcard arm. Level(i32), } From 672ac260962c9c9f40a7d492c003fcb927331f90 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 22:23:03 +0300 Subject: [PATCH 14/18] fix(encoding): canonicalize named levels in from_level --- zstd/src/encoding/mod.rs | 16 +++++++++++---- zstd/src/tests/roundtrip_integrity.rs | 29 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs index 0d797c1c..1f83313e 100644 --- a/zstd/src/encoding/mod.rs +++ b/zstd/src/encoding/mod.rs @@ -121,11 +121,19 @@ impl CompressionLevel { /// Create a compression level from a numeric value. /// - /// Wraps the raw integer in [`Level`](Self::Level). Values outside - /// [`MIN_LEVEL`](Self::MIN_LEVEL)..=[`MAX_LEVEL`](Self::MAX_LEVEL) are - /// silently clamped during parameter resolution. + /// Returns named variants for canonical levels (`0`/`3`, `1`, `7`, `11`) + /// and [`Level`](Self::Level) for all other values. + /// + /// Values outside [`MIN_LEVEL`](Self::MIN_LEVEL)..=[`MAX_LEVEL`](Self::MAX_LEVEL) + /// are silently clamped during parameter resolution. pub const fn from_level(level: i32) -> Self { - CompressionLevel::Level(level) + match level { + 0 | Self::DEFAULT_LEVEL => Self::Default, + 1 => Self::Fastest, + 7 => Self::Better, + 11 => Self::Best, + _ => Self::Level(level), + } } } diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index 1e62ffa8..1c498bfc 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -547,6 +547,35 @@ fn roundtrip_best_level_streaming_multi_block() { // ─── Numeric compression levels (CompressionLevel::Level) ───────── +/// Canonical numeric levels should map to named enum variants for pattern/equality checks. +#[test] +fn numeric_levels_map_to_named_variants() { + assert!(matches!( + CompressionLevel::from_level(0), + CompressionLevel::Default + )); + assert!(matches!( + CompressionLevel::from_level(3), + CompressionLevel::Default + )); + assert!(matches!( + CompressionLevel::from_level(1), + CompressionLevel::Fastest + )); + assert!(matches!( + CompressionLevel::from_level(7), + CompressionLevel::Better + )); + assert!(matches!( + CompressionLevel::from_level(11), + CompressionLevel::Best + )); + assert!(matches!( + CompressionLevel::from_level(2), + CompressionLevel::Level(2) + )); +} + /// `from_level(3)` must be equivalent to `Default` — same compressed output. #[test] fn numeric_level_3_matches_default() { From fec2b3f4f34eb4f73c886a58513ae8b034dc697c Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 22:36:17 +0300 Subject: [PATCH 15/18] test(encoding): cover direct Level(0/3) default equivalence --- zstd/src/tests/roundtrip_integrity.rs | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index 1c498bfc..e2e39a0a 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -576,15 +576,20 @@ fn numeric_levels_map_to_named_variants() { )); } -/// `from_level(3)` must be equivalent to `Default` — same compressed output. +/// `from_level(3)` and direct `Level(3)` must be equivalent to `Default`. #[test] fn numeric_level_3_matches_default() { let data = generate_compressible(9000, 64 * 1024); let default = compress_to_vec(&data[..], CompressionLevel::Default); - let level_3 = compress_to_vec(&data[..], CompressionLevel::from_level(3)); + let from_level_3 = compress_to_vec(&data[..], CompressionLevel::from_level(3)); + let direct_level_3 = compress_to_vec(&data[..], CompressionLevel::Level(3)); + assert_eq!( + default, from_level_3, + "from_level(3) output must be identical to Default" + ); assert_eq!( - default, level_3, - "Level(3) output must be identical to Default" + default, direct_level_3, + "direct Level(3) output must be identical to Default" ); } @@ -621,13 +626,21 @@ fn numeric_level_11_matches_best() { assert_eq!(best, level_11, "Level(11) output must be identical to Best"); } -/// `from_level(0)` maps to default compression (level 3), matching C zstd. +/// `from_level(0)` and direct `Level(0)` map to default compression (level 3). #[test] fn numeric_level_0_is_default_compression() { let data = generate_compressible(9004, 64 * 1024); - let level_0 = compress_to_vec(&data[..], CompressionLevel::from_level(0)); + let from_level_0 = compress_to_vec(&data[..], CompressionLevel::from_level(0)); + let direct_level_0 = compress_to_vec(&data[..], CompressionLevel::Level(0)); let level_3 = compress_to_vec(&data[..], CompressionLevel::from_level(3)); - assert_eq!(level_0, level_3, "Level(0) should map to default (level 3)"); + assert_eq!( + from_level_0, level_3, + "from_level(0) should map to default (level 3)" + ); + assert_eq!( + direct_level_0, level_3, + "direct Level(0) should map to default (level 3)" + ); } /// All 22 positive levels produce valid output that round-trips correctly. From 5d8703d30727623a3792fa2914fce999fa2c2a9f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 22:54:49 +0300 Subject: [PATCH 16/18] docs(encoding): scope clamping guarantee to default matcher --- zstd/src/encoding/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs index 1f83313e..9fc5750a 100644 --- a/zstd/src/encoding/mod.rs +++ b/zstd/src/encoding/mod.rs @@ -124,8 +124,9 @@ impl CompressionLevel { /// Returns named variants for canonical levels (`0`/`3`, `1`, `7`, `11`) /// and [`Level`](Self::Level) for all other values. /// - /// Values outside [`MIN_LEVEL`](Self::MIN_LEVEL)..=[`MAX_LEVEL`](Self::MAX_LEVEL) - /// are silently clamped during parameter resolution. + /// With the default matcher backend (`MatchGeneratorDriver`), values + /// outside [`MIN_LEVEL`](Self::MIN_LEVEL)..=[`MAX_LEVEL`](Self::MAX_LEVEL) + /// are silently clamped during built-in level parameter resolution. pub const fn from_level(level: i32) -> Self { match level { 0 | Self::DEFAULT_LEVEL => Self::Default, From dcbdbd14857edae029b0c0dbbc9894fd61beef0d Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 23:27:44 +0300 Subject: [PATCH 17/18] perf(encoding): avoid eager zero-fill in pooled block buffers --- zstd/src/encoding/match_generator.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 3f46e45a..e303160c 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -536,11 +536,15 @@ impl Matcher for MatchGeneratorDriver { fn get_next_space(&mut self) -> Vec { if let Some(mut space) = self.vec_pool.pop() { - space.clear(); + if space.len() > self.slice_size { + space.truncate(self.slice_size); + } if space.capacity() > self.slice_size { space.shrink_to(self.slice_size); } - space.resize(self.slice_size, 0); + if space.len() < self.slice_size { + space.resize(self.slice_size, 0); + } return space; } alloc::vec![0; self.slice_size] From 734f7489c5becb37c2df90fe72f063f9949f2630 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 4 Apr 2026 23:44:27 +0300 Subject: [PATCH 18/18] docs(encoding): clarify source-size hint scope --- zstd/src/encoding/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs index 9fc5750a..976d28fd 100644 --- a/zstd/src/encoding/mod.rs +++ b/zstd/src/encoding/mod.rs @@ -173,7 +173,9 @@ pub trait Matcher { /// Called before [`reset`](Self::reset) when the caller knows the input /// size (e.g. from pledged content size or file metadata). /// - /// The default implementation is a no-op for custom matchers. + /// The default implementation is a no-op for custom matchers and + /// test stubs. The built-in runtime matcher (`MatchGeneratorDriver`) + /// overrides this hook and applies the hint during level resolution. fn set_source_size_hint(&mut self, _size: u64) {} /// Prime matcher state with dictionary history before compressing the next frame. /// Default implementation is a no-op for custom matchers that do not support this.