From f45a5a8717f962569e2af2f63109521a38876181 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 21:54:06 +0200
Subject: [PATCH 01/24] feat(encoding): add dictionary compression support

- add FrameCompressor dictionary APIs, including parse-from-bytes helper
- write dictionary id into frame header and prime matcher with dictionary history
- support raw-content dictionaries for dict_builder outputs
- add regression tests for dict-id enforcement, C interop, and dict_builder roundtrip

Closes #8
---
 zstd/src/decoding/dictionary.rs       |  24 ++++
 zstd/src/decoding/errors.rs           |   4 +
 zstd/src/encoding/frame_compressor.rs | 182 +++++++++++++++++++++++++-
 zstd/src/encoding/match_generator.rs  |  22 ++++
 zstd/src/encoding/mod.rs              |   3 +
 5 files changed, 234 insertions(+), 1 deletion(-)
diff --git a/zstd/src/decoding/dictionary.rs b/zstd/src/decoding/dictionary.rs
index f0f7b7ad..072c93c6 100644
--- a/zstd/src/decoding/dictionary.rs
+++ b/zstd/src/decoding/dictionary.rs
@@ -40,6 +40,27 @@ pub struct Dictionary {
 pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
 
 impl Dictionary {
+    /// Build a dictionary from raw content bytes (without entropy table sections).
+    ///
+    /// This is primarily intended for dictionaries produced by the `dict_builder`
+    /// module, which currently emits raw-content dictionaries.
+    pub fn from_raw_content(
+        id: u32,
+        dict_content: Vec<u8>,
+    ) -> Result<Dictionary, DictionaryDecodeError> {
+        if id == 0 {
+            return Err(DictionaryDecodeError::ZeroDictionaryId);
+        }
+
+        Ok(Dictionary {
+            id,
+            fse: FSEScratch::new(),
+            huf: HuffmanScratch::new(),
+            dict_content,
+            offset_hist: [1, 4, 8],
+        })
+    }
+
     /// Parses the dictionary from `raw` and set the tables
     /// it returns the dict_id for checking with the frame's `dict_id``
     pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
@@ -58,6 +79,9 @@ impl Dictionary {
 
         let dict_id = raw[4..8].try_into().expect("optimized away");
         let dict_id = u32::from_le_bytes(dict_id);
+        if dict_id == 0 {
+            return Err(DictionaryDecodeError::ZeroDictionaryId);
+        }
         new_dict.id = dict_id;
 
         let raw_tables = &raw[8..];
diff --git a/zstd/src/decoding/errors.rs b/zstd/src/decoding/errors.rs
index 466ffe1a..ddc9335a 100644
--- a/zstd/src/decoding/errors.rs
+++ b/zstd/src/decoding/errors.rs
@@ -425,6 +425,7 @@ impl core::fmt::Display for DecodeBufferError {
 #[non_exhaustive]
 pub enum DictionaryDecodeError {
     BadMagicNum { got: [u8; 4] },
+    ZeroDictionaryId,
     FSETableError(FSETableError),
     HuffmanTableError(HuffmanTableError),
 }
@@ -451,6 +452,9 @@ impl core::fmt::Display for DictionaryDecodeError {
                     crate::decoding::dictionary::MAGIC_NUM,
                 )
             }
+            DictionaryDecodeError::ZeroDictionaryId => {
+                write!(f, "Dictionary id must be non-zero")
+            }
             DictionaryDecodeError::FSETableError(e) => write!(f, "{e:?}"),
             DictionaryDecodeError::HuffmanTableError(e) => write!(f, "{e:?}"),
         }
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index c87806b7..413837ec 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -39,6 +39,7 @@ pub struct FrameCompressor<R: Read, W: Write, M: Matcher> {
     uncompressed_data: Option<R>,
     compressed_data: Option<W>,
     compression_level: CompressionLevel,
+    dictionary: Option<crate::decoding::Dictionary>,
     state: CompressState<M>,
     #[cfg(feature = "hash")]
     hasher: XxHash64,
@@ -99,6 +100,7 @@ impl<R: Read, W: Write> FrameCompressor<R, W, MatchGeneratorDriver> {
             uncompressed_data: None,
             compressed_data: None,
             compression_level,
+            dictionary: None,
             state: CompressState {
                 matcher: MatchGeneratorDriver::new(1024 * 128, 1),
                 last_huff_table: None,
@@ -117,6 +119,7 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         Self {
             uncompressed_data: None,
             compressed_data: None,
+            dictionary: None,
             state: CompressState {
                 matcher,
                 last_huff_table: None,
@@ -158,6 +161,12 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         self.state.fse_tables.ml_previous = None;
         self.state.fse_tables.of_previous = None;
         self.state.offset_hist = [1, 4, 8];
+        if let Some(dict) = self.dictionary.as_ref() {
+            self.state.offset_hist = dict.offset_hist;
+            self.state
+                .matcher
+                .prime_with_dictionary(dict.dict_content.as_slice(), dict.offset_hist);
+        }
         #[cfg(feature = "hash")]
         {
             self.hasher = XxHash64::with_seed(0);
@@ -171,7 +180,7 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             frame_content_size: None,
             single_segment: false,
             content_checksum: cfg!(feature = "hash"),
-            dictionary_id: None,
+            dictionary_id: self.dictionary.as_ref().map(|dict| dict.id as u64),
             window_size: Some(self.state.matcher.window_size()),
         };
         header.serialize(output);
@@ -301,10 +310,38 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
     pub fn compression_level(&self) -> CompressionLevel {
         self.compression_level
     }
+
+    /// Attach a pre-parsed dictionary to be used for subsequent compressions.
+    ///
+    /// The dictionary id will be written to the frame header and the matcher will be
+    /// primed with dictionary content/history before compressing each frame.
+    pub fn set_dictionary(
+        &mut self,
+        dictionary: crate::decoding::Dictionary,
+    ) -> Option<crate::decoding::Dictionary> {
+        self.dictionary.replace(dictionary)
+    }
+
+    /// Parse and attach a serialized dictionary blob.
+    pub fn set_dictionary_from_bytes(
+        &mut self,
+        raw_dictionary: &[u8],
+    ) -> Result<Option<crate::decoding::Dictionary>, crate::decoding::errors::DictionaryDecodeError>
+    {
+        let dictionary = crate::decoding::Dictionary::decode_dict(raw_dictionary)?;
+        Ok(self.set_dictionary(dictionary))
+    }
+
+    /// Remove the attached dictionary.
+    pub fn clear_dictionary(&mut self) -> Option<crate::decoding::Dictionary> {
+        self.dictionary.take()
+    }
 }
 
 #[cfg(test)]
 mod tests {
+    #[cfg(all(feature = "dict_builder", feature = "std"))]
+    use alloc::format;
     use alloc::vec;
 
     use super::FrameCompressor;
@@ -395,6 +432,149 @@ mod tests {
         assert_eq!(mock_data, decoded);
     }
 
+    #[test]
+    fn dictionary_compression_sets_required_dict_id_and_roundtrips() {
+        let dict_raw = include_bytes!("../../dict_tests/dictionary");
+        let dict_for_encoder = crate::decoding::Dictionary::decode_dict(dict_raw).unwrap();
+        let dict_for_decoder = crate::decoding::Dictionary::decode_dict(dict_raw).unwrap();
+
+        let mut data = Vec::new();
+        for _ in 0..8 {
+            data.extend_from_slice(&dict_for_decoder.dict_content[..2048]);
+        }
+
+        let mut plain = Vec::new();
+        crate::encoding::compress(
+            data.as_slice(),
+            &mut plain,
+            super::CompressionLevel::Fastest,
+        );
+
+        let mut with_dict = Vec::new();
+        let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
+        compressor
+            .set_dictionary_from_bytes(dict_raw)
+            .expect("dictionary bytes should parse");
+        assert_eq!(
+            compressor
+                .set_dictionary(dict_for_encoder)
+                .expect("set_dictionary_from_bytes inserted previous dictionary")
+                .id,
+            dict_for_decoder.id
+        );
+        compressor.set_source(data.as_slice());
+        compressor.set_drain(&mut with_dict);
+        compressor.compress();
+
+        let (frame_header, _) = crate::decoding::frame::read_frame_header(with_dict.as_slice())
+            .expect("encoded stream should have a frame header");
+        assert_eq!(frame_header.dictionary_id(), Some(dict_for_decoder.id));
+
+        assert!(
+            with_dict.len() < plain.len(),
+            "dictionary compression should improve ratio for dictionary-like payloads (plain={}, dict={})",
+            plain.len(),
+            with_dict.len()
+        );
+
+        let mut decoder = FrameDecoder::new();
+        let mut missing_dict_target = Vec::with_capacity(data.len());
+        let err = decoder
+            .decode_all_to_vec(&with_dict, &mut missing_dict_target)
+            .unwrap_err();
+        assert!(
+            matches!(
+                err,
+                crate::decoding::errors::FrameDecoderError::DictNotProvided { .. }
+            ),
+            "dict-compressed stream should require dictionary id, got: {err:?}"
+        );
+
+        let mut decoder = FrameDecoder::new();
+        decoder.add_dict(dict_for_decoder).unwrap();
+        let mut decoded = Vec::with_capacity(data.len());
+        decoder.decode_all_to_vec(&with_dict, &mut decoded).unwrap();
+        assert_eq!(decoded, data);
+
+        let mut ffi_decoder = zstd::bulk::Decompressor::with_dictionary(dict_raw).unwrap();
+        let mut ffi_decoded = Vec::with_capacity(data.len());
+        let ffi_written = ffi_decoder
+            .decompress_to_buffer(with_dict.as_slice(), &mut ffi_decoded)
+            .unwrap();
+        assert_eq!(ffi_written, data.len());
+        assert_eq!(ffi_decoded, data);
+    }
+
+    #[cfg(all(feature = "dict_builder", feature = "std"))]
+    #[test]
+    fn dictionary_compression_roundtrips_with_dict_builder_dictionary() {
+        use std::io::Cursor;
+
+        let mut training = Vec::new();
+        for idx in 0..256u32 {
+            training.extend_from_slice(
+                format!("tenant=demo table=orders key={idx} region=eu\n").as_bytes(),
+            );
+        }
+        let mut raw_dict = Vec::new();
+        crate::dictionary::create_raw_dict_from_source(
+            Cursor::new(training.as_slice()),
+            training.len(),
+            &mut raw_dict,
+            4096,
+        );
+        assert!(
+            !raw_dict.is_empty(),
+            "dict_builder produced an empty dictionary"
+        );
+
+        let dict_id = 0xD1C7_0008;
+        let encoder_dict =
+            crate::decoding::Dictionary::from_raw_content(dict_id, raw_dict.clone()).unwrap();
+        let decoder_dict =
+            crate::decoding::Dictionary::from_raw_content(dict_id, raw_dict.clone()).unwrap();
+
+        let mut payload = Vec::new();
+        for idx in 0..512u32 {
+            payload.extend_from_slice(
+                format!(
+                    "tenant=demo table=orders op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n"
+                )
+                .as_bytes(),
+            );
+        }
+
+        let mut plain = Vec::new();
+        crate::encoding::compress(
+            payload.as_slice(),
+            &mut plain,
+            super::CompressionLevel::Fastest,
+        );
+
+        let mut with_dict = Vec::new();
+        let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
+        compressor.set_dictionary(encoder_dict);
+        compressor.set_source(payload.as_slice());
+        compressor.set_drain(&mut with_dict);
+        compressor.compress();
+
+        let (frame_header, _) = crate::decoding::frame::read_frame_header(with_dict.as_slice())
+            .expect("encoded stream should have a frame header");
+        assert_eq!(frame_header.dictionary_id(), Some(dict_id));
+        assert!(
+            with_dict.len() < plain.len(),
+            "dict_builder dictionary should improve ratio for matching payloads (plain={}, dict={})",
+            plain.len(),
+            with_dict.len()
+        );
+
+        let mut decoder = FrameDecoder::new();
+        decoder.add_dict(decoder_dict).unwrap();
+        let mut decoded = Vec::with_capacity(payload.len());
+        decoder.decode_all_to_vec(&with_dict, &mut decoded).unwrap();
+        assert_eq!(decoded, payload);
+    }
+
     #[cfg(feature = "hash")]
     #[test]
     fn checksum_two_frames_reused_compressor() {
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index d0c91245..e0f4d595 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -167,6 +167,28 @@ impl Matcher for MatchGeneratorDriver {
         }
     }
 
+    fn prime_with_dictionary(&mut self, dict_content: &[u8], offset_hist: [u32; 3]) {
+        if dict_content.is_empty() {
+            return;
+        }
+
+        let mut start = 0usize;
+        while start < dict_content.len() {
+            let end = (start + self.slice_size).min(dict_content.len());
+            let mut space = self.get_next_space();
+            space.clear();
+            space.extend_from_slice(&dict_content[start..end]);
+            self.commit_space(space);
+            self.skip_matching();
+            start = end;
+        }
+
+        match self.active_backend {
+            MatcherBackend::Simple => self.match_generator.offset_hist = offset_hist,
+            MatcherBackend::Dfast => self.dfast_matcher_mut().offset_hist = offset_hist,
+        }
+    }
+
     fn window_size(&self) -> u64 {
         match self.active_backend {
             MatcherBackend::Simple => self.match_generator.max_window_size as u64,
diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs
index aa640f66..855f8bb6 100644
--- a/zstd/src/encoding/mod.rs
+++ b/zstd/src/encoding/mod.rs
@@ -95,6 +95,9 @@ pub trait Matcher {
     fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>));
     /// Reset this matcher so it can be used for the next new frame
     fn reset(&mut self, level: CompressionLevel);
+    /// Prime matcher state with dictionary history before compressing the next frame.
+    /// Default implementation is a no-op for custom matchers that do not support this.
+    fn prime_with_dictionary(&mut self, _dict_content: &[u8], _offset_hist: [u32; 3]) {}
     /// The size of the window the decoder will need to execute all sequences produced by this matcher
     ///
     /// May change after a call to reset with a different compression level

From 373eef0a28736a8d3c484febfe19a66b234028db Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 22:13:45 +0200
Subject: [PATCH 02/24] fix(encoding): harden dictionary input handling

- reject dictionary id 0 in FrameCompressor::set_dictionary

- return explicit DictionaryDecodeError on undersized dictionary buffers

- keep dict_tests assets in crate package so include_bytes tests compile downstream
---
 zstd/Cargo.toml                       |  2 +-
 zstd/src/decoding/dictionary.rs       | 46 +++++++++++++++++++++++++++
 zstd/src/decoding/errors.rs           |  7 ++++
 zstd/src/encoding/frame_compressor.rs | 25 +++++++++++++++
 4 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/zstd/Cargo.toml b/zstd/Cargo.toml
index 0d435fc7..052450c1 100644
--- a/zstd/Cargo.toml
+++ b/zstd/Cargo.toml
@@ -11,7 +11,7 @@ license = "Apache-2.0"
 homepage = "https://github.com/structured-world/structured-zstd"
 repository = "https://github.com/structured-world/structured-zstd"
 description = "Pure Rust zstd implementation — managed fork of ruzstd. Dictionary decompression, no FFI."
-exclude = ["dict_tests/*", "fuzz_decodecorpus/*", "decodecorpus_files/*"]
+exclude = ["fuzz_decodecorpus/*", "decodecorpus_files/*"]
 # Package metadata points at a crate-local symlink so the packaged crate and repo root README stay in sync.
 readme = "README.md"
 keywords = ["zstd", "zstandard", "decompression", "compression", "pure-rust"]
diff --git a/zstd/src/decoding/dictionary.rs b/zstd/src/decoding/dictionary.rs
index 072c93c6..8818eb94 100644
--- a/zstd/src/decoding/dictionary.rs
+++ b/zstd/src/decoding/dictionary.rs
@@ -64,6 +64,16 @@ impl Dictionary {
     /// Parses the dictionary from `raw` and set the tables
     /// it returns the dict_id for checking with the frame's `dict_id``
     pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
+        const MIN_MAGIC_AND_ID_LEN: usize = 8;
+        const OFFSET_HISTORY_LEN: usize = 12;
+
+        if raw.len() < MIN_MAGIC_AND_ID_LEN {
+            return Err(DictionaryDecodeError::DictionaryTooSmall {
+                got: raw.len(),
+                need: MIN_MAGIC_AND_ID_LEN,
+            });
+        }
+
         let mut new_dict = Dictionary {
             id: 0,
             fse: FSEScratch::new(),
@@ -107,6 +117,13 @@ impl Dictionary {
         )?;
         let raw_tables = &raw_tables[ll_size..];
 
+        if raw_tables.len() < OFFSET_HISTORY_LEN {
+            return Err(DictionaryDecodeError::DictionaryTooSmall {
+                got: raw_tables.len(),
+                need: OFFSET_HISTORY_LEN,
+            });
+        }
+
         let offset1 = raw_tables[0..4].try_into().expect("optimized away");
         let offset1 = u32::from_le_bytes(offset1);
 
@@ -126,3 +143,32 @@ impl Dictionary {
         Ok(new_dict)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn decode_dict_rejects_short_buffer_before_magic_and_id() {
+        let err = match Dictionary::decode_dict(&[]) {
+            Ok(_) => panic!("expected short dictionary to fail"),
+            Err(err) => err,
+        };
+        assert!(matches!(
+            err,
+            DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 8 }
+        ));
+    }
+
+    #[test]
+    fn decode_dict_malformed_input_returns_error_instead_of_panicking() {
+        let mut raw = Vec::new();
+        raw.extend_from_slice(&MAGIC_NUM);
+        raw.extend_from_slice(&1u32.to_le_bytes());
+        raw.extend_from_slice(&[0u8; 7]);
+
+        let result = std::panic::catch_unwind(|| Dictionary::decode_dict(&raw));
+        assert!(result.is_ok(), "decode_dict must not panic on malformed input");
+        assert!(result.unwrap().is_err(), "malformed dictionary must return error");
+    }
+}
diff --git a/zstd/src/decoding/errors.rs b/zstd/src/decoding/errors.rs
index ddc9335a..cbccc960 100644
--- a/zstd/src/decoding/errors.rs
+++ b/zstd/src/decoding/errors.rs
@@ -425,6 +425,7 @@ impl core::fmt::Display for DecodeBufferError {
 #[non_exhaustive]
 pub enum DictionaryDecodeError {
     BadMagicNum { got: [u8; 4] },
+    DictionaryTooSmall { got: usize, need: usize },
     ZeroDictionaryId,
     FSETableError(FSETableError),
     HuffmanTableError(HuffmanTableError),
@@ -452,6 +453,12 @@ impl core::fmt::Display for DictionaryDecodeError {
                     crate::decoding::dictionary::MAGIC_NUM,
                 )
             }
+            DictionaryDecodeError::DictionaryTooSmall { got, need } => {
+                write!(
+                    f,
+                    "Dictionary is too small: got {got} bytes, need at least {need} bytes",
+                )
+            }
             DictionaryDecodeError::ZeroDictionaryId => {
                 write!(f, "Dictionary id must be non-zero")
             }
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 413837ec..bfc4c362 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -319,6 +319,10 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         &mut self,
         dictionary: crate::decoding::Dictionary,
     ) -> Option<crate::decoding::Dictionary> {
+        assert_ne!(
+            dictionary.id, 0,
+            "FrameCompressor::set_dictionary: dictionary.id must be non-zero (0 means 'no dictionary' in the frame header)."
+        );
         self.dictionary.replace(dictionary)
     }
 
@@ -575,6 +579,27 @@ mod tests {
         assert_eq!(decoded, payload);
     }
 
+    #[test]
+    #[should_panic(
+        expected = "FrameCompressor::set_dictionary: dictionary.id must be non-zero (0 means 'no dictionary' in the frame header)."
+    )]
+    fn set_dictionary_rejects_zero_dictionary_id() {
+        let invalid = crate::decoding::Dictionary {
+            id: 0,
+            fse: crate::decoding::scratch::FSEScratch::new(),
+            huf: crate::decoding::scratch::HuffmanScratch::new(),
+            dict_content: vec![1, 2, 3],
+            offset_hist: [1, 4, 8],
+        };
+
+        let mut compressor: FrameCompressor<
+            &[u8],
+            Vec<u8>,
+            crate::encoding::match_generator::MatchGeneratorDriver,
+        > = FrameCompressor::new(super::CompressionLevel::Fastest);
+        let _ = compressor.set_dictionary(invalid);
+    }
+
     #[cfg(feature = "hash")]
     #[test]
     fn checksum_two_frames_reused_compressor() {

From 05a712cbddd26216cd409b734be5e6452bdded1b Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 22:16:31 +0200
Subject: [PATCH 03/24] style(tests): format dictionary panic regression
 asserts

---
 zstd/src/decoding/dictionary.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/zstd/src/decoding/dictionary.rs b/zstd/src/decoding/dictionary.rs
index 8818eb94..8f22ecea 100644
--- a/zstd/src/decoding/dictionary.rs
+++ b/zstd/src/decoding/dictionary.rs
@@ -168,7 +168,13 @@ mod tests {
         raw.extend_from_slice(&[0u8; 7]);
 
         let result = std::panic::catch_unwind(|| Dictionary::decode_dict(&raw));
-        assert!(result.is_ok(), "decode_dict must not panic on malformed input");
-        assert!(result.unwrap().is_err(), "malformed dictionary must return error");
+        assert!(
+            result.is_ok(),
+            "decode_dict must not panic on malformed input"
+        );
+        assert!(
+            result.unwrap().is_err(),
+            "malformed dictionary must return error"
+        );
     }
 }

From dc4ec49f98c1ca9603812967be2c1de27b92cd91 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 22:25:54 +0200
Subject: [PATCH 04/24] fix(dict): align defaults and clarify dictionary
 priming

- use RFC default repeat offsets [1,4,8] in decode_dict initialization

- document intentional dual offset history priming in compressor state and matcher

- document fail-fast zero-id contract for set_dictionary
---
 zstd/src/decoding/dictionary.rs       | 2 +-
 zstd/src/encoding/frame_compressor.rs | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/zstd/src/decoding/dictionary.rs b/zstd/src/decoding/dictionary.rs
index 8f22ecea..437dd4bc 100644
--- a/zstd/src/decoding/dictionary.rs
+++ b/zstd/src/decoding/dictionary.rs
@@ -79,7 +79,7 @@ impl Dictionary {
             fse: FSEScratch::new(),
             huf: HuffmanScratch::new(),
             dict_content: Vec::new(),
-            offset_hist: [2, 4, 8],
+            offset_hist: [1, 4, 8],
         };
 
         let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index bfc4c362..ed03e0df 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -162,6 +162,8 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         self.state.fse_tables.of_previous = None;
         self.state.offset_hist = [1, 4, 8];
         if let Some(dict) = self.dictionary.as_ref() {
+            // This state drives sequence encoding, while matcher priming below updates
+            // the match generator's internal repeat-offset history for match finding.
             self.state.offset_hist = dict.offset_hist;
             self.state
                 .matcher
@@ -319,6 +321,8 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         &mut self,
         dictionary: crate::decoding::Dictionary,
     ) -> Option<crate::decoding::Dictionary> {
+        // Keep this as a fail-fast contract for manually-constructed dictionaries.
+        // Parsing helpers already return Result and reject id==0 at the boundary.
         assert_ne!(
             dictionary.id, 0,
             "FrameCompressor::set_dictionary: dictionary.id must be non-zero (0 means 'no dictionary' in the frame header)."

From 9093834a77646480f5c43857c414e62c7315c87a Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 22:27:11 +0200
Subject: [PATCH 05/24] test(packaging): handle must_use and trim crate payload

- assert first set_dictionary_from_bytes insert returns None

- explicitly discard optional previous dictionary in dict_builder roundtrip test

- exclude dict_tests/files/** while keeping dict_tests/dictionary for include_bytes tests
---
 zstd/Cargo.toml                       | 2 +-
 zstd/src/encoding/frame_compressor.rs | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/zstd/Cargo.toml b/zstd/Cargo.toml
index 052450c1..ee02ca61 100644
--- a/zstd/Cargo.toml
+++ b/zstd/Cargo.toml
@@ -11,7 +11,7 @@ license = "Apache-2.0"
 homepage = "https://github.com/structured-world/structured-zstd"
 repository = "https://github.com/structured-world/structured-zstd"
 description = "Pure Rust zstd implementation — managed fork of ruzstd. Dictionary decompression, no FFI."
-exclude = ["fuzz_decodecorpus/*", "decodecorpus_files/*"]
+exclude = ["fuzz_decodecorpus/*", "decodecorpus_files/*", "dict_tests/files/**"]
 # Package metadata points at a crate-local symlink so the packaged crate and repo root README stay in sync.
 readme = "README.md"
 keywords = ["zstd", "zstandard", "decompression", "compression", "pure-rust"]
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index ed03e0df..c1da346e 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -460,9 +460,13 @@ mod tests {
 
         let mut with_dict = Vec::new();
         let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
-        compressor
+        let previous = compressor
             .set_dictionary_from_bytes(dict_raw)
             .expect("dictionary bytes should parse");
+        assert!(
+            previous.is_none(),
+            "first dictionary insert should return None"
+        );
         assert_eq!(
             compressor
                 .set_dictionary(dict_for_encoder)
@@ -561,7 +565,7 @@ mod tests {
 
         let mut with_dict = Vec::new();
         let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
-        compressor.set_dictionary(encoder_dict);
+        let _ = compressor.set_dictionary(encoder_dict);
         compressor.set_source(payload.as_slice());
         compressor.set_drain(&mut with_dict);
         compressor.compress();

From f12d234c3a0b3d2c76858fa0840e0ccc68afc2cb Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 22:49:20 +0200
Subject: [PATCH 06/24] fix(dict): seed encoder entropy and reject zero
 repcodes

- restore previous Huffman/FSE encoder tables from parsed dictionaries before first block

- convert decoder-side entropy tables into encoder tables for dictionary priming

- reject zero repeat offsets during dictionary parsing with explicit decode error

- add regression tests for entropy seeding and zero-repeat-offset rejection
---
 zstd/src/decoding/dictionary.rs       | 65 +++++++++++++++++++++++++++
 zstd/src/decoding/errors.rs           |  4 ++
 zstd/src/encoding/frame_compressor.rs | 49 ++++++++++++++++++++
 zstd/src/fse/fse_decoder.rs           | 12 +++++
 zstd/src/huff0/huff0_decoder.rs       | 22 +++++++++
 5 files changed, 152 insertions(+)

diff --git a/zstd/src/decoding/dictionary.rs b/zstd/src/decoding/dictionary.rs
index 437dd4bc..54fd0821 100644
--- a/zstd/src/decoding/dictionary.rs
+++ b/zstd/src/decoding/dictionary.rs
@@ -133,6 +133,16 @@ impl Dictionary {
         let offset3 = raw_tables[8..12].try_into().expect("optimized away");
         let offset3 = u32::from_le_bytes(offset3);
 
+        if offset1 == 0 {
+            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 });
+        }
+        if offset2 == 0 {
+            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 1 });
+        }
+        if offset3 == 0 {
+            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 2 });
+        }
+
         new_dict.offset_hist[0] = offset1;
         new_dict.offset_hist[1] = offset2;
         new_dict.offset_hist[2] = offset3;
@@ -148,6 +158,47 @@ impl Dictionary {
 mod tests {
     use super::*;
 
+    fn offset_history_start(raw: &[u8]) -> usize {
+        let mut huf = crate::decoding::scratch::HuffmanScratch::new();
+        let mut fse = crate::decoding::scratch::FSEScratch::new();
+        let mut cursor = 8usize;
+
+        let huf_size = huf
+            .table
+            .build_decoder(&raw[cursor..])
+            .expect("reference dictionary huffman table should decode");
+        cursor += huf_size as usize;
+
+        let of_size = fse
+            .offsets
+            .build_decoder(
+                &raw[cursor..],
+                crate::decoding::sequence_section_decoder::OF_MAX_LOG,
+            )
+            .expect("reference dictionary OF table should decode");
+        cursor += of_size;
+
+        let ml_size = fse
+            .match_lengths
+            .build_decoder(
+                &raw[cursor..],
+                crate::decoding::sequence_section_decoder::ML_MAX_LOG,
+            )
+            .expect("reference dictionary ML table should decode");
+        cursor += ml_size;
+
+        let ll_size = fse
+            .literal_lengths
+            .build_decoder(
+                &raw[cursor..],
+                crate::decoding::sequence_section_decoder::LL_MAX_LOG,
+            )
+            .expect("reference dictionary LL table should decode");
+        cursor += ll_size;
+
+        cursor
+    }
+
     #[test]
     fn decode_dict_rejects_short_buffer_before_magic_and_id() {
         let err = match Dictionary::decode_dict(&[]) {
@@ -177,4 +228,18 @@ mod tests {
             "malformed dictionary must return error"
         );
     }
+
+    #[test]
+    fn decode_dict_rejects_zero_repeat_offsets() {
+        let mut raw = include_bytes!("../../dict_tests/dictionary").to_vec();
+        let offset_start = offset_history_start(&raw);
+
+        // Corrupt rep0 to zero.
+        raw[offset_start..offset_start + 4].copy_from_slice(&0u32.to_le_bytes());
+        let decoded = Dictionary::decode_dict(&raw);
+        assert!(
+            decoded.is_err(),
+            "dictionary with zero repeat offset must be rejected"
+        );
+    }
 }
diff --git a/zstd/src/decoding/errors.rs b/zstd/src/decoding/errors.rs
index cbccc960..06a0085b 100644
--- a/zstd/src/decoding/errors.rs
+++ b/zstd/src/decoding/errors.rs
@@ -427,6 +427,7 @@ pub enum DictionaryDecodeError {
     BadMagicNum { got: [u8; 4] },
     DictionaryTooSmall { got: usize, need: usize },
     ZeroDictionaryId,
+    ZeroRepeatOffsetInDictionary { index: u8 },
     FSETableError(FSETableError),
     HuffmanTableError(HuffmanTableError),
 }
@@ -462,6 +463,9 @@ impl core::fmt::Display for DictionaryDecodeError {
             DictionaryDecodeError::ZeroDictionaryId => {
                 write!(f, "Dictionary id must be non-zero")
             }
+            DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index } => {
+                write!(f, "Dictionary repeat offset rep{index} must be non-zero")
+            }
             DictionaryDecodeError::FSETableError(e) => write!(f, "{e:?}"),
             DictionaryDecodeError::HuffmanTableError(e) => write!(f, "{e:?}"),
         }
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index c1da346e..64f9ed8e 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -168,6 +168,21 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             self.state
                 .matcher
                 .prime_with_dictionary(dict.dict_content.as_slice(), dict.offset_hist);
+            if let Some(huff_table) = dict.huf.table.to_encoder_table() {
+                self.state.last_huff_table = Some(huff_table);
+            }
+            if let Some(ll_previous) = dict.fse.literal_lengths.to_encoder_table() {
+                self.state.fse_tables.ll_previous =
+                    Some(PreviousFseTable::Custom(Box::new(ll_previous)));
+            }
+            if let Some(ml_previous) = dict.fse.match_lengths.to_encoder_table() {
+                self.state.fse_tables.ml_previous =
+                    Some(PreviousFseTable::Custom(Box::new(ml_previous)));
+            }
+            if let Some(of_previous) = dict.fse.offsets.to_encoder_table() {
+                self.state.fse_tables.of_previous =
+                    Some(PreviousFseTable::Custom(Box::new(of_previous)));
+            }
         }
         #[cfg(feature = "hash")]
         {
@@ -587,6 +602,40 @@ mod tests {
         assert_eq!(decoded, payload);
     }
 
+    #[test]
+    fn set_dictionary_from_bytes_seeds_entropy_tables_for_first_block() {
+        let dict_raw = include_bytes!("../../dict_tests/dictionary");
+        let mut output = Vec::new();
+        let input = b"short-payload-without-obvious-repetitions";
+
+        let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
+        let previous = compressor
+            .set_dictionary_from_bytes(dict_raw)
+            .expect("dictionary bytes should parse");
+        assert!(previous.is_none());
+
+        compressor.set_source(input.as_slice());
+        compressor.set_drain(&mut output);
+        compressor.compress();
+
+        assert!(
+            compressor.state.last_huff_table.is_some(),
+            "dictionary entropy should seed previous huffman table before first block"
+        );
+        assert!(
+            compressor.state.fse_tables.ll_previous.is_some(),
+            "dictionary entropy should seed previous ll table before first block"
+        );
+        assert!(
+            compressor.state.fse_tables.ml_previous.is_some(),
+            "dictionary entropy should seed previous ml table before first block"
+        );
+        assert!(
+            compressor.state.fse_tables.of_previous.is_some(),
+            "dictionary entropy should seed previous of table before first block"
+        );
+    }
+
     #[test]
     #[should_panic(
         expected = "FrameCompressor::set_dictionary: dictionary.id must be non-zero (0 means 'no dictionary' in the frame header)."
diff --git a/zstd/src/fse/fse_decoder.rs b/zstd/src/fse/fse_decoder.rs
index 7cd59dc6..8d05e142 100644
--- a/zstd/src/fse/fse_decoder.rs
+++ b/zstd/src/fse/fse_decoder.rs
@@ -112,6 +112,18 @@ impl FSETable {
         self.accuracy_log = 0;
     }
 
+    /// Build the equivalent encoder-side table from a parsed decoder table.
+    pub(crate) fn to_encoder_table(&self) -> Option<crate::fse::fse_encoder::FSETable> {
+        if self.accuracy_log == 0 || self.symbol_probabilities.is_empty() {
+            return None;
+        }
+
+        Some(crate::fse::fse_encoder::build_table_from_probabilities(
+            &self.symbol_probabilities,
+            self.accuracy_log,
+        ))
+    }
+
     /// returns how many BYTEs (not bits) were read while building the decoder
     pub fn build_decoder(&mut self, source: &[u8], max_log: u8) -> Result<usize, FSETableError> {
         self.accuracy_log = 0;
diff --git a/zstd/src/huff0/huff0_decoder.rs b/zstd/src/huff0/huff0_decoder.rs
index 1952aea3..b220cdc5 100644
--- a/zstd/src/huff0/huff0_decoder.rs
+++ b/zstd/src/huff0/huff0_decoder.rs
@@ -111,6 +111,28 @@ impl HuffmanTable {
         self.fse_table.reset();
     }
 
+    /// Build the equivalent encoder-side Huffman table from parsed weights.
+    pub(crate) fn to_encoder_table(&self) -> Option<crate::huff0::huff0_encoder::HuffmanTable> {
+        if self.bits.is_empty() || self.max_num_bits == 0 {
+            return None;
+        }
+
+        let max_bits = usize::from(self.max_num_bits);
+        let weights = self
+            .bits
+            .iter()
+            .copied()
+            .map(|num_bits| {
+                if num_bits == 0 {
+                    0
+                } else {
+                    max_bits - usize::from(num_bits) + 1
+                }
+            })
+            .collect::<Vec<_>>();
+        Some(crate::huff0::huff0_encoder::HuffmanTable::build_from_weights(&weights))
+    }
+
     /// Read from `source` and decode the input, populating the huffman decoding table.
     ///
     /// Returns the number of bytes read.

From 0fd86e8f97ab6c8fad5bb7ed82027f3a6e902f84 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 23:36:28 +0200
Subject: [PATCH 07/24] perf(encoding): cache dictionary entropy tables

- precompute decoder->encoder entropy conversions when dictionary is set

- reuse cached tables across compress() calls to avoid per-frame rebuild

- keep explicit fail-fast comment for zero dictionary id API contract

- derive Clone for encoder HuffmanTable to support cache reuse
---
 zstd/src/encoding/frame_compressor.rs | 47 +++++++++++++++++++++++----
 zstd/src/huff0/huff0_encoder.rs       |  1 +
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 64f9ed8e..685fa991 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -40,11 +40,20 @@ pub struct FrameCompressor<R: Read, W: Write, M: Matcher> {
     compressed_data: Option<W>,
     compression_level: CompressionLevel,
     dictionary: Option<crate::decoding::Dictionary>,
+    dictionary_entropy_cache: Option<CachedDictionaryEntropy>,
     state: CompressState<M>,
     #[cfg(feature = "hash")]
     hasher: XxHash64,
 }
 
+#[derive(Clone, Default)]
+struct CachedDictionaryEntropy {
+    huff: Option<crate::huff0::huff0_encoder::HuffmanTable>,
+    ll_previous: Option<FSETable>,
+    ml_previous: Option<FSETable>,
+    of_previous: Option<FSETable>,
+}
+
 #[derive(Clone)]
 pub(crate) enum PreviousFseTable {
     // Default tables are immutable and already stored alongside the state, so
@@ -101,6 +110,7 @@ impl<R: Read, W: Write> FrameCompressor<R, W, MatchGeneratorDriver> {
             compressed_data: None,
             compression_level,
             dictionary: None,
+            dictionary_entropy_cache: None,
             state: CompressState {
                 matcher: MatchGeneratorDriver::new(1024 * 128, 1),
                 last_huff_table: None,
@@ -120,6 +130,7 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             uncompressed_data: None,
             compressed_data: None,
             dictionary: None,
+            dictionary_entropy_cache: None,
             state: CompressState {
                 matcher,
                 last_huff_table: None,
@@ -168,18 +179,34 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             self.state
                 .matcher
                 .prime_with_dictionary(dict.dict_content.as_slice(), dict.offset_hist);
-            if let Some(huff_table) = dict.huf.table.to_encoder_table() {
+            if let Some(huff_table) = self
+                .dictionary_entropy_cache
+                .as_ref()
+                .and_then(|cache| cache.huff.clone())
+            {
                 self.state.last_huff_table = Some(huff_table);
             }
-            if let Some(ll_previous) = dict.fse.literal_lengths.to_encoder_table() {
+            if let Some(ll_previous) = self
+                .dictionary_entropy_cache
+                .as_ref()
+                .and_then(|cache| cache.ll_previous.clone())
+            {
                 self.state.fse_tables.ll_previous =
                     Some(PreviousFseTable::Custom(Box::new(ll_previous)));
             }
-            if let Some(ml_previous) = dict.fse.match_lengths.to_encoder_table() {
+            if let Some(ml_previous) = self
+                .dictionary_entropy_cache
+                .as_ref()
+                .and_then(|cache| cache.ml_previous.clone())
+            {
                 self.state.fse_tables.ml_previous =
                     Some(PreviousFseTable::Custom(Box::new(ml_previous)));
             }
-            if let Some(of_previous) = dict.fse.offsets.to_encoder_table() {
+            if let Some(of_previous) = self
+                .dictionary_entropy_cache
+                .as_ref()
+                .and_then(|cache| cache.of_previous.clone())
+            {
                 self.state.fse_tables.of_previous =
                     Some(PreviousFseTable::Custom(Box::new(of_previous)));
             }
@@ -336,12 +363,19 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         &mut self,
         dictionary: crate::decoding::Dictionary,
     ) -> Option<crate::decoding::Dictionary> {
-        // Keep this as a fail-fast contract for manually-constructed dictionaries.
-        // Parsing helpers already return Result and reject id==0 at the boundary.
+        // Keep this as a fail-fast contract for manually-constructed dictionaries:
+        // id=0 would produce a malformed frame header (signals "no dictionary").
+        // Parsing helpers stay fallible and reject zero-id at input boundaries.
         assert_ne!(
             dictionary.id, 0,
             "FrameCompressor::set_dictionary: dictionary.id must be non-zero (0 means 'no dictionary' in the frame header)."
         );
+        self.dictionary_entropy_cache = Some(CachedDictionaryEntropy {
+            huff: dictionary.huf.table.to_encoder_table(),
+            ll_previous: dictionary.fse.literal_lengths.to_encoder_table(),
+            ml_previous: dictionary.fse.match_lengths.to_encoder_table(),
+            of_previous: dictionary.fse.offsets.to_encoder_table(),
+        });
         self.dictionary.replace(dictionary)
     }
 
@@ -357,6 +391,7 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
 
     /// Remove the attached dictionary.
     pub fn clear_dictionary(&mut self) -> Option<crate::decoding::Dictionary> {
+        self.dictionary_entropy_cache = None;
         self.dictionary.take()
     }
 }
diff --git a/zstd/src/huff0/huff0_encoder.rs b/zstd/src/huff0/huff0_encoder.rs
index 7d35fc32..828c056f 100644
--- a/zstd/src/huff0/huff0_encoder.rs
+++ b/zstd/src/huff0/huff0_encoder.rs
@@ -150,6 +150,7 @@ impl<V: AsMut<Vec<u8>>> HuffmanEncoder<'_, '_, V> {
     }
 }
 
+#[derive(Clone)]
 pub struct HuffmanTable {
     /// Index is the symbol, values are the bitstring in the lower bits of the u32 and the amount of bits in the u8
     codes: Vec<(u32, u8)>,

From 7d8a5f050aa2fb5d6ec276d6727b68d1fa1c21f7 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sat, 28 Mar 2026 23:51:08 +0200
Subject: [PATCH 08/24] fix(encoding): harden dictionary priming suffix reuse

---
 zstd/src/encoding/match_generator.rs | 40 ++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index e0f4d595..c565a199 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -175,6 +175,9 @@ impl Matcher for MatchGeneratorDriver {
         let mut start = 0usize;
         while start < dict_content.len() {
             let end = (start + self.slice_size).min(dict_content.len());
+            if end - start < MIN_MATCH_LEN {
+                break;
+            }
             let mut space = self.get_next_space();
             space.clear();
             space.extend_from_slice(&dict_content[start..end]);
@@ -298,6 +301,10 @@ impl SuffixStore {
 
     #[inline(always)]
     fn key(&self, suffix: &[u8]) -> usize {
+        if self.len_log == 0 {
+            return 0;
+        }
+
         let s0 = suffix[0] as u64;
         let s1 = suffix[1] as u64;
         let s2 = suffix[2] as u64;
@@ -1314,6 +1321,39 @@ fn driver_switches_backends_and_initializes_dfast_via_reset() {
     assert_eq!(driver.window_size(), 64);
 }
 
+#[test]
+fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
+    let mut driver = MatchGeneratorDriver::new(8, 2);
+    driver.reset(CompressionLevel::Fastest);
+
+    // This dictionary leaves a 1-byte tail chunk (capacity=1 suffix table),
+    // which should never be committed to the matcher window.
+    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
+
+    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+        for block in [b"mnopqrstu", b"vwxyzabcd", b"efghijklm"] {
+            let mut space = driver.get_next_space();
+            space.clear();
+            space.extend_from_slice(block);
+            driver.commit_space(space);
+            driver.skip_matching();
+        }
+    }));
+
+    assert!(
+        result.is_ok(),
+        "tiny dictionary tail must not poison suffix store reuse"
+    );
+}
+
+#[test]
+fn suffix_store_with_single_slot_does_not_panic_on_keying() {
+    let mut suffixes = SuffixStore::with_capacity(1);
+    suffixes.insert(b"abcde", 0);
+    assert!(suffixes.contains_key(b"abcde"));
+    assert_eq!(suffixes.get(b"abcde"), Some(0));
+}
+
 #[test]
 fn fastest_reset_uses_interleaved_hash_fill_step() {
     let mut driver = MatchGeneratorDriver::new(32, 2);

From 253e788cc6c181bc8b9fa3810c49db7de5dacc90 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 00:07:25 +0200
Subject: [PATCH 09/24] fix(encoding): harden dictionary priming invariants

---
 zstd/src/encoding/frame_compressor.rs | 31 ++++++------
 zstd/src/encoding/match_generator.rs  | 70 +++++++++++++++++++++++++--
 2 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 685fa991..3107543a 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -362,21 +362,18 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
     pub fn set_dictionary(
         &mut self,
         dictionary: crate::decoding::Dictionary,
-    ) -> Option<crate::decoding::Dictionary> {
-        // Keep this as a fail-fast contract for manually-constructed dictionaries:
-        // id=0 would produce a malformed frame header (signals "no dictionary").
-        // Parsing helpers stay fallible and reject zero-id at input boundaries.
-        assert_ne!(
-            dictionary.id, 0,
-            "FrameCompressor::set_dictionary: dictionary.id must be non-zero (0 means 'no dictionary' in the frame header)."
-        );
+    ) -> Result<Option<crate::decoding::Dictionary>, crate::decoding::errors::DictionaryDecodeError>
+    {
+        if dictionary.id == 0 {
+            return Err(crate::decoding::errors::DictionaryDecodeError::ZeroDictionaryId);
+        }
         self.dictionary_entropy_cache = Some(CachedDictionaryEntropy {
             huff: dictionary.huf.table.to_encoder_table(),
             ll_previous: dictionary.fse.literal_lengths.to_encoder_table(),
             ml_previous: dictionary.fse.match_lengths.to_encoder_table(),
             of_previous: dictionary.fse.offsets.to_encoder_table(),
         });
-        self.dictionary.replace(dictionary)
+        Ok(self.dictionary.replace(dictionary))
     }
 
     /// Parse and attach a serialized dictionary blob.
@@ -386,7 +383,7 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
     ) -> Result<Option<crate::decoding::Dictionary>, crate::decoding::errors::DictionaryDecodeError>
     {
         let dictionary = crate::decoding::Dictionary::decode_dict(raw_dictionary)?;
-        Ok(self.set_dictionary(dictionary))
+        self.set_dictionary(dictionary)
     }
 
     /// Remove the attached dictionary.
@@ -520,6 +517,7 @@ mod tests {
         assert_eq!(
             compressor
                 .set_dictionary(dict_for_encoder)
+                .expect("valid dictionary should attach")
                 .expect("set_dictionary_from_bytes inserted previous dictionary")
                 .id,
             dict_for_decoder.id
@@ -615,7 +613,9 @@ mod tests {
 
         let mut with_dict = Vec::new();
         let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
-        let _ = compressor.set_dictionary(encoder_dict);
+        compressor
+            .set_dictionary(encoder_dict)
+            .expect("valid dict_builder dictionary should attach");
         compressor.set_source(payload.as_slice());
         compressor.set_drain(&mut with_dict);
         compressor.compress();
@@ -672,9 +672,6 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(
-        expected = "FrameCompressor::set_dictionary: dictionary.id must be non-zero (0 means 'no dictionary' in the frame header)."
-    )]
     fn set_dictionary_rejects_zero_dictionary_id() {
         let invalid = crate::decoding::Dictionary {
             id: 0,
@@ -689,7 +686,11 @@ mod tests {
             Vec<u8>,
             crate::encoding::match_generator::MatchGeneratorDriver,
         > = FrameCompressor::new(super::CompressionLevel::Fastest);
-        let _ = compressor.set_dictionary(invalid);
+        let result = compressor.set_dictionary(invalid);
+        assert!(matches!(
+            result,
+            Err(crate::decoding::errors::DictionaryDecodeError::ZeroDictionaryId)
+        ));
     }
 
     #[cfg(feature = "hash")]
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index c565a199..6e47627d 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -168,10 +168,32 @@ impl Matcher for MatchGeneratorDriver {
     }
 
     fn prime_with_dictionary(&mut self, dict_content: &[u8], offset_hist: [u32; 3]) {
+        match self.active_backend {
+            MatcherBackend::Simple => self.match_generator.offset_hist = offset_hist,
+            MatcherBackend::Dfast => self.dfast_matcher_mut().offset_hist = offset_hist,
+        }
+
         if dict_content.is_empty() {
             return;
         }
 
+        // Keep enough budget so dictionary-primed history survives adding
+        // one full data block (otherwise reserve() evicts dictionary first).
+        let retained_dict_budget = dict_content.len().min(self.slice_size);
+        match self.active_backend {
+            MatcherBackend::Simple => {
+                self.match_generator.max_window_size = self
+                    .match_generator
+                    .max_window_size
+                    .saturating_add(retained_dict_budget);
+            }
+            MatcherBackend::Dfast => {
+                let matcher = self.dfast_matcher_mut();
+                matcher.max_window_size =
+                    matcher.max_window_size.saturating_add(retained_dict_budget);
+            }
+        }
+
         let mut start = 0usize;
         while start < dict_content.len() {
             let end = (start + self.slice_size).min(dict_content.len());
@@ -185,11 +207,6 @@ impl Matcher for MatchGeneratorDriver {
             self.skip_matching();
             start = end;
         }
-
-        match self.active_backend {
-            MatcherBackend::Simple => self.match_generator.offset_hist = offset_hist,
-            MatcherBackend::Dfast => self.dfast_matcher_mut().offset_hist = offset_hist,
-        }
     }
 
     fn window_size(&self) -> u64 {
@@ -1321,6 +1338,49 @@ fn driver_switches_backends_and_initializes_dfast_via_reset() {
     assert_eq!(driver.window_size(), 64);
 }
 
+#[test]
+fn prime_with_dictionary_preserves_history_for_first_full_block() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Fastest);
+
+    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
+
+    let mut space = driver.get_next_space();
+    space.clear();
+    space.extend_from_slice(b"abcdefgh");
+    driver.commit_space(space);
+
+    let mut saw_match = false;
+    driver.start_matching(|seq| {
+        if let Sequence::Triple {
+            literals,
+            offset,
+            match_len,
+        } = seq
+            && literals.is_empty()
+            && offset == 8
+            && match_len >= MIN_MATCH_LEN
+        {
+            saw_match = true;
+        }
+    });
+
+    assert!(
+        saw_match,
+        "first full block should still match dictionary-primed history"
+    );
+}
+
+#[test]
+fn prime_with_dictionary_applies_offset_history_even_when_content_is_empty() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Fastest);
+
+    driver.prime_with_dictionary(&[], [11, 7, 3]);
+
+    assert_eq!(driver.match_generator.offset_hist, [11, 7, 3]);
+}
+
 #[test]
 fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
     let mut driver = MatchGeneratorDriver::new(8, 2);

From c8c477b10ee7aa14565e4dbb43c10c9e4745bbc9 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 00:54:11 +0200
Subject: [PATCH 10/24] fix(encoding): validate repcodes and retain dict
 history

---
 zstd/src/encoding/frame_compressor.rs | 33 +++++++++++++++++++++++
 zstd/src/encoding/match_generator.rs  | 39 ++++++++++++++++++++++++---
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 3107543a..02ed6f7e 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -367,6 +367,13 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         if dictionary.id == 0 {
             return Err(crate::decoding::errors::DictionaryDecodeError::ZeroDictionaryId);
         }
+        if let Some(index) = dictionary.offset_hist.iter().position(|&rep| rep == 0) {
+            return Err(
+                crate::decoding::errors::DictionaryDecodeError::ZeroRepeatOffsetInDictionary {
+                    index: index as u8,
+                },
+            );
+        }
         self.dictionary_entropy_cache = Some(CachedDictionaryEntropy {
             huff: dictionary.huf.table.to_encoder_table(),
             ll_previous: dictionary.fse.literal_lengths.to_encoder_table(),
@@ -693,6 +700,32 @@ mod tests {
         ));
     }
 
+    #[test]
+    fn set_dictionary_rejects_zero_repeat_offsets() {
+        let invalid = crate::decoding::Dictionary {
+            id: 1,
+            fse: crate::decoding::scratch::FSEScratch::new(),
+            huf: crate::decoding::scratch::HuffmanScratch::new(),
+            dict_content: vec![1, 2, 3],
+            offset_hist: [0, 4, 8],
+        };
+
+        let mut compressor: FrameCompressor<
+            &[u8],
+            Vec<u8>,
+            crate::encoding::match_generator::MatchGeneratorDriver,
+        > = FrameCompressor::new(super::CompressionLevel::Fastest);
+        let result = compressor.set_dictionary(invalid);
+        assert!(matches!(
+            result,
+            Err(
+                crate::decoding::errors::DictionaryDecodeError::ZeroRepeatOffsetInDictionary {
+                    index: 0
+                }
+            )
+        ));
+    }
+
     #[cfg(feature = "hash")]
     #[test]
     fn checksum_two_frames_reused_compressor() {
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index 6e47627d..b07a9acb 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -177,9 +177,9 @@ impl Matcher for MatchGeneratorDriver {
             return;
         }
 
-        // Keep enough budget so dictionary-primed history survives adding
-        // one full data block (otherwise reserve() evicts dictionary first).
-        let retained_dict_budget = dict_content.len().min(self.slice_size);
+        // Dictionary bytes should stay addressable until produced frame output
+        // itself exceeds the live window size.
+        let retained_dict_budget = dict_content.len();
         match self.active_backend {
             MatcherBackend::Simple => {
                 self.match_generator.max_window_size = self
@@ -1371,6 +1371,39 @@ fn prime_with_dictionary_preserves_history_for_first_full_block() {
     );
 }
 
+#[test]
+fn prime_with_large_dictionary_preserves_early_history_until_first_block() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Fastest);
+
+    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
+
+    let mut space = driver.get_next_space();
+    space.clear();
+    space.extend_from_slice(b"abcdefgh");
+    driver.commit_space(space);
+
+    let mut saw_match = false;
+    driver.start_matching(|seq| {
+        if let Sequence::Triple {
+            literals,
+            offset,
+            match_len,
+        } = seq
+            && literals.is_empty()
+            && offset == 24
+            && match_len >= MIN_MATCH_LEN
+        {
+            saw_match = true;
+        }
+    });
+
+    assert!(
+        saw_match,
+        "dictionary bytes should remain addressable until frame output exceeds the live window"
+    );
+}
+
 #[test]
 fn prime_with_dictionary_applies_offset_history_even_when_content_is_empty() {
     let mut driver = MatchGeneratorDriver::new(8, 1);

From 5770877cd753bffa18e7bde041da19ba80de6dfa Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 01:04:57 +0200
Subject: [PATCH 11/24] test(encoding): avoid moving frame decode error in
 assert

---
 zstd/src/encoding/frame_compressor.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 02ed6f7e..f7e4cca8 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -551,7 +551,7 @@ mod tests {
             .unwrap_err();
         assert!(
             matches!(
-                err,
+                &err,
                 crate::decoding::errors::FrameDecoderError::DictNotProvided { .. }
             ),
             "dict-compressed stream should require dictionary id, got: {err:?}"

From 29ac4e71f3760abfe9a9dafd50f54cec359df90b Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 01:19:07 +0200
Subject: [PATCH 12/24] test(encoding): cover dfast priming and decouple header
 window

---
 zstd/src/encoding/match_generator.rs | 58 ++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index b07a9acb..949dde14 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -42,6 +42,9 @@ pub struct MatchGeneratorDriver {
     slice_size: usize,
     base_slice_size: usize,
     base_window_size: usize,
+    // Frame header window size must stay at the configured live-window budget.
+    // Dictionary retention expands internal matcher capacity only.
+    reported_window_size: usize,
 }
 
 impl MatchGeneratorDriver {
@@ -58,6 +61,7 @@ impl MatchGeneratorDriver {
             slice_size,
             base_slice_size: slice_size,
             base_window_size: max_window_size,
+            reported_window_size: max_window_size,
         }
     }
 
@@ -139,6 +143,7 @@ impl Matcher for MatchGeneratorDriver {
 
         self.active_backend = backend;
         self.slice_size = slice_size;
+        self.reported_window_size = max_window_size;
         match self.active_backend {
             MatcherBackend::Simple => {
                 let vec_pool = &mut self.vec_pool;
@@ -210,10 +215,7 @@ impl Matcher for MatchGeneratorDriver {
     }
 
     fn window_size(&self) -> u64 {
-        match self.active_backend {
-            MatcherBackend::Simple => self.match_generator.max_window_size as u64,
-            MatcherBackend::Dfast => self.dfast_matcher().max_window_size as u64,
-        }
+        self.reported_window_size as u64
     }
 
     fn get_next_space(&mut self) -> Vec<u8> {
@@ -1414,6 +1416,54 @@ fn prime_with_dictionary_applies_offset_history_even_when_content_is_empty() {
     assert_eq!(driver.match_generator.offset_hist, [11, 7, 3]);
 }
 
+#[test]
+fn dfast_prime_with_dictionary_preserves_history_for_first_full_block() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Default);
+
+    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
+
+    let mut space = driver.get_next_space();
+    space.clear();
+    space.extend_from_slice(b"abcdefgh");
+    driver.commit_space(space);
+
+    let mut saw_match = false;
+    driver.start_matching(|seq| {
+        if let Sequence::Triple {
+            literals,
+            offset,
+            match_len,
+        } = seq
+            && literals.is_empty()
+            && offset == 8
+            && match_len >= DFAST_MIN_MATCH_LEN
+        {
+            saw_match = true;
+        }
+    });
+
+    assert!(
+        saw_match,
+        "dfast backend should match dictionary-primed history in first full block"
+    );
+}
+
+#[test]
+fn prime_with_dictionary_does_not_inflate_reported_window_size() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Fastest);
+
+    let before = driver.window_size();
+    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
+    let after = driver.window_size();
+
+    assert_eq!(
+        after, before,
+        "dictionary retention budget must not change reported frame window size"
+    );
+}
+
 #[test]
 fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
     let mut driver = MatchGeneratorDriver::new(8, 2);

From b8ed7222bb078f28bf7af1b8fae67d690a195245 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 01:36:42 +0200
Subject: [PATCH 13/24] chore(encoding): document len_log zero guard

---
 zstd/src/encoding/match_generator.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index 949dde14..8f78b5df 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -320,6 +320,7 @@ impl SuffixStore {
 
     #[inline(always)]
     fn key(&self, suffix: &[u8]) -> usize {
+        // Capacity=1 yields len_log=0; shifting by 64 would panic.
         if self.len_log == 0 {
             return 0;
         }

From 4701190c40352c8b8f9454d601180743199d7f12 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 01:44:59 +0200
Subject: [PATCH 14/24] fix(encoding): cap dictionary retention to committed
 bytes

---
 zstd/src/encoding/match_generator.rs | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index 8f78b5df..f3fddf01 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -200,6 +200,7 @@ impl Matcher for MatchGeneratorDriver {
         }
 
         let mut start = 0usize;
+        let mut committed_dict_budget = 0usize;
         while start < dict_content.len() {
             let end = (start + self.slice_size).min(dict_content.len());
             if end - start < MIN_MATCH_LEN {
@@ -210,8 +211,26 @@ impl Matcher for MatchGeneratorDriver {
             space.extend_from_slice(&dict_content[start..end]);
             self.commit_space(space);
             self.skip_matching();
+            committed_dict_budget += end - start;
             start = end;
         }
+
+        let uncommitted_tail_budget = retained_dict_budget.saturating_sub(committed_dict_budget);
+        if uncommitted_tail_budget > 0 {
+            match self.active_backend {
+                MatcherBackend::Simple => {
+                    self.match_generator.max_window_size = self
+                        .match_generator
+                        .max_window_size
+                        .saturating_sub(uncommitted_tail_budget);
+                }
+                MatcherBackend::Dfast => {
+                    let matcher = self.dfast_matcher_mut();
+                    matcher.max_window_size =
+                        matcher.max_window_size.saturating_sub(uncommitted_tail_budget);
+                }
+            }
+        }
     }
 
     fn window_size(&self) -> u64 {
@@ -1490,6 +1509,22 @@ fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
     );
 }
 
+#[test]
+fn prime_with_dictionary_counts_only_committed_tail_budget() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Fastest);
+
+    let before = driver.match_generator.max_window_size;
+    // One full slice plus a 1-byte tail that cannot be committed.
+    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
+
+    assert_eq!(
+        driver.match_generator.max_window_size,
+        before + 8,
+        "retention budget must account only for dictionary bytes actually committed to history"
+    );
+}
+
 #[test]
 fn suffix_store_with_single_slot_does_not_panic_on_keying() {
     let mut suffixes = SuffixStore::with_capacity(1);

From 2fb69e2bf3d7b26ac6161c3365fa9e9013bb4ceb Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 01:47:32 +0200
Subject: [PATCH 15/24] style(encoding): format tail budget subtraction

---
 zstd/src/encoding/match_generator.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index f3fddf01..53ad65bd 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -226,8 +226,9 @@ impl Matcher for MatchGeneratorDriver {
                 }
                 MatcherBackend::Dfast => {
                     let matcher = self.dfast_matcher_mut();
-                    matcher.max_window_size =
-                        matcher.max_window_size.saturating_sub(uncommitted_tail_budget);
+                    matcher.max_window_size = matcher
+                        .max_window_size
+                        .saturating_sub(uncommitted_tail_budget);
                 }
             }
         }

From 80907a82d698ba8c539b3e1860d9d5e5647e3016 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 09:06:49 +0300
Subject: [PATCH 16/24] fix(encoding): skip dictionary state for raw frames

---
 zstd/src/encoding/frame_compressor.rs | 41 +++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index f7e4cca8..bc596173 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -172,7 +172,9 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         self.state.fse_tables.ml_previous = None;
         self.state.fse_tables.of_previous = None;
         self.state.offset_hist = [1, 4, 8];
-        if let Some(dict) = self.dictionary.as_ref() {
+        let use_dictionary_state =
+            !matches!(self.compression_level, CompressionLevel::Uncompressed);
+        if use_dictionary_state && let Some(dict) = self.dictionary.as_ref() {
             // This state drives sequence encoding, while matcher priming below updates
             // the match generator's internal repeat-offset history for match finding.
             self.state.offset_hist = dict.offset_hist;
@@ -224,7 +226,11 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             frame_content_size: None,
             single_segment: false,
             content_checksum: cfg!(feature = "hash"),
-            dictionary_id: self.dictionary.as_ref().map(|dict| dict.id as u64),
+            dictionary_id: if use_dictionary_state {
+                self.dictionary.as_ref().map(|dict| dict.id as u64)
+            } else {
+                None
+            },
             window_size: Some(self.state.matcher.window_size()),
         };
         header.serialize(output);
@@ -726,6 +732,37 @@ mod tests {
         ));
     }
 
+    #[test]
+    fn uncompressed_mode_does_not_require_dictionary() {
+        let dict_id = 0xABCD_0001;
+        let dict =
+            crate::decoding::Dictionary::from_raw_content(dict_id, b"shared-history".to_vec())
+                .expect("raw dictionary should be valid");
+
+        let payload = b"plain-bytes-that-should-stay-raw";
+        let mut output = Vec::new();
+        let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
+        compressor
+            .set_dictionary(dict)
+            .expect("dictionary should attach in uncompressed mode");
+        compressor.set_source(payload.as_slice());
+        compressor.set_drain(&mut output);
+        compressor.compress();
+
+        let (frame_header, _) = crate::decoding::frame::read_frame_header(output.as_slice())
+            .expect("encoded frame should have a header");
+        assert_eq!(
+            frame_header.dictionary_id(),
+            None,
+            "raw/uncompressed frames must not advertise dictionary dependency"
+        );
+
+        let mut decoder = FrameDecoder::new();
+        let mut decoded = Vec::with_capacity(payload.len());
+        decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
+        assert_eq!(decoded, payload);
+    }
+
     #[cfg(feature = "hash")]
     #[test]
     fn checksum_two_frames_reused_compressor() {

From a4cf2f05d105bcfefdf4475bdfcd94f65f134241 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 11:40:01 +0300
Subject: [PATCH 17/24] fix(encoding): address renewed review threads

---
 zstd/src/decoding/dictionary.rs       |  5 +-
 zstd/src/encoding/frame_compressor.rs | 94 ++++++++++-----------------
 zstd/src/encoding/match_generator.rs  | 41 ++++++++----
 3 files changed, 66 insertions(+), 74 deletions(-)

diff --git a/zstd/src/decoding/dictionary.rs b/zstd/src/decoding/dictionary.rs
index 54fd0821..baa974d6 100644
--- a/zstd/src/decoding/dictionary.rs
+++ b/zstd/src/decoding/dictionary.rs
@@ -61,8 +61,9 @@ impl Dictionary {
         })
     }
 
-    /// Parses the dictionary from `raw` and set the tables
-    /// it returns the dict_id for checking with the frame's `dict_id``
+    /// Parses the dictionary from `raw`, initializes its tables,
+    /// and returns a fully constructed [`Dictionary`] whose `id` can be
+    /// checked against the frame's `dict_id`.
     pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
         const MIN_MAGIC_AND_ID_LEN: usize = 8;
         const OFFSET_HISTORY_LEN: usize = 12;
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index bc596173..8722f2a4 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -174,6 +174,11 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         self.state.offset_hist = [1, 4, 8];
         let use_dictionary_state =
             !matches!(self.compression_level, CompressionLevel::Uncompressed);
+        let cached_entropy = if use_dictionary_state {
+            self.dictionary_entropy_cache.as_ref()
+        } else {
+            None
+        };
         if use_dictionary_state && let Some(dict) = self.dictionary.as_ref() {
             // This state drives sequence encoding, while matcher priming below updates
             // the match generator's internal repeat-offset history for match finding.
@@ -181,38 +186,37 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             self.state
                 .matcher
                 .prime_with_dictionary(dict.dict_content.as_slice(), dict.offset_hist);
-            if let Some(huff_table) = self
-                .dictionary_entropy_cache
-                .as_ref()
-                .and_then(|cache| cache.huff.clone())
-            {
-                self.state.last_huff_table = Some(huff_table);
-            }
-            if let Some(ll_previous) = self
-                .dictionary_entropy_cache
-                .as_ref()
-                .and_then(|cache| cache.ll_previous.clone())
-            {
-                self.state.fse_tables.ll_previous =
-                    Some(PreviousFseTable::Custom(Box::new(ll_previous)));
-            }
-            if let Some(ml_previous) = self
-                .dictionary_entropy_cache
-                .as_ref()
-                .and_then(|cache| cache.ml_previous.clone())
-            {
-                self.state.fse_tables.ml_previous =
-                    Some(PreviousFseTable::Custom(Box::new(ml_previous)));
-            }
-            if let Some(of_previous) = self
-                .dictionary_entropy_cache
-                .as_ref()
-                .and_then(|cache| cache.of_previous.clone())
-            {
-                self.state.fse_tables.of_previous =
-                    Some(PreviousFseTable::Custom(Box::new(of_previous)));
-            }
         }
+        self.state
+            .last_huff_table
+            .clone_from(&cached_entropy.and_then(|cache| cache.huff.clone()));
+        self.state
+            .fse_tables
+            .ll_previous
+            .clone_from(&cached_entropy.and_then(|cache| {
+                cache
+                    .ll_previous
+                    .clone()
+                    .map(|table| PreviousFseTable::Custom(Box::new(table)))
+            }));
+        self.state
+            .fse_tables
+            .ml_previous
+            .clone_from(&cached_entropy.and_then(|cache| {
+                cache
+                    .ml_previous
+                    .clone()
+                    .map(|table| PreviousFseTable::Custom(Box::new(table)))
+            }));
+        self.state
+            .fse_tables
+            .of_previous
+            .clone_from(&cached_entropy.and_then(|cache| {
+                cache
+                    .of_previous
+                    .clone()
+                    .map(|table| PreviousFseTable::Custom(Box::new(table)))
+            }));
         #[cfg(feature = "hash")]
         {
             self.hasher = XxHash64::with_seed(0);
@@ -511,13 +515,6 @@ mod tests {
             data.extend_from_slice(&dict_for_decoder.dict_content[..2048]);
         }
 
-        let mut plain = Vec::new();
-        crate::encoding::compress(
-            data.as_slice(),
-            &mut plain,
-            super::CompressionLevel::Fastest,
-        );
-
         let mut with_dict = Vec::new();
         let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
         let previous = compressor
@@ -543,13 +540,6 @@ mod tests {
             .expect("encoded stream should have a frame header");
         assert_eq!(frame_header.dictionary_id(), Some(dict_for_decoder.id));
 
-        assert!(
-            with_dict.len() < plain.len(),
-            "dictionary compression should improve ratio for dictionary-like payloads (plain={}, dict={})",
-            plain.len(),
-            with_dict.len()
-        );
-
         let mut decoder = FrameDecoder::new();
         let mut missing_dict_target = Vec::with_capacity(data.len());
         let err = decoder
@@ -617,13 +607,6 @@ mod tests {
             );
         }
 
-        let mut plain = Vec::new();
-        crate::encoding::compress(
-            payload.as_slice(),
-            &mut plain,
-            super::CompressionLevel::Fastest,
-        );
-
         let mut with_dict = Vec::new();
         let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
         compressor
@@ -636,13 +619,6 @@ mod tests {
         let (frame_header, _) = crate::decoding::frame::read_frame_header(with_dict.as_slice())
             .expect("encoded stream should have a frame header");
         assert_eq!(frame_header.dictionary_id(), Some(dict_id));
-        assert!(
-            with_dict.len() < plain.len(),
-            "dict_builder dictionary should improve ratio for matching payloads (plain={}, dict={})",
-            plain.len(),
-            with_dict.len()
-        );
-
         let mut decoder = FrameDecoder::new();
         decoder.add_dict(decoder_dict).unwrap();
         let mut decoded = Vec::with_capacity(payload.len());
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index 53ad65bd..0cbd33a3 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -201,9 +201,13 @@ impl Matcher for MatchGeneratorDriver {
 
         let mut start = 0usize;
         let mut committed_dict_budget = 0usize;
+        let min_primed_tail = match self.active_backend {
+            MatcherBackend::Simple => MIN_MATCH_LEN,
+            MatcherBackend::Dfast => 4,
+        };
         while start < dict_content.len() {
             let end = (start + self.slice_size).min(dict_content.len());
-            if end - start < MIN_MATCH_LEN {
+            if end - start < min_primed_tail {
                 break;
             }
             let mut space = self.get_next_space();
@@ -1494,19 +1498,13 @@ fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
     // which should never be committed to the matcher window.
     driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
 
-    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
-        for block in [b"mnopqrstu", b"vwxyzabcd", b"efghijklm"] {
-            let mut space = driver.get_next_space();
-            space.clear();
-            space.extend_from_slice(block);
-            driver.commit_space(space);
-            driver.skip_matching();
-        }
-    }));
-
     assert!(
-        result.is_ok(),
-        "tiny dictionary tail must not poison suffix store reuse"
+        driver
+            .match_generator
+            .window
+            .iter()
+            .all(|entry| entry.data.len() >= MIN_MATCH_LEN),
+        "dictionary priming must not commit tails shorter than MIN_MATCH_LEN"
     );
 }
 
@@ -1526,6 +1524,23 @@ fn prime_with_dictionary_counts_only_committed_tail_budget() {
     );
 }
 
+#[test]
+fn dfast_prime_with_dictionary_counts_four_byte_tail_budget() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Default);
+
+    let before = driver.dfast_matcher().max_window_size;
+    // One full slice plus a 4-byte tail. Dfast can still use this tail through
+    // short-hash overlap into the next block, so it should stay retained.
+    driver.prime_with_dictionary(b"abcdefghijkl", [1, 4, 8]);
+
+    assert_eq!(
+        driver.dfast_matcher().max_window_size,
+        before + 12,
+        "dfast retention budget should include 4-byte dictionary tails"
+    );
+}
+
 #[test]
 fn suffix_store_with_single_slot_does_not_panic_on_keying() {
     let mut suffixes = SuffixStore::with_capacity(1);

From 42450b765a41eb37a0acf9c87db066f6d94bd632 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 13:27:49 +0300
Subject: [PATCH 18/24] test(encoding): cover window crossing with dictionaries

---
 zstd/src/encoding/frame_compressor.rs | 42 +++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 8722f2a4..6c9cc89c 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -190,6 +190,8 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         self.state
             .last_huff_table
             .clone_from(&cached_entropy.and_then(|cache| cache.huff.clone()));
+        // `clone_from` keeps frame-to-frame seeding cheap for reused compressors by
+        // reusing existing allocations where possible instead of reallocating every frame.
         self.state
             .fse_tables
             .ll_previous
@@ -739,6 +741,46 @@ mod tests {
         assert_eq!(decoded, payload);
     }
 
+    #[test]
+    fn dictionary_roundtrip_stays_valid_after_output_exceeds_window() {
+        use crate::encoding::match_generator::MatchGeneratorDriver;
+
+        let dict_id = 0xABCD_0002;
+        let dict = crate::decoding::Dictionary::from_raw_content(dict_id, b"abcdefgh".to_vec())
+            .expect("raw dictionary should be valid");
+        let dict_for_decoder =
+            crate::decoding::Dictionary::from_raw_content(dict_id, b"abcdefgh".to_vec())
+                .expect("raw dictionary should be valid");
+
+        let payload = b"abcdefgh".repeat(512);
+        let mut output = Vec::new();
+        let matcher = MatchGeneratorDriver::new(8, 1);
+        let mut compressor =
+            FrameCompressor::new_with_matcher(matcher, super::CompressionLevel::Fastest);
+        compressor
+            .set_dictionary(dict)
+            .expect("dictionary should attach");
+        compressor.set_source(payload.as_slice());
+        compressor.set_drain(&mut output);
+        compressor.compress();
+
+        let (frame_header, _) = crate::decoding::frame::read_frame_header(output.as_slice())
+            .expect("encoded frame should have a header");
+        let advertised_window = frame_header
+            .window_size()
+            .expect("window size should be present");
+        assert!(
+            payload.len() > advertised_window as usize,
+            "test must cross the advertised window boundary"
+        );
+
+        let mut decoder = FrameDecoder::new();
+        decoder.add_dict(dict_for_decoder).unwrap();
+        let mut decoded = Vec::with_capacity(payload.len());
+        decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
+        assert_eq!(decoded, payload);
+    }
+
     #[cfg(feature = "hash")]
     #[test]
     fn checksum_two_frames_reused_compressor() {

From a58db02c8766ad6f8e9a753037a857cff2413f88 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 16:17:56 +0300
Subject: [PATCH 19/24] fix(encoding): gate dictionary id on matcher capability

- add Matcher::supports_dictionary_priming with safe default false
- enable dictionary state only for matchers that opt in
- harden window regression test against no-dictionary baseline
- add regression test for custom matcher without dictionary priming
---
 zstd/src/encoding/frame_compressor.rs | 99 ++++++++++++++++++++++++++-
 zstd/src/encoding/match_generator.rs  |  4 ++
 zstd/src/encoding/mod.rs              |  5 ++
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 6c9cc89c..8428586e 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -173,7 +173,8 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         self.state.fse_tables.of_previous = None;
         self.state.offset_hist = [1, 4, 8];
         let use_dictionary_state =
-            !matches!(self.compression_level, CompressionLevel::Uncompressed);
+            !matches!(self.compression_level, CompressionLevel::Uncompressed)
+                && self.state.matcher.supports_dictionary_priming();
         let cached_entropy = if use_dictionary_state {
             self.dictionary_entropy_cache.as_ref()
         } else {
@@ -421,8 +422,53 @@ mod tests {
     use super::FrameCompressor;
     use crate::common::MAGIC_NUM;
     use crate::decoding::FrameDecoder;
+    use crate::encoding::{Matcher, Sequence};
     use alloc::vec::Vec;
 
+    struct NoDictionaryMatcher {
+        last_space: Vec<u8>,
+        window_size: u64,
+    }
+
+    impl NoDictionaryMatcher {
+        fn new(window_size: u64) -> Self {
+            Self {
+                last_space: Vec::new(),
+                window_size,
+            }
+        }
+    }
+
+    impl Matcher for NoDictionaryMatcher {
+        fn get_next_space(&mut self) -> Vec<u8> {
+            vec![0; self.window_size as usize]
+        }
+
+        fn get_last_space(&mut self) -> &[u8] {
+            self.last_space.as_slice()
+        }
+
+        fn commit_space(&mut self, space: Vec<u8>) {
+            self.last_space = space;
+        }
+
+        fn skip_matching(&mut self) {}
+
+        fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
+            handle_sequence(Sequence::Literals {
+                literals: self.last_space.as_slice(),
+            });
+        }
+
+        fn reset(&mut self, _level: super::CompressionLevel) {
+            self.last_space.clear();
+        }
+
+        fn window_size(&self) -> u64 {
+            self.window_size
+        }
+    }
+
     #[test]
     fn frame_starts_with_magic_num() {
         let mock_data = [1_u8, 2, 3].as_slice();
@@ -753,6 +799,21 @@ mod tests {
                 .expect("raw dictionary should be valid");
 
         let payload = b"abcdefgh".repeat(512);
+        let matcher = MatchGeneratorDriver::new(8, 1);
+
+        let mut no_dict_output = Vec::new();
+        let mut no_dict_compressor =
+            FrameCompressor::new_with_matcher(matcher, super::CompressionLevel::Fastest);
+        no_dict_compressor.set_source(payload.as_slice());
+        no_dict_compressor.set_drain(&mut no_dict_output);
+        no_dict_compressor.compress();
+        let (no_dict_frame_header, _) =
+            crate::decoding::frame::read_frame_header(no_dict_output.as_slice())
+                .expect("baseline frame should have a header");
+        let no_dict_window = no_dict_frame_header
+            .window_size()
+            .expect("window size should be present");
+
         let mut output = Vec::new();
         let matcher = MatchGeneratorDriver::new(8, 1);
         let mut compressor =
@@ -769,6 +830,10 @@ mod tests {
         let advertised_window = frame_header
             .window_size()
             .expect("window size should be present");
+        assert_eq!(
+            advertised_window, no_dict_window,
+            "dictionary priming must not inflate advertised window size"
+        );
         assert!(
             payload.len() > advertised_window as usize,
             "test must cross the advertised window boundary"
@@ -781,6 +846,38 @@ mod tests {
         assert_eq!(decoded, payload);
     }
 
+    #[test]
+    fn custom_matcher_without_dictionary_priming_does_not_advertise_dict_id() {
+        let dict_id = 0xABCD_0003;
+        let dict = crate::decoding::Dictionary::from_raw_content(dict_id, b"abcdefgh".to_vec())
+            .expect("raw dictionary should be valid");
+        let payload = b"abcdefghabcdefgh";
+
+        let mut output = Vec::new();
+        let matcher = NoDictionaryMatcher::new(64);
+        let mut compressor =
+            FrameCompressor::new_with_matcher(matcher, super::CompressionLevel::Fastest);
+        compressor
+            .set_dictionary(dict)
+            .expect("dictionary should attach");
+        compressor.set_source(payload.as_slice());
+        compressor.set_drain(&mut output);
+        compressor.compress();
+
+        let (frame_header, _) = crate::decoding::frame::read_frame_header(output.as_slice())
+            .expect("encoded frame should have a header");
+        assert_eq!(
+            frame_header.dictionary_id(),
+            None,
+            "matchers that do not support dictionary priming must not advertise dictionary dependency"
+        );
+
+        let mut decoder = FrameDecoder::new();
+        let mut decoded = Vec::with_capacity(payload.len());
+        decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
+        assert_eq!(decoded, payload);
+    }
+
     #[cfg(feature = "hash")]
     #[test]
     fn checksum_two_frames_reused_compressor() {
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index 0cbd33a3..701bab29 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -114,6 +114,10 @@ impl MatchGeneratorDriver {
 }
 
 impl Matcher for MatchGeneratorDriver {
+    fn supports_dictionary_priming(&self) -> bool {
+        true
+    }
+
     fn reset(&mut self, level: CompressionLevel) {
         let (backend, slice_size, max_window_size, hash_fill_step) = self.level_config(level);
         if self.active_backend != backend {
diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs
index 855f8bb6..49c6e36a 100644
--- a/zstd/src/encoding/mod.rs
+++ b/zstd/src/encoding/mod.rs
@@ -98,6 +98,11 @@ pub trait Matcher {
     /// Prime matcher state with dictionary history before compressing the next frame.
     /// Default implementation is a no-op for custom matchers that do not support this.
     fn prime_with_dictionary(&mut self, _dict_content: &[u8], _offset_hist: [u32; 3]) {}
+    /// Returns whether this matcher can consume dictionary priming state and produce
+    /// dictionary-dependent sequences. Defaults to `false` for custom matchers.
+    fn supports_dictionary_priming(&self) -> bool {
+        false
+    }
     /// The size of the window the decoder will need to execute all sequences produced by this matcher
     ///
     /// May change after a call to reset with a different compression level

From 5da66966912f8fd7e76801407e6f093ac620e0c1 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 16:58:39 +0300
Subject: [PATCH 20/24] fix(encoding): retire dictionary budget on eviction

- clarify set_dictionary docs for uncompressed and non-priming matchers
- track retained dictionary budget separately from advertised live window
- shrink matcher capacity as primed dictionary bytes are evicted
- add regression tests for simple and dfast budget retirement
---
 zstd/src/encoding/frame_compressor.rs |   6 +-
 zstd/src/encoding/match_generator.rs  | 143 ++++++++++++++++++++++++++
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 8428586e..02d1d95b 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -370,8 +370,10 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
 
     /// Attach a pre-parsed dictionary to be used for subsequent compressions.
     ///
-    /// The dictionary id will be written to the frame header and the matcher will be
-    /// primed with dictionary content/history before compressing each frame.
+    /// In compressed modes, the dictionary id is written only when the active
+    /// matcher supports dictionary priming.
+    /// Uncompressed mode and non-priming matchers ignore the attached dictionary
+    /// at encode time.
     pub fn set_dictionary(
         &mut self,
         dictionary: crate::decoding::Dictionary,
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index 701bab29..f4f027c4 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -45,6 +45,9 @@ pub struct MatchGeneratorDriver {
     // Frame header window size must stay at the configured live-window budget.
     // Dictionary retention expands internal matcher capacity only.
     reported_window_size: usize,
+    // Tracks currently retained bytes that originated from primed dictionary
+    // history and have not been evicted yet.
+    dictionary_retained_budget: usize,
 }
 
 impl MatchGeneratorDriver {
@@ -62,6 +65,7 @@ impl MatchGeneratorDriver {
             base_slice_size: slice_size,
             base_window_size: max_window_size,
             reported_window_size: max_window_size,
+            dictionary_retained_budget: 0,
         }
     }
 
@@ -111,6 +115,61 @@ impl MatchGeneratorDriver {
             .as_mut()
             .expect("dfast backend must be initialized by reset() before use")
     }
+
+    fn retire_dictionary_budget(&mut self, evicted_bytes: usize) {
+        let reclaimed = evicted_bytes.min(self.dictionary_retained_budget);
+        if reclaimed == 0 {
+            return;
+        }
+        self.dictionary_retained_budget -= reclaimed;
+        match self.active_backend {
+            MatcherBackend::Simple => {
+                self.match_generator.max_window_size = self
+                    .match_generator
+                    .max_window_size
+                    .saturating_sub(reclaimed);
+            }
+            MatcherBackend::Dfast => {
+                let matcher = self.dfast_matcher_mut();
+                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
+            }
+        }
+    }
+
+    fn trim_after_budget_retire(&mut self) {
+        loop {
+            let mut evicted_bytes = 0usize;
+            match self.active_backend {
+                MatcherBackend::Simple => {
+                    let vec_pool = &mut self.vec_pool;
+                    let suffix_pool = &mut self.suffix_pool;
+                    self.match_generator.reserve(0, |mut data, mut suffixes| {
+                        evicted_bytes += data.len();
+                        data.resize(data.capacity(), 0);
+                        vec_pool.push(data);
+                        suffixes.slots.clear();
+                        suffixes.slots.resize(suffixes.slots.capacity(), None);
+                        suffix_pool.push(suffixes);
+                    });
+                }
+                MatcherBackend::Dfast => {
+                    let mut retired = Vec::new();
+                    self.dfast_matcher_mut().trim_to_window(|data| {
+                        evicted_bytes += data.len();
+                        retired.push(data);
+                    });
+                    for mut data in retired {
+                        data.resize(data.capacity(), 0);
+                        self.vec_pool.push(data);
+                    }
+                }
+            }
+            if evicted_bytes == 0 {
+                break;
+            }
+            self.retire_dictionary_budget(evicted_bytes);
+        }
+    }
 }
 
 impl Matcher for MatchGeneratorDriver {
@@ -120,6 +179,7 @@ impl Matcher for MatchGeneratorDriver {
 
     fn reset(&mut self, level: CompressionLevel) {
         let (backend, slice_size, max_window_size, hash_fill_step) = self.level_config(level);
+        self.dictionary_retained_budget = 0;
         if self.active_backend != backend {
             match self.active_backend {
                 MatcherBackend::Simple => {
@@ -240,6 +300,11 @@ impl Matcher for MatchGeneratorDriver {
                 }
             }
         }
+        if committed_dict_budget > 0 {
+            self.dictionary_retained_budget = self
+                .dictionary_retained_budget
+                .saturating_add(committed_dict_budget);
+        }
     }
 
     fn window_size(&self) -> u64 {
@@ -265,6 +330,7 @@ impl Matcher for MatchGeneratorDriver {
         match self.active_backend {
             MatcherBackend::Simple => {
                 let vec_pool = &mut self.vec_pool;
+                let mut evicted_bytes = 0usize;
                 let suffixes = self
                     .suffix_pool
                     .pop()
@@ -272,22 +338,29 @@ impl Matcher for MatchGeneratorDriver {
                 let suffix_pool = &mut self.suffix_pool;
                 self.match_generator
                     .add_data(space, suffixes, |mut data, mut suffixes| {
+                        evicted_bytes += data.len();
                         data.resize(data.capacity(), 0);
                         vec_pool.push(data);
                         suffixes.slots.clear();
                         suffixes.slots.resize(suffixes.slots.capacity(), None);
                         suffix_pool.push(suffixes);
                     });
+                self.retire_dictionary_budget(evicted_bytes);
+                self.trim_after_budget_retire();
             }
             MatcherBackend::Dfast => {
                 let vec_pool = &mut self.vec_pool;
+                let mut evicted_bytes = 0usize;
                 self.dfast_match_generator
                     .as_mut()
                     .expect("dfast backend must be initialized by reset() before use")
                     .add_data(space, |mut data| {
+                        evicted_bytes += data.len();
                         data.resize(data.capacity(), 0);
                         vec_pool.push(data);
                     });
+                self.retire_dictionary_budget(evicted_bytes);
+                self.trim_after_budget_retire();
             }
         }
     }
@@ -857,6 +930,17 @@ impl DfastMatchGenerator {
         self.window.push_back(data);
     }
 
+    fn trim_to_window(&mut self, mut reuse_space: impl FnMut(Vec<u8>)) {
+        while self.window_size > self.max_window_size {
+            let mut removed = self.window.pop_front().unwrap();
+            self.window_size -= removed.len();
+            self.history_start += removed.len();
+            self.history_abs_start += removed.len();
+            removed.resize(removed.capacity(), 0);
+            reuse_space(removed);
+        }
+    }
+
     fn skip_matching(&mut self) {
         self.ensure_hash_tables();
         let current_len = self.window.back().unwrap().len();
@@ -1545,6 +1629,65 @@ fn dfast_prime_with_dictionary_counts_four_byte_tail_budget() {
     );
 }
 
+#[test]
+fn prime_with_dictionary_budget_shrinks_after_simple_eviction() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Fastest);
+
+    let base_window = driver.match_generator.max_window_size;
+    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
+    assert_eq!(driver.match_generator.max_window_size, base_window + 24);
+
+    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
+        let mut space = driver.get_next_space();
+        space.clear();
+        space.extend_from_slice(block);
+        driver.commit_space(space);
+        driver.skip_matching();
+    }
+
+    assert_eq!(
+        driver.dictionary_retained_budget, 0,
+        "dictionary budget should be fully retired once primed dict slices are evicted"
+    );
+    assert_eq!(
+        driver.match_generator.max_window_size, base_window,
+        "retired dictionary budget must not remain reusable for live history"
+    );
+}
+
+#[test]
+fn prime_with_dictionary_budget_shrinks_after_dfast_eviction() {
+    let mut driver = MatchGeneratorDriver::new(8, 1);
+    driver.reset(CompressionLevel::Default);
+    // Use a small live window in this regression so dictionary-primed slices are
+    // evicted quickly and budget retirement can be asserted deterministically.
+    driver.dfast_matcher_mut().max_window_size = 8;
+    driver.reported_window_size = 8;
+
+    let base_window = driver.dfast_matcher().max_window_size;
+    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
+    assert_eq!(driver.dfast_matcher().max_window_size, base_window + 24);
+
+    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
+        let mut space = driver.get_next_space();
+        space.clear();
+        space.extend_from_slice(block);
+        driver.commit_space(space);
+        driver.skip_matching();
+    }
+
+    assert_eq!(
+        driver.dictionary_retained_budget, 0,
+        "dictionary budget should be fully retired once primed dict slices are evicted"
+    );
+    assert_eq!(
+        driver.dfast_matcher().max_window_size,
+        base_window,
+        "retired dictionary budget must not remain reusable for live history"
+    );
+}
+
 #[test]
 fn suffix_store_with_single_slot_does_not_panic_on_keying() {
     let mut suffixes = SuffixStore::with_capacity(1);

From 702c64d507ec2e7f636fb123778f0c63cbaa19d3 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 17:28:15 +0300
Subject: [PATCH 21/24] perf(encoding): avoid extra entropy-table cloning

- seed huffman table directly via Option::clone_from from cached entropy
- cache FSE previous tables as PreviousFseTable to avoid per-frame reboxing
- remove temporary clone/map allocations in dictionary seeding path
---
 zstd/src/encoding/frame_compressor.rs | 53 +++++++++++++--------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 02d1d95b..590e46ee 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -49,9 +49,9 @@ pub struct FrameCompressor<R: Read, W: Write, M: Matcher> {
 #[derive(Clone, Default)]
 struct CachedDictionaryEntropy {
     huff: Option<crate::huff0::huff0_encoder::HuffmanTable>,
-    ll_previous: Option<FSETable>,
-    ml_previous: Option<FSETable>,
-    of_previous: Option<FSETable>,
+    ll_previous: Option<PreviousFseTable>,
+    ml_previous: Option<PreviousFseTable>,
+    of_previous: Option<PreviousFseTable>,
 }
 
 #[derive(Clone)]
@@ -188,38 +188,25 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
                 .matcher
                 .prime_with_dictionary(dict.dict_content.as_slice(), dict.offset_hist);
         }
-        self.state
-            .last_huff_table
-            .clone_from(&cached_entropy.and_then(|cache| cache.huff.clone()));
+        if let Some(cache) = cached_entropy {
+            self.state.last_huff_table.clone_from(&cache.huff);
+        } else {
+            self.state.last_huff_table = None;
+        }
         // `clone_from` keeps frame-to-frame seeding cheap for reused compressors by
         // reusing existing allocations where possible instead of reallocating every frame.
         self.state
             .fse_tables
             .ll_previous
-            .clone_from(&cached_entropy.and_then(|cache| {
-                cache
-                    .ll_previous
-                    .clone()
-                    .map(|table| PreviousFseTable::Custom(Box::new(table)))
-            }));
+            .clone_from(&cached_entropy.and_then(|cache| cache.ll_previous.clone()));
         self.state
             .fse_tables
             .ml_previous
-            .clone_from(&cached_entropy.and_then(|cache| {
-                cache
-                    .ml_previous
-                    .clone()
-                    .map(|table| PreviousFseTable::Custom(Box::new(table)))
-            }));
+            .clone_from(&cached_entropy.and_then(|cache| cache.ml_previous.clone()));
         self.state
             .fse_tables
             .of_previous
-            .clone_from(&cached_entropy.and_then(|cache| {
-                cache
-                    .of_previous
-                    .clone()
-                    .map(|table| PreviousFseTable::Custom(Box::new(table)))
-            }));
+            .clone_from(&cached_entropy.and_then(|cache| cache.of_previous.clone()));
         #[cfg(feature = "hash")]
         {
             self.hasher = XxHash64::with_seed(0);
@@ -391,9 +378,21 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         }
         self.dictionary_entropy_cache = Some(CachedDictionaryEntropy {
             huff: dictionary.huf.table.to_encoder_table(),
-            ll_previous: dictionary.fse.literal_lengths.to_encoder_table(),
-            ml_previous: dictionary.fse.match_lengths.to_encoder_table(),
-            of_previous: dictionary.fse.offsets.to_encoder_table(),
+            ll_previous: dictionary
+                .fse
+                .literal_lengths
+                .to_encoder_table()
+                .map(|table| PreviousFseTable::Custom(Box::new(table))),
+            ml_previous: dictionary
+                .fse
+                .match_lengths
+                .to_encoder_table()
+                .map(|table| PreviousFseTable::Custom(Box::new(table))),
+            of_previous: dictionary
+                .fse
+                .offsets
+                .to_encoder_table()
+                .map(|table| PreviousFseTable::Custom(Box::new(table))),
         });
         Ok(self.dictionary.replace(dictionary))
     }

From 37052304725a93c706d90a89dfd9affa1414f200 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 17:43:21 +0300
Subject: [PATCH 22/24] fix(encoding): correct dfast eviction accounting

- keep dfast eviction callbacks on logical slice length, not vec capacity
- add regression tests for add_data/trim_to_window eviction length semantics
- remove intermediate Option clones in FSE dictionary seeding path
---
 zstd/src/encoding/frame_compressor.rs | 30 +++++++++------
 zstd/src/encoding/match_generator.rs  | 55 +++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 16 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 590e46ee..6641dd55 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -195,18 +195,24 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
         }
         // `clone_from` keeps frame-to-frame seeding cheap for reused compressors by
         // reusing existing allocations where possible instead of reallocating every frame.
-        self.state
-            .fse_tables
-            .ll_previous
-            .clone_from(&cached_entropy.and_then(|cache| cache.ll_previous.clone()));
-        self.state
-            .fse_tables
-            .ml_previous
-            .clone_from(&cached_entropy.and_then(|cache| cache.ml_previous.clone()));
-        self.state
-            .fse_tables
-            .of_previous
-            .clone_from(&cached_entropy.and_then(|cache| cache.of_previous.clone()));
+        if let Some(cache) = cached_entropy {
+            self.state
+                .fse_tables
+                .ll_previous
+                .clone_from(&cache.ll_previous);
+            self.state
+                .fse_tables
+                .ml_previous
+                .clone_from(&cache.ml_previous);
+            self.state
+                .fse_tables
+                .of_previous
+                .clone_from(&cache.of_previous);
+        } else {
+            self.state.fse_tables.ll_previous = None;
+            self.state.fse_tables.ml_previous = None;
+            self.state.fse_tables.of_previous = None;
+        }
         #[cfg(feature = "hash")]
         {
             self.hasher = XxHash64::with_seed(0);
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index f4f027c4..e15905e3 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -917,11 +917,10 @@ impl DfastMatchGenerator {
     fn add_data(&mut self, data: Vec<u8>, mut reuse_space: impl FnMut(Vec<u8>)) {
         assert!(data.len() <= self.max_window_size);
         while self.window_size + data.len() > self.max_window_size {
-            let mut removed = self.window.pop_front().unwrap();
+            let removed = self.window.pop_front().unwrap();
             self.window_size -= removed.len();
             self.history_start += removed.len();
             self.history_abs_start += removed.len();
-            removed.resize(removed.capacity(), 0);
             reuse_space(removed);
         }
         self.compact_history();
@@ -932,11 +931,10 @@ impl DfastMatchGenerator {
 
     fn trim_to_window(&mut self, mut reuse_space: impl FnMut(Vec<u8>)) {
         while self.window_size > self.max_window_size {
-            let mut removed = self.window.pop_front().unwrap();
+            let removed = self.window.pop_front().unwrap();
             self.window_size -= removed.len();
             self.history_start += removed.len();
             self.history_abs_start += removed.len();
-            removed.resize(removed.capacity(), 0);
             reuse_space(removed);
         }
     }
@@ -1935,6 +1933,55 @@ fn dfast_skip_matching_handles_window_eviction() {
     assert_eq!(reconstructed, [7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12]);
 }
 
+#[test]
+fn dfast_add_data_callback_reports_evicted_len_not_capacity() {
+    let mut matcher = DfastMatchGenerator::new(8);
+
+    let mut first = Vec::with_capacity(64);
+    first.extend_from_slice(b"abcdefgh");
+    matcher.add_data(first, |_| {});
+
+    let mut second = Vec::with_capacity(64);
+    second.extend_from_slice(b"ijklmnop");
+
+    let mut observed_evicted_len = None;
+    matcher.add_data(second, |data| {
+        observed_evicted_len = Some(data.len());
+    });
+
+    assert_eq!(
+        observed_evicted_len,
+        Some(8),
+        "eviction callback must report evicted byte length, not backing capacity"
+    );
+}
+
+#[test]
+fn dfast_trim_to_window_callback_reports_evicted_len_not_capacity() {
+    let mut matcher = DfastMatchGenerator::new(16);
+
+    let mut first = Vec::with_capacity(64);
+    first.extend_from_slice(b"abcdefgh");
+    matcher.add_data(first, |_| {});
+
+    let mut second = Vec::with_capacity(64);
+    second.extend_from_slice(b"ijklmnop");
+    matcher.add_data(second, |_| {});
+
+    matcher.max_window_size = 8;
+
+    let mut observed_evicted_len = None;
+    matcher.trim_to_window(|data| {
+        observed_evicted_len = Some(data.len());
+    });
+
+    assert_eq!(
+        observed_evicted_len,
+        Some(8),
+        "trim callback must report evicted byte length, not backing capacity"
+    );
+}
+
 #[test]
 fn dfast_inserts_tail_positions_for_next_block_matching() {
     let mut matcher = DfastMatchGenerator::new(DFAST_DEFAULT_WINDOW_SIZE);

From f11ac3658cad17334e8d530b786e3053d234e6cf Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 17:55:43 +0300
Subject: [PATCH 23/24] fix(encoding): preserve entropy tables for clone_from
 reuse

---
 zstd/src/encoding/frame_compressor.rs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 6641dd55..cf5fc069 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -167,10 +167,6 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
     pub fn compress(&mut self) {
         // Clearing buffers to allow re-using of the compressor
         self.state.matcher.reset(self.compression_level);
-        self.state.last_huff_table = None;
-        self.state.fse_tables.ll_previous = None;
-        self.state.fse_tables.ml_previous = None;
-        self.state.fse_tables.of_previous = None;
         self.state.offset_hist = [1, 4, 8];
         let use_dictionary_state =
             !matches!(self.compression_level, CompressionLevel::Uncompressed)

From 127d41dd8a66a2bdc7921ff2e53ee02e5aff4d1c Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Sun, 29 Mar 2026 18:31:37 +0300
Subject: [PATCH 24/24] fix(dictionary): reject empty raw dictionaries

---
 zstd/src/decoding/dictionary.rs       | 20 ++++++++++++++++----
 zstd/src/encoding/frame_compressor.rs | 14 ++++++++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/zstd/src/decoding/dictionary.rs b/zstd/src/decoding/dictionary.rs
index baa974d6..4d3030de 100644
--- a/zstd/src/decoding/dictionary.rs
+++ b/zstd/src/decoding/dictionary.rs
@@ -51,6 +51,9 @@ impl Dictionary {
         if id == 0 {
             return Err(DictionaryDecodeError::ZeroDictionaryId);
         }
+        if dict_content.is_empty() {
+            return Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 });
+        }
 
         Ok(Dictionary {
             id,
@@ -238,9 +241,18 @@ mod tests {
         // Corrupt rep0 to zero.
         raw[offset_start..offset_start + 4].copy_from_slice(&0u32.to_le_bytes());
         let decoded = Dictionary::decode_dict(&raw);
-        assert!(
-            decoded.is_err(),
-            "dictionary with zero repeat offset must be rejected"
-        );
+        assert!(matches!(
+            decoded,
+            Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 })
+        ));
+    }
+
+    #[test]
+    fn from_raw_content_rejects_empty_dictionary_content() {
+        let result = Dictionary::from_raw_content(1, Vec::new());
+        assert!(matches!(
+            result,
+            Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 })
+        ));
     }
 }
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index cf5fc069..ed4c2973 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -649,7 +649,7 @@ mod tests {
             crate::decoding::Dictionary::from_raw_content(dict_id, raw_dict.clone()).unwrap();
 
         let mut payload = Vec::new();
-        for idx in 0..512u32 {
+        for idx in 0..96u32 {
             payload.extend_from_slice(
                 format!(
                     "tenant=demo table=orders op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n"
@@ -658,6 +658,12 @@ mod tests {
             );
         }
 
+        let mut without_dict = Vec::new();
+        let mut baseline = FrameCompressor::new(super::CompressionLevel::Fastest);
+        baseline.set_source(payload.as_slice());
+        baseline.set_drain(&mut without_dict);
+        baseline.compress();
+
         let mut with_dict = Vec::new();
         let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
         compressor
@@ -675,13 +681,17 @@ mod tests {
         let mut decoded = Vec::with_capacity(payload.len());
         decoder.decode_all_to_vec(&with_dict, &mut decoded).unwrap();
         assert_eq!(decoded, payload);
+        assert!(
+            with_dict.len() < without_dict.len(),
+            "trained dictionary should improve compression for this small payload"
+        );
     }
 
     #[test]
     fn set_dictionary_from_bytes_seeds_entropy_tables_for_first_block() {
         let dict_raw = include_bytes!("../../dict_tests/dictionary");
         let mut output = Vec::new();
-        let input = b"short-payload-without-obvious-repetitions";
+        let input = b"";
 
         let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
         let previous = compressor