Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f45a5a8
feat(encoding): add dictionary compression support
polaz Mar 28, 2026
373eef0
fix(encoding): harden dictionary input handling
polaz Mar 28, 2026
05a712c
style(tests): format dictionary panic regression asserts
polaz Mar 28, 2026
dc4ec49
fix(dict): align defaults and clarify dictionary priming
polaz Mar 28, 2026
9093834
test(packaging): handle must_use and trim crate payload
polaz Mar 28, 2026
f12d234
fix(dict): seed encoder entropy and reject zero repcodes
polaz Mar 28, 2026
0fd86e8
perf(encoding): cache dictionary entropy tables
polaz Mar 28, 2026
7d8a5f0
fix(encoding): harden dictionary priming suffix reuse
polaz Mar 28, 2026
253e788
fix(encoding): harden dictionary priming invariants
polaz Mar 28, 2026
c8c477b
fix(encoding): validate repcodes and retain dict history
polaz Mar 28, 2026
5770877
test(encoding): avoid moving frame decode error in assert
polaz Mar 28, 2026
29ac4e7
test(encoding): cover dfast priming and decouple header window
polaz Mar 28, 2026
b8ed722
chore(encoding): document len_log zero guard
polaz Mar 28, 2026
4701190
fix(encoding): cap dictionary retention to committed bytes
polaz Mar 28, 2026
2fb69e2
style(encoding): format tail budget subtraction
polaz Mar 28, 2026
80907a8
fix(encoding): skip dictionary state for raw frames
polaz Mar 29, 2026
a4cf2f0
fix(encoding): address renewed review threads
polaz Mar 29, 2026
42450b7
test(encoding): cover window crossing with dictionaries
polaz Mar 29, 2026
a58db02
fix(encoding): gate dictionary id on matcher capability
polaz Mar 29, 2026
5da6696
fix(encoding): retire dictionary budget on eviction
polaz Mar 29, 2026
702c64d
perf(encoding): avoid extra entropy-table cloning
polaz Mar 29, 2026
3705230
fix(encoding): correct dfast eviction accounting
polaz Mar 29, 2026
f11ac36
fix(encoding): preserve entropy tables for clone_from reuse
polaz Mar 29, 2026
127d41d
fix(dictionary): reject empty raw dictionaries
polaz Mar 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion zstd/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ license = "Apache-2.0"
homepage = "https://github.com/structured-world/structured-zstd"
repository = "https://github.com/structured-world/structured-zstd"
description = "Pure Rust zstd implementation — managed fork of ruzstd. Dictionary decompression, no FFI."
exclude = ["dict_tests/*", "fuzz_decodecorpus/*", "decodecorpus_files/*"]
exclude = ["fuzz_decodecorpus/*", "decodecorpus_files/*", "dict_tests/files/**"]
# Package metadata points at a crate-local symlink so the packaged crate and repo root README stay in sync.
readme = "README.md"
keywords = ["zstd", "zstandard", "decompression", "compression", "pure-rust"]
Expand Down
160 changes: 157 additions & 3 deletions zstd/src/decoding/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,50 @@ pub struct Dictionary {
pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];

impl Dictionary {
/// Parses the dictionary from `raw` and set the tables
/// it returns the dict_id for checking with the frame's `dict_id``
/// Build a dictionary from raw content bytes (without entropy table sections).
///
/// This is primarily intended for dictionaries produced by the `dict_builder`
/// module, which currently emits raw-content dictionaries.
pub fn from_raw_content(
id: u32,
dict_content: Vec<u8>,
) -> Result<Dictionary, DictionaryDecodeError> {
if id == 0 {
return Err(DictionaryDecodeError::ZeroDictionaryId);
}
if dict_content.is_empty() {
return Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 });
}

Ok(Dictionary {
id,
fse: FSEScratch::new(),
huf: HuffmanScratch::new(),
dict_content,
offset_hist: [1, 4, 8],
})
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

/// Parses the dictionary from `raw`, initializes its tables,
/// and returns a fully constructed [`Dictionary`] whose `id` can be
/// checked against the frame's `dict_id`.
pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
const MIN_MAGIC_AND_ID_LEN: usize = 8;
const OFFSET_HISTORY_LEN: usize = 12;

if raw.len() < MIN_MAGIC_AND_ID_LEN {
return Err(DictionaryDecodeError::DictionaryTooSmall {
got: raw.len(),
need: MIN_MAGIC_AND_ID_LEN,
});
}

let mut new_dict = Dictionary {
id: 0,
fse: FSEScratch::new(),
huf: HuffmanScratch::new(),
dict_content: Vec::new(),
offset_hist: [2, 4, 8],
offset_hist: [1, 4, 8],
};

let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
Expand All @@ -58,6 +93,9 @@ impl Dictionary {

let dict_id = raw[4..8].try_into().expect("optimized away");
let dict_id = u32::from_le_bytes(dict_id);
if dict_id == 0 {
return Err(DictionaryDecodeError::ZeroDictionaryId);
}
Comment thread
polaz marked this conversation as resolved.
new_dict.id = dict_id;

let raw_tables = &raw[8..];
Expand All @@ -83,6 +121,13 @@ impl Dictionary {
)?;
let raw_tables = &raw_tables[ll_size..];

if raw_tables.len() < OFFSET_HISTORY_LEN {
return Err(DictionaryDecodeError::DictionaryTooSmall {
got: raw_tables.len(),
need: OFFSET_HISTORY_LEN,
});
}

let offset1 = raw_tables[0..4].try_into().expect("optimized away");
let offset1 = u32::from_le_bytes(offset1);

Expand All @@ -92,6 +137,16 @@ impl Dictionary {
let offset3 = raw_tables[8..12].try_into().expect("optimized away");
let offset3 = u32::from_le_bytes(offset3);

if offset1 == 0 {
return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 });
}
if offset2 == 0 {
return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 1 });
}
if offset3 == 0 {
return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 2 });
}

new_dict.offset_hist[0] = offset1;
new_dict.offset_hist[1] = offset2;
new_dict.offset_hist[2] = offset3;
Expand All @@ -102,3 +157,102 @@ impl Dictionary {
Ok(new_dict)
}
}

#[cfg(test)]
mod tests {
use super::*;

fn offset_history_start(raw: &[u8]) -> usize {
let mut huf = crate::decoding::scratch::HuffmanScratch::new();
let mut fse = crate::decoding::scratch::FSEScratch::new();
let mut cursor = 8usize;

let huf_size = huf
.table
.build_decoder(&raw[cursor..])
.expect("reference dictionary huffman table should decode");
cursor += huf_size as usize;

let of_size = fse
.offsets
.build_decoder(
&raw[cursor..],
crate::decoding::sequence_section_decoder::OF_MAX_LOG,
)
.expect("reference dictionary OF table should decode");
cursor += of_size;

let ml_size = fse
.match_lengths
.build_decoder(
&raw[cursor..],
crate::decoding::sequence_section_decoder::ML_MAX_LOG,
)
.expect("reference dictionary ML table should decode");
cursor += ml_size;

let ll_size = fse
.literal_lengths
.build_decoder(
&raw[cursor..],
crate::decoding::sequence_section_decoder::LL_MAX_LOG,
)
.expect("reference dictionary LL table should decode");
cursor += ll_size;

cursor
}

#[test]
fn decode_dict_rejects_short_buffer_before_magic_and_id() {
let err = match Dictionary::decode_dict(&[]) {
Ok(_) => panic!("expected short dictionary to fail"),
Err(err) => err,
};
assert!(matches!(
err,
DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 8 }
));
}

#[test]
fn decode_dict_malformed_input_returns_error_instead_of_panicking() {
let mut raw = Vec::new();
raw.extend_from_slice(&MAGIC_NUM);
raw.extend_from_slice(&1u32.to_le_bytes());
raw.extend_from_slice(&[0u8; 7]);

let result = std::panic::catch_unwind(|| Dictionary::decode_dict(&raw));
assert!(
result.is_ok(),
"decode_dict must not panic on malformed input"
);
assert!(
result.unwrap().is_err(),
"malformed dictionary must return error"
);
}

#[test]
fn decode_dict_rejects_zero_repeat_offsets() {
let mut raw = include_bytes!("../../dict_tests/dictionary").to_vec();
let offset_start = offset_history_start(&raw);

// Corrupt rep0 to zero.
raw[offset_start..offset_start + 4].copy_from_slice(&0u32.to_le_bytes());
let decoded = Dictionary::decode_dict(&raw);
assert!(matches!(
decoded,
Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 })
));
}

#[test]
fn from_raw_content_rejects_empty_dictionary_content() {
let result = Dictionary::from_raw_content(1, Vec::new());
assert!(matches!(
result,
Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 })
));
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
15 changes: 15 additions & 0 deletions zstd/src/decoding/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,9 @@ impl core::fmt::Display for DecodeBufferError {
#[non_exhaustive]
pub enum DictionaryDecodeError {
BadMagicNum { got: [u8; 4] },
DictionaryTooSmall { got: usize, need: usize },
ZeroDictionaryId,
ZeroRepeatOffsetInDictionary { index: u8 },
FSETableError(FSETableError),
HuffmanTableError(HuffmanTableError),
}
Expand All @@ -451,6 +454,18 @@ impl core::fmt::Display for DictionaryDecodeError {
crate::decoding::dictionary::MAGIC_NUM,
)
}
DictionaryDecodeError::DictionaryTooSmall { got, need } => {
write!(
f,
"Dictionary is too small: got {got} bytes, need at least {need} bytes",
)
}
DictionaryDecodeError::ZeroDictionaryId => {
write!(f, "Dictionary id must be non-zero")
}
DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index } => {
write!(f, "Dictionary repeat offset rep{index} must be non-zero")
}
DictionaryDecodeError::FSETableError(e) => write!(f, "{e:?}"),
DictionaryDecodeError::HuffmanTableError(e) => write!(f, "{e:?}"),
}
Expand Down
Loading
Loading