Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions zstd/src/encoding/frame_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,87 @@ mod tests {
}
}

#[cfg(feature = "std")]
#[test]
fn source_size_hint_fastest_remains_ffi_compatible_small_input() {
let data = vec![0xAB; 2047];
let compressed = {
let mut compressor = FrameCompressor::new(super::CompressionLevel::Fastest);
compressor.set_source_size_hint(data.len() as u64);
compressor.set_source(data.as_slice());
let mut out = Vec::new();
compressor.set_drain(&mut out);
compressor.compress();
out
};

let mut decoded = Vec::new();
zstd::stream::copy_decode(compressed.as_slice(), &mut decoded).unwrap();
assert_eq!(decoded, data);
}

#[cfg(feature = "std")]
#[test]
fn source_size_hint_levels_remain_ffi_compatible_small_inputs_matrix() {
fn generate_data(seed: u64, len: usize) -> Vec<u8> {
let mut state = seed;
let mut data = Vec::with_capacity(len);
for _ in 0..len {
state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
data.push((state >> 33) as u8);
}
data
}

let levels = [
super::CompressionLevel::Fastest,
super::CompressionLevel::Default,
super::CompressionLevel::Better,
super::CompressionLevel::Best,
super::CompressionLevel::Level(-1),
super::CompressionLevel::Level(2),
super::CompressionLevel::Level(3),
super::CompressionLevel::Level(4),
super::CompressionLevel::Level(11),
];
let sizes = [513usize, 1023, 1024, 1536, 2047, 2048, 4095, 4096, 8191];

for (seed_idx, seed) in [11u64, 23, 41].into_iter().enumerate() {
for &size in &sizes {
let data = generate_data(seed + seed_idx as u64, size);
for &level in &levels {
let compressed = {
let mut compressor = FrameCompressor::new(level);
compressor.set_source_size_hint(data.len() as u64);
compressor.set_source(data.as_slice());
let mut out = Vec::new();
compressor.set_drain(&mut out);
compressor.compress();
out
};

let mut decoded = Vec::new();
zstd::stream::copy_decode(compressed.as_slice(), &mut decoded).unwrap_or_else(
|e| {
panic!(
"ffi decode failed with source-size hint: level={level:?} size={size} seed={} err={e}",
seed + seed_idx as u64
)
},
);
assert_eq!(
decoded,
data,
"hinted ffi roundtrip mismatch: level={level:?} size={size} seed={}",
seed + seed_idx as u64
);
}
}
}
}

struct NoDictionaryMatcher {
last_space: Vec<u8>,
window_size: u64,
Expand Down
100 changes: 86 additions & 14 deletions zstd/src/encoding/match_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,25 +145,33 @@ const LEVEL_TABLE: [LevelParams; 22] = [

/// Smallest window_log the encoder will use regardless of source size.
const MIN_WINDOW_LOG: u8 = 10;
/// Conservative floor for source-size-hinted window tuning.
///
/// Hinted windows below 16 KiB (`window_log < 14`) currently regress C-FFI
/// interoperability on certain compressed-block patterns. Keep hinted
/// windows at 16 KiB or larger until that compatibility gap is closed.
const MIN_HINTED_WINDOW_LOG: u8 = 14;

/// Adjust level parameters for a known source size.
///
/// This derives a cap from `ceil(log2(src_size))`, then clamps it to
/// [`MIN_WINDOW_LOG`]. A zero-byte size hint is treated as
/// [`MIN_WINDOW_LOG`]. This keeps tables bounded for
/// small inputs while preserving the encoder's minimum supported window.
/// [`MIN_HINTED_WINDOW_LOG`] (16 KiB). A zero-byte size hint is treated as
/// [`MIN_WINDOW_LOG`] for the raw ceil-log step and then promoted to the hinted
/// floor. This keeps tables bounded for small inputs while preserving the
/// encoder's baseline minimum supported window.
/// For the HC backend, `hash_log` and `chain_log` are reduced
/// proportionally.
fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams {
// Derive a source-size-based cap from ceil(log2(src_size)), then
// clamp to MIN_WINDOW_LOG. For inputs smaller than 1 KiB (or zero) we keep the
// 1 KiB minimum window instead of shrinking below that floor.
// clamp first to MIN_WINDOW_LOG (baseline encoder minimum) and then to
// MIN_HINTED_WINDOW_LOG (16 KiB hinted floor). For tiny or zero hints we
// therefore keep a 16 KiB effective minimum window in hinted mode.
let src_log = if src_size == 0 {
MIN_WINDOW_LOG
} else {
(64 - (src_size - 1).leading_zeros()) as u8 // ceil_log2
};
let src_log = src_log.max(MIN_WINDOW_LOG);
let src_log = src_log.max(MIN_WINDOW_LOG).max(MIN_HINTED_WINDOW_LOG);
Comment thread
polaz marked this conversation as resolved.
if src_log < params.window_log {
params.window_log = src_log;
}
Expand Down Expand Up @@ -2767,8 +2775,8 @@ fn driver_small_source_hint_shrinks_dfast_hash_tables() {
driver.skip_matching();
let hinted_tables = driver.dfast_matcher().short_hash.len();

assert_eq!(driver.window_size(), 1 << MIN_WINDOW_LOG);
assert_eq!(hinted_tables, 1 << MIN_WINDOW_LOG);
assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
assert_eq!(hinted_tables, 1 << MIN_HINTED_WINDOW_LOG);
assert!(
hinted_tables < full_tables,
"tiny source hint should reduce dfast table footprint"
Expand Down Expand Up @@ -2797,8 +2805,11 @@ fn driver_small_source_hint_shrinks_row_hash_tables() {
driver.skip_matching();
let hinted_rows = driver.row_matcher().row_heads.len();

assert_eq!(driver.window_size(), 1 << MIN_WINDOW_LOG);
assert_eq!(hinted_rows, 1 << ((MIN_WINDOW_LOG as usize) - ROW_LOG));
assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
assert_eq!(
hinted_rows,
1 << ((MIN_HINTED_WINDOW_LOG as usize) - ROW_LOG)
);
assert!(
hinted_rows < full_rows,
"tiny source hint should reduce row hash table footprint"
Expand Down Expand Up @@ -3010,7 +3021,7 @@ fn source_hint_clamps_driver_slice_size_to_window() {
driver.reset(CompressionLevel::Default);

let window = driver.window_size() as usize;
assert_eq!(window, 1024);
assert_eq!(window, 1 << MIN_HINTED_WINDOW_LOG);
assert_eq!(driver.slice_size, window);

let space = driver.get_next_space();
Expand All @@ -3032,7 +3043,7 @@ fn pooled_space_keeps_capacity_when_slice_size_shrinks() {
driver.reset(CompressionLevel::Default);

let small = driver.get_next_space();
assert_eq!(small.len(), 1024);
assert_eq!(small.len(), 1 << MIN_HINTED_WINDOW_LOG);
assert!(
small.capacity() >= large_capacity,
"pooled buffer capacity should be preserved to avoid shrink/grow churn"
Expand Down Expand Up @@ -3432,11 +3443,11 @@ fn driver_reset_from_row_backend_tolerates_missing_row_matcher() {
}

#[test]
fn adjust_params_for_zero_source_size_uses_min_window_floor() {
fn adjust_params_for_zero_source_size_uses_min_hinted_window_floor() {
let mut params = resolve_level_params(CompressionLevel::Level(4), None);
params.window_log = 22;
let adjusted = adjust_params_for_source_size(params, 0);
assert_eq!(adjusted.window_log, MIN_WINDOW_LOG);
assert_eq!(adjusted.window_log, MIN_HINTED_WINDOW_LOG);
}

#[test]
Expand Down Expand Up @@ -4205,3 +4216,64 @@ fn dfast_inserts_tail_positions_for_next_block_matching() {
);
assert_eq!(history, b"012345bcdeabcdeabcdeab");
}

#[test]
fn fastest_hint_iteration_23_sequences_reconstruct_source() {
fn generate_data(seed: u64, len: usize) -> Vec<u8> {
let mut state = seed;
let mut data = Vec::with_capacity(len);
for _ in 0..len {
state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
data.push((state >> 33) as u8);
}
data
}

let i = 23u64;
let len = (i * 89 % 16384) as usize;
let mut data = generate_data(i, len);
// Append a repeated slice so the fixture deterministically exercises
// the match path (Sequence::Triple) instead of only literals.
let repeat = data[128..256].to_vec();
data.extend_from_slice(&repeat);
data.extend_from_slice(&repeat);

let mut driver = MatchGeneratorDriver::new(1024 * 128, 1);
driver.set_source_size_hint(data.len() as u64);
driver.reset(CompressionLevel::Fastest);
let mut space = driver.get_next_space();
space[..data.len()].copy_from_slice(&data);
space.truncate(data.len());
driver.commit_space(space);

let mut rebuilt = Vec::with_capacity(data.len());
let mut saw_triple = false;
driver.start_matching(|seq| match seq {
Sequence::Literals { literals } => rebuilt.extend_from_slice(literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
saw_triple = true;
rebuilt.extend_from_slice(literals);
assert!(offset > 0, "offset must be non-zero");
assert!(
offset <= rebuilt.len(),
"offset must reference already-produced bytes: offset={} produced={}",
offset,
rebuilt.len()
);
let start = rebuilt.len() - offset;
for idx in 0..match_len {
let b = rebuilt[start + idx];
rebuilt.push(b);
}
}
});

assert!(saw_triple, "fixture must emit at least one match");
assert_eq!(rebuilt, data);
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
17 changes: 15 additions & 2 deletions zstd/src/encoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,28 @@ pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel
frame_enc.compress();
}

/// Convenience function to compress some source into a Vec without reusing any resources of the compressor
/// Convenience function to compress some source into a Vec without reusing any resources of the compressor.
///
/// This helper eagerly buffers the full input (`Read`) before compression so it
/// can provide a source-size hint to the one-shot encoder path. Peak memory can
/// therefore be roughly `input_size + output_size`. For very large payloads or
/// tighter memory budgets, prefer streaming APIs such as [`StreamingEncoder`].
/// ```rust
/// use structured_zstd::encoding::{compress_to_vec, CompressionLevel};
/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
/// let compressed = compress_to_vec(data, CompressionLevel::Fastest);
/// ```
pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
let mut source = source;
let mut input = Vec::new();
source.read_to_end(&mut input).unwrap();

Comment thread
polaz marked this conversation as resolved.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
let mut vec = Vec::new();
compress(source, &mut vec, level);
let mut frame_enc = FrameCompressor::new(level);
frame_enc.set_source_size_hint(input.len() as u64);
frame_enc.set_source(input.as_slice());
frame_enc.set_drain(&mut vec);
frame_enc.compress();
vec
}

Expand Down
42 changes: 42 additions & 0 deletions zstd/src/tests/roundtrip_integrity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,48 @@ fn streaming_pledged_size_uses_source_hint() {
);
}

/// One-shot `compress_to_vec` should propagate source-size hint for Default
/// so tiny payloads avoid oversized dfast window/table sizing.
#[test]
fn compress_to_vec_default_small_input_uses_source_size_hint() {
let data = generate_compressible(9604, 4 * 1024); // 4 KiB

// One-shot helper path (should auto-hint source size).
let auto_hint = compress_to_vec(&data[..], CompressionLevel::Default);
let auto_hint_window = crate::decoding::frame::read_frame_header(auto_hint.as_slice())
.unwrap()
.0
.window_size()
.unwrap();

// Manual compressor path without hint (legacy behavior baseline).
let no_hint = {
let mut compressor = FrameCompressor::new(CompressionLevel::Default);
compressor.set_source(data.as_slice());
let mut out = Vec::new();
compressor.set_drain(&mut out);
compressor.compress();
out
};
let no_hint_window = crate::decoding::frame::read_frame_header(no_hint.as_slice())
.unwrap()
.0
.window_size()
.unwrap();

let mut decoder = StreamingDecoder::new(auto_hint.as_slice()).unwrap();
let mut decoded = Vec::new();
decoder.read_to_end(&mut decoded).unwrap();
assert_eq!(decoded, data);

assert!(
auto_hint_window < no_hint_window,
"compress_to_vec(default) should advertise a smaller window on tiny payloads: auto_hint={} no_hint={}",
auto_hint_window,
no_hint_window
);
}

/// All 22 levels produce valid output for a tiny (256 byte) input with size hint.
#[test]
fn all_levels_tiny_input_with_hint() {
Expand Down
33 changes: 31 additions & 2 deletions zstd/tests/cross_validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
//! - C FFI compress → Pure Rust decompress

use structured_zstd::decoding::StreamingDecoder;
use structured_zstd::encoding::{CompressionLevel, compress_to_vec};
use structured_zstd::encoding::{CompressionLevel, FrameCompressor, compress_to_vec};
use structured_zstd::io::Read;

/// Generate deterministic pseudo-random data using a simple LCG.
Expand Down Expand Up @@ -42,14 +42,43 @@ fn cross_rust_compress_ffi_decompress_1000() {
let data = generate_data(i, len);

let compressed = compress_to_vec(&data[..], CompressionLevel::Fastest);
let result = zstd::decode_all(compressed.as_slice()).unwrap();
let result = zstd::decode_all(compressed.as_slice()).unwrap_or_else(|e| {
panic!("rust→ffi decode failed at iteration {i}, len={len}: {e}");
});
assert_eq!(
data, result,
"rust→ffi roundtrip failed at iteration {i}, len={len}"
);
}
}

#[test]
fn cross_rust_fastest_with_source_hint_ffi_decompress_iteration_23() {
let i = 23u64;
let len = (i * 89 % 16384) as usize;
let data = generate_data(i, len);

let compressed = {
let mut compressor = FrameCompressor::new(CompressionLevel::Fastest);
compressor.set_source_size_hint(data.len() as u64);
compressor.set_source(data.as_slice());
let mut out = Vec::new();
compressor.set_drain(&mut out);
compressor.compress();
out
};

let mut rust_decoder = StreamingDecoder::new(compressed.as_slice()).unwrap();
let mut rust_result = Vec::new();
rust_decoder.read_to_end(&mut rust_result).unwrap();
assert_eq!(data, rust_result, "rust decoder must accept hinted stream");

let result = zstd::decode_all(compressed.as_slice()).unwrap_or_else(|e| {
panic!("hinted rust→ffi decode failed at iteration {i}, len={len}: {e}");
});
assert_eq!(data, result, "ffi decoder must accept hinted stream");
}

#[test]
fn cross_ffi_compress_rust_decompress_1000() {
for i in 0..1000u64 {
Expand Down
Loading