Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions zstd/src/encoding/blocks/compressed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ use crate::{
huff0::huff0_encoder,
};

/// Compile-time guarantee that MAX_BLOCK_SIZE fits in the 18-bit size format.
const _: () = assert!(crate::common::MAX_BLOCK_SIZE <= 262_143);

/// A block of [`crate::common::BlockType::Compressed`]
pub fn compress_block<M: Matcher>(state: &mut CompressState<M>, output: &mut Vec<u8>) {
let mut literals_vec = Vec::new();
Expand Down Expand Up @@ -341,12 +344,25 @@ fn compress_literals(
writer.write_bits(3u8, 2); // treeless compressed literals type
}

// RFC 8878 §3.1.1.3.1.1 Size_Format (spec limits):
// 0b00: single stream, 10-bit (≤ 1023) | 0b01: 4 streams, 10-bit (≤ 1023)
// 0b10: 4 streams, 14-bit (≤ 16383) | 0b11: 4 streams, 18-bit (≤ 262143)
//
// The encoder currently only calls this function for literals > 1024 bytes
// (smaller literals use raw_literals), so only formats 0b10 and 0b11 are
// reachable in practice. The 0b00/0b01 arms are kept for completeness.
//
// Runtime: hard guard — truncated 18-bit writes produce corrupt streams.
// Note: format args omitted intentionally to avoid uncoverable dead code in coverage.
assert!(
literals.len() <= 262_143,
"literals exceed RFC 8878 18-bit size limit (262143)"
);
let (size_format, size_bits) = match literals.len() {
0..6 => (0b00u8, 10),
6..1024 => (0b01, 10),
1024..16384 => (0b10, 14),
16384..262144 => (0b11, 18),
_ => unimplemented!("too many literals"),
_ => (0b11, 18),
Comment thread
polaz marked this conversation as resolved.
};
Comment thread
polaz marked this conversation as resolved.

writer.write_bits(size_format, 2);
Expand Down
57 changes: 57 additions & 0 deletions zstd/src/tests/roundtrip_integrity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,21 @@ fn roundtrip_streaming(data: &[u8]) -> Vec<u8> {
result
}

/// Generate data with limited alphabet for better Huffman compressibility
/// but enough variety to avoid RLE path.
fn generate_huffman_friendly(seed: u64, len: usize, alphabet_size: u8) -> Vec<u8> {
assert!(alphabet_size > 0, "alphabet_size must be non-zero");
let mut state = seed;
let mut data = Vec::with_capacity(len);
for _ in 0..len {
state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
data.push(((state >> 33) as u8) % alphabet_size);
}
Comment thread
polaz marked this conversation as resolved.
data
}

// Cross-validation tests (pure Rust ↔ C FFI) are in tests/cross_validation.rs
// because dev-dependencies (zstd) aren't available in library test modules.

Expand Down Expand Up @@ -130,3 +145,45 @@ fn roundtrip_edge_cases() {
let rle = vec![0xABu8; 1_000_000];
assert_eq!(roundtrip_simple(&rle), rle);
}

/// Roundtrip tests with large inputs that produce large literal sections.
///
/// The encoder uses `compress_literals` (Huffman) for literals > 1024 bytes,
/// so these inputs exercise the 14-bit (0b10) and 18-bit (0b11) size formats.
/// The exact literals size depends on how many matches the encoder finds,
/// so we verify roundtrip correctness rather than specific format selection.
#[test]
fn roundtrip_large_literals() {
// ~1KB input — just above the raw→Huffman threshold.
let data_1025 = generate_huffman_friendly(42, 1025, 16);
assert_eq!(roundtrip_simple(&data_1025), data_1025);
assert_eq!(roundtrip_streaming(&data_1025), data_1025);

// ~16KB input — near the 14-bit/18-bit boundary.
let data_16383 = generate_huffman_friendly(43, 16383, 32);
assert_eq!(roundtrip_simple(&data_16383), data_16383);

let data_16384 = generate_huffman_friendly(44, 16384, 32);
assert_eq!(roundtrip_simple(&data_16384), data_16384);
assert_eq!(roundtrip_streaming(&data_16384), data_16384);

// 64KB input — well within the 18-bit range.
let data_64k = generate_huffman_friendly(45, 65536, 64);
assert_eq!(roundtrip_simple(&data_64k), data_64k);

// 128KB input — MAX_BLOCK_SIZE, the largest single block.
let data_128k = generate_huffman_friendly(46, 128 * 1024, 64);
assert_eq!(roundtrip_simple(&data_128k), data_128k);
assert_eq!(roundtrip_streaming(&data_128k), data_128k);
}

/// Multi-block data larger than MAX_BLOCK_SIZE that exercises the 4-stream
/// Huffman encoding across multiple blocks, each with large literal sections.
#[test]
fn roundtrip_multi_block_large_literals() {
// 512KB of Huffman-friendly data — will be split into multiple 128KB blocks,
// each exercising the 18-bit (0b11) size format with 4-stream encoding.
let data = generate_huffman_friendly(100, 512 * 1024, 48);
assert_eq!(roundtrip_simple(&data), data);
assert_eq!(roundtrip_streaming(&data), data);
}
63 changes: 63 additions & 0 deletions zstd/tests/cross_validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@ fn generate_data(seed: u64, len: usize) -> Vec<u8> {
data
}

/// Generate data with limited alphabet for Huffman-friendly compression.
fn generate_huffman_friendly(seed: u64, len: usize, alphabet_size: u8) -> Vec<u8> {
assert!(alphabet_size > 0, "alphabet_size must be non-zero");
let mut state = seed;
let mut data = Vec::with_capacity(len);
for _ in 0..len {
state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
data.push(((state >> 33) as u8) % alphabet_size);
}
Comment thread
polaz marked this conversation as resolved.
data
}

#[test]
fn cross_rust_compress_ffi_decompress_1000() {
for i in 0..1000u64 {
Expand Down Expand Up @@ -52,3 +66,52 @@ fn cross_ffi_compress_rust_decompress_1000() {
);
}
}

/// Cross-validate large inputs (1KB–512KB) that produce large literal sections,
/// verifying C zstd can decompress what our encoder produces.
#[test]
fn cross_rust_compress_ffi_decompress_large_blocks() {
let sizes = [1025, 16384, 65536, 128 * 1024];
for (i, &size) in sizes.iter().enumerate() {
let data = generate_huffman_friendly(i as u64 + 200, size, 48);

let compressed = compress_to_vec(&data[..], CompressionLevel::Fastest);
let result = zstd::decode_all(compressed.as_slice()).unwrap();
assert_eq!(
data, result,
"rust→ffi large block roundtrip failed at size={size}"
);
}

// Multi-block: 512KB forces multiple blocks, each with large literals
let data = generate_huffman_friendly(300, 512 * 1024, 48);
let compressed = compress_to_vec(&data[..], CompressionLevel::Fastest);
let result = zstd::decode_all(compressed.as_slice()).unwrap();
assert_eq!(data, result, "rust→ffi multi-block roundtrip failed");
}

/// Cross-validate C FFI compress → Rust decompress for large blocks.
#[test]
fn cross_ffi_compress_rust_decompress_large_blocks() {
let sizes = [1025, 16384, 65536, 128 * 1024];
for (i, &size) in sizes.iter().enumerate() {
let data = generate_huffman_friendly(i as u64 + 400, size, 48);

let compressed = zstd::encode_all(&data[..], 1).unwrap();
let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap();
let mut result = Vec::new();
decoder.read_to_end(&mut result).unwrap();
assert_eq!(
data, result,
"ffi→rust large block roundtrip failed at size={size}"
);
}

// Multi-block: 512KB
let data = generate_huffman_friendly(500, 512 * 1024, 48);
let compressed = zstd::encode_all(&data[..], 1).unwrap();
let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap();
let mut result = Vec::new();
decoder.read_to_end(&mut result).unwrap();
assert_eq!(data, result, "ffi→rust multi-block roundtrip failed");
}
Loading