diff --git a/zstd/src/encoding/blocks/compressed.rs b/zstd/src/encoding/blocks/compressed.rs index ad7f413c..1d0fbac2 100644 --- a/zstd/src/encoding/blocks/compressed.rs +++ b/zstd/src/encoding/blocks/compressed.rs @@ -8,6 +8,9 @@ use crate::{ huff0::huff0_encoder, }; +/// Compile-time guarantee that MAX_BLOCK_SIZE fits in the 18-bit size format. +const _: () = assert!(crate::common::MAX_BLOCK_SIZE <= 262_143); + /// A block of [`crate::common::BlockType::Compressed`] pub fn compress_block(state: &mut CompressState, output: &mut Vec) { let mut literals_vec = Vec::new(); @@ -341,12 +344,25 @@ fn compress_literals( writer.write_bits(3u8, 2); // treeless compressed literals type } + // RFC 8878 §3.1.1.3.1.1 Size_Format (spec limits): + // 0b00: single stream, 10-bit (≤ 1023) | 0b01: 4 streams, 10-bit (≤ 1023) + // 0b10: 4 streams, 14-bit (≤ 16383) | 0b11: 4 streams, 18-bit (≤ 262143) + // + // The encoder currently only calls this function for literals > 1024 bytes + // (smaller literals use raw_literals), so only formats 0b10 and 0b11 are + // reachable in practice. The 0b00/0b01 arms are kept for completeness. + // + // Runtime: hard guard — truncated 18-bit writes produce corrupt streams. + // Note: format args omitted intentionally to avoid uncoverable dead code in coverage. + assert!( + literals.len() <= 262_143, + "literals exceed RFC 8878 18-bit size limit (262143)" + ); let (size_format, size_bits) = match literals.len() { 0..6 => (0b00u8, 10), 6..1024 => (0b01, 10), 1024..16384 => (0b10, 14), - 16384..262144 => (0b11, 18), - _ => unimplemented!("too many literals"), + _ => (0b11, 18), }; writer.write_bits(size_format, 2); diff --git a/zstd/src/tests/roundtrip_integrity.rs b/zstd/src/tests/roundtrip_integrity.rs index b246ced8..6aeb7350 100644 --- a/zstd/src/tests/roundtrip_integrity.rs +++ b/zstd/src/tests/roundtrip_integrity.rs @@ -60,6 +60,21 @@ fn roundtrip_streaming(data: &[u8]) -> Vec { result } +/// Generate data with limited alphabet for better Huffman compressibility +/// but enough variety to avoid RLE path. +fn generate_huffman_friendly(seed: u64, len: usize, alphabet_size: u8) -> Vec { + assert!(alphabet_size > 0, "alphabet_size must be non-zero"); + let mut state = seed; + let mut data = Vec::with_capacity(len); + for _ in 0..len { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + data.push(((state >> 33) as u8) % alphabet_size); + } + data +} + // Cross-validation tests (pure Rust ↔ C FFI) are in tests/cross_validation.rs // because dev-dependencies (zstd) aren't available in library test modules. @@ -130,3 +145,45 @@ fn roundtrip_edge_cases() { let rle = vec![0xABu8; 1_000_000]; assert_eq!(roundtrip_simple(&rle), rle); } + +/// Roundtrip tests with large inputs that produce large literal sections. +/// +/// The encoder uses `compress_literals` (Huffman) for literals > 1024 bytes, +/// so these inputs exercise the 14-bit (0b10) and 18-bit (0b11) size formats. +/// The exact literals size depends on how many matches the encoder finds, +/// so we verify roundtrip correctness rather than specific format selection. +#[test] +fn roundtrip_large_literals() { + // ~1KB input — just above the raw→Huffman threshold. + let data_1025 = generate_huffman_friendly(42, 1025, 16); + assert_eq!(roundtrip_simple(&data_1025), data_1025); + assert_eq!(roundtrip_streaming(&data_1025), data_1025); + + // ~16KB input — near the 14-bit/18-bit boundary. + let data_16383 = generate_huffman_friendly(43, 16383, 32); + assert_eq!(roundtrip_simple(&data_16383), data_16383); + + let data_16384 = generate_huffman_friendly(44, 16384, 32); + assert_eq!(roundtrip_simple(&data_16384), data_16384); + assert_eq!(roundtrip_streaming(&data_16384), data_16384); + + // 64KB input — well within the 18-bit range. + let data_64k = generate_huffman_friendly(45, 65536, 64); + assert_eq!(roundtrip_simple(&data_64k), data_64k); + + // 128KB input — MAX_BLOCK_SIZE, the largest single block. + let data_128k = generate_huffman_friendly(46, 128 * 1024, 64); + assert_eq!(roundtrip_simple(&data_128k), data_128k); + assert_eq!(roundtrip_streaming(&data_128k), data_128k); +} + +/// Multi-block data larger than MAX_BLOCK_SIZE that exercises the 4-stream +/// Huffman encoding across multiple blocks, each with large literal sections. +#[test] +fn roundtrip_multi_block_large_literals() { + // 512KB of Huffman-friendly data — will be split into multiple 128KB blocks, + // each exercising the 18-bit (0b11) size format with 4-stream encoding. + let data = generate_huffman_friendly(100, 512 * 1024, 48); + assert_eq!(roundtrip_simple(&data), data); + assert_eq!(roundtrip_streaming(&data), data); +} diff --git a/zstd/tests/cross_validation.rs b/zstd/tests/cross_validation.rs index 619521d0..83b6e783 100644 --- a/zstd/tests/cross_validation.rs +++ b/zstd/tests/cross_validation.rs @@ -21,6 +21,20 @@ fn generate_data(seed: u64, len: usize) -> Vec { data } +/// Generate data with limited alphabet for Huffman-friendly compression. +fn generate_huffman_friendly(seed: u64, len: usize, alphabet_size: u8) -> Vec { + assert!(alphabet_size > 0, "alphabet_size must be non-zero"); + let mut state = seed; + let mut data = Vec::with_capacity(len); + for _ in 0..len { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + data.push(((state >> 33) as u8) % alphabet_size); + } + data +} + #[test] fn cross_rust_compress_ffi_decompress_1000() { for i in 0..1000u64 { @@ -52,3 +66,52 @@ fn cross_ffi_compress_rust_decompress_1000() { ); } } + +/// Cross-validate large inputs (1KB–512KB) that produce large literal sections, +/// verifying C zstd can decompress what our encoder produces. +#[test] +fn cross_rust_compress_ffi_decompress_large_blocks() { + let sizes = [1025, 16384, 65536, 128 * 1024]; + for (i, &size) in sizes.iter().enumerate() { + let data = generate_huffman_friendly(i as u64 + 200, size, 48); + + let compressed = compress_to_vec(&data[..], CompressionLevel::Fastest); + let result = zstd::decode_all(compressed.as_slice()).unwrap(); + assert_eq!( + data, result, + "rust→ffi large block roundtrip failed at size={size}" + ); + } + + // Multi-block: 512KB forces multiple blocks, each with large literals + let data = generate_huffman_friendly(300, 512 * 1024, 48); + let compressed = compress_to_vec(&data[..], CompressionLevel::Fastest); + let result = zstd::decode_all(compressed.as_slice()).unwrap(); + assert_eq!(data, result, "rust→ffi multi-block roundtrip failed"); +} + +/// Cross-validate C FFI compress → Rust decompress for large blocks. +#[test] +fn cross_ffi_compress_rust_decompress_large_blocks() { + let sizes = [1025, 16384, 65536, 128 * 1024]; + for (i, &size) in sizes.iter().enumerate() { + let data = generate_huffman_friendly(i as u64 + 400, size, 48); + + let compressed = zstd::encode_all(&data[..], 1).unwrap(); + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); + assert_eq!( + data, result, + "ffi→rust large block roundtrip failed at size={size}" + ); + } + + // Multi-block: 512KB + let data = generate_huffman_friendly(500, 512 * 1024, 48); + let compressed = zstd::encode_all(&data[..], 1).unwrap(); + let mut decoder = StreamingDecoder::new(compressed.as_slice()).unwrap(); + let mut result = Vec::new(); + decoder.read_to_end(&mut result).unwrap(); + assert_eq!(data, result, "ffi→rust multi-block roundtrip failed"); +}