From 2f2d608dcadeb3a6a4a7b587d0129254f6d873c2 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 10:46:56 +0300 Subject: [PATCH 1/5] perf(decoding): dual-state interleaved FSE sequence decoding - Add ensure_bits() and get_bits_unchecked() to BitReaderReversed for batched unchecked bit reads after a single refill check - Add update_state_fast() to FSEDecoder using unchecked reads - Restructure both sequence decode loops (with/without RLE) to use one ensure_bits() call covering all three FSE state updates per iteration, replacing three individual per-update refill checks - Fix pre-existing bench compile error (rand 0.10 Rng -> RngExt) Closes #11 --- zstd/benches/support/mod.rs | 2 +- zstd/src/bit_io/bit_reader_reverse.rs | 23 +++++++ zstd/src/decoding/sequence_section_decoder.rs | 63 +++++++++++-------- zstd/src/fse/fse_decoder.rs | 16 +++++ 4 files changed, 76 insertions(+), 28 deletions(-) diff --git a/zstd/benches/support/mod.rs b/zstd/benches/support/mod.rs index 08875d11..79466adf 100644 --- a/zstd/benches/support/mod.rs +++ b/zstd/benches/support/mod.rs @@ -1,6 +1,6 @@ // rand 0.10: SmallRng is available with default features (no `small_rng` flag needed). // Use Rng::fill() instead of RngCore::fill_bytes(); RngCore removed from rand's public root in 0.10. -use rand::{Rng, SeedableRng, rngs::SmallRng}; +use rand::{rngs::SmallRng, RngExt, SeedableRng}; use std::{collections::HashSet, env, fs, path::Path}; use structured_zstd::encoding::CompressionLevel; diff --git a/zstd/src/bit_io/bit_reader_reverse.rs b/zstd/src/bit_io/bit_reader_reverse.rs index b6a1de5c..677338cb 100644 --- a/zstd/src/bit_io/bit_reader_reverse.rs +++ b/zstd/src/bit_io/bit_reader_reverse.rs @@ -99,6 +99,29 @@ impl<'s> BitReaderReversed<'s> { value } + /// Ensure at least `n` bits are available for subsequent unchecked reads. + /// After calling this, it is safe to call [`get_bits_unchecked`](Self::get_bits_unchecked) + /// for a combined total of up to `n` bits without individual refill checks. + /// + /// `n` must be at most 56. + #[inline(always)] + pub fn ensure_bits(&mut self, n: u8) { + debug_assert!(n <= 56); + if self.bits_consumed + n > 64 { + self.refill(); + } + } + + /// Read `n` bits from the source **without** checking whether a refill is + /// needed. The caller **must** guarantee enough bits are available (e.g. via + /// a prior [`ensure_bits`](Self::ensure_bits) call). + #[inline(always)] + pub fn get_bits_unchecked(&mut self, n: u8) -> u64 { + let value = self.peek_bits(n); + self.consume(n); + value + } + /// Get the next `n` bits from the source without consuming them. /// Caller is responsible for making sure that `n` many bits have been refilled. #[inline(always)] diff --git a/zstd/src/decoding/sequence_section_decoder.rs b/zstd/src/decoding/sequence_section_decoder.rs index 2753fe0c..8a519dff 100644 --- a/zstd/src/decoding/sequence_section_decoder.rs +++ b/zstd/src/decoding/sequence_section_decoder.rs @@ -69,6 +69,21 @@ fn decode_sequences_with_rle( target.clear(); target.reserve(section.num_sequences as usize); + // Only non-RLE decoders need state updates; compute their combined worst-case. + let max_update_bits = if scratch.ll_rle.is_none() { + scratch.literal_lengths.accuracy_log + } else { + 0 + } + if scratch.ml_rle.is_none() { + scratch.match_lengths.accuracy_log + } else { + 0 + } + if scratch.of_rle.is_none() { + scratch.offsets.accuracy_log + } else { + 0 + }; + for _seq_idx in 0..section.num_sequences { //get the codes from either the RLE byte or from the decoder let ll_code = if let Some(ll_rle) = scratch.ll_rle { @@ -90,17 +105,6 @@ fn decode_sequences_with_rle( let (ll_value, ll_num_bits) = lookup_ll_code(ll_code); let (ml_value, ml_num_bits) = lookup_ml_code(ml_code); - //println!("Sequence: {}", i); - //println!("of stat: {}", of_dec.state); - //println!("of Code: {}", of_code); - //println!("ll stat: {}", ll_dec.state); - //println!("ll bits: {}", ll_num_bits); - //println!("ll Code: {}", ll_value); - //println!("ml stat: {}", ml_dec.state); - //println!("ml bits: {}", ml_num_bits); - //println!("ml Code: {}", ml_value); - //println!(""); - if of_code > MAX_OFFSET_CODE { return Err(DecodeSequenceError::UnsupportedOffset { offset_code: of_code, @@ -121,19 +125,18 @@ fn decode_sequences_with_rle( }); if target.len() < section.num_sequences as usize { - //println!( - // "Bits left: {} ({} bytes)", - // br.bits_remaining(), - // br.bits_remaining() / 8, - //); + // One refill for all non-RLE state updates (interleaved fast path). + if max_update_bits > 0 { + br.ensure_bits(max_update_bits); + } if scratch.ll_rle.is_none() { - ll_dec.update_state(br); + ll_dec.update_state_fast(br); } if scratch.ml_rle.is_none() { - ml_dec.update_state(br); + ml_dec.update_state_fast(br); } if scratch.of_rle.is_none() { - of_dec.update_state(br); + of_dec.update_state_fast(br); } } @@ -168,6 +171,15 @@ fn decode_sequences_without_rle( target.clear(); target.reserve(section.num_sequences as usize); + // Maximum bits consumed by the three state updates combined. + // LL and ML accuracy logs are at most 9, OF at most 8, so the ceiling is 26. + // A single ensure_bits call (which guarantees ≥56 bits after refill) replaces + // three individual per-update refill checks — the core of the dual-state + // interleaving optimisation from the C reference. + let max_update_bits = scratch.literal_lengths.accuracy_log + + scratch.match_lengths.accuracy_log + + scratch.offsets.accuracy_log; + for _seq_idx in 0..section.num_sequences { let ll_code = ll_dec.decode_symbol(); let ml_code = ml_dec.decode_symbol(); @@ -196,14 +208,11 @@ fn decode_sequences_without_rle( }); if target.len() < section.num_sequences as usize { - //println!( - // "Bits left: {} ({} bytes)", - // br.bits_remaining(), - // br.bits_remaining() / 8, - //); - ll_dec.update_state(br); - ml_dec.update_state(br); - of_dec.update_state(br); + // One refill for all three state updates (interleaved fast path). + br.ensure_bits(max_update_bits); + ll_dec.update_state_fast(br); + ml_dec.update_state_fast(br); + of_dec.update_state_fast(br); } if br.bits_remaining() < 0 { diff --git a/zstd/src/fse/fse_decoder.rs b/zstd/src/fse/fse_decoder.rs index 8d05e142..d39810d2 100644 --- a/zstd/src/fse/fse_decoder.rs +++ b/zstd/src/fse/fse_decoder.rs @@ -49,6 +49,22 @@ impl<'t> FSEDecoder<'t> { //println!("Update: {}, {} -> {}", base_line, add, self.state); } + + /// Advance the internal state **without** an individual refill check. + /// + /// The caller **must** guarantee that enough bits are available in the bit + /// reader (e.g. via [`BitReaderReversed::ensure_bits`] with a budget that + /// covers this and any other unchecked reads in the same batch). + /// + /// This is the "fast path" used in the interleaved sequence decode loop + /// where a single refill check covers all three FSE state updates. + #[inline(always)] + pub fn update_state_fast(&mut self, bits: &mut BitReaderReversed<'_>) { + let num_bits = self.state.num_bits; + let add = bits.get_bits_unchecked(num_bits); + let new_state = self.state.base_line + add as u32; + self.state = self.table.decode[new_state as usize]; + } } /// FSE decoding involves a decoding table that describes the probabilities of From 8225bc325e093418cd3d4e86389df34325930533 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 10:55:37 +0300 Subject: [PATCH 2/5] style: fix rustfmt import ordering in bench support --- zstd/benches/support/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zstd/benches/support/mod.rs b/zstd/benches/support/mod.rs index 79466adf..b5ec26f1 100644 --- a/zstd/benches/support/mod.rs +++ b/zstd/benches/support/mod.rs @@ -1,6 +1,6 @@ // rand 0.10: SmallRng is available with default features (no `small_rng` flag needed). // Use Rng::fill() instead of RngCore::fill_bytes(); RngCore removed from rand's public root in 0.10. -use rand::{rngs::SmallRng, RngExt, SeedableRng}; +use rand::{RngExt, SeedableRng, rngs::SmallRng}; use std::{collections::HashSet, env, fs, path::Path}; use structured_zstd::encoding::CompressionLevel; From ea50a25f98e441926092da64ce521f8e47e0d856 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 10:59:38 +0300 Subject: [PATCH 3/5] test(decoding): add debug assertions and fast-path bit reader test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add debug_assert!(n <= 56) to get_bits_unchecked - Add debug_assert!(max_update_bits <= 56) in both sequence decode loops - Add ensure_and_unchecked_match_get_bits test covering fast-path equivalence with get_bits across refill boundaries and n=0 edge case - Update bench rand 0.10 doc: Rng::fill() → RngExt::fill() --- zstd/benches/support/mod.rs | 2 +- zstd/src/bit_io/bit_reader_reverse.rs | 50 +++++++++++++++++++ zstd/src/decoding/sequence_section_decoder.rs | 8 +++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/zstd/benches/support/mod.rs b/zstd/benches/support/mod.rs index b5ec26f1..6305b6ab 100644 --- a/zstd/benches/support/mod.rs +++ b/zstd/benches/support/mod.rs @@ -1,5 +1,5 @@ // rand 0.10: SmallRng is available with default features (no `small_rng` flag needed). -// Use Rng::fill() instead of RngCore::fill_bytes(); RngCore removed from rand's public root in 0.10. +// Use RngExt::fill() instead of RngCore::fill_bytes(); RngCore removed from rand's public root in 0.10. use rand::{RngExt, SeedableRng, rngs::SmallRng}; use std::{collections::HashSet, env, fs, path::Path}; use structured_zstd::encoding::CompressionLevel; diff --git a/zstd/src/bit_io/bit_reader_reverse.rs b/zstd/src/bit_io/bit_reader_reverse.rs index 677338cb..b5f71361 100644 --- a/zstd/src/bit_io/bit_reader_reverse.rs +++ b/zstd/src/bit_io/bit_reader_reverse.rs @@ -117,6 +117,7 @@ impl<'s> BitReaderReversed<'s> { /// a prior [`ensure_bits`](Self::ensure_bits) call). #[inline(always)] pub fn get_bits_unchecked(&mut self, n: u8) -> u64 { + debug_assert!(n <= 56); let value = self.peek_bits(n); self.consume(n); value @@ -204,4 +205,53 @@ mod test { assert_eq!(br.get_bits(4), 0b0000); assert_eq!(br.bits_remaining(), -7); } + + /// Verify that `ensure_bits(n)` + `get_bits_unchecked(..)` returns the same + /// values as plain `get_bits(..)`, including across refill boundaries and + /// for edge cases like n=0. + #[test] + fn ensure_and_unchecked_match_get_bits() { + // 10 bytes = 80 bits — enough to force multiple refills + let data: [u8; 10] = [0xDE, 0xAD, 0xBE, 0xEF, 0x42, 0x13, 0x37, 0xCA, 0xFE, 0x01]; + + // Reference: read with get_bits + let mut ref_br = super::BitReaderReversed::new(&data); + let r1 = ref_br.get_bits(0); + let r2 = ref_br.get_bits(7); + let r3 = ref_br.get_bits(13); + let r4 = ref_br.get_bits(9); + let r5 = ref_br.get_bits(8); + // After 37 bits consumed, force a batched read of 26 (simulates 3 FSE updates) + let r6 = ref_br.get_bits(9); + let r7 = ref_br.get_bits(9); + let r8 = ref_br.get_bits(8); + + // Unchecked path: same reads via ensure_bits + get_bits_unchecked + let mut fast_br = super::BitReaderReversed::new(&data); + + // n=0 edge case + fast_br.ensure_bits(0); + assert_eq!(fast_br.get_bits_unchecked(0), r1); + + // Single reads + fast_br.ensure_bits(7); + assert_eq!(fast_br.get_bits_unchecked(7), r2); + + fast_br.ensure_bits(13); + assert_eq!(fast_br.get_bits_unchecked(13), r3); + + fast_br.ensure_bits(9); + assert_eq!(fast_br.get_bits_unchecked(9), r4); + + fast_br.ensure_bits(8); + assert_eq!(fast_br.get_bits_unchecked(8), r5); + + // Batched: one ensure covering 9+9+8 = 26 bits + fast_br.ensure_bits(26); + assert_eq!(fast_br.get_bits_unchecked(9), r6); + assert_eq!(fast_br.get_bits_unchecked(9), r7); + assert_eq!(fast_br.get_bits_unchecked(8), r8); + + assert_eq!(ref_br.bits_remaining(), fast_br.bits_remaining()); + } } diff --git a/zstd/src/decoding/sequence_section_decoder.rs b/zstd/src/decoding/sequence_section_decoder.rs index 8a519dff..9c84cb1f 100644 --- a/zstd/src/decoding/sequence_section_decoder.rs +++ b/zstd/src/decoding/sequence_section_decoder.rs @@ -83,6 +83,10 @@ fn decode_sequences_with_rle( } else { 0 }; + debug_assert!( + max_update_bits <= 56, + "sequence section update bits exceed 56-bit budget" + ); for _seq_idx in 0..section.num_sequences { //get the codes from either the RLE byte or from the decoder @@ -179,6 +183,10 @@ fn decode_sequences_without_rle( let max_update_bits = scratch.literal_lengths.accuracy_log + scratch.match_lengths.accuracy_log + scratch.offsets.accuracy_log; + debug_assert!( + max_update_bits <= 56, + "sequence section update bits exceed 56-bit budget" + ); for _seq_idx in 0..section.num_sequences { let ll_code = ll_dec.decode_symbol(); From 8826cdf52c5a14578f6ce1f2ee19ca6406e484b4 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 11:16:28 +0300 Subject: [PATCH 4/5] docs(decoding): clarify batched refill comments in sequence decoder Remove misleading "dual-state interleaving" wording from comments. The optimization is a batched refill check covering three single-state FSE decoders, not a dual-state interleaving pattern. --- zstd/src/decoding/sequence_section_decoder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zstd/src/decoding/sequence_section_decoder.rs b/zstd/src/decoding/sequence_section_decoder.rs index 9c84cb1f..74a3baf7 100644 --- a/zstd/src/decoding/sequence_section_decoder.rs +++ b/zstd/src/decoding/sequence_section_decoder.rs @@ -129,7 +129,7 @@ fn decode_sequences_with_rle( }); if target.len() < section.num_sequences as usize { - // One refill for all non-RLE state updates (interleaved fast path). + // One refill check for all non-RLE state updates (batched fast path). if max_update_bits > 0 { br.ensure_bits(max_update_bits); } @@ -178,8 +178,8 @@ fn decode_sequences_without_rle( // Maximum bits consumed by the three state updates combined. // LL and ML accuracy logs are at most 9, OF at most 8, so the ceiling is 26. // A single ensure_bits call (which guarantees ≥56 bits after refill) replaces - // three individual per-update refill checks — the core of the dual-state - // interleaving optimisation from the C reference. + // three individual per-update refill checks, eliminating two branches per + // iteration on the hot decode path. let max_update_bits = scratch.literal_lengths.accuracy_log + scratch.match_lengths.accuracy_log + scratch.offsets.accuracy_log; @@ -216,7 +216,7 @@ fn decode_sequences_without_rle( }); if target.len() < section.num_sequences as usize { - // One refill for all three state updates (interleaved fast path). + // One refill check for all three state updates (batched fast path). br.ensure_bits(max_update_bits); ll_dec.update_state_fast(br); ml_dec.update_state_fast(br); From 61933850c6ab34ab37307200386c5891b6de5b85 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Fri, 3 Apr 2026 11:55:02 +0300 Subject: [PATCH 5/5] test(decoding): strengthen get_bits_unchecked precondition and test coverage - Add debug_assert!(bits_consumed + n <= 64) to get_bits_unchecked to catch caller violations in debug builds - Force real refill boundary in test: consume 39 bits before batched ensure_bits(26), triggering actual refill (39+26=65 > 64) --- zstd/src/bit_io/bit_reader_reverse.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/zstd/src/bit_io/bit_reader_reverse.rs b/zstd/src/bit_io/bit_reader_reverse.rs index b5f71361..99f5a1df 100644 --- a/zstd/src/bit_io/bit_reader_reverse.rs +++ b/zstd/src/bit_io/bit_reader_reverse.rs @@ -118,6 +118,12 @@ impl<'s> BitReaderReversed<'s> { #[inline(always)] pub fn get_bits_unchecked(&mut self, n: u8) -> u64 { debug_assert!(n <= 56); + debug_assert!( + self.bits_consumed + n <= 64, + "get_bits_unchecked: not enough bits (consumed={}, requested={})", + self.bits_consumed, + n + ); let value = self.peek_bits(n); self.consume(n); value @@ -221,7 +227,9 @@ mod test { let r3 = ref_br.get_bits(13); let r4 = ref_br.get_bits(9); let r5 = ref_br.get_bits(8); - // After 37 bits consumed, force a batched read of 26 (simulates 3 FSE updates) + let r5b = ref_br.get_bits(2); + // After 39 bits consumed, ensure_bits(26) triggers a real refill + // because 39 + 26 = 65 > 64. let r6 = ref_br.get_bits(9); let r7 = ref_br.get_bits(9); let r8 = ref_br.get_bits(8); @@ -246,7 +254,11 @@ mod test { fast_br.ensure_bits(8); assert_eq!(fast_br.get_bits_unchecked(8), r5); - // Batched: one ensure covering 9+9+8 = 26 bits + fast_br.ensure_bits(2); + assert_eq!(fast_br.get_bits_unchecked(2), r5b); + + // Batched: one ensure covering 9+9+8 = 26 bits. + // At 39 bits consumed, this forces a real refill (39+26=65 > 64). fast_br.ensure_bits(26); assert_eq!(fast_br.get_bits_unchecked(9), r6); assert_eq!(fast_br.get_bits_unchecked(9), r7);