From dea98dcc6331055425ea8c181488403e82140de1 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 13:03:19 +0300 Subject: [PATCH 1/9] perf(encoding): gate hash mix with arm crc intrinsics - add aarch64 crc-enabled hash mixing path with runtime/static feature detection - keep multiplicative fallback for non-crc and non-arm targets - apply shared hash mix in dfast and row hash indexing - align row hash test with shared hash-mix contract --- zstd/src/encoding/match_generator.rs | 54 +++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index e1db399b..5daa6677 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -8,7 +8,9 @@ use alloc::collections::VecDeque; use alloc::vec::Vec; #[cfg(all(target_arch = "aarch64", target_endian = "little"))] -use core::arch::aarch64::{uint8x16_t, vceqq_u8, vgetq_lane_u64, vld1q_u8, vreinterpretq_u64_u8}; +use core::arch::aarch64::{ + __crc32d, uint8x16_t, vceqq_u8, vgetq_lane_u64, vld1q_u8, vreinterpretq_u64_u8, +}; #[cfg(target_arch = "x86")] use core::arch::x86::{ __m128i, __m256i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm256_cmpeq_epi8, @@ -59,6 +61,7 @@ const ROW_TARGET_LEN: usize = 48; const ROW_TAG_BITS: usize = 8; const ROW_EMPTY_SLOT: usize = usize::MAX; const ROW_HASH_KEY_LEN: usize = 4; +const HASH_MIX_PRIME: u64 = 0x9E37_79B1_85EB_CA87; const HC_HASH_LOG: usize = 20; const HC_CHAIN_LOG: usize = 19; @@ -73,6 +76,44 @@ const HC_EMPTY: u32 = 0; // fixed-length candidate array returned by chain_candidates(). const MAX_HC_SEARCH_DEPTH: usize = 32; +#[inline(always)] +fn hash_mix_u64(value: u64) -> u64 { + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + { + if crc_hash_available() { + // SAFETY: guarded by runtime/static `crc` feature detection. + return unsafe { hash_mix_u64_crc(value) }; + } + } + + value.wrapping_mul(HASH_MIX_PRIME) +} + +#[cfg(all(target_arch = "aarch64", target_endian = "little"))] +#[inline(always)] +fn crc_hash_available() -> bool { + #[cfg(feature = "std")] + { + static HAS_CRC: OnceLock = OnceLock::new(); + return *HAS_CRC.get_or_init(|| is_aarch64_feature_detected!("crc")); + } + + #[cfg(not(feature = "std"))] + { + cfg!(target_feature = "crc") + } +} + +#[cfg(all(target_arch = "aarch64", target_endian = "little"))] +#[target_feature(enable = "crc")] +unsafe fn hash_mix_u64_crc(value: u64) -> u64 { + // Feed the full 64-bit lane through ARM CRC32 and then mix back with a + // rotated copy of the source to keep dispersion in the upper bits used by + // hash table indexing. + let crc = __crc32d(0, value) as u64; + ((crc << 32) ^ value.rotate_left(17)).wrapping_mul(HASH_MIX_PRIME) +} + #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum PrefixKernel { Scalar, @@ -2191,8 +2232,7 @@ impl DfastMatchGenerator { } fn hash_index(&self, value: u64) -> usize { - const PRIME: u64 = 0x9E37_79B1_85EB_CA87; - ((value.wrapping_mul(PRIME)) >> (64 - self.hash_bits)) as usize + (hash_mix_u64(value) >> (64 - self.hash_bits)) as usize } } @@ -2421,8 +2461,7 @@ impl RowMatchGenerator { } let value = u32::from_le_bytes(concat[idx..idx + ROW_HASH_KEY_LEN].try_into().unwrap()) as u64; - const PRIME: u64 = 0x9E37_79B1_85EB_CA87; - let hash = value.wrapping_mul(PRIME); + let hash = hash_mix_u64(value); let total_bits = self.row_hash_log + ROW_TAG_BITS; let combined = hash >> (u64::BITS as usize - total_bits); let row_mask = (1usize << self.row_hash_log) - 1; @@ -4170,7 +4209,7 @@ fn row_pick_lazy_depth2_keeps_best_when_next2_is_only_one_byte_better() { assert_eq!(chosen.match_len, best.match_len); } -/// Verifies row/tag extraction uses the high bits of the multiplicative hash. +/// Verifies row/tag extraction uses the shared hash mix bit-splitting contract. #[test] fn row_hash_and_row_extracts_high_bits() { let mut matcher = RowMatchGenerator::new(1 << 22); @@ -4192,8 +4231,7 @@ fn row_hash_and_row_extracts_high_bits() { let idx = pos - matcher.history_abs_start; let concat = matcher.live_history(); let value = u32::from_le_bytes(concat[idx..idx + ROW_HASH_KEY_LEN].try_into().unwrap()) as u64; - const PRIME: u64 = 0x9E37_79B1_85EB_CA87; - let hash = value.wrapping_mul(PRIME); + let hash = hash_mix_u64(value); let total_bits = matcher.row_hash_log + ROW_TAG_BITS; let combined = hash >> (u64::BITS as usize - total_bits); let expected_row = From 0e75cb06818f91f9e0917cd2e9bcb9755beb640c Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 13:07:19 +0300 Subject: [PATCH 2/9] perf(encoding): add x86_64 sse4.2 crc hash path - extend hash mix with runtime-gated SSE4.2 CRC32 on x86_64 - keep scalar fallback and existing aarch64 CRC path - add CPU-gated determinism tests for crc hash paths --- zstd/src/encoding/match_generator.rs | 60 +++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 5daa6677..9af00f14 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -18,8 +18,8 @@ use core::arch::x86::{ }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ - __m128i, __m256i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm256_cmpeq_epi8, - _mm256_loadu_si256, _mm256_movemask_epi8, + __m128i, __m256i, _mm_cmpeq_epi8, _mm_crc32_u64, _mm_loadu_si128, _mm_movemask_epi8, + _mm256_cmpeq_epi8, _mm256_loadu_si256, _mm256_movemask_epi8, }; use core::convert::TryInto; use core::num::NonZeroUsize; @@ -78,6 +78,14 @@ const MAX_HC_SEARCH_DEPTH: usize = 32; #[inline(always)] fn hash_mix_u64(value: u64) -> u64 { + #[cfg(target_arch = "x86_64")] + { + if sse42_crc_hash_available() { + // SAFETY: guarded by runtime/static `sse4.2` feature detection. + return unsafe { hash_mix_u64_sse42(value) }; + } + } + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { if crc_hash_available() { @@ -89,6 +97,28 @@ fn hash_mix_u64(value: u64) -> u64 { value.wrapping_mul(HASH_MIX_PRIME) } +#[cfg(target_arch = "x86_64")] +#[inline(always)] +fn sse42_crc_hash_available() -> bool { + #[cfg(feature = "std")] + { + static HAS_SSE42: OnceLock = OnceLock::new(); + return *HAS_SSE42.get_or_init(|| is_x86_feature_detected!("sse4.2")); + } + + #[cfg(not(feature = "std"))] + { + cfg!(target_feature = "sse4.2") + } +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "sse4.2")] +unsafe fn hash_mix_u64_sse42(value: u64) -> u64 { + let crc = _mm_crc32_u64(0, value); + ((crc as u64) << 32 ^ value.rotate_left(13)).wrapping_mul(HASH_MIX_PRIME) +} + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] #[inline(always)] fn crc_hash_available() -> bool { @@ -4266,6 +4296,32 @@ fn row_repcode_returns_none_when_position_too_close_to_history_end() { assert!(matcher.repcode_candidate(4, 1).is_none()); } +#[cfg(all(feature = "std", target_arch = "x86_64"))] +#[test] +fn hash_mix_sse42_path_is_available_and_deterministic_when_supported() { + if !is_x86_feature_detected!("sse4.2") { + return; + } + + let v = 0x0123_4567_89AB_CDEFu64; + let a = hash_mix_u64(v); + let b = hash_mix_u64(v); + assert_eq!(a, b); +} + +#[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))] +#[test] +fn hash_mix_crc_path_is_available_and_deterministic_when_supported() { + if !is_aarch64_feature_detected!("crc") { + return; + } + + let v = 0x0123_4567_89AB_CDEFu64; + let a = hash_mix_u64(v); + let b = hash_mix_u64(v); + assert_eq!(a, b); +} + #[test] fn row_candidate_returns_none_when_abs_pos_near_end_of_history() { let mut matcher = RowMatchGenerator::new(1 << 22); From 66b90ab9d66181bf9b69bd5d013d3634c06b42cf Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 13:13:18 +0300 Subject: [PATCH 3/9] fix(lint): resolve clippy warnings in crc hash gating - remove needless returns in feature-detection helpers - drop redundant u64 cast in sse4.2 crc mix path --- zstd/src/encoding/match_generator.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 9af00f14..7a9bb085 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -103,7 +103,7 @@ fn sse42_crc_hash_available() -> bool { #[cfg(feature = "std")] { static HAS_SSE42: OnceLock = OnceLock::new(); - return *HAS_SSE42.get_or_init(|| is_x86_feature_detected!("sse4.2")); + *HAS_SSE42.get_or_init(|| is_x86_feature_detected!("sse4.2")) } #[cfg(not(feature = "std"))] @@ -116,7 +116,7 @@ fn sse42_crc_hash_available() -> bool { #[target_feature(enable = "sse4.2")] unsafe fn hash_mix_u64_sse42(value: u64) -> u64 { let crc = _mm_crc32_u64(0, value); - ((crc as u64) << 32 ^ value.rotate_left(13)).wrapping_mul(HASH_MIX_PRIME) + ((crc << 32) ^ value.rotate_left(13)).wrapping_mul(HASH_MIX_PRIME) } #[cfg(all(target_arch = "aarch64", target_endian = "little"))] @@ -125,7 +125,7 @@ fn crc_hash_available() -> bool { #[cfg(feature = "std")] { static HAS_CRC: OnceLock = OnceLock::new(); - return *HAS_CRC.get_or_init(|| is_aarch64_feature_detected!("crc")); + *HAS_CRC.get_or_init(|| is_aarch64_feature_detected!("crc")) } #[cfg(not(feature = "std"))] From 551776154b36aa58813e7d56fc76671f505b56b8 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 13:20:11 +0300 Subject: [PATCH 4/9] test(encoding): validate crc-gated hash paths hit accelerated impl --- zstd/src/encoding/match_generator.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 7a9bb085..0cb2041c 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -102,8 +102,7 @@ fn hash_mix_u64(value: u64) -> u64 { fn sse42_crc_hash_available() -> bool { #[cfg(feature = "std")] { - static HAS_SSE42: OnceLock = OnceLock::new(); - *HAS_SSE42.get_or_init(|| is_x86_feature_detected!("sse4.2")) + is_x86_feature_detected!("sse4.2") } #[cfg(not(feature = "std"))] @@ -124,8 +123,7 @@ unsafe fn hash_mix_u64_sse42(value: u64) -> u64 { fn crc_hash_available() -> bool { #[cfg(feature = "std")] { - static HAS_CRC: OnceLock = OnceLock::new(); - *HAS_CRC.get_or_init(|| is_aarch64_feature_detected!("crc")) + is_aarch64_feature_detected!("crc") } #[cfg(not(feature = "std"))] @@ -4298,28 +4296,26 @@ fn row_repcode_returns_none_when_position_too_close_to_history_end() { #[cfg(all(feature = "std", target_arch = "x86_64"))] #[test] -fn hash_mix_sse42_path_is_available_and_deterministic_when_supported() { +fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported() { if !is_x86_feature_detected!("sse4.2") { return; } let v = 0x0123_4567_89AB_CDEFu64; - let a = hash_mix_u64(v); - let b = hash_mix_u64(v); - assert_eq!(a, b); + let accelerated = unsafe { hash_mix_u64_sse42(v) }; + assert_eq!(hash_mix_u64(v), accelerated); } #[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))] #[test] -fn hash_mix_crc_path_is_available_and_deterministic_when_supported() { +fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() { if !is_aarch64_feature_detected!("crc") { return; } let v = 0x0123_4567_89AB_CDEFu64; - let a = hash_mix_u64(v); - let b = hash_mix_u64(v); - assert_eq!(a, b); + let accelerated = unsafe { hash_mix_u64_crc(v) }; + assert_eq!(hash_mix_u64(v), accelerated); } #[test] From b0f4bb8ee18cb2047a0dad696a273c8362f4ec80 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 13:36:27 +0300 Subject: [PATCH 5/9] perf(encoding): cache hash-mix kernel selection in matcher hot path --- zstd/src/encoding/match_generator.rs | 138 ++++++++++++++++++++------- 1 file changed, 106 insertions(+), 32 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 0cb2041c..311f0feb 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -23,6 +23,8 @@ use core::arch::x86_64::{ }; use core::convert::TryInto; use core::num::NonZeroUsize; +#[cfg(feature = "std")] +use core::sync::atomic::{AtomicU8, Ordering}; use super::BETTER_WINDOW_LOG; use super::CompressionLevel; @@ -76,60 +78,106 @@ const HC_EMPTY: u32 = 0; // fixed-length candidate array returned by chain_candidates(). const MAX_HC_SEARCH_DEPTH: usize = 32; +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(u8)] +enum HashMixKernel { + Scalar = 0, + #[cfg(target_arch = "x86_64")] + X86Sse42 = 1, + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + Aarch64Crc = 2, +} + +#[cfg(feature = "std")] +const HASH_MIX_KERNEL_UNINIT: u8 = u8::MAX; + +#[cfg(feature = "std")] +static HASH_MIX_KERNEL: AtomicU8 = AtomicU8::new(HASH_MIX_KERNEL_UNINIT); + #[inline(always)] fn hash_mix_u64(value: u64) -> u64 { - #[cfg(target_arch = "x86_64")] - { - if sse42_crc_hash_available() { - // SAFETY: guarded by runtime/static `sse4.2` feature detection. - return unsafe { hash_mix_u64_sse42(value) }; + match selected_hash_mix_kernel() { + HashMixKernel::Scalar => value.wrapping_mul(HASH_MIX_PRIME), + #[cfg(target_arch = "x86_64")] + HashMixKernel::X86Sse42 => { + // SAFETY: runtime/static detection selected this kernel. + unsafe { hash_mix_u64_sse42(value) } } - } - - #[cfg(all(target_arch = "aarch64", target_endian = "little"))] - { - if crc_hash_available() { - // SAFETY: guarded by runtime/static `crc` feature detection. - return unsafe { hash_mix_u64_crc(value) }; + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + HashMixKernel::Aarch64Crc => { + // SAFETY: runtime/static detection selected this kernel. + unsafe { hash_mix_u64_crc(value) } } } - - value.wrapping_mul(HASH_MIX_PRIME) } -#[cfg(target_arch = "x86_64")] #[inline(always)] -fn sse42_crc_hash_available() -> bool { +fn selected_hash_mix_kernel() -> HashMixKernel { #[cfg(feature = "std")] { - is_x86_feature_detected!("sse4.2") + let cached = HASH_MIX_KERNEL.load(Ordering::Relaxed); + if cached != HASH_MIX_KERNEL_UNINIT { + return hash_mix_kernel_from_u8(cached); + } + + let detected = detect_hash_mix_kernel(); + HASH_MIX_KERNEL.store(detected as u8, Ordering::Relaxed); + detected } #[cfg(not(feature = "std"))] { - cfg!(target_feature = "sse4.2") + detect_hash_mix_kernel() } } -#[cfg(target_arch = "x86_64")] -#[target_feature(enable = "sse4.2")] -unsafe fn hash_mix_u64_sse42(value: u64) -> u64 { - let crc = _mm_crc32_u64(0, value); - ((crc << 32) ^ value.rotate_left(13)).wrapping_mul(HASH_MIX_PRIME) +#[inline(always)] +fn detect_hash_mix_kernel() -> HashMixKernel { + #[cfg(all(feature = "std", target_arch = "x86_64"))] + if is_x86_feature_detected!("sse4.2") { + return HashMixKernel::X86Sse42; + } + + #[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))] + if is_aarch64_feature_detected!("crc") { + return HashMixKernel::Aarch64Crc; + } + + #[cfg(all(not(feature = "std"), target_arch = "x86_64"))] + if cfg!(target_feature = "sse4.2") { + return HashMixKernel::X86Sse42; + } + + #[cfg(all( + not(feature = "std"), + target_arch = "aarch64", + target_endian = "little" + ))] + if cfg!(target_feature = "crc") { + return HashMixKernel::Aarch64Crc; + } + + HashMixKernel::Scalar } -#[cfg(all(target_arch = "aarch64", target_endian = "little"))] +#[cfg(feature = "std")] #[inline(always)] -fn crc_hash_available() -> bool { - #[cfg(feature = "std")] - { - is_aarch64_feature_detected!("crc") +fn hash_mix_kernel_from_u8(raw: u8) -> HashMixKernel { + match raw { + x if x == HashMixKernel::Scalar as u8 => HashMixKernel::Scalar, + #[cfg(target_arch = "x86_64")] + x if x == HashMixKernel::X86Sse42 as u8 => HashMixKernel::X86Sse42, + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + x if x == HashMixKernel::Aarch64Crc as u8 => HashMixKernel::Aarch64Crc, + _ => HashMixKernel::Scalar, } +} - #[cfg(not(feature = "std"))] - { - cfg!(target_feature = "crc") - } +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "sse4.2")] +unsafe fn hash_mix_u64_sse42(value: u64) -> u64 { + let crc = _mm_crc32_u64(0, value); + ((crc << 32) ^ value.rotate_left(13)).wrapping_mul(HASH_MIX_PRIME) } #[cfg(all(target_arch = "aarch64", target_endian = "little"))] @@ -142,6 +190,14 @@ unsafe fn hash_mix_u64_crc(value: u64) -> u64 { ((crc << 32) ^ value.rotate_left(17)).wrapping_mul(HASH_MIX_PRIME) } +#[cfg(all(test, feature = "std"))] +fn with_forced_hash_mix_kernel(kernel: HashMixKernel, f: impl FnOnce() -> T) -> T { + let prev = HASH_MIX_KERNEL.swap(kernel as u8, Ordering::Relaxed); + let out = f(); + HASH_MIX_KERNEL.store(prev, Ordering::Relaxed); + out +} + #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum PrefixKernel { Scalar, @@ -4306,6 +4362,15 @@ fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported( assert_eq!(hash_mix_u64(v), accelerated); } +#[cfg(all(feature = "std", target_arch = "x86_64"))] +#[test] +fn hash_mix_scalar_path_can_be_forced_for_coverage_and_matches_formula() { + let v = 0x0123_4567_89AB_CDEFu64; + let expected = v.wrapping_mul(HASH_MIX_PRIME); + let mixed = with_forced_hash_mix_kernel(HashMixKernel::Scalar, || hash_mix_u64(v)); + assert_eq!(mixed, expected); +} + #[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))] #[test] fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() { @@ -4318,6 +4383,15 @@ fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() assert_eq!(hash_mix_u64(v), accelerated); } +#[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))] +#[test] +fn hash_mix_scalar_path_can_be_forced_on_aarch64_and_matches_formula() { + let v = 0x0123_4567_89AB_CDEFu64; + let expected = v.wrapping_mul(HASH_MIX_PRIME); + let mixed = with_forced_hash_mix_kernel(HashMixKernel::Scalar, || hash_mix_u64(v)); + assert_eq!(mixed, expected); +} + #[test] fn row_candidate_returns_none_when_abs_pos_near_end_of_history() { let mut matcher = RowMatchGenerator::new(1 << 22); From c4ea535616779a9e9086402408421151891667e3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 14:04:50 +0300 Subject: [PATCH 6/9] test(encoding): serialize and panic-proof hash kernel override --- zstd/src/encoding/match_generator.rs | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 311f0feb..1cd769be 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -190,12 +190,25 @@ unsafe fn hash_mix_u64_crc(value: u64) -> u64 { ((crc << 32) ^ value.rotate_left(17)).wrapping_mul(HASH_MIX_PRIME) } +#[cfg(all(test, feature = "std"))] +static HASH_MIX_KERNEL_TEST_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + #[cfg(all(test, feature = "std"))] fn with_forced_hash_mix_kernel(kernel: HashMixKernel, f: impl FnOnce() -> T) -> T { + let _lock = HASH_MIX_KERNEL_TEST_LOCK + .lock() + .expect("hash mix test lock poisoned"); + + struct RestoreHashMixKernel(u8); + impl Drop for RestoreHashMixKernel { + fn drop(&mut self) { + HASH_MIX_KERNEL.store(self.0, Ordering::Relaxed); + } + } + let prev = HASH_MIX_KERNEL.swap(kernel as u8, Ordering::Relaxed); - let out = f(); - HASH_MIX_KERNEL.store(prev, Ordering::Relaxed); - out + let _restore = RestoreHashMixKernel(prev); + f() } #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -4357,6 +4370,9 @@ fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported( return; } + let _lock = HASH_MIX_KERNEL_TEST_LOCK + .lock() + .expect("hash mix test lock poisoned"); let v = 0x0123_4567_89AB_CDEFu64; let accelerated = unsafe { hash_mix_u64_sse42(v) }; assert_eq!(hash_mix_u64(v), accelerated); @@ -4378,6 +4394,9 @@ fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() return; } + let _lock = HASH_MIX_KERNEL_TEST_LOCK + .lock() + .expect("hash mix test lock poisoned"); let v = 0x0123_4567_89AB_CDEFu64; let accelerated = unsafe { hash_mix_u64_crc(v) }; assert_eq!(hash_mix_u64(v), accelerated); From 904d2b0a72c7a9229c212134028292acfe14173b Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 14:30:46 +0300 Subject: [PATCH 7/9] test(encoding): lock row hash extraction test against kernel override --- zstd/src/encoding/match_generator.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 1cd769be..a094c926 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -4309,6 +4309,11 @@ fn row_pick_lazy_depth2_keeps_best_when_next2_is_only_one_byte_better() { /// Verifies row/tag extraction uses the shared hash mix bit-splitting contract. #[test] fn row_hash_and_row_extracts_high_bits() { + #[cfg(feature = "std")] + let _lock = HASH_MIX_KERNEL_TEST_LOCK + .lock() + .expect("hash mix test lock poisoned"); + let mut matcher = RowMatchGenerator::new(1 << 22); matcher.configure(ROW_CONFIG); matcher.add_data( From 46927db310ffd2ee640010edf45d3a61e1972379 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 14:53:58 +0300 Subject: [PATCH 8/9] perf(encoding): move hash kernel dispatch into matcher state --- zstd/src/encoding/match_generator.rs | 101 +++++---------------------- 1 file changed, 19 insertions(+), 82 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index a094c926..02452538 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -23,8 +23,6 @@ use core::arch::x86_64::{ }; use core::convert::TryInto; use core::num::NonZeroUsize; -#[cfg(feature = "std")] -use core::sync::atomic::{AtomicU8, Ordering}; use super::BETTER_WINDOW_LOG; use super::CompressionLevel; @@ -88,15 +86,9 @@ enum HashMixKernel { Aarch64Crc = 2, } -#[cfg(feature = "std")] -const HASH_MIX_KERNEL_UNINIT: u8 = u8::MAX; - -#[cfg(feature = "std")] -static HASH_MIX_KERNEL: AtomicU8 = AtomicU8::new(HASH_MIX_KERNEL_UNINIT); - #[inline(always)] -fn hash_mix_u64(value: u64) -> u64 { - match selected_hash_mix_kernel() { +fn hash_mix_u64_with_kernel(value: u64, kernel: HashMixKernel) -> u64 { + match kernel { HashMixKernel::Scalar => value.wrapping_mul(HASH_MIX_PRIME), #[cfg(target_arch = "x86_64")] HashMixKernel::X86Sse42 => { @@ -111,26 +103,6 @@ fn hash_mix_u64(value: u64) -> u64 { } } -#[inline(always)] -fn selected_hash_mix_kernel() -> HashMixKernel { - #[cfg(feature = "std")] - { - let cached = HASH_MIX_KERNEL.load(Ordering::Relaxed); - if cached != HASH_MIX_KERNEL_UNINIT { - return hash_mix_kernel_from_u8(cached); - } - - let detected = detect_hash_mix_kernel(); - HASH_MIX_KERNEL.store(detected as u8, Ordering::Relaxed); - detected - } - - #[cfg(not(feature = "std"))] - { - detect_hash_mix_kernel() - } -} - #[inline(always)] fn detect_hash_mix_kernel() -> HashMixKernel { #[cfg(all(feature = "std", target_arch = "x86_64"))] @@ -160,19 +132,6 @@ fn detect_hash_mix_kernel() -> HashMixKernel { HashMixKernel::Scalar } -#[cfg(feature = "std")] -#[inline(always)] -fn hash_mix_kernel_from_u8(raw: u8) -> HashMixKernel { - match raw { - x if x == HashMixKernel::Scalar as u8 => HashMixKernel::Scalar, - #[cfg(target_arch = "x86_64")] - x if x == HashMixKernel::X86Sse42 as u8 => HashMixKernel::X86Sse42, - #[cfg(all(target_arch = "aarch64", target_endian = "little"))] - x if x == HashMixKernel::Aarch64Crc as u8 => HashMixKernel::Aarch64Crc, - _ => HashMixKernel::Scalar, - } -} - #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse4.2")] unsafe fn hash_mix_u64_sse42(value: u64) -> u64 { @@ -190,27 +149,6 @@ unsafe fn hash_mix_u64_crc(value: u64) -> u64 { ((crc << 32) ^ value.rotate_left(17)).wrapping_mul(HASH_MIX_PRIME) } -#[cfg(all(test, feature = "std"))] -static HASH_MIX_KERNEL_TEST_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); - -#[cfg(all(test, feature = "std"))] -fn with_forced_hash_mix_kernel(kernel: HashMixKernel, f: impl FnOnce() -> T) -> T { - let _lock = HASH_MIX_KERNEL_TEST_LOCK - .lock() - .expect("hash mix test lock poisoned"); - - struct RestoreHashMixKernel(u8); - impl Drop for RestoreHashMixKernel { - fn drop(&mut self) { - HASH_MIX_KERNEL.store(self.0, Ordering::Relaxed); - } - } - - let prev = HASH_MIX_KERNEL.swap(kernel as u8, Ordering::Relaxed); - let _restore = RestoreHashMixKernel(prev); - f() -} - #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum PrefixKernel { Scalar, @@ -1607,6 +1545,7 @@ struct DfastMatchGenerator { short_hash: Vec<[usize; DFAST_SEARCH_DEPTH]>, long_hash: Vec<[usize; DFAST_SEARCH_DEPTH]>, hash_bits: usize, + hash_mix_kernel: HashMixKernel, use_fast_loop: bool, // Lazy match lookahead depth (internal tuning parameter). lazy_depth: u8, @@ -1778,6 +1717,7 @@ impl DfastMatchGenerator { short_hash: Vec::new(), long_hash: Vec::new(), hash_bits: DFAST_HASH_BITS, + hash_mix_kernel: detect_hash_mix_kernel(), use_fast_loop: false, lazy_depth: 1, } @@ -2329,7 +2269,7 @@ impl DfastMatchGenerator { } fn hash_index(&self, value: u64) -> usize { - (hash_mix_u64(value) >> (64 - self.hash_bits)) as usize + (hash_mix_u64_with_kernel(value, self.hash_mix_kernel) >> (64 - self.hash_bits)) as usize } } @@ -2346,6 +2286,7 @@ struct RowMatchGenerator { search_depth: usize, target_len: usize, lazy_depth: u8, + hash_mix_kernel: HashMixKernel, row_heads: Vec, row_positions: Vec, row_tags: Vec, @@ -2366,6 +2307,7 @@ impl RowMatchGenerator { search_depth: ROW_SEARCH_DEPTH, target_len: ROW_TARGET_LEN, lazy_depth: 1, + hash_mix_kernel: detect_hash_mix_kernel(), row_heads: Vec::new(), row_positions: Vec::new(), row_tags: Vec::new(), @@ -2558,7 +2500,7 @@ impl RowMatchGenerator { } let value = u32::from_le_bytes(concat[idx..idx + ROW_HASH_KEY_LEN].try_into().unwrap()) as u64; - let hash = hash_mix_u64(value); + let hash = hash_mix_u64_with_kernel(value, self.hash_mix_kernel); let total_bits = self.row_hash_log + ROW_TAG_BITS; let combined = hash >> (u64::BITS as usize - total_bits); let row_mask = (1usize << self.row_hash_log) - 1; @@ -4309,11 +4251,6 @@ fn row_pick_lazy_depth2_keeps_best_when_next2_is_only_one_byte_better() { /// Verifies row/tag extraction uses the shared hash mix bit-splitting contract. #[test] fn row_hash_and_row_extracts_high_bits() { - #[cfg(feature = "std")] - let _lock = HASH_MIX_KERNEL_TEST_LOCK - .lock() - .expect("hash mix test lock poisoned"); - let mut matcher = RowMatchGenerator::new(1 << 22); matcher.configure(ROW_CONFIG); matcher.add_data( @@ -4333,7 +4270,7 @@ fn row_hash_and_row_extracts_high_bits() { let idx = pos - matcher.history_abs_start; let concat = matcher.live_history(); let value = u32::from_le_bytes(concat[idx..idx + ROW_HASH_KEY_LEN].try_into().unwrap()) as u64; - let hash = hash_mix_u64(value); + let hash = hash_mix_u64_with_kernel(value, matcher.hash_mix_kernel); let total_bits = matcher.row_hash_log + ROW_TAG_BITS; let combined = hash >> (u64::BITS as usize - total_bits); let expected_row = @@ -4375,12 +4312,12 @@ fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported( return; } - let _lock = HASH_MIX_KERNEL_TEST_LOCK - .lock() - .expect("hash mix test lock poisoned"); let v = 0x0123_4567_89AB_CDEFu64; let accelerated = unsafe { hash_mix_u64_sse42(v) }; - assert_eq!(hash_mix_u64(v), accelerated); + assert_eq!( + hash_mix_u64_with_kernel(v, HashMixKernel::X86Sse42), + accelerated + ); } #[cfg(all(feature = "std", target_arch = "x86_64"))] @@ -4388,7 +4325,7 @@ fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported( fn hash_mix_scalar_path_can_be_forced_for_coverage_and_matches_formula() { let v = 0x0123_4567_89AB_CDEFu64; let expected = v.wrapping_mul(HASH_MIX_PRIME); - let mixed = with_forced_hash_mix_kernel(HashMixKernel::Scalar, || hash_mix_u64(v)); + let mixed = hash_mix_u64_with_kernel(v, HashMixKernel::Scalar); assert_eq!(mixed, expected); } @@ -4399,12 +4336,12 @@ fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() return; } - let _lock = HASH_MIX_KERNEL_TEST_LOCK - .lock() - .expect("hash mix test lock poisoned"); let v = 0x0123_4567_89AB_CDEFu64; let accelerated = unsafe { hash_mix_u64_crc(v) }; - assert_eq!(hash_mix_u64(v), accelerated); + assert_eq!( + hash_mix_u64_with_kernel(v, HashMixKernel::Aarch64Crc), + accelerated + ); } #[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))] @@ -4412,7 +4349,7 @@ fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() fn hash_mix_scalar_path_can_be_forced_on_aarch64_and_matches_formula() { let v = 0x0123_4567_89AB_CDEFu64; let expected = v.wrapping_mul(HASH_MIX_PRIME); - let mixed = with_forced_hash_mix_kernel(HashMixKernel::Scalar, || hash_mix_u64(v)); + let mixed = hash_mix_u64_with_kernel(v, HashMixKernel::Scalar); assert_eq!(mixed, expected); } From 33b482ff339a57d36311864a3ae57918971ebd72 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 11 Apr 2026 15:10:48 +0300 Subject: [PATCH 9/9] test(encoding): tighten kernel dispatch assertions --- zstd/src/encoding/match_generator.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 02452538..3bcd8796 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -2872,8 +2872,7 @@ impl HcMatchGenerator { fn hash_position(&self, data: &[u8]) -> usize { let value = u32::from_le_bytes(data[..4].try_into().unwrap()) as u64; - const PRIME: u64 = 0x9E37_79B1_85EB_CA87; - ((value.wrapping_mul(PRIME)) >> (64 - self.hash_log)) as usize + ((value.wrapping_mul(HASH_MIX_PRIME)) >> (64 - self.hash_log)) as usize } fn relative_position(&self, abs_pos: usize) -> Option { @@ -4312,12 +4311,11 @@ fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported( return; } + let kernel = detect_hash_mix_kernel(); + assert_eq!(kernel, HashMixKernel::X86Sse42); let v = 0x0123_4567_89AB_CDEFu64; let accelerated = unsafe { hash_mix_u64_sse42(v) }; - assert_eq!( - hash_mix_u64_with_kernel(v, HashMixKernel::X86Sse42), - accelerated - ); + assert_eq!(hash_mix_u64_with_kernel(v, kernel), accelerated); } #[cfg(all(feature = "std", target_arch = "x86_64"))] @@ -4336,12 +4334,11 @@ fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() return; } + let kernel = detect_hash_mix_kernel(); + assert_eq!(kernel, HashMixKernel::Aarch64Crc); let v = 0x0123_4567_89AB_CDEFu64; let accelerated = unsafe { hash_mix_u64_crc(v) }; - assert_eq!( - hash_mix_u64_with_kernel(v, HashMixKernel::Aarch64Crc), - accelerated - ); + assert_eq!(hash_mix_u64_with_kernel(v, kernel), accelerated); } #[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))]