From 0af1bf71ee9a0fd9b7dd5d09e312c4d336fd135f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 5 Apr 2026 19:24:45 +0300 Subject: [PATCH 01/14] feat(dict): add ffi fastcover training and finalization --- README.md | 3 +- zstd/Cargo.toml | 2 + zstd/src/dictionary/ffi_builder.rs | 410 +++++++++++++++++++++++++++++ zstd/src/dictionary/mod.rs | 5 + 4 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 zstd/src/dictionary/ffi_builder.rs diff --git a/README.md b/README.md index 606164d9..31e2abe0 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,8 @@ Complete RFC 8878 implementation. Performance: ~1.4-3.5x slower than C zstd depe ### Dictionary Generation -When the `dict_builder` feature is enabled, the `dictionary` module can create raw content dictionaries. Within 0.2% of the official implementation on the `github-users` sample set. +When the `dict_builder` feature is enabled, the `dictionary` module can create raw content dictionaries. +When `dict_builder_ffi` is also enabled, it additionally exposes C-zstd-backed COVER/FastCOVER training (including parameter optimization) and raw-content dictionary finalization into full zstd dictionary format. ## Benchmarking diff --git a/zstd/Cargo.toml b/zstd/Cargo.toml index 786aa118..4d11eb73 100644 --- a/zstd/Cargo.toml +++ b/zstd/Cargo.toml @@ -21,6 +21,7 @@ categories = ["compression"] # Locked behind the `hash` feature flag twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true } fastrand = {version = "2.3.0", optional = true } +zstd-sys = { version = "2.0.16", default-features = false, features = ["std", "zdict_builder", "experimental"], optional = true } # Internal feature, only used when building as part of libstd, not part of the # stable interface of this crate. @@ -36,6 +37,7 @@ zstd = { version = "0.13.3", features = ["zdict_builder"] } [features] default = ["hash", "std"] dict_builder = ["std", "dep:fastrand"] +dict_builder_ffi = ["dict_builder", "dep:zstd-sys"] hash = ["dep:twox-hash"] bench_internals = [] fuzz_exports = [] diff --git a/zstd/src/dictionary/ffi_builder.rs b/zstd/src/dictionary/ffi_builder.rs new file mode 100644 index 00000000..97bf52db --- /dev/null +++ b/zstd/src/dictionary/ffi_builder.rs @@ -0,0 +1,410 @@ +use std::ffi::CStr; +use std::io::{self, Write}; +use std::string::String; +use std::vec; +use std::vec::Vec; + +use zstd_sys::{ + ZDICT_DICTSIZE_MIN, ZDICT_fastCover_params_t, ZDICT_finalizeDictionary, + ZDICT_getDictID, ZDICT_getErrorName, ZDICT_isError, + ZDICT_optimizeTrainFromBuffer_cover, ZDICT_optimizeTrainFromBuffer_fastCover, + ZDICT_params_t, ZDICT_trainFromBuffer_cover, ZDICT_trainFromBuffer_fastCover, + ZDICT_cover_params_t, +}; + +/// Dictionary training algorithm used by the zstd C dictionary builder. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TrainingAlgorithm { + Cover, + FastCover, +} + +/// Builder parameters used for COVER/FastCOVER dictionary training. +#[derive(Debug, Clone, Copy)] +pub struct TrainingParams { + pub algorithm: TrainingAlgorithm, + pub optimize: bool, + pub k: u32, + pub d: u32, + pub f: u32, + pub steps: u32, + pub accel: u32, + pub nb_threads: u32, + pub split_point: f64, + pub compression_level: i32, + pub dict_id: Option, + pub notification_level: u32, +} + +impl Default for TrainingParams { + fn default() -> Self { + Self { + algorithm: TrainingAlgorithm::FastCover, + optimize: true, + k: 0, + d: 0, + f: 0, + steps: 0, + accel: 0, + nb_threads: 0, + split_point: 0.0, + compression_level: 0, + dict_id: None, + notification_level: 0, + } + } +} + +/// Parameters for finalizing a raw-content dictionary into full zstd dictionary format. +#[derive(Debug, Clone, Copy, Default)] +pub struct FinalizeParams { + pub compression_level: i32, + pub dict_id: Option, + pub notification_level: u32, +} + +/// Result metadata for FFI dictionary training. +#[derive(Debug, Clone, Copy)] +pub struct TrainedDictionaryInfo { + pub dict_size: usize, + pub dict_id: u32, + pub selected_k: u32, + pub selected_d: u32, + pub selected_f: u32, + pub selected_steps: u32, + pub selected_accel: u32, +} + +fn validate_samples(sample_data: &[u8], sample_sizes: &[usize]) -> io::Result<()> { + if sample_sizes.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "dictionary training requires at least one sample", + )); + } + if sample_sizes.iter().sum::() != sample_data.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "sample sizes must sum to sample_data length", + )); + } + Ok(()) +} + +fn parse_zdict_code(code: usize) -> io::Result { + // SAFETY: ZDICT_isError and ZDICT_getErrorName are pure functions over the code value. + let is_err = unsafe { ZDICT_isError(code) } != 0; + if !is_err { + return Ok(code); + } + // SAFETY: When ZDICT_isError(code) is non-zero, zstd returns a static error string. + let msg = unsafe { + let ptr = ZDICT_getErrorName(code); + if ptr.is_null() { + String::from("zstd dictionary builder error") + } else { + CStr::from_ptr(ptr).to_string_lossy().into_owned() + } + }; + Err(io::Error::other(msg)) +} + +fn zparams(compression_level: i32, dict_id: Option, notification_level: u32) -> ZDICT_params_t { + ZDICT_params_t { + compressionLevel: compression_level, + notificationLevel: notification_level, + dictID: dict_id.unwrap_or(0), + } +} + +/// Train a finalized zstd dictionary from contiguous sample data with explicit sample sizes. +pub fn train_dictionary_ffi( + sample_data: &[u8], + sample_sizes: &[usize], + output: &mut W, + dict_size: usize, + params: TrainingParams, +) -> io::Result { + validate_samples(sample_data, sample_sizes)?; + if !params.optimize && (params.k == 0 || params.d == 0) { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "k and d must be set when optimize=false", + )); + } + + let max_size = dict_size.max(ZDICT_DICTSIZE_MIN as usize); + let mut dict_buf = vec![0u8; max_size]; + let z_params = zparams( + params.compression_level, + params.dict_id, + params.notification_level, + ); + + let (written, selected_k, selected_d, selected_f, selected_steps, selected_accel) = + match params.algorithm { + TrainingAlgorithm::Cover => { + let mut cover = ZDICT_cover_params_t { + k: params.k, + d: params.d, + steps: params.steps, + nbThreads: params.nb_threads, + splitPoint: params.split_point, + shrinkDict: 0, + shrinkDictMaxRegression: 0, + zParams: z_params, + }; + let rc = if params.optimize { + // SAFETY: All pointers remain valid for the duration of the call. + unsafe { + ZDICT_optimizeTrainFromBuffer_cover( + dict_buf.as_mut_ptr().cast(), + dict_buf.len(), + sample_data.as_ptr().cast(), + sample_sizes.as_ptr(), + sample_sizes.len() as u32, + &mut cover, + ) + } + } else { + // SAFETY: All pointers remain valid for the duration of the call. + unsafe { + ZDICT_trainFromBuffer_cover( + dict_buf.as_mut_ptr().cast(), + dict_buf.len(), + sample_data.as_ptr().cast(), + sample_sizes.as_ptr(), + sample_sizes.len() as u32, + cover, + ) + } + }; + (parse_zdict_code(rc)?, cover.k, cover.d, 0, cover.steps, 0) + } + TrainingAlgorithm::FastCover => { + let mut fast = ZDICT_fastCover_params_t { + k: params.k, + d: params.d, + f: params.f, + steps: params.steps, + nbThreads: params.nb_threads, + splitPoint: params.split_point, + accel: params.accel, + shrinkDict: 0, + shrinkDictMaxRegression: 0, + zParams: z_params, + }; + let rc = if params.optimize { + // SAFETY: All pointers remain valid for the duration of the call. + unsafe { + ZDICT_optimizeTrainFromBuffer_fastCover( + dict_buf.as_mut_ptr().cast(), + dict_buf.len(), + sample_data.as_ptr().cast(), + sample_sizes.as_ptr(), + sample_sizes.len() as u32, + &mut fast, + ) + } + } else { + // SAFETY: All pointers remain valid for the duration of the call. + unsafe { + ZDICT_trainFromBuffer_fastCover( + dict_buf.as_mut_ptr().cast(), + dict_buf.len(), + sample_data.as_ptr().cast(), + sample_sizes.as_ptr(), + sample_sizes.len() as u32, + fast, + ) + } + }; + ( + parse_zdict_code(rc)?, + fast.k, + fast.d, + fast.f, + fast.steps, + fast.accel, + ) + } + }; + output.write_all(&dict_buf[..written])?; + // SAFETY: dictionary slice is valid and length is from zstd return code. + let dict_id = unsafe { ZDICT_getDictID(dict_buf.as_ptr().cast(), written) }; + Ok(TrainedDictionaryInfo { + dict_size: written, + dict_id, + selected_k, + selected_d, + selected_f, + selected_steps, + selected_accel, + }) +} + +/// Convenience helper: train dictionary from individual samples. +pub fn train_dictionary_from_samples_ffi, W: Write>( + samples: &[S], + output: &mut W, + dict_size: usize, + params: TrainingParams, +) -> io::Result { + let total_len = samples.iter().map(|s| s.as_ref().len()).sum(); + let mut data = Vec::with_capacity(total_len); + let mut sizes = Vec::with_capacity(samples.len()); + for sample in samples { + let sample = sample.as_ref(); + sizes.push(sample.len()); + data.extend_from_slice(sample); + } + train_dictionary_ffi(&data, &sizes, output, dict_size, params) +} + +/// Finalize a raw-content dictionary into full zstd dictionary format with entropy tables. +pub fn finalize_raw_dictionary_ffi( + raw_content: &[u8], + sample_data: &[u8], + sample_sizes: &[usize], + output: &mut W, + dict_size: usize, + params: FinalizeParams, +) -> io::Result { + validate_samples(sample_data, sample_sizes)?; + if raw_content.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "raw dictionary content must not be empty", + )); + } + + let max_size = dict_size + .max(raw_content.len()) + .max(ZDICT_DICTSIZE_MIN as usize); + let mut dict_buf = vec![0u8; max_size]; + let z_params = zparams( + params.compression_level, + params.dict_id, + params.notification_level, + ); + + // SAFETY: All pointers remain valid for the duration of the call. + let rc = unsafe { + ZDICT_finalizeDictionary( + dict_buf.as_mut_ptr().cast(), + dict_buf.len(), + raw_content.as_ptr().cast(), + raw_content.len(), + sample_data.as_ptr().cast(), + sample_sizes.as_ptr(), + sample_sizes.len() as u32, + z_params, + ) + }; + let written = parse_zdict_code(rc)?; + output.write_all(&dict_buf[..written])?; + // SAFETY: dictionary slice is valid and length is from zstd return code. + let dict_id = unsafe { ZDICT_getDictID(dict_buf.as_ptr().cast(), written) }; + Ok(TrainedDictionaryInfo { + dict_size: written, + dict_id, + selected_k: 0, + selected_d: 0, + selected_f: 0, + selected_steps: 0, + selected_accel: 0, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::decoding::Dictionary; + use crate::encoding::{CompressionLevel, FrameCompressor}; + use std::format; + use std::io::Cursor; + + fn sample_corpus() -> (Vec, Vec) { + let mut sizes = Vec::new(); + let mut data = Vec::new(); + for i in 0..400u32 { + let sample = format!("tenant=demo table=orders key={i} region=eu value=aaaaabbbbbccccc\n"); + sizes.push(sample.len()); + data.extend_from_slice(sample.as_bytes()); + } + (data, sizes) + } + + #[test] + fn fastcover_training_produces_decodeable_dictionary() { + let (sample_data, sample_sizes) = sample_corpus(); + let mut dict = Vec::new(); + let info = train_dictionary_ffi( + &sample_data, + &sample_sizes, + &mut dict, + 4096, + TrainingParams::default(), + ) + .expect("fastcover training should succeed"); + assert!(info.dict_size > 0); + let parsed = Dictionary::decode_dict(dict.as_slice()).expect("trained dict should decode"); + assert_eq!(parsed.id, info.dict_id); + assert!(!parsed.dict_content.is_empty()); + } + + #[test] + fn finalize_raw_dict_is_compatible_with_ffi_decoder() { + let (sample_data, sample_sizes) = sample_corpus(); + + let mut raw = Vec::new(); + crate::dictionary::create_raw_dict_from_source( + Cursor::new(sample_data.as_slice()), + sample_data.len(), + &mut raw, + 4096, + ); + assert!(!raw.is_empty(), "raw dictionary must not be empty"); + + let mut finalized = Vec::new(); + let info = finalize_raw_dictionary_ffi( + raw.as_slice(), + &sample_data, + &sample_sizes, + &mut finalized, + 4096, + FinalizeParams::default(), + ) + .expect("finalization should succeed"); + assert!(info.dict_size > 0); + + let parsed = Dictionary::decode_dict(finalized.as_slice()) + .expect("finalized dictionary should decode"); + + let mut payload = Vec::new(); + for idx in 0..128u32 { + payload.extend_from_slice( + format!("tenant=demo table=orders op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n") + .as_bytes(), + ); + } + + let mut compressed = Vec::new(); + let mut compressor = FrameCompressor::new(CompressionLevel::Fastest); + compressor + .set_dictionary(parsed) + .expect("dictionary should attach"); + compressor.set_source(payload.as_slice()); + compressor.set_drain(&mut compressed); + compressor.compress(); + + let mut ffi_decoder = zstd::bulk::Decompressor::with_dictionary(finalized.as_slice()) + .expect("ffi decoder should accept finalized dictionary"); + let mut decoded = Vec::with_capacity(payload.len()); + let written = ffi_decoder + .decompress_to_buffer(compressed.as_slice(), &mut decoded) + .expect("ffi decoder should decode payload"); + assert_eq!(written, payload.len()); + assert_eq!(decoded, payload); + } +} diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 117bfe7a..4ae7ba14 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -23,6 +23,8 @@ // the frequency of w in the reservoir using a rolling karp-rabin hash // - The score of a segment is the sum of `f(w)` called on every kmer within the segment mod cover; +#[cfg(feature = "dict_builder_ffi")] +mod ffi_builder; mod frequency; mod reservoir; @@ -39,6 +41,9 @@ use std::{ vec::Vec, }; +#[cfg(feature = "dict_builder_ffi")] +pub use ffi_builder::*; + /// A set of values that are used during dictionary construction. /// /// Changing these values can improve the resulting dictionary size for certain datasets. From 8ea0b6fdac23dc841058ebc4805f0724f608feb7 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 5 Apr 2026 19:27:18 +0300 Subject: [PATCH 02/14] Revert "feat(dict): add ffi fastcover training and finalization" This reverts commit 5f1535330fa13c95b8576e7f3e67ecbe481a9127. --- README.md | 3 +- zstd/Cargo.toml | 2 - zstd/src/dictionary/ffi_builder.rs | 410 ----------------------------- zstd/src/dictionary/mod.rs | 5 - 4 files changed, 1 insertion(+), 419 deletions(-) delete mode 100644 zstd/src/dictionary/ffi_builder.rs diff --git a/README.md b/README.md index 31e2abe0..606164d9 100644 --- a/README.md +++ b/README.md @@ -54,8 +54,7 @@ Complete RFC 8878 implementation. Performance: ~1.4-3.5x slower than C zstd depe ### Dictionary Generation -When the `dict_builder` feature is enabled, the `dictionary` module can create raw content dictionaries. -When `dict_builder_ffi` is also enabled, it additionally exposes C-zstd-backed COVER/FastCOVER training (including parameter optimization) and raw-content dictionary finalization into full zstd dictionary format. +When the `dict_builder` feature is enabled, the `dictionary` module can create raw content dictionaries. Within 0.2% of the official implementation on the `github-users` sample set. ## Benchmarking diff --git a/zstd/Cargo.toml b/zstd/Cargo.toml index 4d11eb73..786aa118 100644 --- a/zstd/Cargo.toml +++ b/zstd/Cargo.toml @@ -21,7 +21,6 @@ categories = ["compression"] # Locked behind the `hash` feature flag twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true } fastrand = {version = "2.3.0", optional = true } -zstd-sys = { version = "2.0.16", default-features = false, features = ["std", "zdict_builder", "experimental"], optional = true } # Internal feature, only used when building as part of libstd, not part of the # stable interface of this crate. @@ -37,7 +36,6 @@ zstd = { version = "0.13.3", features = ["zdict_builder"] } [features] default = ["hash", "std"] dict_builder = ["std", "dep:fastrand"] -dict_builder_ffi = ["dict_builder", "dep:zstd-sys"] hash = ["dep:twox-hash"] bench_internals = [] fuzz_exports = [] diff --git a/zstd/src/dictionary/ffi_builder.rs b/zstd/src/dictionary/ffi_builder.rs deleted file mode 100644 index 97bf52db..00000000 --- a/zstd/src/dictionary/ffi_builder.rs +++ /dev/null @@ -1,410 +0,0 @@ -use std::ffi::CStr; -use std::io::{self, Write}; -use std::string::String; -use std::vec; -use std::vec::Vec; - -use zstd_sys::{ - ZDICT_DICTSIZE_MIN, ZDICT_fastCover_params_t, ZDICT_finalizeDictionary, - ZDICT_getDictID, ZDICT_getErrorName, ZDICT_isError, - ZDICT_optimizeTrainFromBuffer_cover, ZDICT_optimizeTrainFromBuffer_fastCover, - ZDICT_params_t, ZDICT_trainFromBuffer_cover, ZDICT_trainFromBuffer_fastCover, - ZDICT_cover_params_t, -}; - -/// Dictionary training algorithm used by the zstd C dictionary builder. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TrainingAlgorithm { - Cover, - FastCover, -} - -/// Builder parameters used for COVER/FastCOVER dictionary training. -#[derive(Debug, Clone, Copy)] -pub struct TrainingParams { - pub algorithm: TrainingAlgorithm, - pub optimize: bool, - pub k: u32, - pub d: u32, - pub f: u32, - pub steps: u32, - pub accel: u32, - pub nb_threads: u32, - pub split_point: f64, - pub compression_level: i32, - pub dict_id: Option, - pub notification_level: u32, -} - -impl Default for TrainingParams { - fn default() -> Self { - Self { - algorithm: TrainingAlgorithm::FastCover, - optimize: true, - k: 0, - d: 0, - f: 0, - steps: 0, - accel: 0, - nb_threads: 0, - split_point: 0.0, - compression_level: 0, - dict_id: None, - notification_level: 0, - } - } -} - -/// Parameters for finalizing a raw-content dictionary into full zstd dictionary format. -#[derive(Debug, Clone, Copy, Default)] -pub struct FinalizeParams { - pub compression_level: i32, - pub dict_id: Option, - pub notification_level: u32, -} - -/// Result metadata for FFI dictionary training. -#[derive(Debug, Clone, Copy)] -pub struct TrainedDictionaryInfo { - pub dict_size: usize, - pub dict_id: u32, - pub selected_k: u32, - pub selected_d: u32, - pub selected_f: u32, - pub selected_steps: u32, - pub selected_accel: u32, -} - -fn validate_samples(sample_data: &[u8], sample_sizes: &[usize]) -> io::Result<()> { - if sample_sizes.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "dictionary training requires at least one sample", - )); - } - if sample_sizes.iter().sum::() != sample_data.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "sample sizes must sum to sample_data length", - )); - } - Ok(()) -} - -fn parse_zdict_code(code: usize) -> io::Result { - // SAFETY: ZDICT_isError and ZDICT_getErrorName are pure functions over the code value. - let is_err = unsafe { ZDICT_isError(code) } != 0; - if !is_err { - return Ok(code); - } - // SAFETY: When ZDICT_isError(code) is non-zero, zstd returns a static error string. - let msg = unsafe { - let ptr = ZDICT_getErrorName(code); - if ptr.is_null() { - String::from("zstd dictionary builder error") - } else { - CStr::from_ptr(ptr).to_string_lossy().into_owned() - } - }; - Err(io::Error::other(msg)) -} - -fn zparams(compression_level: i32, dict_id: Option, notification_level: u32) -> ZDICT_params_t { - ZDICT_params_t { - compressionLevel: compression_level, - notificationLevel: notification_level, - dictID: dict_id.unwrap_or(0), - } -} - -/// Train a finalized zstd dictionary from contiguous sample data with explicit sample sizes. -pub fn train_dictionary_ffi( - sample_data: &[u8], - sample_sizes: &[usize], - output: &mut W, - dict_size: usize, - params: TrainingParams, -) -> io::Result { - validate_samples(sample_data, sample_sizes)?; - if !params.optimize && (params.k == 0 || params.d == 0) { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "k and d must be set when optimize=false", - )); - } - - let max_size = dict_size.max(ZDICT_DICTSIZE_MIN as usize); - let mut dict_buf = vec![0u8; max_size]; - let z_params = zparams( - params.compression_level, - params.dict_id, - params.notification_level, - ); - - let (written, selected_k, selected_d, selected_f, selected_steps, selected_accel) = - match params.algorithm { - TrainingAlgorithm::Cover => { - let mut cover = ZDICT_cover_params_t { - k: params.k, - d: params.d, - steps: params.steps, - nbThreads: params.nb_threads, - splitPoint: params.split_point, - shrinkDict: 0, - shrinkDictMaxRegression: 0, - zParams: z_params, - }; - let rc = if params.optimize { - // SAFETY: All pointers remain valid for the duration of the call. - unsafe { - ZDICT_optimizeTrainFromBuffer_cover( - dict_buf.as_mut_ptr().cast(), - dict_buf.len(), - sample_data.as_ptr().cast(), - sample_sizes.as_ptr(), - sample_sizes.len() as u32, - &mut cover, - ) - } - } else { - // SAFETY: All pointers remain valid for the duration of the call. - unsafe { - ZDICT_trainFromBuffer_cover( - dict_buf.as_mut_ptr().cast(), - dict_buf.len(), - sample_data.as_ptr().cast(), - sample_sizes.as_ptr(), - sample_sizes.len() as u32, - cover, - ) - } - }; - (parse_zdict_code(rc)?, cover.k, cover.d, 0, cover.steps, 0) - } - TrainingAlgorithm::FastCover => { - let mut fast = ZDICT_fastCover_params_t { - k: params.k, - d: params.d, - f: params.f, - steps: params.steps, - nbThreads: params.nb_threads, - splitPoint: params.split_point, - accel: params.accel, - shrinkDict: 0, - shrinkDictMaxRegression: 0, - zParams: z_params, - }; - let rc = if params.optimize { - // SAFETY: All pointers remain valid for the duration of the call. - unsafe { - ZDICT_optimizeTrainFromBuffer_fastCover( - dict_buf.as_mut_ptr().cast(), - dict_buf.len(), - sample_data.as_ptr().cast(), - sample_sizes.as_ptr(), - sample_sizes.len() as u32, - &mut fast, - ) - } - } else { - // SAFETY: All pointers remain valid for the duration of the call. - unsafe { - ZDICT_trainFromBuffer_fastCover( - dict_buf.as_mut_ptr().cast(), - dict_buf.len(), - sample_data.as_ptr().cast(), - sample_sizes.as_ptr(), - sample_sizes.len() as u32, - fast, - ) - } - }; - ( - parse_zdict_code(rc)?, - fast.k, - fast.d, - fast.f, - fast.steps, - fast.accel, - ) - } - }; - output.write_all(&dict_buf[..written])?; - // SAFETY: dictionary slice is valid and length is from zstd return code. - let dict_id = unsafe { ZDICT_getDictID(dict_buf.as_ptr().cast(), written) }; - Ok(TrainedDictionaryInfo { - dict_size: written, - dict_id, - selected_k, - selected_d, - selected_f, - selected_steps, - selected_accel, - }) -} - -/// Convenience helper: train dictionary from individual samples. -pub fn train_dictionary_from_samples_ffi, W: Write>( - samples: &[S], - output: &mut W, - dict_size: usize, - params: TrainingParams, -) -> io::Result { - let total_len = samples.iter().map(|s| s.as_ref().len()).sum(); - let mut data = Vec::with_capacity(total_len); - let mut sizes = Vec::with_capacity(samples.len()); - for sample in samples { - let sample = sample.as_ref(); - sizes.push(sample.len()); - data.extend_from_slice(sample); - } - train_dictionary_ffi(&data, &sizes, output, dict_size, params) -} - -/// Finalize a raw-content dictionary into full zstd dictionary format with entropy tables. -pub fn finalize_raw_dictionary_ffi( - raw_content: &[u8], - sample_data: &[u8], - sample_sizes: &[usize], - output: &mut W, - dict_size: usize, - params: FinalizeParams, -) -> io::Result { - validate_samples(sample_data, sample_sizes)?; - if raw_content.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "raw dictionary content must not be empty", - )); - } - - let max_size = dict_size - .max(raw_content.len()) - .max(ZDICT_DICTSIZE_MIN as usize); - let mut dict_buf = vec![0u8; max_size]; - let z_params = zparams( - params.compression_level, - params.dict_id, - params.notification_level, - ); - - // SAFETY: All pointers remain valid for the duration of the call. - let rc = unsafe { - ZDICT_finalizeDictionary( - dict_buf.as_mut_ptr().cast(), - dict_buf.len(), - raw_content.as_ptr().cast(), - raw_content.len(), - sample_data.as_ptr().cast(), - sample_sizes.as_ptr(), - sample_sizes.len() as u32, - z_params, - ) - }; - let written = parse_zdict_code(rc)?; - output.write_all(&dict_buf[..written])?; - // SAFETY: dictionary slice is valid and length is from zstd return code. - let dict_id = unsafe { ZDICT_getDictID(dict_buf.as_ptr().cast(), written) }; - Ok(TrainedDictionaryInfo { - dict_size: written, - dict_id, - selected_k: 0, - selected_d: 0, - selected_f: 0, - selected_steps: 0, - selected_accel: 0, - }) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::decoding::Dictionary; - use crate::encoding::{CompressionLevel, FrameCompressor}; - use std::format; - use std::io::Cursor; - - fn sample_corpus() -> (Vec, Vec) { - let mut sizes = Vec::new(); - let mut data = Vec::new(); - for i in 0..400u32 { - let sample = format!("tenant=demo table=orders key={i} region=eu value=aaaaabbbbbccccc\n"); - sizes.push(sample.len()); - data.extend_from_slice(sample.as_bytes()); - } - (data, sizes) - } - - #[test] - fn fastcover_training_produces_decodeable_dictionary() { - let (sample_data, sample_sizes) = sample_corpus(); - let mut dict = Vec::new(); - let info = train_dictionary_ffi( - &sample_data, - &sample_sizes, - &mut dict, - 4096, - TrainingParams::default(), - ) - .expect("fastcover training should succeed"); - assert!(info.dict_size > 0); - let parsed = Dictionary::decode_dict(dict.as_slice()).expect("trained dict should decode"); - assert_eq!(parsed.id, info.dict_id); - assert!(!parsed.dict_content.is_empty()); - } - - #[test] - fn finalize_raw_dict_is_compatible_with_ffi_decoder() { - let (sample_data, sample_sizes) = sample_corpus(); - - let mut raw = Vec::new(); - crate::dictionary::create_raw_dict_from_source( - Cursor::new(sample_data.as_slice()), - sample_data.len(), - &mut raw, - 4096, - ); - assert!(!raw.is_empty(), "raw dictionary must not be empty"); - - let mut finalized = Vec::new(); - let info = finalize_raw_dictionary_ffi( - raw.as_slice(), - &sample_data, - &sample_sizes, - &mut finalized, - 4096, - FinalizeParams::default(), - ) - .expect("finalization should succeed"); - assert!(info.dict_size > 0); - - let parsed = Dictionary::decode_dict(finalized.as_slice()) - .expect("finalized dictionary should decode"); - - let mut payload = Vec::new(); - for idx in 0..128u32 { - payload.extend_from_slice( - format!("tenant=demo table=orders op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n") - .as_bytes(), - ); - } - - let mut compressed = Vec::new(); - let mut compressor = FrameCompressor::new(CompressionLevel::Fastest); - compressor - .set_dictionary(parsed) - .expect("dictionary should attach"); - compressor.set_source(payload.as_slice()); - compressor.set_drain(&mut compressed); - compressor.compress(); - - let mut ffi_decoder = zstd::bulk::Decompressor::with_dictionary(finalized.as_slice()) - .expect("ffi decoder should accept finalized dictionary"); - let mut decoded = Vec::with_capacity(payload.len()); - let written = ffi_decoder - .decompress_to_buffer(compressed.as_slice(), &mut decoded) - .expect("ffi decoder should decode payload"); - assert_eq!(written, payload.len()); - assert_eq!(decoded, payload); - } -} diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 4ae7ba14..117bfe7a 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -23,8 +23,6 @@ // the frequency of w in the reservoir using a rolling karp-rabin hash // - The score of a segment is the sum of `f(w)` called on every kmer within the segment mod cover; -#[cfg(feature = "dict_builder_ffi")] -mod ffi_builder; mod frequency; mod reservoir; @@ -41,9 +39,6 @@ use std::{ vec::Vec, }; -#[cfg(feature = "dict_builder_ffi")] -pub use ffi_builder::*; - /// A set of values that are used during dictionary construction. /// /// Changing these values can improve the resulting dictionary size for certain datasets. From 7dedf72b97a108850603c1a00ce50793c29d05e8 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 5 Apr 2026 19:46:36 +0300 Subject: [PATCH 03/14] feat(dict): implement pure-rust fastcover and finalize --- README.md | 6 +- zstd/Cargo.toml | 5 + zstd/benches/dict_builder_fastcover.rs | 76 +++++ zstd/src/dictionary/cover.rs | 8 +- zstd/src/dictionary/fastcover.rs | 236 ++++++++++++++++ zstd/src/dictionary/mod.rs | 367 ++++++++++++++++++++++++- 6 files changed, 680 insertions(+), 18 deletions(-) create mode 100644 zstd/benches/dict_builder_fastcover.rs create mode 100644 zstd/src/dictionary/fastcover.rs diff --git a/README.md b/README.md index 606164d9..ddd7ead4 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,11 @@ Complete RFC 8878 implementation. Performance: ~1.4-3.5x slower than C zstd depe ### Dictionary Generation -When the `dict_builder` feature is enabled, the `dictionary` module can create raw content dictionaries. Within 0.2% of the official implementation on the `github-users` sample set. +When the `dict_builder` feature is enabled, the `dictionary` module can: +- build raw dictionaries with COVER (`create_raw_dict_from_source`) +- build raw dictionaries with FastCOVER (`create_fastcover_raw_dict_from_source`) +- finalize raw content into full zstd dictionary format (`finalize_raw_dict`) +- train+finalize in one pure-Rust flow (`create_fastcover_dict_from_source`) ## Benchmarking diff --git a/zstd/Cargo.toml b/zstd/Cargo.toml index 786aa118..a0a6c4b5 100644 --- a/zstd/Cargo.toml +++ b/zstd/Cargo.toml @@ -57,3 +57,8 @@ harness = false name = "bitstream" harness = false required-features = ["bench_internals"] + +[[bench]] +name = "dict_builder_fastcover" +harness = false +required-features = ["dict_builder"] diff --git a/zstd/benches/dict_builder_fastcover.rs b/zstd/benches/dict_builder_fastcover.rs new file mode 100644 index 00000000..fbd050bf --- /dev/null +++ b/zstd/benches/dict_builder_fastcover.rs @@ -0,0 +1,76 @@ +use criterion::{Criterion, criterion_group, criterion_main}; +use std::hint::black_box; +use std::io::Cursor; +use structured_zstd::dictionary::{ + FastCoverOptions, create_fastcover_raw_dict_from_source, create_raw_dict_from_source, +}; + +fn corpus() -> Vec { + let mut data = Vec::new(); + for i in 0..2_000u32 { + data.extend_from_slice( + format!( + "tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeeefffff\n" + ) + .as_bytes(), + ); + } + data +} + +fn bench_dict_builder(c: &mut Criterion) { + let data = corpus(); + let dict_size = 8 * 1024; + + c.bench_function("dict_builder/cover_raw", |b| { + b.iter(|| { + let mut out = Vec::new(); + create_raw_dict_from_source( + Cursor::new(data.as_slice()), + data.len(), + &mut out, + black_box(dict_size), + ); + black_box(out.len()); + }) + }); + + c.bench_function("dict_builder/fastcover_raw_opt", |b| { + b.iter(|| { + let mut out = Vec::new(); + let tuned = create_fastcover_raw_dict_from_source( + Cursor::new(data.as_slice()), + &mut out, + black_box(dict_size), + &FastCoverOptions::default(), + ) + .expect("fastcover training should succeed"); + black_box((out.len(), tuned.score)); + }) + }); + + c.bench_function("dict_builder/fastcover_raw_fixed", |b| { + b.iter(|| { + let mut out = Vec::new(); + let opts = FastCoverOptions { + optimize: false, + accel: 4, + k: 256, + d: 8, + f: 20, + ..FastCoverOptions::default() + }; + let tuned = create_fastcover_raw_dict_from_source( + Cursor::new(data.as_slice()), + &mut out, + black_box(dict_size), + &opts, + ) + .expect("fastcover training should succeed"); + black_box((out.len(), tuned.score)); + }) + }); +} + +criterion_group!(benches, bench_dict_builder); +criterion_main!(benches); diff --git a/zstd/src/dictionary/cover.rs b/zstd/src/dictionary/cover.rs index fa448a2d..fbcd3660 100644 --- a/zstd/src/dictionary/cover.rs +++ b/zstd/src/dictionary/cover.rs @@ -67,9 +67,10 @@ pub struct Context { pub fn pick_best_segment( params: &DictParams, ctx: &mut Context, + epoch: &'_ [u8], collection_sample: &'_ [u8], ) -> Segment { - let mut segments = collection_sample + let mut segments = epoch .chunks(params.segment_size as usize) .peekable(); let mut best_segment: &[u8] = segments.peek().expect("at least one segment"); @@ -93,9 +94,12 @@ pub fn pick_best_segment( /// /// `score_segment` modifies `ctx.frequencies`. fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize { + if segment.len() < K { + return 0; + } let mut segment_score = 0; // Determine the score of each overlapping k-mer - for i in 0..(segment.len() - K - 1) { + for i in 0..=(segment.len() - K) { let kmer: &KMer = (&segment[i..i + K]) .try_into() .expect("Failed to make kmer"); diff --git a/zstd/src/dictionary/fastcover.rs b/zstd/src/dictionary/fastcover.rs new file mode 100644 index 00000000..ea4dffc4 --- /dev/null +++ b/zstd/src/dictionary/fastcover.rs @@ -0,0 +1,236 @@ +use alloc::vec; +use alloc::vec::Vec; + +#[derive(Debug, Clone, Copy)] +pub struct FastCoverParams { + pub k: usize, + pub d: usize, + pub f: u32, + pub accel: usize, +} + +#[derive(Debug, Clone, Copy)] +pub struct FastCoverTuned { + pub k: usize, + pub d: usize, + pub f: u32, + pub accel: usize, + pub score: usize, +} + +pub const DEFAULT_K_CANDIDATES: &[usize] = &[64, 128, 256, 512, 1024, 2048]; +pub const DEFAULT_D_CANDIDATES: &[usize] = &[6, 8, 12, 16]; +pub const DEFAULT_F_CANDIDATES: &[u32] = &[16, 18, 20]; + +fn hash_dmer(dmer: &[u8]) -> u64 { + // 64-bit FNV-1a, deterministic and cheap for d-mer hashing. + let mut h = 0xcbf29ce484222325u64; + for &b in dmer { + h ^= u64::from(b); + h = h.wrapping_mul(0x100000001b3); + } + h +} + +fn clamp_table_bits(f: u32) -> u32 { + f.clamp(8, 20) +} + +fn build_frequency_table(sample: &[u8], d: usize, f: u32, accel: usize) -> Vec { + let bits = clamp_table_bits(f); + let size = 1usize << bits; + let mask = size - 1; + let step = accel.max(1); + let mut table = vec![0u32; size]; + + if sample.len() < d || d == 0 { + return table; + } + + let mut i = 0usize; + while i + d <= sample.len() { + let slot = (hash_dmer(&sample[i..i + d]) as usize) & mask; + table[slot] = table[slot].saturating_add(1); + i += step; + } + table +} + +fn score_segment(segment: &[u8], d: usize, mask: usize, table: &[u32]) -> usize { + if segment.len() < d || d == 0 { + return 0; + } + let mut score = 0usize; + for i in 0..=(segment.len() - d) { + let slot = (hash_dmer(&segment[i..i + d]) as usize) & mask; + score += table[slot] as usize; + } + score +} + +fn build_raw_dict(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec { + if sample.is_empty() || dict_size == 0 { + return Vec::new(); + } + + let k = params.k.max(params.d).max(16); + let d = params.d.clamp(4, 32); + let table = build_frequency_table(sample, d, params.f, params.accel); + let mask = table.len().saturating_sub(1); + + let mut segments: Vec<(usize, &[u8])> = sample + .chunks(k) + .filter(|seg| seg.len() >= d) + .map(|seg| (score_segment(seg, d, mask, &table), seg)) + .collect(); + segments.sort_by(|a, b| b.0.cmp(&a.0)); + + let mut out = Vec::with_capacity(dict_size); + for (_, seg) in segments { + if out.len() >= dict_size { + break; + } + let remaining = dict_size - out.len(); + if seg.len() <= remaining { + out.extend_from_slice(seg); + } else { + out.extend_from_slice(&seg[..remaining]); + } + } + out +} + +fn coverage_score(dict: &[u8], eval: &[u8], d: usize, accel: usize) -> usize { + if dict.len() < d || eval.len() < d || d == 0 { + return 0; + } + let mut seen = std::collections::HashSet::with_capacity(dict.len() / d + 1); + for i in 0..=(dict.len() - d) { + seen.insert(hash_dmer(&dict[i..i + d])); + } + + let mut hits = 0usize; + let step = accel.max(1); + let mut i = 0usize; + while i + d <= eval.len() { + if seen.contains(&hash_dmer(&eval[i..i + d])) { + hits += 1; + } + i += step; + } + hits +} + +pub fn train_fastcover_raw( + sample: &[u8], + dict_size: usize, + params: FastCoverParams, +) -> Vec { + build_raw_dict(sample, dict_size, params) +} + +pub fn optimize_fastcover_raw( + sample: &[u8], + dict_size: usize, + split_point: f64, + accel: usize, + d_candidates: &[usize], + f_candidates: &[u32], + k_values: &[usize], +) -> (Vec, FastCoverTuned) { + let split = split_point.clamp(0.1, 0.95); + let split_idx = ((sample.len() as f64) * split) as usize; + let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1)); + let (train, eval) = sample.split_at(split_idx); + + let d_values = if d_candidates.is_empty() { + DEFAULT_D_CANDIDATES + } else { + d_candidates + }; + let f_values = if f_candidates.is_empty() { + DEFAULT_F_CANDIDATES + } else { + f_candidates + }; + + let mut best_dict = Vec::new(); + let mut best = FastCoverTuned { + k: 0, + d: 0, + f: 0, + accel, + score: 0, + }; + + for &f in f_values { + for &d in d_values { + for &k in k_values { + let params = FastCoverParams { k, d, f, accel }; + let dict = build_raw_dict(train, dict_size, params); + let score = coverage_score(dict.as_slice(), eval, d, accel); + if score > best.score { + best.score = score; + best.k = k; + best.d = d; + best.f = f; + best_dict = dict; + } + } + } + } + + (best_dict, best) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::format; + + fn corpus() -> Vec { + let mut data = Vec::new(); + for i in 0..500u32 { + data.extend_from_slice( + format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbccccdddd\n") + .as_bytes(), + ); + } + data + } + + #[test] + fn fastcover_raw_produces_non_empty_dict() { + let sample = corpus(); + let dict = train_fastcover_raw( + sample.as_slice(), + 4096, + FastCoverParams { + k: 256, + d: 8, + f: 20, + accel: 1, + }, + ); + assert!(!dict.is_empty()); + assert!(dict.len() <= 4096); + } + + #[test] + fn fastcover_optimizer_selects_valid_params() { + let sample = corpus(); + let (dict, tuned) = optimize_fastcover_raw( + sample.as_slice(), + 4096, + 0.75, + 1, + &[6, 8], + &[18, 20], + &[128, 256], + ); + assert!(!dict.is_empty()); + assert!([6, 8].contains(&tuned.d)); + assert!([18, 20].contains(&tuned.f)); + assert!([128, 256].contains(&tuned.k)); + } +} diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 117bfe7a..535820ec 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -23,22 +23,67 @@ // the frequency of w in the reservoir using a rolling karp-rabin hash // - The score of a segment is the sum of `f(w)` called on every kmer within the segment mod cover; +mod fastcover; mod frequency; mod reservoir; +use crate::bit_io::BitWriter; +use crate::decoding::dictionary::MAGIC_NUM as DICT_MAGIC_NUM; +use crate::fse::fse_encoder; +use crate::huff0::HuffmanTable as HuffmanDecoderTable; +use crate::huff0::huff0_encoder::{HuffmanEncoder, HuffmanTable as HuffmanEncoderTable}; use crate::dictionary::reservoir::create_sample; -use alloc::vec; use core::cmp::Reverse; use cover::*; +pub use fastcover::{ + DEFAULT_D_CANDIDATES, DEFAULT_F_CANDIDATES, DEFAULT_K_CANDIDATES, FastCoverParams, + FastCoverTuned, +}; use std::{ boxed::Box, collections::{BinaryHeap, HashMap}, + format, fs::{self, File}, - io::{self, BufReader, Read}, + io::{self, Read}, path::{Path, PathBuf}, vec::Vec, }; +/// Tuning knobs for pure-Rust FastCOVER training. +#[derive(Debug, Clone)] +pub struct FastCoverOptions { + pub optimize: bool, + pub split_point: f64, + pub accel: usize, + pub k: usize, + pub d: usize, + pub f: u32, + pub k_candidates: Vec, + pub d_candidates: Vec, + pub f_candidates: Vec, +} + +impl Default for FastCoverOptions { + fn default() -> Self { + Self { + optimize: true, + split_point: 0.75, + accel: 1, + k: 256, + d: 8, + f: 20, + k_candidates: DEFAULT_K_CANDIDATES.to_vec(), + d_candidates: DEFAULT_D_CANDIDATES.to_vec(), + f_candidates: DEFAULT_F_CANDIDATES.to_vec(), + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +pub struct FinalizeOptions { + pub dict_id: Option, +} + /// A set of values that are used during dictionary construction. /// /// Changing these values can improve the resulting dictionary size for certain datasets. @@ -125,22 +170,45 @@ pub fn create_raw_dict_from_dir, W: io::Write>( /// /// This function uses `BufRead` internally, the provided reader need not be buffered. pub fn create_raw_dict_from_source( - source: R, + mut source: R, source_size: usize, output: &mut W, dict_size: usize, ) { + if source_size == 0 || dict_size == 0 { + return; + } + let mut all = Vec::with_capacity(source_size); + source + .read_to_end(&mut all) + .expect("can read full source for dictionary training"); + if all.is_empty() { + return; + } + + if all.len() < K { + let keep = usize::min(all.len(), dict_size); + output + .write_all(&all[all.len() - keep..]) + .expect("can write tiny dictionary"); + return; + } + + let source_size = all.len(); vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source"); - let mut buffered_source = BufReader::with_capacity(128_000, source); let params = DictParams { segment_size: 2048 }; - let num_segments = source_size / params.segment_size as usize; + let num_segments = usize::max(1, source_size / params.segment_size as usize); // According to 4. Experiments - Varying Reservoir Sampler Thresholds, // setting reservoir size to collection size / min{collection size / (2 * number of segments), // 256} was effective - let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256); + let denom = usize::max(1, source_size / (2 * num_segments)); + let sample_scale = usize::max(1, usize::min(denom, 256)); + let mut sample_size = source_size / sample_scale; + sample_size = usize::max(sample_size, usize::min(source_size, 16)); vprintln!("create_dict: creating {sample_size} byte sample of collection"); - let collection_sample = create_sample(&mut buffered_source, sample_size); + let mut sample_reader = all.as_slice(); + let collection_sample = create_sample(&mut sample_reader, sample_size); // A collection of segments to be used in the final dictionary. // @@ -151,21 +219,15 @@ pub fn create_raw_dict_from_source( let (_, epoch_size) = compute_epoch_info(¶ms, dict_size, source_size / K); let num_epochs = source_size / epoch_size; vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes"); - //let mut current_epoch = vec![0; epoch_size]; - let mut current_epoch = vec![0; 100]; let mut epoch_counter = 0; let mut ctx = Context { frequencies: HashMap::with_capacity(epoch_size / K), }; // Score each segment in the epoch and select the highest scoring segment // for the pool - while buffered_source - .read(&mut current_epoch) - .expect("can read input") - != 0 - { + for epoch in all.chunks(epoch_size) { epoch_counter += 1; - let best_segment = pick_best_segment(¶ms, &mut ctx, &collection_sample); + let best_segment = pick_best_segment(¶ms, &mut ctx, epoch, &collection_sample); vprintln!( "\tcreate_dict: epoch {epoch_counter}/{num_epochs} has best segment score {}", best_segment.score @@ -186,3 +248,278 @@ pub fn create_raw_dict_from_source( .expect("can write to output"); } } + +fn serialize_huffman_table(sample_data: &[u8], raw_content: &[u8]) -> io::Result> { + let mut stats = if sample_data.len() >= 2 { + sample_data.to_vec() + } else { + raw_content.to_vec() + }; + if stats.len() < 2 || stats.iter().all(|b| *b == stats[0]) { + stats = (0u8..=255).collect(); + } + + let table = HuffmanEncoderTable::build_from_data(stats.as_slice()); + let mut writer = BitWriter::new(); + let mut encoder = HuffmanEncoder::new(&table, &mut writer); + encoder.encode(&[stats[0]], true); + let encoded = writer.dump(); + + let mut decoder = HuffmanDecoderTable::new(); + let table_size = decoder + .build_decoder(encoded.as_slice()) + .map_err(|e| io::Error::other(format!("failed to decode generated huffman table: {e}")))?; + Ok(encoded[..table_size as usize].to_vec()) +} + +fn serialize_fse_table(table: &fse_encoder::FSETable) -> Vec { + let mut writer = BitWriter::new(); + table.write_table(&mut writer); + writer.dump() +} + +fn derive_dict_id(raw_content: &[u8]) -> u32 { + let mut h = 0xcbf29ce484222325u64; + for &b in raw_content { + h ^= u64::from(b); + h = h.wrapping_mul(0x100000001b3); + } + let compliant = (h % ((1u64 << 31) - 32768)) + 32768; + compliant as u32 +} + +/// Finalize raw dictionary content into a full zstd dictionary binary +/// (`magic + dict_id + entropy tables + offset history + content`). +pub fn finalize_raw_dict( + raw_content: &[u8], + sample_data: &[u8], + dict_size: usize, + options: FinalizeOptions, +) -> io::Result> { + if raw_content.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "raw dictionary content must not be empty", + )); + } + let mut out = Vec::with_capacity(dict_size.max(256)); + out.extend_from_slice(&DICT_MAGIC_NUM); + let dict_id = options.dict_id.unwrap_or_else(|| derive_dict_id(raw_content)); + out.extend_from_slice(&dict_id.to_le_bytes()); + out.extend_from_slice(serialize_huffman_table(sample_data, raw_content)?.as_slice()); + out.extend_from_slice(serialize_fse_table(&fse_encoder::default_of_table()).as_slice()); + out.extend_from_slice(serialize_fse_table(&fse_encoder::default_ml_table()).as_slice()); + out.extend_from_slice(serialize_fse_table(&fse_encoder::default_ll_table()).as_slice()); + + // Repeat offsets: keep default bootstrap history. + out.extend_from_slice(&1u32.to_le_bytes()); + out.extend_from_slice(&4u32.to_le_bytes()); + out.extend_from_slice(&8u32.to_le_bytes()); + + let min_content_size = 8usize; + let max_content_budget = dict_size.saturating_sub(out.len()); + if max_content_budget < min_content_size { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "dictionary size too small to fit header and offset history", + )); + } + + let content = if raw_content.len() > max_content_budget { + &raw_content[raw_content.len() - max_content_budget..] + } else { + raw_content + }; + if content.len() < min_content_size { + out.resize(out.len() + (min_content_size - content.len()), 0); + } + out.extend_from_slice(content); + Ok(out) +} + +/// Train a raw FastCOVER dictionary from a source stream. +pub fn create_fastcover_raw_dict_from_source( + mut source: R, + output: &mut W, + dict_size: usize, + options: &FastCoverOptions, +) -> io::Result { + let mut sample = Vec::new(); + source.read_to_end(&mut sample)?; + if sample.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "source stream is empty", + )); + } + + let (dict, tuned) = if options.optimize { + let (dict, tuned) = fastcover::optimize_fastcover_raw( + sample.as_slice(), + dict_size, + options.split_point, + options.accel, + options.d_candidates.as_slice(), + options.f_candidates.as_slice(), + options.k_candidates.as_slice(), + ); + (dict, tuned) + } else { + let params = FastCoverParams { + k: options.k, + d: options.d, + f: options.f, + accel: options.accel, + }; + let dict = fastcover::train_fastcover_raw(sample.as_slice(), dict_size, params); + ( + dict, + FastCoverTuned { + k: options.k, + d: options.d, + f: options.f, + accel: options.accel, + score: 0, + }, + ) + }; + output.write_all(dict.as_slice())?; + Ok(tuned) +} + +/// Train and finalize a FastCOVER dictionary in pure Rust. +pub fn create_fastcover_dict_from_source( + mut source: R, + output: &mut W, + dict_size: usize, + fastcover: &FastCoverOptions, + finalize: FinalizeOptions, +) -> io::Result { + let mut sample = Vec::new(); + source.read_to_end(&mut sample)?; + if sample.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "source stream is empty", + )); + } + + let (raw_dict, tuned) = if fastcover.optimize { + fastcover::optimize_fastcover_raw( + sample.as_slice(), + dict_size, + fastcover.split_point, + fastcover.accel, + fastcover.d_candidates.as_slice(), + fastcover.f_candidates.as_slice(), + fastcover.k_candidates.as_slice(), + ) + } else { + let params = FastCoverParams { + k: fastcover.k, + d: fastcover.d, + f: fastcover.f, + accel: fastcover.accel, + }; + ( + fastcover::train_fastcover_raw(sample.as_slice(), dict_size, params), + FastCoverTuned { + k: fastcover.k, + d: fastcover.d, + f: fastcover.f, + accel: fastcover.accel, + score: 0, + }, + ) + }; + + let finalized = finalize_raw_dict(raw_dict.as_slice(), sample.as_slice(), dict_size, finalize)?; + output.write_all(finalized.as_slice())?; + Ok(tuned) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::decoding::Dictionary; + use crate::encoding::{CompressionLevel, FrameCompressor}; + use std::io::Cursor; + + fn training_data() -> Vec { + let mut data = Vec::new(); + for i in 0..512u32 { + data.extend_from_slice( + format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n") + .as_bytes(), + ); + } + data + } + + #[test] + fn finalize_raw_dict_roundtrips_with_ffi_decoder() { + let sample = training_data(); + let raw = fastcover::train_fastcover_raw( + sample.as_slice(), + 4096, + FastCoverParams { + k: 256, + d: 8, + f: 20, + accel: 1, + }, + ); + let finalized = finalize_raw_dict( + raw.as_slice(), + sample.as_slice(), + 4096, + FinalizeOptions::default(), + ) + .expect("finalization should succeed"); + let parsed = Dictionary::decode_dict(finalized.as_slice()) + .expect("finalized dictionary should parse"); + assert!(!parsed.dict_content.is_empty()); + + let mut payload = Vec::new(); + for idx in 0..96u32 { + payload.extend_from_slice( + format!("tenant=demo op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n").as_bytes(), + ); + } + + let mut compressed = Vec::new(); + let mut compressor = FrameCompressor::new(CompressionLevel::Fastest); + compressor + .set_dictionary(parsed) + .expect("dictionary should attach"); + compressor.set_source(payload.as_slice()); + compressor.set_drain(&mut compressed); + compressor.compress(); + + let mut ffi_decoder = zstd::bulk::Decompressor::with_dictionary(finalized.as_slice()) + .expect("ffi decoder should accept finalized dictionary"); + let mut decoded = Vec::with_capacity(payload.len()); + let written = ffi_decoder + .decompress_to_buffer(compressed.as_slice(), &mut decoded) + .expect("ffi decoder should decode payload"); + assert_eq!(written, payload.len()); + assert_eq!(decoded, payload); + } + + #[test] + fn create_fastcover_dict_from_source_writes_non_empty_output() { + let sample = training_data(); + let mut out = Vec::new(); + let tuned = create_fastcover_dict_from_source( + Cursor::new(sample.as_slice()), + &mut out, + 4096, + &FastCoverOptions::default(), + FinalizeOptions::default(), + ) + .expect("fastcover+finalize should succeed"); + assert!(!out.is_empty()); + assert!(tuned.k > 0); + assert!(tuned.d > 0); + } +} From c6b3fdf82dc1451207f5385f22d982dcad21954d Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 5 Apr 2026 20:40:11 +0300 Subject: [PATCH 04/14] perf(bench): add rust fastcover vs ffi training delta - extend compare_ffi with dict-train stage and REPORT_DICT_TRAIN lines - parse and render dictionary training section in benchmark reports - require dict_builder feature for compare_ffi benchmark and pipeline command - document new benchmark stage and speed-delta fallback behavior --- .github/scripts/run-benchmarks.sh | 79 ++++++++++++++++++-- BENCHMARKS.md | 14 ++-- zstd/Cargo.toml | 1 + zstd/benches/compare_ffi.rs | 116 ++++++++++++++++++++++++++++-- 4 files changed, 194 insertions(+), 16 deletions(-) diff --git a/.github/scripts/run-benchmarks.sh b/.github/scripts/run-benchmarks.sh index a467b2a1..f3079579 100755 --- a/.github/scripts/run-benchmarks.sh +++ b/.github/scripts/run-benchmarks.sh @@ -17,7 +17,7 @@ BENCH_RAW_FILE="$(mktemp -t structured-zstd-bench-raw.XXXXXX)" trap 'rm -f "$BENCH_RAW_FILE"' EXIT export STRUCTURED_ZSTD_EMIT_REPORT=1 -cargo bench --bench compare_ffi -p structured-zstd -- --output-format bencher | tee "$BENCH_RAW_FILE" +cargo bench --bench compare_ffi -p structured-zstd --features dict_builder -- --output-format bencher | tee "$BENCH_RAW_FILE" echo "Parsing results..." >&2 @@ -38,6 +38,9 @@ MEM_RE = re.compile( DICT_RE = re.compile( r'^REPORT_DICT scenario=(\S+) label="((?:[^"\\]|\\.)+)" level=(\S+) dict_bytes=(\d+) train_ms=([0-9.]+) ffi_no_dict_bytes=(\d+) ffi_with_dict_bytes=(\d+) ffi_no_dict_ratio=([0-9.]+) ffi_with_dict_ratio=([0-9.]+)$' ) +DICT_TRAIN_RE = re.compile( + r'^REPORT_DICT_TRAIN scenario=(\S+) label="((?:[^"\\]|\\.)+)" dict_bytes_requested=(\d+) rust_train_ms=([0-9.]+) ffi_train_ms=([0-9.]+) rust_dict_bytes=(\d+) ffi_dict_bytes=(\d+) rust_fastcover_score=(\d+)$' +) def unescape_report_label(value): output = [] @@ -71,6 +74,7 @@ timings = [] ratios = [] memory_rows = [] dictionary_rows = [] +dictionary_training_rows = [] timing_rows = [] scenario_input_bytes = {} raw_path = os.environ["BENCH_RAW_FILE"] @@ -104,6 +108,14 @@ def parse_benchmark_name(name): "source": None, "implementation": parts[4], } + if len(parts) == 5 and parts[0] == "dict-train" and parts[3] == "matrix": + return { + "stage": "dict-train", + "level": parts[1], + "scenario": parts[2], + "source": None, + "implementation": parts[4], + } raise ValueError(f"Unsupported benchmark name format: {name} (parts={parts})") def canonical_key(stage, scenario, level, source): @@ -227,6 +239,38 @@ with open(raw_path) as f: "ffi_no_dict_ratio": float(ffi_no_dict_ratio), "ffi_with_dict_ratio": float(ffi_with_dict_ratio), }) + continue + + dict_train_match = DICT_TRAIN_RE.match(line) + if dict_train_match: + ( + scenario, + label, + dict_bytes_requested, + rust_train_ms, + ffi_train_ms, + rust_dict_bytes, + ffi_dict_bytes, + rust_fastcover_score, + ) = dict_train_match.groups() + label = unescape_report_label(label) + delta = None + rust_train_ms_float = float(rust_train_ms) + ffi_train_ms_float = float(ffi_train_ms) + if rust_train_ms_float > 0.0: + delta = ffi_train_ms_float / rust_train_ms_float + dictionary_training_rows.append({ + "scenario": scenario, + "label": label, + "dict_bytes_requested": int(dict_bytes_requested), + "rust_train_ms": rust_train_ms_float, + "ffi_train_ms": ffi_train_ms_float, + "rust_dict_bytes": int(rust_dict_bytes), + "ffi_dict_bytes": int(ffi_dict_bytes), + "rust_fastcover_score": int(rust_fastcover_score), + "delta_rust_over_ffi": delta, + "status": classify_speed_delta(delta), + }) if not benchmark_results: print("ERROR: No benchmark results parsed!", file=sys.stderr) @@ -246,6 +290,12 @@ if not memory_rows: if not dictionary_rows: print("WARN: No REPORT_DICT lines parsed; dictionary section will be empty.", file=sys.stderr) +if not dictionary_training_rows: + print( + "WARN: No REPORT_DICT_TRAIN lines parsed; dictionary training section will be empty.", + file=sys.stderr, + ) + with open("benchmark-results.json", "w") as f: json.dump(benchmark_results, f, indent=2) @@ -325,7 +375,11 @@ for key in all_keys: speed_delta = ( rust_bps / ffi_bps if (rust_bps is not None and ffi_bps is not None and ffi_bps > 0.0) - else None + else ( + ffi_ms / rust_ms + if (rust_ms is not None and ffi_ms is not None and rust_ms > 0.0) + else None + ) ) has_comparable_ratio = ( @@ -368,7 +422,7 @@ for key in all_keys: "delta_low": DELTA_LOW, "delta_high": DELTA_HIGH, }, - "interpretation": "delta>1 means Rust faster than FFI; delta<1 means slower", + "interpretation": "delta>1 means Rust faster than FFI; throughput ratio uses rust_bytes_per_sec/ffi_bytes_per_sec when available, otherwise fallback is ffi_ms_per_iter/rust_ms_per_iter", }, } ) @@ -421,6 +475,22 @@ for row in sorted(dictionary_rows, key=lambda item: (item["scenario"], item["lev f'| {row["scenario"]} | {label} | {row["level"]} | {row["dict_bytes"]} | {row["train_ms"]:.3f} | {row["ffi_no_dict_bytes"]} | {row["ffi_with_dict_bytes"]} | {row["ffi_no_dict_ratio"]:.4f} | {row["ffi_with_dict_ratio"]:.4f} |' ) +lines.extend([ + "", + "## Dictionary Training (Rust FastCOVER vs C FFI)", + "", + "| Scenario | Label | Dict bytes (requested) | Rust train ms | C train ms | Rust dict bytes | C dict bytes | Rust FastCOVER score | Delta (C/Rust) | Status |", + "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |", +]) + +for row in sorted(dictionary_training_rows, key=lambda item: item["scenario"]): + label = markdown_table_escape(row["label"]) + delta = row["delta_rust_over_ffi"] + delta_cell = f"{delta:.4f}" if delta is not None else "n/a" + lines.append( + f'| {row["scenario"]} | {label} | {row["dict_bytes_requested"]} | {row["rust_train_ms"]:.3f} | {row["ffi_train_ms"]:.3f} | {row["rust_dict_bytes"]} | {row["ffi_dict_bytes"]} | {row["rust_fastcover_score"]} | {delta_cell} | {row["status"]} |' + ) + lines.extend([ "", "## Timing Metrics", @@ -502,7 +572,7 @@ delta_lines.extend( "", "## Speed pack", "", - "Interpretation: higher speed is better (`rust_bytes_per_sec / ffi_bytes_per_sec`).", + "Interpretation: higher speed is better; delta uses `rust_bytes_per_sec / ffi_bytes_per_sec` when throughput exists, otherwise fallback is `ffi_ms_per_iter / rust_ms_per_iter`.", "", "### Rust speed", "", @@ -564,6 +634,7 @@ print(f"Wrote {len(benchmark_results)} timing results to benchmark-results.json" print(f"Wrote {len(ratios)} ratio rows to benchmark-report.md", file=sys.stderr) print(f"Wrote {len(memory_rows)} memory rows to benchmark-report.md", file=sys.stderr) print(f"Wrote {len(dictionary_rows)} dictionary rows to benchmark-report.md", file=sys.stderr) +print(f"Wrote {len(dictionary_training_rows)} dictionary training rows to benchmark-report.md", file=sys.stderr) print(f"Wrote {len(delta_rows)} canonical rows to benchmark-delta.json", file=sys.stderr) print(f"Wrote {len(delta_rows)} canonical rows to benchmark-delta.md", file=sys.stderr) PYEOF diff --git a/BENCHMARKS.md b/BENCHMARKS.md index cd6a6093..23ef5fa5 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -38,9 +38,10 @@ encoder: - `structured-zstd::Better` vs `zstd` level `7` - `structured-zstd::Best` vs `zstd` level `11` -Dictionary benchmarks are tracked separately with C FFI `with_dict` vs `without_dict` runs, using a -dictionary trained from scenario samples. Pure Rust dictionary compression is still pending and is -therefore not part of the pure-Rust-vs-C timing matrix yet. +Dictionary benchmarks currently include: + +- C FFI `with_dict` vs `without_dict` compression runs +- dictionary training timing comparison (`dict-train`) between Rust FastCOVER and C FFI trainer ## Issue #24 Acceptance Mapping @@ -55,7 +56,7 @@ therefore not part of the pure-Rust-vs-C timing matrix yet. Run the full Criterion matrix: ```bash -cargo bench --bench compare_ffi -p structured-zstd -- --output-format bencher +cargo bench --bench compare_ffi -p structured-zstd --features dict_builder -- --output-format bencher ``` Generate the CI-style JSON and markdown report locally: @@ -85,6 +86,7 @@ bash scripts/bench-flamegraph.sh decompress/default/decodecorpus-z000033/rust_st - compression ratio tables (`REPORT`) - input+output buffer size estimate tables (`REPORT_MEM`) - dictionary compression tables (`REPORT_DICT`) + - dictionary training comparison tables (`REPORT_DICT_TRAIN`) - timing rows for all benchmark functions - `benchmark-delta.json` with canonical `(scenario + params)` rows including: - raw Rust/FFI ratio values and `rust/ffi` ratio delta @@ -96,7 +98,9 @@ bash scripts/bench-flamegraph.sh decompress/default/decodecorpus-z000033/rust_st Delta interpretation (direct same-run comparison on the same environment): - **Ratio delta** (`rust_ratio / ffi_ratio`): lower is better for Rust -- **Speed delta** (`rust_bytes_per_sec / ffi_bytes_per_sec`): higher is better for Rust +- **Speed delta**: higher is better for Rust + - throughput form: `rust_bytes_per_sec / ffi_bytes_per_sec` + - fallback form (when throughput is unavailable): `ffi_ms_per_iter / rust_ms_per_iter` Status labels in `benchmark-delta` are derived directly from the same-run deltas (no environment calibration/pre-test coefficients): diff --git a/zstd/Cargo.toml b/zstd/Cargo.toml index a0a6c4b5..d52c7021 100644 --- a/zstd/Cargo.toml +++ b/zstd/Cargo.toml @@ -52,6 +52,7 @@ harness = false [[bench]] name = "compare_ffi" harness = false +required-features = ["dict_builder"] [[bench]] name = "bitstream" diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index 2ed5caaa..2c4a5255 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -13,9 +13,11 @@ mod support; use criterion::{Criterion, SamplingMode, Throughput, criterion_group, criterion_main}; use std::hint::black_box; +use std::io::Cursor; use std::sync::OnceLock; use std::time::{Duration, Instant}; use structured_zstd::decoding::FrameDecoder; +use structured_zstd::dictionary::{FastCoverOptions, create_fastcover_raw_dict_from_source}; use support::{LevelConfig, Scenario, ScenarioClass, benchmark_scenarios, supported_levels}; static BENCHMARK_SCENARIOS: OnceLock> = OnceLock::new(); @@ -195,12 +197,34 @@ fn bench_dictionary(c: &mut Criterion) { let training_samples = split_training_samples(&scenario.bytes); let sample_refs: Vec<&[u8]> = training_samples.iter().map(Vec::as_slice).collect(); + let training_blob: Vec = training_samples.concat(); let total_training_bytes = sample_refs.iter().map(|sample| sample.len()).sum::(); let dict_size = dictionary_size_for(scenario.len()) .min(total_training_bytes.saturating_sub(64)) .max(256); - let train_started = Instant::now(); - let Ok(dictionary) = zstd::dict::from_samples(&sample_refs, dict_size) else { + let fastcover_options = fastcover_fixed_options(); + + let rust_train_started = Instant::now(); + let mut rust_dictionary = Vec::new(); + let Ok(rust_tuned) = create_fastcover_raw_dict_from_source( + Cursor::new(training_blob.as_slice()), + &mut rust_dictionary, + dict_size, + &fastcover_options, + ) else { + eprintln!( + "BENCH_WARN skipping Rust FastCOVER dictionary benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", + scenario.id, + sample_refs.len(), + total_training_bytes, + dict_size + ); + continue; + }; + let rust_train_ms = rust_train_started.elapsed().as_secs_f64() * 1_000.0; + + let ffi_train_started = Instant::now(); + let Ok(ffi_dictionary) = zstd::dict::from_samples(&sample_refs, dict_size) else { eprintln!( "BENCH_WARN skipping dictionary benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", scenario.id, @@ -210,20 +234,63 @@ fn bench_dictionary(c: &mut Criterion) { ); continue; }; - let train_ms = train_started.elapsed().as_secs_f64() * 1_000.0; + let ffi_train_ms = ffi_train_started.elapsed().as_secs_f64() * 1_000.0; + + if emit_reports { + emit_dictionary_training_report( + scenario, + dict_size, + rust_train_ms, + ffi_train_ms, + rust_dictionary.len(), + ffi_dictionary.len(), + rust_tuned.score, + ); + } + + let benchmark_name = format!("dict-train/na/{}/{}", scenario.id, "matrix"); + let mut group = c.benchmark_group(benchmark_name); + configure_group(&mut group, scenario); + group.throughput(Throughput::Bytes(total_training_bytes as u64)); + + group.bench_function("pure_rust", |b| { + b.iter(|| { + let mut out = Vec::new(); + let tuned = create_fastcover_raw_dict_from_source( + Cursor::new(training_blob.as_slice()), + &mut out, + dict_size, + &fastcover_options, + ) + .expect("fastcover training should succeed"); + black_box((out.len(), tuned.score)); + }) + }); + + group.bench_function("c_ffi", |b| { + b.iter(|| { + black_box( + zstd::dict::from_samples(&sample_refs, dict_size) + .expect("ffi dictionary training should succeed") + .len(), + ) + }) + }); + + group.finish(); for level in supported_levels() { let mut no_dict = zstd::bulk::Compressor::new(level.ffi_level).unwrap(); let mut with_dict = - zstd::bulk::Compressor::with_dictionary(level.ffi_level, &dictionary).unwrap(); + zstd::bulk::Compressor::with_dictionary(level.ffi_level, &ffi_dictionary).unwrap(); let no_dict_bytes = no_dict.compress(&scenario.bytes).unwrap(); let with_dict_bytes = with_dict.compress(&scenario.bytes).unwrap(); if emit_reports { emit_dictionary_report( scenario, level, - dictionary.len(), - train_ms, + ffi_dictionary.len(), + ffi_train_ms, &no_dict_bytes, &with_dict_bytes, ); @@ -242,7 +309,8 @@ fn bench_dictionary(c: &mut Criterion) { group.bench_function("c_ffi_with_dict", |b| { let mut compressor = - zstd::bulk::Compressor::with_dictionary(level.ffi_level, &dictionary).unwrap(); + zstd::bulk::Compressor::with_dictionary(level.ffi_level, &ffi_dictionary) + .unwrap(); b.iter(|| black_box(compressor.compress(&scenario.bytes).unwrap())) }); @@ -355,6 +423,29 @@ fn emit_dictionary_report( ); } +fn emit_dictionary_training_report( + scenario: &Scenario, + dict_bytes_requested: usize, + rust_train_ms: f64, + ffi_train_ms: f64, + rust_dict_bytes: usize, + ffi_dict_bytes: usize, + rust_fastcover_score: usize, +) { + let escaped_label = escape_report_label(&scenario.label); + println!( + "REPORT_DICT_TRAIN scenario={} label=\"{}\" dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={} rust_fastcover_score={}", + scenario.id, + escaped_label, + dict_bytes_requested, + rust_train_ms, + ffi_train_ms, + rust_dict_bytes, + ffi_dict_bytes, + rust_fastcover_score + ); +} + fn split_training_samples(source: &[u8]) -> Vec> { let sample_size = source.len().div_ceil(16).clamp(256, 8192); let mut samples: Vec> = source @@ -384,6 +475,17 @@ fn dictionary_size_for(input_len: usize) -> usize { input_len.div_ceil(8).clamp(256, 16 * 1024) } +fn fastcover_fixed_options() -> FastCoverOptions { + FastCoverOptions { + optimize: false, + accel: 4, + k: 256, + d: 8, + f: 20, + ..FastCoverOptions::default() + } +} + fn escape_report_label(label: &str) -> String { label.replace('\\', "\\\\").replace('\"', "\\\"") } From 29988b0b2576ad52a92aa40e0dcedba015b69f7f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 5 Apr 2026 21:17:42 +0300 Subject: [PATCH 05/14] fix(dict): align fastcover benchmarks and thread feedback - treat source_size as hint and document in-memory training behavior - fix epoch sizing units, bound huffman stats sampling, and share fastcover training helper - add k-candidate fallback + regression tests - rename dict-train delta field to match C/Rust ratio semantics --- .github/scripts/run-benchmarks.sh | 4 +- zstd/benches/compare_ffi.rs | 21 ++-- zstd/src/dictionary/cover.rs | 4 +- zstd/src/dictionary/fastcover.rs | 22 +++-- zstd/src/dictionary/mod.rs | 153 ++++++++++++++++-------------- 5 files changed, 110 insertions(+), 94 deletions(-) diff --git a/.github/scripts/run-benchmarks.sh b/.github/scripts/run-benchmarks.sh index f3079579..ef330f0d 100755 --- a/.github/scripts/run-benchmarks.sh +++ b/.github/scripts/run-benchmarks.sh @@ -268,7 +268,7 @@ with open(raw_path) as f: "rust_dict_bytes": int(rust_dict_bytes), "ffi_dict_bytes": int(ffi_dict_bytes), "rust_fastcover_score": int(rust_fastcover_score), - "delta_rust_over_ffi": delta, + "delta_ffi_over_rust": delta, "status": classify_speed_delta(delta), }) @@ -485,7 +485,7 @@ lines.extend([ for row in sorted(dictionary_training_rows, key=lambda item: item["scenario"]): label = markdown_table_escape(row["label"]) - delta = row["delta_rust_over_ffi"] + delta = row["delta_ffi_over_rust"] delta_cell = f"{delta:.4f}" if delta is not None else "n/a" lines.append( f'| {row["scenario"]} | {label} | {row["dict_bytes_requested"]} | {row["rust_train_ms"]:.3f} | {row["ffi_train_ms"]:.3f} | {row["rust_dict_bytes"]} | {row["ffi_dict_bytes"]} | {row["rust_fastcover_score"]} | {delta_cell} | {row["status"]} |' diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index 2c4a5255..b9c290f7 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -13,11 +13,10 @@ mod support; use criterion::{Criterion, SamplingMode, Throughput, criterion_group, criterion_main}; use std::hint::black_box; -use std::io::Cursor; use std::sync::OnceLock; use std::time::{Duration, Instant}; use structured_zstd::decoding::FrameDecoder; -use structured_zstd::dictionary::{FastCoverOptions, create_fastcover_raw_dict_from_source}; +use structured_zstd::dictionary::{FastCoverOptions, train_fastcover_raw_from_slice}; use support::{LevelConfig, Scenario, ScenarioClass, benchmark_scenarios, supported_levels}; static BENCHMARK_SCENARIOS: OnceLock> = OnceLock::new(); @@ -205,13 +204,9 @@ fn bench_dictionary(c: &mut Criterion) { let fastcover_options = fastcover_fixed_options(); let rust_train_started = Instant::now(); - let mut rust_dictionary = Vec::new(); - let Ok(rust_tuned) = create_fastcover_raw_dict_from_source( - Cursor::new(training_blob.as_slice()), - &mut rust_dictionary, - dict_size, - &fastcover_options, - ) else { + let Ok((rust_dictionary, rust_tuned)) = + train_fastcover_raw_from_slice(training_blob.as_slice(), dict_size, &fastcover_options) + else { eprintln!( "BENCH_WARN skipping Rust FastCOVER dictionary benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", scenario.id, @@ -255,15 +250,13 @@ fn bench_dictionary(c: &mut Criterion) { group.bench_function("pure_rust", |b| { b.iter(|| { - let mut out = Vec::new(); - let tuned = create_fastcover_raw_dict_from_source( - Cursor::new(training_blob.as_slice()), - &mut out, + let (dict, tuned) = train_fastcover_raw_from_slice( + training_blob.as_slice(), dict_size, &fastcover_options, ) .expect("fastcover training should succeed"); - black_box((out.len(), tuned.score)); + black_box((dict.len(), tuned.score)); }) }); diff --git a/zstd/src/dictionary/cover.rs b/zstd/src/dictionary/cover.rs index fbcd3660..e2794ffa 100644 --- a/zstd/src/dictionary/cover.rs +++ b/zstd/src/dictionary/cover.rs @@ -70,9 +70,7 @@ pub fn pick_best_segment( epoch: &'_ [u8], collection_sample: &'_ [u8], ) -> Segment { - let mut segments = epoch - .chunks(params.segment_size as usize) - .peekable(); + let mut segments = epoch.chunks(params.segment_size as usize).peekable(); let mut best_segment: &[u8] = segments.peek().expect("at least one segment"); let mut top_segment_score: usize = 0; // Iterate over segments and score each segment, keeping track of the best segment diff --git a/zstd/src/dictionary/fastcover.rs b/zstd/src/dictionary/fastcover.rs index ea4dffc4..0da5c631 100644 --- a/zstd/src/dictionary/fastcover.rs +++ b/zstd/src/dictionary/fastcover.rs @@ -121,11 +121,7 @@ fn coverage_score(dict: &[u8], eval: &[u8], d: usize, accel: usize) -> usize { hits } -pub fn train_fastcover_raw( - sample: &[u8], - dict_size: usize, - params: FastCoverParams, -) -> Vec { +pub fn train_fastcover_raw(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec { build_raw_dict(sample, dict_size, params) } @@ -153,6 +149,11 @@ pub fn optimize_fastcover_raw( } else { f_candidates }; + let k_candidates = if k_values.is_empty() { + DEFAULT_K_CANDIDATES + } else { + k_values + }; let mut best_dict = Vec::new(); let mut best = FastCoverTuned { @@ -165,7 +166,7 @@ pub fn optimize_fastcover_raw( for &f in f_values { for &d in d_values { - for &k in k_values { + for &k in k_candidates { let params = FastCoverParams { k, d, f, accel }; let dict = build_raw_dict(train, dict_size, params); let score = coverage_score(dict.as_slice(), eval, d, accel); @@ -233,4 +234,13 @@ mod tests { assert!([18, 20].contains(&tuned.f)); assert!([128, 256].contains(&tuned.k)); } + + #[test] + fn fastcover_optimizer_falls_back_when_k_candidates_empty() { + let sample = corpus(); + let (dict, tuned) = + optimize_fastcover_raw(sample.as_slice(), 4096, 0.75, 1, &[6, 8], &[18, 20], &[]); + assert!(!dict.is_empty()); + assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k)); + } } diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 535820ec..274bf541 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -29,10 +29,10 @@ mod reservoir; use crate::bit_io::BitWriter; use crate::decoding::dictionary::MAGIC_NUM as DICT_MAGIC_NUM; +use crate::dictionary::reservoir::create_sample; use crate::fse::fse_encoder; use crate::huff0::HuffmanTable as HuffmanDecoderTable; use crate::huff0::huff0_encoder::{HuffmanEncoder, HuffmanTable as HuffmanEncoderTable}; -use crate::dictionary::reservoir::create_sample; use core::cmp::Reverse; use cover::*; pub use fastcover::{ @@ -49,6 +49,9 @@ use std::{ vec::Vec, }; +const MAX_TRAINING_PREALLOC_BYTES: usize = 8 * 1024 * 1024; +const MAX_HUFFMAN_STATS_BYTES: usize = 64 * 1024; + /// Tuning knobs for pure-Rust FastCOVER training. #[derive(Debug, Clone)] pub struct FastCoverOptions { @@ -168,17 +171,20 @@ pub fn create_raw_dict_from_dir, W: io::Write>( /// - `dict_size` determines how large the complete dictionary should be. The completed /// dictionary will be this size or smaller. /// -/// This function uses `BufRead` internally, the provided reader need not be buffered. +/// This function reads the entire `source` into an in-memory `Vec` before building +/// the dictionary. The provided reader need not be buffered, but callers should avoid +/// sources too large to fit comfortably in memory. pub fn create_raw_dict_from_source( mut source: R, source_size: usize, output: &mut W, dict_size: usize, ) { - if source_size == 0 || dict_size == 0 { + if dict_size == 0 { return; } - let mut all = Vec::with_capacity(source_size); + let prealloc = source_size.min(MAX_TRAINING_PREALLOC_BYTES); + let mut all = Vec::with_capacity(prealloc); source .read_to_end(&mut all) .expect("can read full source for dictionary training"); @@ -216,8 +222,8 @@ pub fn create_raw_dict_from_source( // Reverse is used because we want a min heap, where // the lowest scoring items come first let mut pool: BinaryHeap> = BinaryHeap::new(); - let (_, epoch_size) = compute_epoch_info(¶ms, dict_size, source_size / K); - let num_epochs = source_size / epoch_size; + let (num_epochs, epoch_size_kmers) = compute_epoch_info(¶ms, dict_size, source_size / K); + let epoch_size = usize::max(K, epoch_size_kmers.saturating_mul(K)); vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes"); let mut epoch_counter = 0; let mut ctx = Context { @@ -250,11 +256,25 @@ pub fn create_raw_dict_from_source( } fn serialize_huffman_table(sample_data: &[u8], raw_content: &[u8]) -> io::Result> { - let mut stats = if sample_data.len() >= 2 { - sample_data.to_vec() + fn bounded_huffman_stats(data: &[u8]) -> Vec { + if data.len() <= MAX_HUFFMAN_STATS_BYTES { + return data.to_vec(); + } + + let mut stats = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES); + for i in 0..MAX_HUFFMAN_STATS_BYTES { + let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES; + stats.push(data[idx]); + } + stats + } + + let source = if sample_data.len() >= 2 { + sample_data } else { - raw_content.to_vec() + raw_content }; + let mut stats = bounded_huffman_stats(source); if stats.len() < 2 || stats.iter().all(|b| *b == stats[0]) { stats = (0u8..=255).collect(); } @@ -304,7 +324,9 @@ pub fn finalize_raw_dict( } let mut out = Vec::with_capacity(dict_size.max(256)); out.extend_from_slice(&DICT_MAGIC_NUM); - let dict_id = options.dict_id.unwrap_or_else(|| derive_dict_id(raw_content)); + let dict_id = options + .dict_id + .unwrap_or_else(|| derive_dict_id(raw_content)); out.extend_from_slice(&dict_id.to_le_bytes()); out.extend_from_slice(serialize_huffman_table(sample_data, raw_content)?.as_slice()); out.extend_from_slice(serialize_fse_table(&fse_encoder::default_of_table()).as_slice()); @@ -338,32 +360,21 @@ pub fn finalize_raw_dict( } /// Train a raw FastCOVER dictionary from a source stream. -pub fn create_fastcover_raw_dict_from_source( - mut source: R, - output: &mut W, +fn train_fastcover_internal( + sample: &[u8], dict_size: usize, options: &FastCoverOptions, -) -> io::Result { - let mut sample = Vec::new(); - source.read_to_end(&mut sample)?; - if sample.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "source stream is empty", - )); - } - - let (dict, tuned) = if options.optimize { - let (dict, tuned) = fastcover::optimize_fastcover_raw( - sample.as_slice(), +) -> (Vec, FastCoverTuned) { + if options.optimize { + fastcover::optimize_fastcover_raw( + sample, dict_size, options.split_point, options.accel, options.d_candidates.as_slice(), options.f_candidates.as_slice(), options.k_candidates.as_slice(), - ); - (dict, tuned) + ) } else { let params = FastCoverParams { k: options.k, @@ -371,9 +382,8 @@ pub fn create_fastcover_raw_dict_from_source( f: options.f, accel: options.accel, }; - let dict = fastcover::train_fastcover_raw(sample.as_slice(), dict_size, params); ( - dict, + fastcover::train_fastcover_raw(sample, dict_size, params), FastCoverTuned { k: options.k, d: options.d, @@ -382,7 +392,34 @@ pub fn create_fastcover_raw_dict_from_source( score: 0, }, ) - }; + } +} + +/// Train a raw FastCOVER dictionary directly from an in-memory sample. +pub fn train_fastcover_raw_from_slice( + sample: &[u8], + dict_size: usize, + options: &FastCoverOptions, +) -> io::Result<(Vec, FastCoverTuned)> { + if sample.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "source stream is empty", + )); + } + Ok(train_fastcover_internal(sample, dict_size, options)) +} + +/// Train a raw FastCOVER dictionary from a source stream. +pub fn create_fastcover_raw_dict_from_source( + mut source: R, + output: &mut W, + dict_size: usize, + options: &FastCoverOptions, +) -> io::Result { + let mut sample = Vec::new(); + source.read_to_end(&mut sample)?; + let (dict, tuned) = train_fastcover_raw_from_slice(sample.as_slice(), dict_size, options)?; output.write_all(dict.as_slice())?; Ok(tuned) } @@ -397,41 +434,8 @@ pub fn create_fastcover_dict_from_source( ) -> io::Result { let mut sample = Vec::new(); source.read_to_end(&mut sample)?; - if sample.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "source stream is empty", - )); - } - - let (raw_dict, tuned) = if fastcover.optimize { - fastcover::optimize_fastcover_raw( - sample.as_slice(), - dict_size, - fastcover.split_point, - fastcover.accel, - fastcover.d_candidates.as_slice(), - fastcover.f_candidates.as_slice(), - fastcover.k_candidates.as_slice(), - ) - } else { - let params = FastCoverParams { - k: fastcover.k, - d: fastcover.d, - f: fastcover.f, - accel: fastcover.accel, - }; - ( - fastcover::train_fastcover_raw(sample.as_slice(), dict_size, params), - FastCoverTuned { - k: fastcover.k, - d: fastcover.d, - f: fastcover.f, - accel: fastcover.accel, - score: 0, - }, - ) - }; + let (raw_dict, tuned) = + train_fastcover_raw_from_slice(sample.as_slice(), dict_size, fastcover)?; let finalized = finalize_raw_dict(raw_dict.as_slice(), sample.as_slice(), dict_size, finalize)?; output.write_all(finalized.as_slice())?; @@ -449,8 +453,10 @@ mod tests { let mut data = Vec::new(); for i in 0..512u32 { data.extend_from_slice( - format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n") - .as_bytes(), + format!( + "tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n" + ) + .as_bytes(), ); } data @@ -483,7 +489,8 @@ mod tests { let mut payload = Vec::new(); for idx in 0..96u32 { payload.extend_from_slice( - format!("tenant=demo op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n").as_bytes(), + format!("tenant=demo op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n") + .as_bytes(), ); } @@ -522,4 +529,12 @@ mod tests { assert!(tuned.k > 0); assert!(tuned.d > 0); } + + #[test] + fn create_raw_dict_from_source_treats_source_size_as_hint() { + let sample = training_data(); + let mut out = Vec::new(); + create_raw_dict_from_source(Cursor::new(sample.as_slice()), 0, &mut out, 1024); + assert!(!out.is_empty()); + } } From be47b2df0bb3637103482c21ed8038d2efb775f4 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 5 Apr 2026 23:28:14 +0300 Subject: [PATCH 06/14] fix(dict): address remaining coderabbit findings --- .github/scripts/run-benchmarks.sh | 2 +- zstd/src/dictionary/fastcover.rs | 58 +++++++++++++++++++++++++++---- zstd/src/dictionary/mod.rs | 22 ++++++++++++ 3 files changed, 75 insertions(+), 7 deletions(-) diff --git a/.github/scripts/run-benchmarks.sh b/.github/scripts/run-benchmarks.sh index ef330f0d..9b16184a 100755 --- a/.github/scripts/run-benchmarks.sh +++ b/.github/scripts/run-benchmarks.sh @@ -352,7 +352,7 @@ for key in all_keys: scenario = meta["scenario"] if meta else key.split(" + ")[0] level = meta["level"] if meta else "unknown" source = meta["source"] if meta else None - input_bytes = scenario_input_bytes.get(scenario) + input_bytes = None if stage == "dict-train" else scenario_input_bytes.get(scenario) speed_series = {} for impl_name, impl_row in speed_index.get(key, {}).items(): diff --git a/zstd/src/dictionary/fastcover.rs b/zstd/src/dictionary/fastcover.rs index 0da5c631..a8c381da 100644 --- a/zstd/src/dictionary/fastcover.rs +++ b/zstd/src/dictionary/fastcover.rs @@ -134,11 +134,6 @@ pub fn optimize_fastcover_raw( f_candidates: &[u32], k_values: &[usize], ) -> (Vec, FastCoverTuned) { - let split = split_point.clamp(0.1, 0.95); - let split_idx = ((sample.len() as f64) * split) as usize; - let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1)); - let (train, eval) = sample.split_at(split_idx); - let d_values = if d_candidates.is_empty() { DEFAULT_D_CANDIDATES } else { @@ -155,6 +150,35 @@ pub fn optimize_fastcover_raw( k_values }; + if sample.len() < 2 { + let params = FastCoverParams { + k: k_candidates[0], + d: d_values[0], + f: f_values[0], + accel, + }; + let mut dict = build_raw_dict(sample, dict_size, params); + if dict.is_empty() && dict_size > 0 { + let take = sample.len().min(dict_size); + dict.extend_from_slice(&sample[..take]); + } + return ( + dict, + FastCoverTuned { + k: params.k, + d: params.d, + f: params.f, + accel, + score: 0, + }, + ); + } + + let split = split_point.clamp(0.1, 0.95); + let split_idx = ((sample.len() as f64) * split) as usize; + let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1)); + let (train, eval) = sample.split_at(split_idx); + let mut best_dict = Vec::new(); let mut best = FastCoverTuned { k: 0, @@ -170,7 +194,7 @@ pub fn optimize_fastcover_raw( let params = FastCoverParams { k, d, f, accel }; let dict = build_raw_dict(train, dict_size, params); let score = coverage_score(dict.as_slice(), eval, d, accel); - if score > best.score { + if best_dict.is_empty() || score > best.score { best.score = score; best.k = k; best.d = d; @@ -243,4 +267,26 @@ mod tests { assert!(!dict.is_empty()); assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k)); } + + #[test] + fn fastcover_optimizer_handles_one_byte_sample_without_panic() { + let sample = [0xAB]; + let (dict, tuned) = optimize_fastcover_raw(&sample, 16, 0.75, 1, &[], &[], &[]); + assert!(!dict.is_empty()); + assert!(dict.len() <= 16); + assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k)); + assert!(DEFAULT_D_CANDIDATES.contains(&tuned.d)); + assert!(DEFAULT_F_CANDIDATES.contains(&tuned.f)); + } + + #[test] + fn fastcover_optimizer_seeds_winner_when_all_scores_are_zero() { + let sample = b"abcdefghijklmnopqrst"; + let (dict, tuned) = optimize_fastcover_raw(sample, 16, 0.9, 1, &[6], &[16], &[8]); + assert!(!dict.is_empty()); + assert_eq!(tuned.k, 8); + assert_eq!(tuned.d, 6); + assert_eq!(tuned.f, 16); + assert_eq!(tuned.score, 0); + } } diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 274bf541..c3bbc4ad 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -327,6 +327,12 @@ pub fn finalize_raw_dict( let dict_id = options .dict_id .unwrap_or_else(|| derive_dict_id(raw_content)); + if dict_id == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "dictionary id must be non-zero", + )); + } out.extend_from_slice(&dict_id.to_le_bytes()); out.extend_from_slice(serialize_huffman_table(sample_data, raw_content)?.as_slice()); out.extend_from_slice(serialize_fse_table(&fse_encoder::default_of_table()).as_slice()); @@ -448,6 +454,7 @@ mod tests { use crate::decoding::Dictionary; use crate::encoding::{CompressionLevel, FrameCompressor}; use std::io::Cursor; + use std::string::ToString; fn training_data() -> Vec { let mut data = Vec::new(); @@ -537,4 +544,19 @@ mod tests { create_raw_dict_from_source(Cursor::new(sample.as_slice()), 0, &mut out, 1024); assert!(!out.is_empty()); } + + #[test] + fn finalize_raw_dict_rejects_zero_dict_id() { + let sample = training_data(); + let raw = b"raw-fastcover-bytes"; + let err = finalize_raw_dict( + raw, + sample.as_slice(), + 4096, + FinalizeOptions { dict_id: Some(0) }, + ) + .expect_err("dict_id=0 must be rejected"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert_eq!(err.to_string(), "dictionary id must be non-zero"); + } } From 33972312e45fdae7a54c80dc74f3e85de17c8cb8 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 5 Apr 2026 23:34:48 +0300 Subject: [PATCH 07/14] test(dict): cover fastcover and finalize edge paths --- zstd/src/dictionary/fastcover.rs | 44 ++++++++++++++ zstd/src/dictionary/mod.rs | 101 +++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/zstd/src/dictionary/fastcover.rs b/zstd/src/dictionary/fastcover.rs index a8c381da..f7a77f82 100644 --- a/zstd/src/dictionary/fastcover.rs +++ b/zstd/src/dictionary/fastcover.rs @@ -241,6 +241,19 @@ mod tests { assert!(dict.len() <= 4096); } + #[test] + fn fastcover_raw_returns_empty_for_empty_or_zero_budget() { + let sample = corpus(); + let params = FastCoverParams { + k: 256, + d: 8, + f: 20, + accel: 1, + }; + assert!(train_fastcover_raw(&[], 1024, params).is_empty()); + assert!(train_fastcover_raw(sample.as_slice(), 0, params).is_empty()); + } + #[test] fn fastcover_optimizer_selects_valid_params() { let sample = corpus(); @@ -289,4 +302,35 @@ mod tests { assert_eq!(tuned.f, 16); assert_eq!(tuned.score, 0); } + + #[test] + fn fastcover_optimizer_handles_zero_dict_budget() { + let sample = corpus(); + let (dict, tuned) = optimize_fastcover_raw( + sample.as_slice(), + 0, + 0.75, + 1, + &[6, 8], + &[18, 20], + &[128, 256], + ); + assert!(dict.is_empty()); + assert!([6, 8].contains(&tuned.d)); + assert!([18, 20].contains(&tuned.f)); + assert!([128, 256].contains(&tuned.k)); + } + + #[test] + fn fastcover_optimizer_clamps_extreme_split_points() { + let sample = corpus(); + let (dict_low, tuned_low) = + optimize_fastcover_raw(sample.as_slice(), 2048, 0.0, 1, &[6], &[18], &[128]); + let (dict_high, tuned_high) = + optimize_fastcover_raw(sample.as_slice(), 2048, 1.0, 1, &[6], &[18], &[128]); + assert!(!dict_low.is_empty()); + assert!(!dict_high.is_empty()); + assert_eq!(tuned_low.k, 128); + assert_eq!(tuned_high.k, 128); + } } diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index c3bbc4ad..ca6c3126 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -537,6 +537,43 @@ mod tests { assert!(tuned.d > 0); } + #[test] + fn create_fastcover_raw_dict_from_source_rejects_empty_source() { + let mut out = Vec::new(); + let err = create_fastcover_raw_dict_from_source( + Cursor::new(Vec::::new()), + &mut out, + 1024, + &FastCoverOptions::default(), + ) + .expect_err("empty source must be rejected"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } + + #[test] + fn create_fastcover_dict_from_source_propagates_finalize_error() { + let sample = training_data(); + let mut out = Vec::new(); + let err = create_fastcover_dict_from_source( + Cursor::new(sample.as_slice()), + &mut out, + 32, + &FastCoverOptions::default(), + FinalizeOptions::default(), + ) + .expect_err("too-small dictionary budget must fail during finalize"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("dictionary size too small")); + } + + #[test] + fn create_raw_dict_from_source_early_returns_on_zero_dict_size() { + let sample = training_data(); + let mut out = Vec::new(); + create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 0); + assert!(out.is_empty()); + } + #[test] fn create_raw_dict_from_source_treats_source_size_as_hint() { let sample = training_data(); @@ -545,6 +582,70 @@ mod tests { assert!(!out.is_empty()); } + #[test] + fn create_raw_dict_from_source_handles_tiny_source_without_epochs() { + let sample = b"short"; + let mut out = Vec::new(); + create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3); + assert_eq!(out, b"ort"); + } + + #[test] + fn train_fastcover_raw_from_slice_rejects_empty_sample() { + let err = train_fastcover_raw_from_slice(&[], 1024, &FastCoverOptions::default()) + .expect_err("empty sample must be rejected"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } + + #[test] + fn train_fastcover_raw_from_slice_supports_non_optimized_params() { + let sample = training_data(); + let options = FastCoverOptions { + optimize: false, + k: 128, + d: 6, + f: 18, + ..FastCoverOptions::default() + }; + let (dict, tuned) = + train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train"); + assert!(!dict.is_empty()); + assert!(dict.len() <= 2048); + assert_eq!(tuned.k, 128); + assert_eq!(tuned.d, 6); + assert_eq!(tuned.f, 18); + assert_eq!(tuned.score, 0); + } + + #[test] + fn finalize_raw_dict_rejects_empty_raw_content() { + let sample = training_data(); + let err = finalize_raw_dict(&[], sample.as_slice(), 4096, FinalizeOptions::default()) + .expect_err("empty raw dictionary must be rejected"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } + + #[test] + fn finalize_raw_dict_rejects_too_small_budget() { + let sample = training_data(); + let raw = b"some-raw-bytes"; + let err = finalize_raw_dict(raw, sample.as_slice(), 32, FinalizeOptions::default()) + .expect_err("tiny dict_size must fail"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert!(err.to_string().contains("dictionary size too small")); + } + + #[test] + fn finalize_raw_dict_pads_to_minimum_content_size() { + let sample = training_data(); + let raw = b"x"; + let finalized = finalize_raw_dict(raw, sample.as_slice(), 4096, FinalizeOptions::default()) + .expect("finalize should pad small raw content"); + let parsed = Dictionary::decode_dict(finalized.as_slice()).expect("finalized dict parses"); + assert!(parsed.dict_content.len() >= 8); + assert_eq!(parsed.dict_content.last(), Some(&b'x')); + } + #[test] fn finalize_raw_dict_rejects_zero_dict_id() { let sample = training_data(); From f83dc71045c1305fb973ae35d267e7ce4c134687 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 6 Apr 2026 00:29:31 +0300 Subject: [PATCH 08/14] fix(dict): align fastcover params and tiny-sample guards --- .github/scripts/run-benchmarks.sh | 3 +- zstd/src/dictionary/fastcover.rs | 39 ++++++++++++++++++------- zstd/src/dictionary/mod.rs | 48 +++++++++++++++++++++++++++---- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/.github/scripts/run-benchmarks.sh b/.github/scripts/run-benchmarks.sh index 9b16184a..b14457b7 100755 --- a/.github/scripts/run-benchmarks.sh +++ b/.github/scripts/run-benchmarks.sh @@ -292,9 +292,10 @@ if not dictionary_rows: if not dictionary_training_rows: print( - "WARN: No REPORT_DICT_TRAIN lines parsed; dictionary training section will be empty.", + "ERROR: No REPORT_DICT_TRAIN lines parsed; dictionary training section would be empty.", file=sys.stderr, ) + sys.exit(1) with open("benchmark-results.json", "w") as f: json.dump(benchmark_results, f, indent=2) diff --git a/zstd/src/dictionary/fastcover.rs b/zstd/src/dictionary/fastcover.rs index f7a77f82..8b6c646e 100644 --- a/zstd/src/dictionary/fastcover.rs +++ b/zstd/src/dictionary/fastcover.rs @@ -36,6 +36,13 @@ fn clamp_table_bits(f: u32) -> u32 { f.clamp(8, 20) } +pub(crate) fn normalize_fastcover_params(mut params: FastCoverParams) -> FastCoverParams { + params.d = params.d.clamp(4, 32); + params.k = params.k.max(params.d).max(16); + params.f = clamp_table_bits(params.f); + params +} + fn build_frequency_table(sample: &[u8], d: usize, f: u32, accel: usize) -> Vec { let bits = clamp_table_bits(f); let size = 1usize << bits; @@ -73,8 +80,9 @@ fn build_raw_dict(sample: &[u8], dict_size: usize, params: FastCoverParams) -> V return Vec::new(); } - let k = params.k.max(params.d).max(16); - let d = params.d.clamp(4, 32); + let params = normalize_fastcover_params(params); + let k = params.k; + let d = params.d; let table = build_frequency_table(sample, d, params.f, params.accel); let mask = table.len().saturating_sub(1); @@ -151,12 +159,12 @@ pub fn optimize_fastcover_raw( }; if sample.len() < 2 { - let params = FastCoverParams { + let params = normalize_fastcover_params(FastCoverParams { k: k_candidates[0], d: d_values[0], f: f_values[0], accel, - }; + }); let mut dict = build_raw_dict(sample, dict_size, params); if dict.is_empty() && dict_size > 0 { let take = sample.len().min(dict_size); @@ -191,14 +199,14 @@ pub fn optimize_fastcover_raw( for &f in f_values { for &d in d_values { for &k in k_candidates { - let params = FastCoverParams { k, d, f, accel }; + let params = normalize_fastcover_params(FastCoverParams { k, d, f, accel }); let dict = build_raw_dict(train, dict_size, params); - let score = coverage_score(dict.as_slice(), eval, d, accel); + let score = coverage_score(dict.as_slice(), eval, params.d, accel); if best_dict.is_empty() || score > best.score { best.score = score; - best.k = k; - best.d = d; - best.f = f; + best.k = params.k; + best.d = params.d; + best.f = params.f; best_dict = dict; } } @@ -297,7 +305,7 @@ mod tests { let sample = b"abcdefghijklmnopqrst"; let (dict, tuned) = optimize_fastcover_raw(sample, 16, 0.9, 1, &[6], &[16], &[8]); assert!(!dict.is_empty()); - assert_eq!(tuned.k, 8); + assert_eq!(tuned.k, 16); assert_eq!(tuned.d, 6); assert_eq!(tuned.f, 16); assert_eq!(tuned.score, 0); @@ -333,4 +341,15 @@ mod tests { assert_eq!(tuned_low.k, 128); assert_eq!(tuned_high.k, 128); } + + #[test] + fn fastcover_optimizer_reports_normalized_params() { + let sample = corpus(); + let (dict, tuned) = + optimize_fastcover_raw(sample.as_slice(), 1024, 0.75, 1, &[64], &[42], &[8]); + assert!(!dict.is_empty()); + assert_eq!(tuned.d, 32); + assert_eq!(tuned.f, 20); + assert_eq!(tuned.k, 32); + } } diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index ca6c3126..3e17787a 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -382,18 +382,18 @@ fn train_fastcover_internal( options.k_candidates.as_slice(), ) } else { - let params = FastCoverParams { + let params = fastcover::normalize_fastcover_params(FastCoverParams { k: options.k, d: options.d, f: options.f, accel: options.accel, - }; + }); ( fastcover::train_fastcover_raw(sample, dict_size, params), FastCoverTuned { - k: options.k, - d: options.d, - f: options.f, + k: params.k, + d: params.d, + f: params.f, accel: options.accel, score: 0, }, @@ -413,7 +413,14 @@ pub fn train_fastcover_raw_from_slice( "source stream is empty", )); } - Ok(train_fastcover_internal(sample, dict_size, options)) + let (dict, tuned) = train_fastcover_internal(sample, dict_size, options); + if dict.is_empty() && dict_size > 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "training sample is too small for FastCOVER", + )); + } + Ok((dict, tuned)) } /// Train a raw FastCOVER dictionary from a source stream. @@ -617,6 +624,35 @@ mod tests { assert_eq!(tuned.score, 0); } + #[test] + fn train_fastcover_raw_from_slice_rejects_tiny_sample_with_empty_dict() { + let sample = b"tiny"; + let err = train_fastcover_raw_from_slice(sample, 1024, &FastCoverOptions::default()) + .expect_err("tiny sample should not produce an empty dictionary successfully"); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + assert_eq!( + err.to_string(), + "training sample is too small for FastCOVER" + ); + } + + #[test] + fn train_fastcover_raw_from_slice_normalizes_non_optimized_params() { + let sample = training_data(); + let options = FastCoverOptions { + optimize: false, + k: 8, + d: 64, + f: 42, + ..FastCoverOptions::default() + }; + let (_, tuned) = + train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train"); + assert_eq!(tuned.k, 32); + assert_eq!(tuned.d, 32); + assert_eq!(tuned.f, 20); + } + #[test] fn finalize_raw_dict_rejects_empty_raw_content() { let sample = training_data(); From c30e9bad779325acf47f5c9c3e91db591a05343a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 6 Apr 2026 00:52:49 +0300 Subject: [PATCH 09/14] fix(dict): address benchmark and finalization review feedback --- .github/scripts/run-benchmarks.sh | 11 ++- zstd/benches/compare_ffi.rs | 4 +- zstd/src/dictionary/mod.rs | 117 ++++++++++++++++++++++++++++-- 3 files changed, 121 insertions(+), 11 deletions(-) diff --git a/.github/scripts/run-benchmarks.sh b/.github/scripts/run-benchmarks.sh index b14457b7..cfd00e5a 100755 --- a/.github/scripts/run-benchmarks.sh +++ b/.github/scripts/run-benchmarks.sh @@ -39,7 +39,7 @@ DICT_RE = re.compile( r'^REPORT_DICT scenario=(\S+) label="((?:[^"\\]|\\.)+)" level=(\S+) dict_bytes=(\d+) train_ms=([0-9.]+) ffi_no_dict_bytes=(\d+) ffi_with_dict_bytes=(\d+) ffi_no_dict_ratio=([0-9.]+) ffi_with_dict_ratio=([0-9.]+)$' ) DICT_TRAIN_RE = re.compile( - r'^REPORT_DICT_TRAIN scenario=(\S+) label="((?:[^"\\]|\\.)+)" dict_bytes_requested=(\d+) rust_train_ms=([0-9.]+) ffi_train_ms=([0-9.]+) rust_dict_bytes=(\d+) ffi_dict_bytes=(\d+) rust_fastcover_score=(\d+)$' + r'^REPORT_DICT_TRAIN scenario=(\S+) label="((?:[^"\\]|\\.)+)" training_bytes=(\d+) dict_bytes_requested=(\d+) rust_train_ms=([0-9.]+) ffi_train_ms=([0-9.]+) rust_dict_bytes=(\d+) ffi_dict_bytes=(\d+) rust_fastcover_score=(\d+)$' ) def unescape_report_label(value): @@ -77,6 +77,7 @@ dictionary_rows = [] dictionary_training_rows = [] timing_rows = [] scenario_input_bytes = {} +scenario_training_bytes = {} raw_path = os.environ["BENCH_RAW_FILE"] DELTA_LOW = 0.99 @@ -246,6 +247,7 @@ with open(raw_path) as f: ( scenario, label, + training_bytes, dict_bytes_requested, rust_train_ms, ffi_train_ms, @@ -262,6 +264,7 @@ with open(raw_path) as f: dictionary_training_rows.append({ "scenario": scenario, "label": label, + "training_bytes": int(training_bytes), "dict_bytes_requested": int(dict_bytes_requested), "rust_train_ms": rust_train_ms_float, "ffi_train_ms": ffi_train_ms_float, @@ -271,6 +274,7 @@ with open(raw_path) as f: "delta_ffi_over_rust": delta, "status": classify_speed_delta(delta), }) + scenario_training_bytes[scenario] = int(training_bytes) if not benchmark_results: print("ERROR: No benchmark results parsed!", file=sys.stderr) @@ -353,7 +357,10 @@ for key in all_keys: scenario = meta["scenario"] if meta else key.split(" + ")[0] level = meta["level"] if meta else "unknown" source = meta["source"] if meta else None - input_bytes = None if stage == "dict-train" else scenario_input_bytes.get(scenario) + if stage == "dict-train": + input_bytes = scenario_training_bytes.get(scenario) + else: + input_bytes = scenario_input_bytes.get(scenario) speed_series = {} for impl_name, impl_row in speed_index.get(key, {}).items(): diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index b9c290f7..c4ac9d11 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -426,10 +426,12 @@ fn emit_dictionary_training_report( rust_fastcover_score: usize, ) { let escaped_label = escape_report_label(&scenario.label); + let training_bytes = scenario.len(); println!( - "REPORT_DICT_TRAIN scenario={} label=\"{}\" dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={} rust_fastcover_score={}", + "REPORT_DICT_TRAIN scenario={} label=\"{}\" training_bytes={} dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={} rust_fastcover_score={}", scenario.id, escaped_label, + training_bytes, dict_bytes_requested, rust_train_ms, ffi_train_ms, diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 3e17787a..a91505ce 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -28,9 +28,13 @@ mod frequency; mod reservoir; use crate::bit_io::BitWriter; +use crate::blocks::sequence_section::{ + MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE, +}; use crate::decoding::dictionary::MAGIC_NUM as DICT_MAGIC_NUM; +use crate::decoding::sequence_section_decoder::{LL_MAX_LOG, ML_MAX_LOG, OF_MAX_LOG}; use crate::dictionary::reservoir::create_sample; -use crate::fse::fse_encoder; +use crate::fse::fse_encoder::{self, build_table_from_data}; use crate::huff0::HuffmanTable as HuffmanDecoderTable; use crate::huff0::huff0_encoder::{HuffmanEncoder, HuffmanTable as HuffmanEncoderTable}; use core::cmp::Reverse; @@ -164,9 +168,8 @@ pub fn create_raw_dict_from_dir, W: io::Write>( /// The completed dictionary is written to `output`. /// /// - `source` will be used as training data for the entire dictionary. -/// - `source_size` influences how the data is divided and sampled and is measured -/// in bytes. While this does not need to be exact, estimates should attempt to be -/// larger than the actual collection size. +/// - `source_size` is used only as a preallocation hint before reading `source` and +/// does not affect sampling once all data has been buffered. /// - `output` is where the completed dictionary will be written. /// - `dict_size` determines how large the complete dictionary should be. The completed /// dictionary will be this size or smaller. @@ -298,6 +301,78 @@ fn serialize_fse_table(table: &fse_encoder::FSETable) -> Vec { writer.dump() } +fn bounded_fse_symbols(data: &[u8], max_symbol: u8) -> Vec { + let modulo = u16::from(max_symbol) + 1; + if data.is_empty() { + return Vec::from([0u8]); + } + if data.len() <= MAX_HUFFMAN_STATS_BYTES { + return data + .iter() + .map(|b| (u16::from(*b) % modulo) as u8) + .collect(); + } + + let mut out = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES); + for i in 0..MAX_HUFFMAN_STATS_BYTES { + let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES; + out.push((u16::from(data[idx]) % modulo) as u8); + } + out +} + +fn serialize_fse_table_from_corpus( + sample_data: &[u8], + raw_content: &[u8], + max_symbol: u8, + max_log: u8, +) -> Vec { + let source = if sample_data.is_empty() { + raw_content + } else { + sample_data + }; + let symbols = bounded_fse_symbols(source, max_symbol); + let table = build_table_from_data(symbols.into_iter(), max_log, false); + serialize_fse_table(&table) +} + +fn finalized_content_budget( + sample_data: &[u8], + raw_fallback: &[u8], + dict_size: usize, +) -> io::Result { + let min_content_size = 8usize; + let huf_len = serialize_huffman_table(sample_data, raw_fallback)?.len(); + let of_len = + serialize_fse_table_from_corpus(sample_data, raw_fallback, MAX_OFFSET_CODE, OF_MAX_LOG) + .len(); + let ml_len = serialize_fse_table_from_corpus( + sample_data, + raw_fallback, + MAX_MATCH_LENGTH_CODE, + ML_MAX_LOG, + ) + .len(); + let ll_len = serialize_fse_table_from_corpus( + sample_data, + raw_fallback, + MAX_LITERAL_LENGTH_CODE, + LL_MAX_LOG, + ) + .len(); + + let header_len = DICT_MAGIC_NUM.len() + 4 + huf_len + of_len + ml_len + ll_len + 12; + let max_content_budget = dict_size.saturating_sub(header_len); + if max_content_budget < min_content_size { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "dictionary size too small to fit header and offset history", + )); + } + Ok(max_content_budget) +} + fn derive_dict_id(raw_content: &[u8]) -> u32 { let mut h = 0xcbf29ce484222325u64; for &b in raw_content { @@ -335,9 +410,28 @@ pub fn finalize_raw_dict( } out.extend_from_slice(&dict_id.to_le_bytes()); out.extend_from_slice(serialize_huffman_table(sample_data, raw_content)?.as_slice()); - out.extend_from_slice(serialize_fse_table(&fse_encoder::default_of_table()).as_slice()); - out.extend_from_slice(serialize_fse_table(&fse_encoder::default_ml_table()).as_slice()); - out.extend_from_slice(serialize_fse_table(&fse_encoder::default_ll_table()).as_slice()); + out.extend_from_slice( + serialize_fse_table_from_corpus(sample_data, raw_content, MAX_OFFSET_CODE, OF_MAX_LOG) + .as_slice(), + ); + out.extend_from_slice( + serialize_fse_table_from_corpus( + sample_data, + raw_content, + MAX_MATCH_LENGTH_CODE, + ML_MAX_LOG, + ) + .as_slice(), + ); + out.extend_from_slice( + serialize_fse_table_from_corpus( + sample_data, + raw_content, + MAX_LITERAL_LENGTH_CODE, + LL_MAX_LOG, + ) + .as_slice(), + ); // Repeat offsets: keep default bootstrap history. out.extend_from_slice(&1u32.to_le_bytes()); @@ -424,6 +518,9 @@ pub fn train_fastcover_raw_from_slice( } /// Train a raw FastCOVER dictionary from a source stream. +/// +/// This function fully buffers the entire training corpus into memory via +/// `read_to_end`, which can consume significant RAM for large inputs. pub fn create_fastcover_raw_dict_from_source( mut source: R, output: &mut W, @@ -438,6 +535,9 @@ pub fn create_fastcover_raw_dict_from_source( } /// Train and finalize a FastCOVER dictionary in pure Rust. +/// +/// This function fully buffers the entire training corpus into memory via +/// `read_to_end`, which can consume significant RAM for large inputs. pub fn create_fastcover_dict_from_source( mut source: R, output: &mut W, @@ -447,8 +547,9 @@ pub fn create_fastcover_dict_from_source( ) -> io::Result { let mut sample = Vec::new(); source.read_to_end(&mut sample)?; + let content_budget = finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size)?; let (raw_dict, tuned) = - train_fastcover_raw_from_slice(sample.as_slice(), dict_size, fastcover)?; + train_fastcover_raw_from_slice(sample.as_slice(), content_budget, fastcover)?; let finalized = finalize_raw_dict(raw_dict.as_slice(), sample.as_slice(), dict_size, finalize)?; output.write_all(finalized.as_slice())?; From dcd9185ce09dd9d0f15f3329f9558ead905cdbd4 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 6 Apr 2026 01:27:08 +0300 Subject: [PATCH 10/14] fix(dict): normalize accel and align training benchmark inputs --- zstd/benches/compare_ffi.rs | 62 +++++++++++---------- zstd/benches/dict_builder_fastcover.rs | 3 +- zstd/src/dictionary/fastcover.rs | 8 +-- zstd/src/dictionary/mod.rs | 74 +++++++++++++++++++------- zstd/src/encoding/frame_compressor.rs | 3 +- 5 files changed, 97 insertions(+), 53 deletions(-) diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index c4ac9d11..7b70b501 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -195,9 +195,9 @@ fn bench_dictionary(c: &mut Criterion) { } let training_samples = split_training_samples(&scenario.bytes); - let sample_refs: Vec<&[u8]> = training_samples.iter().map(Vec::as_slice).collect(); let training_blob: Vec = training_samples.concat(); - let total_training_bytes = sample_refs.iter().map(|sample| sample.len()).sum::(); + let total_training_bytes = training_blob.len(); + let ffi_samples = [training_blob.as_slice()]; let dict_size = dictionary_size_for(scenario.len()) .min(total_training_bytes.saturating_sub(64)) .max(256); @@ -210,7 +210,7 @@ fn bench_dictionary(c: &mut Criterion) { eprintln!( "BENCH_WARN skipping Rust FastCOVER dictionary benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", scenario.id, - sample_refs.len(), + training_samples.len(), total_training_bytes, dict_size ); @@ -219,11 +219,11 @@ fn bench_dictionary(c: &mut Criterion) { let rust_train_ms = rust_train_started.elapsed().as_secs_f64() * 1_000.0; let ffi_train_started = Instant::now(); - let Ok(ffi_dictionary) = zstd::dict::from_samples(&sample_refs, dict_size) else { + let Ok(ffi_dictionary) = zstd::dict::from_samples(&ffi_samples, dict_size) else { eprintln!( "BENCH_WARN skipping dictionary benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", scenario.id, - sample_refs.len(), + ffi_samples.len(), total_training_bytes, dict_size ); @@ -234,12 +234,15 @@ fn bench_dictionary(c: &mut Criterion) { if emit_reports { emit_dictionary_training_report( scenario, - dict_size, - rust_train_ms, - ffi_train_ms, - rust_dictionary.len(), - ffi_dictionary.len(), - rust_tuned.score, + DictTrainingMetrics { + training_bytes: total_training_bytes, + dict_bytes_requested: dict_size, + rust_train_ms, + ffi_train_ms, + rust_dict_bytes: rust_dictionary.len(), + ffi_dict_bytes: ffi_dictionary.len(), + rust_fastcover_score: rust_tuned.score, + }, ); } @@ -263,7 +266,7 @@ fn bench_dictionary(c: &mut Criterion) { group.bench_function("c_ffi", |b| { b.iter(|| { black_box( - zstd::dict::from_samples(&sample_refs, dict_size) + zstd::dict::from_samples(&ffi_samples, dict_size) .expect("ffi dictionary training should succeed") .len(), ) @@ -416,31 +419,32 @@ fn emit_dictionary_report( ); } -fn emit_dictionary_training_report( - scenario: &Scenario, - dict_bytes_requested: usize, - rust_train_ms: f64, - ffi_train_ms: f64, - rust_dict_bytes: usize, - ffi_dict_bytes: usize, - rust_fastcover_score: usize, -) { +fn emit_dictionary_training_report(scenario: &Scenario, metrics: DictTrainingMetrics) { let escaped_label = escape_report_label(&scenario.label); - let training_bytes = scenario.len(); println!( "REPORT_DICT_TRAIN scenario={} label=\"{}\" training_bytes={} dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={} rust_fastcover_score={}", scenario.id, escaped_label, - training_bytes, - dict_bytes_requested, - rust_train_ms, - ffi_train_ms, - rust_dict_bytes, - ffi_dict_bytes, - rust_fastcover_score + metrics.training_bytes, + metrics.dict_bytes_requested, + metrics.rust_train_ms, + metrics.ffi_train_ms, + metrics.rust_dict_bytes, + metrics.ffi_dict_bytes, + metrics.rust_fastcover_score ); } +struct DictTrainingMetrics { + training_bytes: usize, + dict_bytes_requested: usize, + rust_train_ms: f64, + ffi_train_ms: f64, + rust_dict_bytes: usize, + ffi_dict_bytes: usize, + rust_fastcover_score: usize, +} + fn split_training_samples(source: &[u8]) -> Vec> { let sample_size = source.len().div_ceil(16).clamp(256, 8192); let mut samples: Vec> = source diff --git a/zstd/benches/dict_builder_fastcover.rs b/zstd/benches/dict_builder_fastcover.rs index fbd050bf..2423fe18 100644 --- a/zstd/benches/dict_builder_fastcover.rs +++ b/zstd/benches/dict_builder_fastcover.rs @@ -30,7 +30,8 @@ fn bench_dict_builder(c: &mut Criterion) { data.len(), &mut out, black_box(dict_size), - ); + ) + .expect("cover training should succeed"); black_box(out.len()); }) }); diff --git a/zstd/src/dictionary/fastcover.rs b/zstd/src/dictionary/fastcover.rs index 8b6c646e..940d6cdb 100644 --- a/zstd/src/dictionary/fastcover.rs +++ b/zstd/src/dictionary/fastcover.rs @@ -40,6 +40,7 @@ pub(crate) fn normalize_fastcover_params(mut params: FastCoverParams) -> FastCov params.d = params.d.clamp(4, 32); params.k = params.k.max(params.d).max(16); params.f = clamp_table_bits(params.f); + params.accel = params.accel.clamp(1, 10); params } @@ -176,7 +177,7 @@ pub fn optimize_fastcover_raw( k: params.k, d: params.d, f: params.f, - accel, + accel: params.accel, score: 0, }, ); @@ -192,7 +193,7 @@ pub fn optimize_fastcover_raw( k: 0, d: 0, f: 0, - accel, + accel: accel.clamp(1, 10), score: 0, }; @@ -201,12 +202,13 @@ pub fn optimize_fastcover_raw( for &k in k_candidates { let params = normalize_fastcover_params(FastCoverParams { k, d, f, accel }); let dict = build_raw_dict(train, dict_size, params); - let score = coverage_score(dict.as_slice(), eval, params.d, accel); + let score = coverage_score(dict.as_slice(), eval, params.d, params.accel); if best_dict.is_empty() || score > best.score { best.score = score; best.k = params.k; best.d = params.d; best.f = params.f; + best.accel = params.accel; best_dict = dict; } } diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index a91505ce..13bac2aa 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -115,7 +115,8 @@ pub(super) struct DictParams { /// /// # Errors /// This function returns `Ok(())` if the dictionary was created successfully, and an -/// `Err(io::Error)` if an error was encountered reading the input directory. +/// `Err(io::Error)` if an error was encountered reading the input directory or +/// writing dictionary bytes to `output`. /// /// # Examples /// ```no_run @@ -160,7 +161,7 @@ pub fn create_raw_dict_from_dir, W: io::Write>( .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader))); // Create a dict using the new reader - create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size); + create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size)?; Ok(()) } @@ -182,25 +183,21 @@ pub fn create_raw_dict_from_source( source_size: usize, output: &mut W, dict_size: usize, -) { +) -> io::Result<()> { if dict_size == 0 { - return; + return Ok(()); } let prealloc = source_size.min(MAX_TRAINING_PREALLOC_BYTES); let mut all = Vec::with_capacity(prealloc); - source - .read_to_end(&mut all) - .expect("can read full source for dictionary training"); + source.read_to_end(&mut all)?; if all.is_empty() { - return; + return Ok(()); } if all.len() < K { let keep = usize::min(all.len(), dict_size); - output - .write_all(&all[all.len() - keep..]) - .expect("can write tiny dictionary"); - return; + output.write_all(&all[all.len() - keep..])?; + return Ok(()); } let source_size = all.len(); @@ -252,10 +249,9 @@ pub fn create_raw_dict_from_source( // Write the dictionary with the highest scoring segment last because // closer items can be represented with a smaller offset while let Some(segment) = pool.pop() { - output - .write_all(&segment.0.raw) - .expect("can write to output"); + output.write_all(&segment.0.raw)?; } + Ok(()) } fn serialize_huffman_table(sample_data: &[u8], raw_content: &[u8]) -> io::Result> { @@ -488,7 +484,7 @@ fn train_fastcover_internal( k: params.k, d: params.d, f: params.f, - accel: options.accel, + accel: params.accel, score: 0, }, ) @@ -678,7 +674,8 @@ mod tests { fn create_raw_dict_from_source_early_returns_on_zero_dict_size() { let sample = training_data(); let mut out = Vec::new(); - create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 0); + create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 0) + .expect("zero dict size should no-op"); assert!(out.is_empty()); } @@ -686,7 +683,8 @@ mod tests { fn create_raw_dict_from_source_treats_source_size_as_hint() { let sample = training_data(); let mut out = Vec::new(); - create_raw_dict_from_source(Cursor::new(sample.as_slice()), 0, &mut out, 1024); + create_raw_dict_from_source(Cursor::new(sample.as_slice()), 0, &mut out, 1024) + .expect("raw dictionary training should succeed"); assert!(!out.is_empty()); } @@ -694,10 +692,48 @@ mod tests { fn create_raw_dict_from_source_handles_tiny_source_without_epochs() { let sample = b"short"; let mut out = Vec::new(); - create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3); + create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3) + .expect("tiny source path should succeed"); assert_eq!(out, b"ort"); } + #[test] + fn create_raw_dict_from_source_propagates_read_error() { + struct FailingReader; + impl io::Read for FailingReader { + fn read(&mut self, _buf: &mut [u8]) -> io::Result { + Err(io::Error::other("read failed")) + } + } + + let mut out = Vec::new(); + let err = create_raw_dict_from_source(FailingReader, 1024, &mut out, 1024) + .expect_err("read failures must be returned"); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert_eq!(err.to_string(), "read failed"); + } + + #[test] + fn create_raw_dict_from_source_propagates_write_error() { + struct FailingWriter; + impl io::Write for FailingWriter { + fn write(&mut self, _buf: &[u8]) -> io::Result { + Err(io::Error::other("write failed")) + } + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } + } + + let sample = b"short"; + let mut out = FailingWriter; + let err = + create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3) + .expect_err("write failures must be returned"); + assert_eq!(err.kind(), io::ErrorKind::Other); + assert_eq!(err.to_string(), "write failed"); + } + #[test] fn train_fastcover_raw_from_slice_rejects_empty_sample() { let err = train_fastcover_raw_from_slice(&[], 1024, &FastCoverOptions::default()) diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs index c6c11c5a..2e3304a9 100644 --- a/zstd/src/encoding/frame_compressor.rs +++ b/zstd/src/encoding/frame_compressor.rs @@ -730,7 +730,8 @@ mod tests { training.len(), &mut raw_dict, 4096, - ); + ) + .expect("dict_builder training should succeed"); assert!( !raw_dict.is_empty(), "dict_builder produced an empty dictionary" From 9625c35fbcee2e2aacd2997257bf58ce5027ff05 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 6 Apr 2026 02:01:16 +0300 Subject: [PATCH 11/14] fix(bench): align dict-train artifacts and hoist options --- zstd/benches/compare_ffi.rs | 30 +++++++++++++++++++++++--- zstd/benches/dict_builder_fastcover.rs | 21 +++++++++--------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index 7b70b501..fb8d74e7 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -16,7 +16,9 @@ use std::hint::black_box; use std::sync::OnceLock; use std::time::{Duration, Instant}; use structured_zstd::decoding::FrameDecoder; -use structured_zstd::dictionary::{FastCoverOptions, train_fastcover_raw_from_slice}; +use structured_zstd::dictionary::{ + FastCoverOptions, FinalizeOptions, finalize_raw_dict, train_fastcover_raw_from_slice, +}; use support::{LevelConfig, Scenario, ScenarioClass, benchmark_scenarios, supported_levels}; static BENCHMARK_SCENARIOS: OnceLock> = OnceLock::new(); @@ -204,7 +206,7 @@ fn bench_dictionary(c: &mut Criterion) { let fastcover_options = fastcover_fixed_options(); let rust_train_started = Instant::now(); - let Ok((rust_dictionary, rust_tuned)) = + let Ok((rust_raw_dictionary, rust_tuned)) = train_fastcover_raw_from_slice(training_blob.as_slice(), dict_size, &fastcover_options) else { eprintln!( @@ -216,6 +218,21 @@ fn bench_dictionary(c: &mut Criterion) { ); continue; }; + let Ok(rust_dictionary) = finalize_raw_dict( + rust_raw_dictionary.as_slice(), + training_blob.as_slice(), + dict_size, + FinalizeOptions::default(), + ) else { + eprintln!( + "BENCH_WARN skipping Rust FastCOVER finalization benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", + scenario.id, + training_samples.len(), + total_training_bytes, + dict_size + ); + continue; + }; let rust_train_ms = rust_train_started.elapsed().as_secs_f64() * 1_000.0; let ffi_train_started = Instant::now(); @@ -253,12 +270,19 @@ fn bench_dictionary(c: &mut Criterion) { group.bench_function("pure_rust", |b| { b.iter(|| { - let (dict, tuned) = train_fastcover_raw_from_slice( + let (raw_dict, tuned) = train_fastcover_raw_from_slice( training_blob.as_slice(), dict_size, &fastcover_options, ) .expect("fastcover training should succeed"); + let dict = finalize_raw_dict( + raw_dict.as_slice(), + training_blob.as_slice(), + dict_size, + FinalizeOptions::default(), + ) + .expect("fastcover dictionary finalization should succeed"); black_box((dict.len(), tuned.score)); }) }); diff --git a/zstd/benches/dict_builder_fastcover.rs b/zstd/benches/dict_builder_fastcover.rs index 2423fe18..4d7c41c3 100644 --- a/zstd/benches/dict_builder_fastcover.rs +++ b/zstd/benches/dict_builder_fastcover.rs @@ -21,6 +21,15 @@ fn corpus() -> Vec { fn bench_dict_builder(c: &mut Criterion) { let data = corpus(); let dict_size = 8 * 1024; + let fastcover_opt = FastCoverOptions::default(); + let fastcover_fixed = FastCoverOptions { + optimize: false, + accel: 4, + k: 256, + d: 8, + f: 20, + ..FastCoverOptions::default() + }; c.bench_function("dict_builder/cover_raw", |b| { b.iter(|| { @@ -43,7 +52,7 @@ fn bench_dict_builder(c: &mut Criterion) { Cursor::new(data.as_slice()), &mut out, black_box(dict_size), - &FastCoverOptions::default(), + &fastcover_opt, ) .expect("fastcover training should succeed"); black_box((out.len(), tuned.score)); @@ -53,19 +62,11 @@ fn bench_dict_builder(c: &mut Criterion) { c.bench_function("dict_builder/fastcover_raw_fixed", |b| { b.iter(|| { let mut out = Vec::new(); - let opts = FastCoverOptions { - optimize: false, - accel: 4, - k: 256, - d: 8, - f: 20, - ..FastCoverOptions::default() - }; let tuned = create_fastcover_raw_dict_from_source( Cursor::new(data.as_slice()), &mut out, black_box(dict_size), - &opts, + &fastcover_fixed, ) .expect("fastcover training should succeed"); black_box((out.len(), tuned.score)); From dbe0b38f98411bd4468bef934b8f4990d20812a3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 6 Apr 2026 03:29:25 +0300 Subject: [PATCH 12/14] fix(bench): resolve remaining dict-train review threads --- zstd/benches/compare_ffi.rs | 59 ++++++++++++++++--------------------- zstd/src/dictionary/mod.rs | 3 +- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index fb8d74e7..bcfe764b 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -196,40 +196,36 @@ fn bench_dictionary(c: &mut Criterion) { continue; } - let training_samples = split_training_samples(&scenario.bytes); - let training_blob: Vec = training_samples.concat(); - let total_training_bytes = training_blob.len(); - let ffi_samples = [training_blob.as_slice()]; + let sample_count = training_sample_count(&scenario.bytes); + let total_training_bytes = scenario.bytes.len(); + let ffi_samples = [scenario.bytes.as_slice()]; + let max_dict_size = total_training_bytes.saturating_sub(64); let dict_size = dictionary_size_for(scenario.len()) - .min(total_training_bytes.saturating_sub(64)) - .max(256); + .max(256) + .min(max_dict_size); let fastcover_options = fastcover_fixed_options(); let rust_train_started = Instant::now(); - let Ok((rust_raw_dictionary, rust_tuned)) = - train_fastcover_raw_from_slice(training_blob.as_slice(), dict_size, &fastcover_options) - else { + let Ok((rust_raw_dictionary, _rust_tuned)) = train_fastcover_raw_from_slice( + scenario.bytes.as_slice(), + dict_size, + &fastcover_options, + ) else { eprintln!( "BENCH_WARN skipping Rust FastCOVER dictionary benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", - scenario.id, - training_samples.len(), - total_training_bytes, - dict_size + scenario.id, sample_count, total_training_bytes, dict_size ); continue; }; let Ok(rust_dictionary) = finalize_raw_dict( rust_raw_dictionary.as_slice(), - training_blob.as_slice(), + scenario.bytes.as_slice(), dict_size, FinalizeOptions::default(), ) else { eprintln!( "BENCH_WARN skipping Rust FastCOVER finalization benchmark for {} (samples={}, total_training_bytes={}, dict_size={})", - scenario.id, - training_samples.len(), - total_training_bytes, - dict_size + scenario.id, sample_count, total_training_bytes, dict_size ); continue; }; @@ -258,7 +254,6 @@ fn bench_dictionary(c: &mut Criterion) { ffi_train_ms, rust_dict_bytes: rust_dictionary.len(), ffi_dict_bytes: ffi_dictionary.len(), - rust_fastcover_score: rust_tuned.score, }, ); } @@ -271,14 +266,14 @@ fn bench_dictionary(c: &mut Criterion) { group.bench_function("pure_rust", |b| { b.iter(|| { let (raw_dict, tuned) = train_fastcover_raw_from_slice( - training_blob.as_slice(), + scenario.bytes.as_slice(), dict_size, &fastcover_options, ) .expect("fastcover training should succeed"); let dict = finalize_raw_dict( raw_dict.as_slice(), - training_blob.as_slice(), + scenario.bytes.as_slice(), dict_size, FinalizeOptions::default(), ) @@ -446,7 +441,7 @@ fn emit_dictionary_report( fn emit_dictionary_training_report(scenario: &Scenario, metrics: DictTrainingMetrics) { let escaped_label = escape_report_label(&scenario.label); println!( - "REPORT_DICT_TRAIN scenario={} label=\"{}\" training_bytes={} dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={} rust_fastcover_score={}", + "REPORT_DICT_TRAIN scenario={} label=\"{}\" training_bytes={} dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={}", scenario.id, escaped_label, metrics.training_bytes, @@ -454,8 +449,7 @@ fn emit_dictionary_training_report(scenario: &Scenario, metrics: DictTrainingMet metrics.rust_train_ms, metrics.ffi_train_ms, metrics.rust_dict_bytes, - metrics.ffi_dict_bytes, - metrics.rust_fastcover_score + metrics.ffi_dict_bytes ); } @@ -466,32 +460,31 @@ struct DictTrainingMetrics { ffi_train_ms: f64, rust_dict_bytes: usize, ffi_dict_bytes: usize, - rust_fastcover_score: usize, } -fn split_training_samples(source: &[u8]) -> Vec> { +fn training_sample_count(source: &[u8]) -> usize { let sample_size = source.len().div_ceil(16).clamp(256, 8192); - let mut samples: Vec> = source + let samples = source .chunks(sample_size) .take(64) .filter(|chunk| chunk.len() >= 64) - .map(|chunk| chunk.to_vec()) - .collect(); - if samples.len() < 2 { + .count(); + if samples < 2 { let midpoint = source.len() / 2; let left = &source[..midpoint]; let right = &source[midpoint..]; if left.len() >= 64 && right.len() >= 64 { - samples = vec![left.to_vec(), right.to_vec()]; + 2 } else { eprintln!( "BENCH_WARN tiny dictionary training input ({} bytes), using a single sample fallback", source.len() ); - samples = vec![source.to_vec()]; + 1 } + } else { + samples } - samples } fn dictionary_size_for(input_len: usize) -> usize { diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 13bac2aa..257d6af8 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -124,7 +124,8 @@ pub(super) struct DictParams { /// // Create a roughly 1mb dictionary, training off of file in `sample_files` /// let input_folder = "sample_files/"; /// let mut output = File::create("output.dict").unwrap(); -/// structured_zstd::dictionary::create_raw_dict_from_dir(input_folder, &mut output, 1_000_000); +/// structured_zstd::dictionary::create_raw_dict_from_dir(input_folder, &mut output, 1_000_000) +/// .expect("dictionary training from sample_files should succeed"); /// ``` pub fn create_raw_dict_from_dir, W: io::Write>( path: P, From 8622344e9d188c26e7ee17a653996064d34438ba Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 6 Apr 2026 04:38:40 +0300 Subject: [PATCH 13/14] fix(dict): cap epochs and align dict-train reporting --- zstd/benches/compare_ffi.rs | 34 +++++++++++++++++++++++++++++----- zstd/src/dictionary/mod.rs | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index bcfe764b..064b7704 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -203,12 +203,21 @@ fn bench_dictionary(c: &mut Criterion) { let dict_size = dictionary_size_for(scenario.len()) .max(256) .min(max_dict_size); + let Ok(rust_content_budget) = + finalized_training_content_budget(scenario.bytes.as_slice(), dict_size) + else { + eprintln!( + "BENCH_WARN skipping Rust FastCOVER dictionary benchmark for {} (samples={}, total_training_bytes={}, dict_size={}) due to finalized content budget error", + scenario.id, sample_count, total_training_bytes, dict_size + ); + continue; + }; let fastcover_options = fastcover_fixed_options(); let rust_train_started = Instant::now(); - let Ok((rust_raw_dictionary, _rust_tuned)) = train_fastcover_raw_from_slice( + let Ok((rust_raw_dictionary, rust_tuned)) = train_fastcover_raw_from_slice( scenario.bytes.as_slice(), - dict_size, + rust_content_budget, &fastcover_options, ) else { eprintln!( @@ -254,6 +263,7 @@ fn bench_dictionary(c: &mut Criterion) { ffi_train_ms, rust_dict_bytes: rust_dictionary.len(), ffi_dict_bytes: ffi_dictionary.len(), + rust_fastcover_score: rust_tuned.score, }, ); } @@ -267,7 +277,7 @@ fn bench_dictionary(c: &mut Criterion) { b.iter(|| { let (raw_dict, tuned) = train_fastcover_raw_from_slice( scenario.bytes.as_slice(), - dict_size, + rust_content_budget, &fastcover_options, ) .expect("fastcover training should succeed"); @@ -441,7 +451,7 @@ fn emit_dictionary_report( fn emit_dictionary_training_report(scenario: &Scenario, metrics: DictTrainingMetrics) { let escaped_label = escape_report_label(&scenario.label); println!( - "REPORT_DICT_TRAIN scenario={} label=\"{}\" training_bytes={} dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={}", + "REPORT_DICT_TRAIN scenario={} label=\"{}\" training_bytes={} dict_bytes_requested={} rust_train_ms={:.3} ffi_train_ms={:.3} rust_dict_bytes={} ffi_dict_bytes={} rust_fastcover_score={}", scenario.id, escaped_label, metrics.training_bytes, @@ -449,7 +459,8 @@ fn emit_dictionary_training_report(scenario: &Scenario, metrics: DictTrainingMet metrics.rust_train_ms, metrics.ffi_train_ms, metrics.rust_dict_bytes, - metrics.ffi_dict_bytes + metrics.ffi_dict_bytes, + metrics.rust_fastcover_score ); } @@ -460,6 +471,19 @@ struct DictTrainingMetrics { ffi_train_ms: f64, rust_dict_bytes: usize, ffi_dict_bytes: usize, + rust_fastcover_score: usize, +} + +fn finalized_training_content_budget(sample: &[u8], dict_size: usize) -> std::io::Result { + let probe = [0u8; 8]; + let finalized = finalize_raw_dict( + probe.as_slice(), + sample, + dict_size, + FinalizeOptions::default(), + )?; + let header_bytes = finalized.len().saturating_sub(probe.len()); + Ok(dict_size.saturating_sub(header_bytes)) } fn training_sample_count(source: &[u8]) -> usize { diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index 257d6af8..af4aab24 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -230,9 +230,20 @@ pub fn create_raw_dict_from_source( let mut ctx = Context { frequencies: HashMap::with_capacity(epoch_size / K), }; - // Score each segment in the epoch and select the highest scoring segment - // for the pool - for epoch in all.chunks(epoch_size) { + // Score each segment in each planned epoch and select the highest-scoring + // segment for the pool. Keep exactly `num_epochs` windows to avoid + // emitting more segments than the requested dictionary budget allows. + for epoch_idx in 0..num_epochs { + let start = epoch_idx.saturating_mul(epoch_size); + if start >= all.len() { + break; + } + let end = if epoch_idx + 1 == num_epochs { + all.len() + } else { + usize::min(start.saturating_add(epoch_size), all.len()) + }; + let epoch = &all[start..end]; epoch_counter += 1; let best_segment = pick_best_segment(¶ms, &mut ctx, epoch, &collection_sample); vprintln!( @@ -735,6 +746,26 @@ mod tests { assert_eq!(err.to_string(), "write failed"); } + #[test] + fn create_raw_dict_from_source_never_exceeds_requested_size() { + let dict_size = 4096usize; + let source: Vec = core::iter::repeat_n(b'a', 320_001).collect(); + let mut out = Vec::new(); + create_raw_dict_from_source( + Cursor::new(source.as_slice()), + source.len(), + &mut out, + dict_size, + ) + .expect("raw dictionary training should succeed"); + assert!( + out.len() <= dict_size, + "raw dictionary exceeded requested size: {} > {}", + out.len(), + dict_size + ); + } + #[test] fn train_fastcover_raw_from_slice_rejects_empty_sample() { let err = train_fastcover_raw_from_slice(&[], 1024, &FastCoverOptions::default()) From b1627049b849a3e05bbf74dca364d36c367d85cd Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 6 Apr 2026 10:52:47 +0300 Subject: [PATCH 14/14] docs(dict): clarify result semantics and align finalize test --- README.md | 1 + zstd/src/dictionary/mod.rs | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ddd7ead4..e47d6fa4 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ When the `dict_builder` feature is enabled, the `dictionary` module can: - build raw dictionaries with FastCOVER (`create_fastcover_raw_dict_from_source`) - finalize raw content into full zstd dictionary format (`finalize_raw_dict`) - train+finalize in one pure-Rust flow (`create_fastcover_dict_from_source`) +- propagate I/O failures from dictionary-building APIs via `io::Result` return values ## Benchmarking diff --git a/zstd/src/dictionary/mod.rs b/zstd/src/dictionary/mod.rs index af4aab24..1a9cebb6 100644 --- a/zstd/src/dictionary/mod.rs +++ b/zstd/src/dictionary/mod.rs @@ -111,7 +111,7 @@ pub(super) struct DictParams { /// Creates a "raw content" dictionary, training off of every file in this directory and all /// sub-directories. /// -/// The resulting dictionary will be approxamitely `dict_size` or less, and written to `output`. +/// The resulting dictionary will be approximately `dict_size` or less, and written to `output`. /// /// # Errors /// This function returns `Ok(())` if the dictionary was created successfully, and an @@ -179,6 +179,9 @@ pub fn create_raw_dict_from_dir, W: io::Write>( /// This function reads the entire `source` into an in-memory `Vec` before building /// the dictionary. The provided reader need not be buffered, but callers should avoid /// sources too large to fit comfortably in memory. +/// +/// # API note +/// This public API returns `io::Result<()>` and propagates source/output I/O failures. pub fn create_raw_dict_from_source( mut source: R, source_size: usize, @@ -588,9 +591,13 @@ mod tests { #[test] fn finalize_raw_dict_roundtrips_with_ffi_decoder() { let sample = training_data(); + let dict_size = 4096usize; + let content_budget = + finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size) + .expect("content budget should be computable"); let raw = fastcover::train_fastcover_raw( sample.as_slice(), - 4096, + content_budget, FastCoverParams { k: 256, d: 8, @@ -601,7 +608,7 @@ mod tests { let finalized = finalize_raw_dict( raw.as_slice(), sample.as_slice(), - 4096, + dict_size, FinalizeOptions::default(), ) .expect("finalization should succeed");