Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
a40e317
feat(encoding): numeric compression levels (1-22) API
polaz Apr 4, 2026
bd152e0
fix(cli): align level 0 with library API and prevent i32::MIN overflow
polaz Apr 4, 2026
e209495
feat(encoding): source-size-aware parameter selection and review fixes
polaz Apr 4, 2026
625c1f0
fix(encoding): honor source-size hints in matcher and cli
polaz Apr 4, 2026
ec9a61a
test(cli): clarify --store level validation behavior
polaz Apr 4, 2026
70dd7f6
test(cli): derive parse boundary inputs from level constants
polaz Apr 4, 2026
eba6cbb
docs(encoding): align source-size docs with runtime behavior
polaz Apr 4, 2026
42e6ecf
test(roundtrip): replace brittle monotonic ratio assertion
polaz Apr 4, 2026
d7f0a79
fix(encoding): clamp zero source-size hint to minimum window
polaz Apr 4, 2026
8bb19c8
test(encoding): tighten level roundtrip checks and pool sizing
polaz Apr 4, 2026
899a8f0
docs(cli): sync numeric defaults and source-hint coverage
polaz Apr 4, 2026
c37f6cf
fix(cli): document clap i64 range bounds for level parser
polaz Apr 4, 2026
6aa7618
docs(encoding): note semver impact of level variant
polaz Apr 4, 2026
672ac26
fix(encoding): canonicalize named levels in from_level
polaz Apr 4, 2026
fec2b3f
test(encoding): cover direct Level(0/3) default equivalence
polaz Apr 4, 2026
5d8703d
docs(encoding): scope clamping guarantee to default matcher
polaz Apr 4, 2026
dcbdbd1
perf(encoding): avoid eager zero-fill in pooled block buffers
polaz Apr 4, 2026
734f748
docs(encoding): clarify source-size hint scope
polaz Apr 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ This is a **maintained fork** of [KillingSpark/zstd-rs](https://github.com/Killi
**Fork goals:**
- Dictionary compression improvements (critical for per-label trained dictionaries in LSM-tree)
- Performance parity with C zstd for decompression (currently 1.4-3.5x slower)
- Additional compression levels (Fastest, Default, Better, and Best are all implemented)
- Full numeric compression levels (0 = default, 1–22 plus negative ultra-fast, C zstd compatible)
- No FFI — pure `cargo build`, no cmake/system libraries (ADR-013 compliance)

**Upstream relationship:** We periodically sync with upstream but maintain an independent development trajectory focused on CoordiNode requirements.
Expand All @@ -46,6 +46,8 @@ Complete RFC 8878 implementation. Performance: ~1.4-3.5x slower than C zstd depe
- [x] Default (roughly level 3)
- [x] Better (roughly level 7)
- [x] Best (roughly level 11)
- [x] Numeric levels `0` (default), `1–22`, and negative ultra-fast levels via `CompressionLevel::from_level(n)` (C zstd compatible numbering)
- [x] Negative levels for ultra-fast compression
- [x] Checksums
- [x] Frame Content Size — `FrameCompressor` writes FCS automatically; `StreamingEncoder` requires `set_pledged_content_size()` before first write
- [x] Dictionary compression
Expand All @@ -67,7 +69,10 @@ Performance tracking lives in [BENCHMARKS.md](BENCHMARKS.md). The suite compares
use structured_zstd::encoding::{compress, compress_to_vec, CompressionLevel};

let data: &[u8] = b"hello world";
// Named level
let compressed = compress_to_vec(data, CompressionLevel::Fastest);
// Numeric level (C zstd compatible: 0 = default, 1-22, negative for ultra-fast)
let compressed = compress_to_vec(data, CompressionLevel::from_level(7));
```

```rust,no_run
Expand Down
113 changes: 87 additions & 26 deletions cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,40 @@ enum Commands {
/// Where the compressed file is written
/// [default: <INPUT_FILE>.zst]
output_file: Option<PathBuf>,
/// How thoroughly the file should be compressed. A higher level will take
/// more time to compress but result in a smaller file, and vice versa.
/// Compression level (higher = smaller, slower).
///
/// - 0: Uncompressed
/// - 1: Fastest
/// - 2: Default
/// - 3: Better (lazy2, ~zstd level 7)
/// - 4: Best (deep lazy2, ~zstd level 11)
/// Numeric levels follow the zstd convention where 0 means
/// "use the default level" (currently 3).
///
/// - 0: Default (same as 3)
/// - 1: Fastest (fast hash, ~zstd level 1)
/// - 3: Default (dfast, ~zstd level 3)
/// - 7: Better (lazy2, ~zstd level 7)
/// - 11: Best (deep lazy2, ~zstd level 11)
/// - Negative: ultra-fast modes (less compression, more speed)
/// - 12-22: progressively higher ratio (capped at lazy2 backend)
///
/// Use --store to write an uncompressed zstd frame.
#[arg(
short,
long,
value_name = "COMPRESSION_LEVEL",
default_value_t = 2,
value_parser = clap::value_parser!(u8).range(0..=4),
verbatim_doc_comment
value_name = "LEVEL",
default_value_t = CompressionLevel::DEFAULT_LEVEL,
// clap's ranged parser expects i64 bounds here (RangedI64ValueParser),
// even though the target value type is i32.
value_parser = clap::value_parser!(i32).range(
(CompressionLevel::MIN_LEVEL as i64)..=(CompressionLevel::MAX_LEVEL as i64)
),
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.
verbatim_doc_comment,
allow_hyphen_values = true,
)]
level: u8,
level: i32,
/// Write an uncompressed zstd frame (no compression).
///
/// When set, compression itself ignores `--level` and writes a raw
/// zstd frame. The CLI still validates `--level` range at parse time.
#[arg(long)]
store: bool,
Comment thread
polaz marked this conversation as resolved.
},
Decompress {
/// .zst archive to decompress
Expand Down Expand Up @@ -81,9 +98,10 @@ fn main() -> color_eyre::Result<()> {
input_file,
output_file,
level,
store,
} => {
let output_file = output_file.unwrap_or_else(|| add_extension(&input_file, ".zst"));
compress(input_file, output_file, level)?;
compress(input_file, output_file, level, store)?;
}
Commands::Decompress {
input_file,
Expand All @@ -101,15 +119,12 @@ fn main() -> color_eyre::Result<()> {
Ok(())
}

fn compress(input: PathBuf, output: PathBuf, level: u8) -> color_eyre::Result<()> {
fn compress(input: PathBuf, output: PathBuf, level: i32, store: bool) -> color_eyre::Result<()> {
info!("compressing {input:?} to {output:?}");
let compression_level: structured_zstd::encoding::CompressionLevel = match level {
0 => CompressionLevel::Uncompressed,
1 => CompressionLevel::Fastest,
2 => CompressionLevel::Default,
3 => CompressionLevel::Better,
4 => CompressionLevel::Best,
_ => return Err(eyre!("unsupported compression level: {level}")),
let compression_level = if store {
CompressionLevel::Uncompressed
} else {
CompressionLevel::from_level(level)
};
Comment thread
polaz marked this conversation as resolved.
ensure_distinct_paths(&input, &output)?;
ensure_regular_output_destination(&output)?;
Expand All @@ -128,6 +143,9 @@ fn compress(input: PathBuf, output: PathBuf, level: u8) -> color_eyre::Result<()
let compression_result: color_eyre::Result<File> = (|| {
let mut encoder =
structured_zstd::encoding::StreamingEncoder::new(temporary_output, compression_level);
encoder
.set_source_size_hint(source_size as u64)
.wrap_err("failed to configure source size hint")?;
std::io::copy(&mut encoder_input, &mut encoder).wrap_err("streaming compression failed")?;
encoder.finish().wrap_err("failed to finalize zstd frame")
})();
Expand Down Expand Up @@ -402,7 +420,50 @@ mod tests {

#[test]
fn cli_rejects_unsupported_compression_level_at_parse_time() {
let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "5"]);
let too_high =
(structured_zstd::encoding::CompressionLevel::MAX_LEVEL as i64 + 1).to_string();
let parse = Cli::try_parse_from([
"structured-zstd",
"compress",
"in.bin",
"--level",
too_high.as_str(),
]);
assert!(parse.is_err());
}

#[test]
fn cli_accepts_negative_compression_level() {
let parse = Cli::try_parse_from(["structured-zstd", "compress", "in.bin", "--level", "-3"]);
assert!(parse.is_ok());
}

#[test]
fn cli_rejects_too_negative_compression_level() {
let too_low =
(structured_zstd::encoding::CompressionLevel::MIN_LEVEL as i64 - 1).to_string();
let parse = Cli::try_parse_from([
"structured-zstd",
"compress",
"in.bin",
"--level",
too_low.as_str(),
]);
assert!(parse.is_err());
}

#[test]
fn cli_store_still_validates_level_range_at_parse_time() {
let too_high =
(structured_zstd::encoding::CompressionLevel::MAX_LEVEL as i64 + 1).to_string();
let parse = Cli::try_parse_from([
"structured-zstd",
"compress",
"in.bin",
"--store",
"--level",
too_high.as_str(),
]);
assert!(parse.is_err());
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

Expand All @@ -415,7 +476,7 @@ mod tests {
let input = std::env::temp_dir().join(format!("structured-zstd-cli-alias-{unique}.txt"));
fs::write(&input, b"streaming-cli-alias-check").unwrap();

let err = compress(input.clone(), input.clone(), 2).unwrap_err();
let err = compress(input.clone(), input.clone(), 3, false).unwrap_err();
let message = format!("{err:#}");
assert!(
message.contains("input and output"),
Expand All @@ -434,7 +495,7 @@ mod tests {
fs::write(&input, b"streaming-cli-hardlink-check").unwrap();
fs::hard_link(&input, &output).unwrap();

let err = compress(input.clone(), output.clone(), 2).unwrap_err();
let err = compress(input.clone(), output.clone(), 3, false).unwrap_err();
let message = format!("{err:#}");
assert!(
message.contains("input and output"),
Expand All @@ -455,7 +516,7 @@ mod tests {
let output =
std::env::temp_dir().join(format!("structured-zstd-cli-missing-output-{unique}.zst"));

let err = compress(missing_input, output.clone(), 2).unwrap_err();
let err = compress(missing_input, output.clone(), 3, false).unwrap_err();
let message = format!("{err:#}");
assert!(
message.contains("failed to open input file"),
Expand All @@ -473,7 +534,7 @@ mod tests {
let output = dir.join("existing-dir");
fs::create_dir(&output).unwrap();

let err = compress(input, output.clone(), 2).unwrap_err();
let err = compress(input, output.clone(), 3, false).unwrap_err();
let message = format!("{err:#}");
assert!(
message.contains("not a regular file"),
Expand Down
24 changes: 18 additions & 6 deletions zstd/src/encoding/frame_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
self.compressed_data.replace(compressed_data)
}

/// Provide a hint about the total uncompressed size for the next frame.
///
/// When set, the encoder selects smaller hash tables and windows for
/// small inputs, matching the C zstd source-size-class behavior.
/// Must be called before [`compress`](Self::compress).
pub fn set_source_size_hint(&mut self, size: u64) {
self.state.matcher.set_source_size_hint(size);
}

/// Compress the uncompressed data from the provided source as one Zstd frame and write it to the provided drain
///
/// This will repeatedly call [Read::read] on the source to fill up blocks until the source returns 0 on the read call.
Expand Down Expand Up @@ -274,7 +283,8 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
CompressionLevel::Fastest
| CompressionLevel::Default
| CompressionLevel::Better
| CompressionLevel::Best => compress_block_encoded(
| CompressionLevel::Best
| CompressionLevel::Level(_) => compress_block_encoded(
&mut self.state,
last_block,
uncompressed_data,
Expand Down Expand Up @@ -476,7 +486,7 @@ mod tests {
data.len() as u64,
"FCS mismatch for len={} level={:?}",
data.len(),
level as u8,
level,
);
// Confirm the FCS field is actually present in the header
// (not just the decoder returning 0 for absent FCS).
Expand All @@ -485,7 +495,7 @@ mod tests {
0,
"FCS field must be present for len={} level={:?}",
data.len(),
level as u8,
level,
);
// Verify C zstd can decompress
let mut decoded = Vec::new();
Expand Down Expand Up @@ -883,8 +893,10 @@ mod tests {
crate::decoding::Dictionary::from_raw_content(dict_id, b"abcdefgh".to_vec())
.expect("raw dictionary should be valid");

let payload = b"abcdefgh".repeat(512);
let matcher = MatchGeneratorDriver::new(8, 1);
// Payload must exceed the encoder's advertised window (128 KiB for
// Fastest) so the test actually exercises cross-window-boundary behavior.
let payload = b"abcdefgh".repeat(128 * 1024 / 8 + 64);
let matcher = MatchGeneratorDriver::new(1024, 1);

let mut no_dict_output = Vec::new();
let mut no_dict_compressor =
Expand All @@ -900,7 +912,7 @@ mod tests {
.expect("window size should be present");

let mut output = Vec::new();
let matcher = MatchGeneratorDriver::new(8, 1);
let matcher = MatchGeneratorDriver::new(1024, 1);
let mut compressor =
FrameCompressor::new_with_matcher(matcher, super::CompressionLevel::Fastest);
compressor
Expand Down
Loading
Loading