diff --git a/blobby/CHANGELOG.md b/blobby/CHANGELOG.md index b81c2b10..18ef58d6 100644 --- a/blobby/CHANGELOG.md +++ b/blobby/CHANGELOG.md @@ -7,8 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## 0.4.0 (unreleased) ### Changed - Edition changed to 2024 and MSRV bumped to 1.85 ([#1149]) +- Replaced iterators with `const fn` parsing ([#1187]) +- Format of the file. File header now contains total number of stored blobs. ([#1207]) [#1149]: https://github.com/RustCrypto/utils/pull/1149 +[#1187]: https://github.com/RustCrypto/utils/pull/1187 +[#1207]: https://github.com/RustCrypto/utils/pull/1207 ## 0.3.1 (2021-12-07) ### Added diff --git a/blobby/README.md b/blobby/README.md index ecc31865..2310dc35 100644 --- a/blobby/README.md +++ b/blobby/README.md @@ -7,13 +7,14 @@ ![Rust Version][rustc-image] [![Project Chat][chat-image]][chat-link] -Iterators over a simple binary blob storage. +An encoding and decoding library for the Blobby (`blb`) file format, which serves as a simple, +deduplicated storage format for a sequence of binary blobs. ## Examples ``` // We recommend to save blobby data into separate files and // use the `include_bytes!` macro -static BLOBBY_DATA: &[u8] = b"\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00"; +static BLOBBY_DATA: &[u8; 27] = b"\x08\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00"; static SLICE: &[&[u8]] = blobby::parse_into_slice!(BLOBBY_DATA); @@ -54,7 +55,7 @@ assert_eq!( assert_eq!(ITEMS.len(), 2); ``` -## Encoding and decoding +## Encoding and decoding utilities This crate provides encoding and decoding utilities for converting between the blobby format and text file with hex-encoded strings. @@ -97,9 +98,7 @@ This file can be converted to the Blobby format by running the following command cargo run --release --features alloc --bin encode -- /path/to/input.txt /path/to/output.blb ``` -This will create a file which can be read using `blobby::Blob2Iterator`. - -To see contents of an existing Blobby file you can use the following command: +To inspect contents of an existing Blobby file you can use the following command: ```sh cargo run --release --features alloc --bin decode -- /path/to/input.blb /path/to/output.txt ``` @@ -109,20 +108,22 @@ in the input file. ## Storage format Storage format represents a sequence of binary blobs. The format uses -git-flavored [variable-length quantity][0] (VLQ) for encoding unsigned +git-flavored [variable-length quantity][VLQ] (VLQ) for encoding unsigned numbers. -File starts with a number of de-duplicated blobs `d`. It followed by `d` -entries. Each entry starts with an integer `m`, immediately followed by `m` +Blobby files start with two numbers: total number of blobs in the file `n` and +number of de-duplicated blobs `d`. The numbers are followed by `d` entries. +Each entry starts with an integer `m`, immediately followed by `m` bytes representing de-duplicated binary blob. -Next follows unspecified number of entries representing sequence of stored -blobs. Each entry starts with an unsigned integer `n`. The least significant +Next, follows `n` entries representing sequence of stored blobs. +Each entry starts with an unsigned integer `l`. The least significant bit of this integer is used as a flag. If the flag is equal to 0, then the number is followed by `n >> 1` bytes, representing a stored binary blob. -Otherwise the entry references a de-duplicated entry number `n >> 1`. +Otherwise the entry references a de-duplicated entry number `n >> 1` +which should be smaller than `d`. -[0]: https://en.wikipedia.org/wiki/Variable-length_quantity +[VLQ]: https://en.wikipedia.org/wiki/Variable-length_quantity ## License diff --git a/blobby/src/decode.rs b/blobby/src/decode.rs index 707b0fb6..b40d7dfb 100644 --- a/blobby/src/decode.rs +++ b/blobby/src/decode.rs @@ -50,51 +50,38 @@ macro_rules! try_read_vlq { }; } -pub const fn parse_dedup_len(mut data: &[u8]) -> Result { - read_vlq(&mut data) +/// Blobby file header +pub struct Header { + /// Number of blobs stored in the file + pub items_len: usize, + /// Number of deduplicated blobs + pub dedup_len: usize, } -pub const fn parse_items_len(mut data: &[u8]) -> Result { - let dedup_index_len = try_read_vlq!(data); - - let mut i = 0; - while i < dedup_index_len { - let m = try_read_vlq!(data); - let split = data.split_at(m); - data = split.1; - i += 1; - } - - let mut i = 0; - loop { - if data.is_empty() { - return Ok(i); +impl Header { + /// Parse blobby header + pub const fn parse(data: &mut &[u8]) -> Result { + match (read_vlq(data), read_vlq(data)) { + (Ok(items_len), Ok(dedup_len)) => Ok(Header { + items_len, + dedup_len, + }), + (Err(err), _) | (Ok(_), Err(err)) => Err(err), } - let val = try_read_vlq!(data); - // the least significant bit is used as a flag - let is_ref = (val & 1) != 0; - let val = val >> 1; - if is_ref { - if val >= dedup_index_len { - return Err(Error::InvalidIndex); - } - } else { - if val > data.len() { - return Err(Error::UnexpectedEnd); - } - let split = data.split_at(val); - data = split.1; - }; - i += 1; } } /// Parse blobby data into an array. -pub const fn parse_into_array( +pub const fn parse_into_array( mut data: &[u8], -) -> Result<[&[u8]; ITEMS], Error> { - if try_read_vlq!(data) != DEDUP_LEN { - return Err(Error::BadArrayLen); +) -> Result<[&[u8]; ITEMS_LEN], Error> { + match Header::parse(&mut data) { + Ok(header) => { + if header.items_len != ITEMS_LEN || header.dedup_len != DEDUP_LEN { + return Err(Error::BadArrayLen); + } + } + Err(err) => return Err(err), } let mut dedup_index: [&[u8]; DEDUP_LEN] = [&[]; DEDUP_LEN]; @@ -108,7 +95,7 @@ pub const fn parse_into_array( i += 1; } - let mut res: [&[u8]; ITEMS] = [&[]; ITEMS]; + let mut res: [&[u8]; ITEMS_LEN] = [&[]; ITEMS_LEN]; let mut i = 0; while i < res.len() { @@ -144,7 +131,10 @@ pub const fn parse_into_array( pub fn parse_into_vec(mut data: &[u8]) -> Result, Error> { use alloc::{vec, vec::Vec}; - let dedup_len = try_read_vlq!(data); + let Header { + items_len, + dedup_len, + } = Header::parse(&mut data)?; let mut dedup_index: Vec<&[u8]> = vec![&[]; dedup_len]; @@ -157,7 +147,6 @@ pub fn parse_into_vec(mut data: &[u8]) -> Result, Error> i += 1; } - let items_len = parse_items_len(data)?; let mut res: Vec<&[u8]> = vec![&[]; items_len]; let mut i = 0; @@ -189,20 +178,15 @@ pub fn parse_into_vec(mut data: &[u8]) -> Result, Error> #[macro_export] macro_rules! parse_into_slice { ($data:expr) => {{ - const ITEMS_LEN: usize = { - match $crate::parse_items_len($data) { + const HEADER: $crate::Header = { + let mut data: &[u8] = $data; + match $crate::Header::parse(&mut data) { Ok(v) => v, Err(_) => panic!("Failed to parse items len"), } }; - const DEDUP_LEN: usize = { - match $crate::parse_dedup_len($data) { - Ok(v) => v, - Err(_) => panic!("Failed to parse dedup len"), - } - }; - const ITEMS: [&[u8]; ITEMS_LEN] = { - match $crate::parse_into_array::($data) { + const ITEMS: [&[u8]; { HEADER.items_len }] = { + match $crate::parse_into_array::<{ HEADER.items_len }, { HEADER.dedup_len }>($data) { Ok(v) => v, Err(_) => panic!("Failed to parse items"), } diff --git a/blobby/src/encode.rs b/blobby/src/encode.rs index 786dbf14..0f0efcd9 100644 --- a/blobby/src/encode.rs +++ b/blobby/src/encode.rs @@ -30,59 +30,66 @@ fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] { /// Returns the encoded data together with a count of the number of blobs included in the index. /// /// The encoded file format is: -/// - count of index entries=N -/// - N x index entries, each encoded as: +/// - number of blobs in the file = N +/// - number of deduplicated index entries = M +/// - M x index entries encoded as: /// - size L of index entry (VLQ) /// - index blob contents (L bytes) -/// - repeating encoded blobs, each encoded as: +/// - N x blobs encoded as: /// - VLQ value that is either: /// - (J << 1) & 0x01: indicates this blob is index entry J /// - (L << 1) & 0x00: indicates an explicit blob of len L /// - (in the latter case) explicit blob contents (L bytes) -pub fn encode_blobs<'a, I, T>(blobs: &'a I) -> (alloc::vec::Vec, usize) +pub fn encode_blobs(blobs: &[T]) -> (alloc::vec::Vec, usize) where - &'a I: IntoIterator, - T: AsRef<[u8]> + 'a, + T: AsRef<[u8]>, { use alloc::{collections::BTreeMap, vec::Vec}; - let mut idx_map = BTreeMap::new(); + let mut dedup_map = BTreeMap::new(); blobs - .into_iter() + .iter() .map(|v| v.as_ref()) .filter(|blob| !blob.is_empty()) .for_each(|blob| { - let v = idx_map.entry(blob.as_ref()).or_insert(0); + let v = dedup_map.entry(blob.as_ref()).or_insert(0); *v += 1; }); - let mut idx: Vec<&[u8]> = idx_map + let mut dedup_list: Vec<&[u8]> = dedup_map .iter() .filter(|&(_, &v)| v > 1) .map(|(&k, _)| k) .collect(); - idx.sort_by_key(|e| { + dedup_list.sort_by_key(|e| { let k = match e { [0] => 2, [1] => 1, _ => 0, }; - (k, idx_map.get(e).unwrap()) + (k, dedup_map.get(e).unwrap()) }); - idx.reverse(); - let idx_len = idx.len(); + dedup_list.reverse(); + let idx_len = dedup_list.len(); - let rev_idx: BTreeMap<&[u8], usize> = idx.iter().enumerate().map(|(i, &e)| (e, i)).collect(); + let rev_idx: BTreeMap<&[u8], usize> = dedup_list + .iter() + .enumerate() + .map(|(i, &e)| (e, i)) + .collect(); let mut out_buf = Vec::new(); let mut buf = [0u8; 4]; - out_buf.extend_from_slice(encode_vlq(idx.len(), &mut buf)); - for e in idx { + + out_buf.extend_from_slice(encode_vlq(blobs.len(), &mut buf)); + out_buf.extend_from_slice(encode_vlq(dedup_list.len(), &mut buf)); + + for e in dedup_list { out_buf.extend_from_slice(encode_vlq(e.len(), &mut buf)); out_buf.extend_from_slice(e); } - for blob in blobs.into_iter().map(|v| v.as_ref()) { + for blob in blobs.iter().map(|v| v.as_ref()) { if let Some(dup_pos) = rev_idx.get(blob) { let n = (dup_pos << 1) + 1usize; out_buf.extend_from_slice(encode_vlq(n, &mut buf)); diff --git a/blobby/src/lib.rs b/blobby/src/lib.rs index b5516646..2457e35b 100644 --- a/blobby/src/lib.rs +++ b/blobby/src/lib.rs @@ -13,7 +13,7 @@ extern crate alloc; pub(crate) mod decode; #[cfg(feature = "alloc")] pub use decode::parse_into_vec; -pub use decode::{parse_dedup_len, parse_into_array, parse_items_len}; +pub use decode::{Header, parse_into_array}; #[cfg(feature = "alloc")] mod encode; diff --git a/blobby/tests/mod.rs b/blobby/tests/mod.rs new file mode 100644 index 00000000..c28d8a74 --- /dev/null +++ b/blobby/tests/mod.rs @@ -0,0 +1,31 @@ +#![cfg(feature = "alloc")] + +const ITEMS_LEN: usize = 10; +const DEDUP_LEN: usize = 3; +const TEST_BLOBS: &[&[u8]; ITEMS_LEN] = &[ + b"1", + b"12", + b"1", + b"1", + b"123", + &[42; 100_000], + &[42; 100_000], + &[13; 7_000], + &[13; 7_000], + &[13; 5_000], +]; + +#[test] +fn blobby_rondtrip_test() -> Result<(), blobby::Error> { + let (blobby_data, dedup_len) = blobby::encode_blobs(TEST_BLOBS); + assert_eq!(dedup_len, DEDUP_LEN); + assert_eq!(blobby_data.len(), 112_025); + + let decoded_blobs = blobby::parse_into_array::(&blobby_data)?; + assert_eq!(decoded_blobs, TEST_BLOBS[..]); + + let decoded_blobs = blobby::parse_into_vec(&blobby_data)?; + assert_eq!(decoded_blobs, TEST_BLOBS[..]); + + Ok(()) +}