Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions blobby/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## 0.4.0 (unreleased)
### Changed
- Edition changed to 2024 and MSRV bumped to 1.85 ([#1149])
- Replaced iterators with `const fn` parsing ([#1187])
- Format of the file. File header now contains total number of stored blobs. ([#1207])

[#1149]: https://github.com/RustCrypto/utils/pull/1149
[#1187]: https://github.com/RustCrypto/utils/pull/1187
[#1207]: https://github.com/RustCrypto/utils/pull/1207

## 0.3.1 (2021-12-07)
### Added
Expand Down
27 changes: 14 additions & 13 deletions blobby/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
![Rust Version][rustc-image]
[![Project Chat][chat-image]][chat-link]

Iterators over a simple binary blob storage.
An encoding and decoding library for the Blobby (`blb`) file format, which serves as a simple,
deduplicated storage format for a sequence of binary blobs.

## Examples
```
// We recommend to save blobby data into separate files and
// use the `include_bytes!` macro
static BLOBBY_DATA: &[u8] = b"\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00";
static BLOBBY_DATA: &[u8; 27] = b"\x08\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00";

static SLICE: &[&[u8]] = blobby::parse_into_slice!(BLOBBY_DATA);

Expand Down Expand Up @@ -54,7 +55,7 @@ assert_eq!(
assert_eq!(ITEMS.len(), 2);
```

## Encoding and decoding
## Encoding and decoding utilities

This crate provides encoding and decoding utilities for converting between
the blobby format and text file with hex-encoded strings.
Expand Down Expand Up @@ -97,9 +98,7 @@ This file can be converted to the Blobby format by running the following command
cargo run --release --features alloc --bin encode -- /path/to/input.txt /path/to/output.blb
```

This will create a file which can be read using `blobby::Blob2Iterator`.

To see contents of an existing Blobby file you can use the following command:
To inspect contents of an existing Blobby file you can use the following command:
```sh
cargo run --release --features alloc --bin decode -- /path/to/input.blb /path/to/output.txt
```
Expand All @@ -109,20 +108,22 @@ in the input file.
## Storage format

Storage format represents a sequence of binary blobs. The format uses
git-flavored [variable-length quantity][0] (VLQ) for encoding unsigned
git-flavored [variable-length quantity][VLQ] (VLQ) for encoding unsigned
numbers.

File starts with a number of de-duplicated blobs `d`. It followed by `d`
entries. Each entry starts with an integer `m`, immediately followed by `m`
Blobby files start with two numbers: total number of blobs in the file `n` and
number of de-duplicated blobs `d`. The numbers are followed by `d` entries.
Each entry starts with an integer `m`, immediately followed by `m`
bytes representing de-duplicated binary blob.

Next follows unspecified number of entries representing sequence of stored
blobs. Each entry starts with an unsigned integer `n`. The least significant
Next, follows `n` entries representing sequence of stored blobs.
Each entry starts with an unsigned integer `l`. The least significant
bit of this integer is used as a flag. If the flag is equal to 0, then the
number is followed by `n >> 1` bytes, representing a stored binary blob.
Otherwise the entry references a de-duplicated entry number `n >> 1`.
Otherwise the entry references a de-duplicated entry number `n >> 1`
which should be smaller than `d`.

[0]: https://en.wikipedia.org/wiki/Variable-length_quantity
[VLQ]: https://en.wikipedia.org/wiki/Variable-length_quantity

## License

Expand Down
84 changes: 34 additions & 50 deletions blobby/src/decode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,51 +50,38 @@ macro_rules! try_read_vlq {
};
}

pub const fn parse_dedup_len(mut data: &[u8]) -> Result<usize, Error> {
read_vlq(&mut data)
/// Blobby file header
pub struct Header {
/// Number of blobs stored in the file
pub items_len: usize,
/// Number of deduplicated blobs
pub dedup_len: usize,
}

pub const fn parse_items_len(mut data: &[u8]) -> Result<usize, Error> {
let dedup_index_len = try_read_vlq!(data);

let mut i = 0;
while i < dedup_index_len {
let m = try_read_vlq!(data);
let split = data.split_at(m);
data = split.1;
i += 1;
}

let mut i = 0;
loop {
if data.is_empty() {
return Ok(i);
impl Header {
/// Parse blobby header
pub const fn parse(data: &mut &[u8]) -> Result<Self, Error> {
match (read_vlq(data), read_vlq(data)) {
(Ok(items_len), Ok(dedup_len)) => Ok(Header {
items_len,
dedup_len,
}),
(Err(err), _) | (Ok(_), Err(err)) => Err(err),
}
let val = try_read_vlq!(data);
// the least significant bit is used as a flag
let is_ref = (val & 1) != 0;
let val = val >> 1;
if is_ref {
if val >= dedup_index_len {
return Err(Error::InvalidIndex);
}
} else {
if val > data.len() {
return Err(Error::UnexpectedEnd);
}
let split = data.split_at(val);
data = split.1;
};
i += 1;
}
}

/// Parse blobby data into an array.
pub const fn parse_into_array<const ITEMS: usize, const DEDUP_LEN: usize>(
pub const fn parse_into_array<const ITEMS_LEN: usize, const DEDUP_LEN: usize>(
mut data: &[u8],
) -> Result<[&[u8]; ITEMS], Error> {
if try_read_vlq!(data) != DEDUP_LEN {
return Err(Error::BadArrayLen);
) -> Result<[&[u8]; ITEMS_LEN], Error> {
match Header::parse(&mut data) {
Ok(header) => {
if header.items_len != ITEMS_LEN || header.dedup_len != DEDUP_LEN {
return Err(Error::BadArrayLen);
}
}
Err(err) => return Err(err),
}

let mut dedup_index: [&[u8]; DEDUP_LEN] = [&[]; DEDUP_LEN];
Expand All @@ -108,7 +95,7 @@ pub const fn parse_into_array<const ITEMS: usize, const DEDUP_LEN: usize>(
i += 1;
}

let mut res: [&[u8]; ITEMS] = [&[]; ITEMS];
let mut res: [&[u8]; ITEMS_LEN] = [&[]; ITEMS_LEN];

let mut i = 0;
while i < res.len() {
Expand Down Expand Up @@ -144,7 +131,10 @@ pub const fn parse_into_array<const ITEMS: usize, const DEDUP_LEN: usize>(
pub fn parse_into_vec(mut data: &[u8]) -> Result<alloc::vec::Vec<&[u8]>, Error> {
use alloc::{vec, vec::Vec};

let dedup_len = try_read_vlq!(data);
let Header {
items_len,
dedup_len,
} = Header::parse(&mut data)?;

let mut dedup_index: Vec<&[u8]> = vec![&[]; dedup_len];

Expand All @@ -157,7 +147,6 @@ pub fn parse_into_vec(mut data: &[u8]) -> Result<alloc::vec::Vec<&[u8]>, Error>
i += 1;
}

let items_len = parse_items_len(data)?;
let mut res: Vec<&[u8]> = vec![&[]; items_len];

let mut i = 0;
Expand Down Expand Up @@ -189,20 +178,15 @@ pub fn parse_into_vec(mut data: &[u8]) -> Result<alloc::vec::Vec<&[u8]>, Error>
#[macro_export]
macro_rules! parse_into_slice {
($data:expr) => {{
const ITEMS_LEN: usize = {
match $crate::parse_items_len($data) {
const HEADER: $crate::Header = {
let mut data: &[u8] = $data;
match $crate::Header::parse(&mut data) {
Ok(v) => v,
Err(_) => panic!("Failed to parse items len"),
}
};
const DEDUP_LEN: usize = {
match $crate::parse_dedup_len($data) {
Ok(v) => v,
Err(_) => panic!("Failed to parse dedup len"),
}
};
const ITEMS: [&[u8]; ITEMS_LEN] = {
match $crate::parse_into_array::<ITEMS_LEN, DEDUP_LEN>($data) {
const ITEMS: [&[u8]; { HEADER.items_len }] = {
match $crate::parse_into_array::<{ HEADER.items_len }, { HEADER.dedup_len }>($data) {
Ok(v) => v,
Err(_) => panic!("Failed to parse items"),
}
Expand Down
43 changes: 25 additions & 18 deletions blobby/src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,59 +30,66 @@ fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] {
/// Returns the encoded data together with a count of the number of blobs included in the index.
///
/// The encoded file format is:
/// - count of index entries=N
/// - N x index entries, each encoded as:
/// - number of blobs in the file = N
/// - number of deduplicated index entries = M
/// - M x index entries encoded as:
/// - size L of index entry (VLQ)
/// - index blob contents (L bytes)
/// - repeating encoded blobs, each encoded as:
/// - N x blobs encoded as:
/// - VLQ value that is either:
/// - (J << 1) & 0x01: indicates this blob is index entry J
/// - (L << 1) & 0x00: indicates an explicit blob of len L
/// - (in the latter case) explicit blob contents (L bytes)
pub fn encode_blobs<'a, I, T>(blobs: &'a I) -> (alloc::vec::Vec<u8>, usize)
pub fn encode_blobs<T>(blobs: &[T]) -> (alloc::vec::Vec<u8>, usize)
where
&'a I: IntoIterator<Item = &'a T>,
T: AsRef<[u8]> + 'a,
T: AsRef<[u8]>,
{
use alloc::{collections::BTreeMap, vec::Vec};

let mut idx_map = BTreeMap::new();
let mut dedup_map = BTreeMap::new();
blobs
.into_iter()
.iter()
.map(|v| v.as_ref())
.filter(|blob| !blob.is_empty())
.for_each(|blob| {
let v = idx_map.entry(blob.as_ref()).or_insert(0);
let v = dedup_map.entry(blob.as_ref()).or_insert(0);
*v += 1;
});

let mut idx: Vec<&[u8]> = idx_map
let mut dedup_list: Vec<&[u8]> = dedup_map
.iter()
.filter(|&(_, &v)| v > 1)
.map(|(&k, _)| k)
.collect();
idx.sort_by_key(|e| {
dedup_list.sort_by_key(|e| {
let k = match e {
[0] => 2,
[1] => 1,
_ => 0,
};
(k, idx_map.get(e).unwrap())
(k, dedup_map.get(e).unwrap())
});
idx.reverse();
let idx_len = idx.len();
dedup_list.reverse();
let idx_len = dedup_list.len();

let rev_idx: BTreeMap<&[u8], usize> = idx.iter().enumerate().map(|(i, &e)| (e, i)).collect();
let rev_idx: BTreeMap<&[u8], usize> = dedup_list
.iter()
.enumerate()
.map(|(i, &e)| (e, i))
.collect();

let mut out_buf = Vec::new();
let mut buf = [0u8; 4];
out_buf.extend_from_slice(encode_vlq(idx.len(), &mut buf));
for e in idx {

out_buf.extend_from_slice(encode_vlq(blobs.len(), &mut buf));
out_buf.extend_from_slice(encode_vlq(dedup_list.len(), &mut buf));

for e in dedup_list {
out_buf.extend_from_slice(encode_vlq(e.len(), &mut buf));
out_buf.extend_from_slice(e);
}

for blob in blobs.into_iter().map(|v| v.as_ref()) {
for blob in blobs.iter().map(|v| v.as_ref()) {
if let Some(dup_pos) = rev_idx.get(blob) {
let n = (dup_pos << 1) + 1usize;
out_buf.extend_from_slice(encode_vlq(n, &mut buf));
Expand Down
2 changes: 1 addition & 1 deletion blobby/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ extern crate alloc;
pub(crate) mod decode;
#[cfg(feature = "alloc")]
pub use decode::parse_into_vec;
pub use decode::{parse_dedup_len, parse_into_array, parse_items_len};
pub use decode::{Header, parse_into_array};

#[cfg(feature = "alloc")]
mod encode;
Expand Down
31 changes: 31 additions & 0 deletions blobby/tests/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#![cfg(feature = "alloc")]

const ITEMS_LEN: usize = 10;
const DEDUP_LEN: usize = 3;
const TEST_BLOBS: &[&[u8]; ITEMS_LEN] = &[
b"1",
b"12",
b"1",
b"1",
b"123",
&[42; 100_000],
&[42; 100_000],
&[13; 7_000],
&[13; 7_000],
&[13; 5_000],
];

#[test]
fn blobby_rondtrip_test() -> Result<(), blobby::Error> {
let (blobby_data, dedup_len) = blobby::encode_blobs(TEST_BLOBS);
assert_eq!(dedup_len, DEDUP_LEN);
assert_eq!(blobby_data.len(), 112_025);

let decoded_blobs = blobby::parse_into_array::<ITEMS_LEN, DEDUP_LEN>(&blobby_data)?;
assert_eq!(decoded_blobs, TEST_BLOBS[..]);

let decoded_blobs = blobby::parse_into_vec(&blobby_data)?;
assert_eq!(decoded_blobs, TEST_BLOBS[..]);

Ok(())
}