diff --git a/benchmark.log b/benchmark.log deleted file mode 100644 index 3d9f074..0000000 --- a/benchmark.log +++ /dev/null @@ -1,56 +0,0 @@ -cargo bench - Finished `bench` profile [optimized] target(s) in 0.27s - Running unittests src/lib.rs (target/release/deps/density_rs-d3297b9d2331d177) - -running 3 tests -test tests::chameleon ... ignored -test tests::cheetah ... ignored -test tests::lion ... ignored - -test result: ok. 0 passed; 0 failed; 3 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Running benches/density.rs (target/release/deps/density-337b4b824fbad157) -Using file ./benches/data/dickens.txt (10192446 bytes) -Timer precision: 41 ns -density fastest │ slowest │ median │ mean │ samples │ iters -├─ chameleon │ │ │ │ │ -│ ├─ compress/raw (1.749x) 4.606 ms │ 5.252 ms │ 4.725 ms │ 4.742 ms │ 25 │ 25 -│ │ 2.212 GB/s │ 1.94 GB/s │ 2.156 GB/s │ 2.149 GB/s │ │ -│ ╰─ decompress/raw 3.397 ms │ 3.567 ms │ 3.452 ms │ 3.456 ms │ 25 │ 25 -│ 3 GB/s │ 2.856 GB/s │ 2.952 GB/s │ 2.949 GB/s │ │ -├─ cheetah │ │ │ │ │ -│ ├─ compress/raw (1.860x) 8.388 ms │ 8.854 ms │ 8.556 ms │ 8.551 ms │ 25 │ 25 -│ │ 1.215 GB/s │ 1.151 GB/s │ 1.191 GB/s │ 1.191 GB/s │ │ -│ ╰─ decompress/raw 5.781 ms │ 6.257 ms │ 5.882 ms │ 5.894 ms │ 25 │ 25 -│ 1.762 GB/s │ 1.628 GB/s │ 1.732 GB/s │ 1.729 GB/s │ │ -╰─ lion │ │ │ │ │ - ├─ compress/raw (1.966x) 14.42 ms │ 14.79 ms │ 14.55 ms │ 14.55 ms │ 25 │ 25 - │ 706.5 MB/s │ 689.1 MB/s │ 700.4 MB/s │ 700.2 MB/s │ │ - ╰─ decompress/raw 9.31 ms │ 9.787 ms │ 9.469 ms │ 9.483 ms │ 25 │ 25 - 1.094 GB/s │ 1.041 GB/s │ 1.076 GB/s │ 1.074 GB/s │ │ - - Running benches/lz4.rs (target/release/deps/lz4-9c50a6cd5b53e994) -Using file ./benches/data/dickens.txt (10192446 bytes) -Timer precision: 41 ns -lz4 fastest │ slowest │ median │ mean │ samples │ iters -╰─ default │ │ │ │ │ - ├─ compress/raw (1.585x) 21.41 ms │ 22.37 ms │ 21.79 ms │ 21.79 ms │ 25 │ 25 - │ 476 MB/s │ 455.5 MB/s │ 467.6 MB/s │ 467.5 MB/s │ │ - ╰─ decompress/raw 3.405 ms │ 3.667 ms │ 3.436 ms │ 3.465 ms │ 25 │ 25 - 2.993 GB/s │ 2.778 GB/s │ 2.966 GB/s │ 2.94 GB/s │ │ - - Running benches/snappy.rs (target/release/deps/snappy-33d1f219f1371d73) -Using file ./benches/data/dickens.txt (10192446 bytes) -Timer precision: 41 ns -snappy fastest │ slowest │ median │ mean │ samples │ iters -╰─ default │ │ │ │ │ - ├─ compress/stream (1.607x) 28.59 ms │ 29.17 ms │ 28.87 ms │ 28.88 ms │ 25 │ 25 - │ 356.4 MB/s │ 349.3 MB/s │ 352.9 MB/s │ 352.8 MB/s │ │ - ╰─ decompress/stream 12.95 ms │ 13.64 ms │ 13.16 ms │ 13.17 ms │ 25 │ 25 - 786.6 MB/s │ 746.9 MB/s │ 774 MB/s │ 773.7 MB/s │ │ - - Running benches/utils.rs (target/release/deps/utils-0441cb69e0fcfbda) - -running 0 tests - -test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s \ No newline at end of file diff --git a/src/algorithms/chameleon/chameleon.rs b/src/algorithms/chameleon/chameleon.rs index 4d9553d..0e18e95 100644 --- a/src/algorithms/chameleon/chameleon.rs +++ b/src/algorithms/chameleon/chameleon.rs @@ -11,6 +11,9 @@ use crate::io::write_signature::WriteSignature; use crate::{BIT_SIZE_U16, BIT_SIZE_U32, BYTE_SIZE_U32}; use std::slice::{from_raw_parts, from_raw_parts_mut}; +#[cfg(all(target_arch = "riscv64", target_feature = "v"))] +use std::arch::riscv64::*; + pub(crate) const CHAMELEON_HASH_BITS: usize = BIT_SIZE_U16; pub(crate) const CHAMELEON_HASH_MULTIPLIER: u32 = 0x9D6EF916; @@ -21,7 +24,6 @@ pub(crate) const PLAIN_PLAIN_FLAGS: u64 = (PLAIN_FLAG << 1) | PLAIN_FLAG; pub(crate) const MAP_PLAIN_FLAGS: u64 = (PLAIN_FLAG << 1) | MAP_FLAG; pub(crate) const PLAIN_MAP_FLAGS: u64 = (MAP_FLAG << 1) | PLAIN_FLAG; // pub(crate) const _MAP_MAP_FLAGS: u64 = (MAP_FLAG << 1) | MAP_FLAG; - pub(crate) const DECODE_TWIN_FLAG_MASK: u64 = 0x3; pub(crate) const DECODE_TWIN_FLAG_MASK_BITS: u8 = 2; pub(crate) const DECODE_FLAG_MASK: u64 = 0x1; @@ -88,14 +90,76 @@ impl QuadEncoder for Chameleon { fn encode_quad(&mut self, quad: u32, out_buffer: &mut WriteBuffer, signature: &mut WriteSignature) { let hash_u16 = (quad.wrapping_mul(CHAMELEON_HASH_MULTIPLIER) >> (BIT_SIZE_U32 - CHAMELEON_HASH_BITS)) as u16; let dictionary_value = &mut self.state.chunk_map[hash_u16 as usize]; - if *dictionary_value != quad { + + // 检查字典命中 + if *dictionary_value == quad { + // 字典命中,输出哈希引用 + signature.push_bits(MAP_FLAG, FLAG_SIZE_BITS); + out_buffer.push(&hash_u16.to_le_bytes()); + } else { + // 字典未命中,输出原始数据并更新字典 signature.push_bits(PLAIN_FLAG, FLAG_SIZE_BITS); out_buffer.push(&quad.to_le_bytes()); - *dictionary_value = quad; - } else { - signature.push_bits(MAP_FLAG, FLAG_SIZE_BITS); - out_buffer.push(&hash_u16.to_le_bytes()); + } + } + + #[inline(always)] + fn encode_batch(&mut self, quads: &[u32], out_buffer: &mut WriteBuffer, signature: &mut WriteSignature) { + #[cfg(not(all(target_arch = "riscv64", target_feature = "v")))] + { + for &quad in quads { + self.encode_quad(quad, out_buffer, signature); + } + return; + } + + // ... existing code ... + + #[cfg(all(target_arch = "riscv64", target_feature = "v"))] + unsafe { + let num_quads = quads.len(); + let mut offset = 0; + while offset < num_quads { + let remaining = num_quads - offset; + let vl = vsetvli(remaining, riscv64::riscv_v_sew::E32, riscv64::riscv_v_lmul::M1, riscv64::riscv_v_ta::TA, riscv64::riscv_v_ma::MA); + + let v_quad = vle32_v_u32m1(quads.as_ptr().add(offset) as *const u32, vl); + + let v_mult = vmul_vx_u32m1(v_quad, CHAMELEON_HASH_MULTIPLIER, vl); + let v_hash = vsrl_vx_u32m1(v_mult, BIT_SIZE_U32 - CHAMELEON_HASH_BITS as u32, vl); + + let dict_ptr = self.state.chunk_map.as_mut_ptr(); + let v_dict = vluxei32_v_u32m1(dict_ptr as *const u32, v_hash, vl); + + let v_mask = vmseq_vv_m_b32(v_dict, v_quad, vl); // hit mask (true if match) + + let mut quad_arr = vec![0u32; vl]; + let mut hash_arr = vec![0u32; vl]; + let mut hit_arr = vec![false; vl]; + + vse32_v_u32m1(quad_arr.as_mut_ptr(), v_quad, vl); + vse32_v_u32m1(hash_arr.as_mut_ptr(), v_hash, vl); + + for i in 0..vl { + let single_mask = vslidedown_vx_m_b32(v_mask, i as u32, vl); + hit_arr[i] = vfirst_m_b32(single_mask, 1) != -1; + } + for i in 0..vl { + let quad = quad_arr[i]; + let hash_u16 = hash_arr[i] as u16; + if hit_arr[i] { + signature.push_bits(MAP_FLAG, FLAG_SIZE_BITS); + out_buffer.push(&hash_u16.to_le_bytes()); + } else { + signature.push_bits(PLAIN_FLAG, FLAG_SIZE_BITS); + out_buffer.push(&quad.to_le_bytes()); + self.state.chunk_map[hash_u16 as usize] = quad; + } + } + + offset += vl; + } } } } @@ -104,10 +168,10 @@ impl Decoder for Chameleon { #[inline(always)] fn decode_unit(&mut self, in_buffer: &mut ReadBuffer, signature: &mut ReadSignature, out_buffer: &mut WriteBuffer) { let (quad_a, quad_b) = match signature.read_bits(DECODE_TWIN_FLAG_MASK, DECODE_TWIN_FLAG_MASK_BITS) { - PLAIN_PLAIN_FLAGS => { (self.decode_plain(in_buffer), self.decode_plain(in_buffer)) } - MAP_PLAIN_FLAGS => { (self.decode_map(in_buffer), self.decode_plain(in_buffer)) } - PLAIN_MAP_FLAGS => { (self.decode_plain(in_buffer), self.decode_map(in_buffer)) } - _ => { (self.decode_map(in_buffer), self.decode_map(in_buffer)) } + PLAIN_PLAIN_FLAGS => (self.decode_plain(in_buffer), self.decode_plain(in_buffer)), + MAP_PLAIN_FLAGS => (self.decode_map(in_buffer), self.decode_plain(in_buffer)), + PLAIN_MAP_FLAGS => (self.decode_plain(in_buffer), self.decode_map(in_buffer)), + _ => (self.decode_map(in_buffer), self.decode_map(in_buffer)), }; out_buffer.push(&quad_a.to_le_bytes()); out_buffer.push(&quad_b.to_le_bytes()); @@ -119,15 +183,15 @@ impl Decoder for Chameleon { let quad = match signature.read_bits(DECODE_FLAG_MASK, DECODE_FLAG_MASK_BITS) { PLAIN_FLAG => { match in_buffer.remaining() { - 0 => { return true; } + 0 => return true, 1..=3 => { out_buffer.push(in_buffer.read(in_buffer.remaining())); return true; } - _ => { self.decode_plain(in_buffer) } + _ => self.decode_plain(in_buffer), } } - _ => { self.decode_map(in_buffer) } + _ => self.decode_map(in_buffer), }; out_buffer.push(&quad.to_le_bytes()); } diff --git a/src/algorithms/cheetah/cheetah.rs b/src/algorithms/cheetah/cheetah.rs index 22bc648..03470c8 100644 --- a/src/algorithms/cheetah/cheetah.rs +++ b/src/algorithms/cheetah/cheetah.rs @@ -14,7 +14,6 @@ use std::slice::{from_raw_parts, from_raw_parts_mut}; pub(crate) const CHEETAH_HASH_BITS: usize = BIT_SIZE_U16; pub(crate) const CHEETAH_HASH_MULTIPLIER: u32 = 0x9D6EF916; - pub(crate) const FLAG_SIZE_BITS: u8 = 2; pub(crate) const MAP_A_FLAG: u64 = 0x1; pub(crate) const MAP_B_FLAG: u64 = 0x2; @@ -147,6 +146,13 @@ impl QuadEncoder for Cheetah { } self.state.last_hash = hash_u16; } + + #[inline(always)] + fn encode_batch(&mut self, quads: &[u32], out_buffer: &mut WriteBuffer, signature: &mut WriteSignature) { + for &quad in quads { + self.encode_quad(quad, out_buffer, signature); + } + } } impl Decoder for Cheetah { @@ -200,4 +206,4 @@ impl Codec for Cheetah { self.state.chunk_map.fill(ChunkData { chunk_a: 0, chunk_b: 0 }); self.state.prediction_map.fill(PredictionData { next: 0 }); } -} \ No newline at end of file +} diff --git a/src/algorithms/lion/lion.rs b/src/algorithms/lion/lion.rs index 7b36c49..08a1e1f 100644 --- a/src/algorithms/lion/lion.rs +++ b/src/algorithms/lion/lion.rs @@ -14,7 +14,6 @@ use std::slice::{from_raw_parts, from_raw_parts_mut}; pub(crate) const LION_HASH_BITS: usize = BIT_SIZE_U16; pub(crate) const LION_HASH_MULTIPLIER: u32 = 0x9D6EF916; - pub(crate) const FLAG_SIZE_BITS: u8 = 3; pub(crate) const PREDICTED_A_FLAG: u64 = 0x1; pub(crate) const PREDICTED_B_FLAG: u64 = 0x2; @@ -268,6 +267,13 @@ impl QuadEncoder for Lion { self.update_last_hash(hash_u16); } + + #[inline(always)] + fn encode_batch(&mut self, quads: &[u32], out_buffer: &mut WriteBuffer, signature: &mut WriteSignature) { + for &quad in quads { + self.encode_quad(quad, out_buffer, signature); + } + } } impl Decoder for Lion { @@ -349,4 +355,4 @@ impl Codec for Lion { } } } -} \ No newline at end of file +} diff --git a/src/codec/codec.rs b/src/codec/codec.rs index 324b55c..0088624 100644 --- a/src/codec/codec.rs +++ b/src/codec/codec.rs @@ -7,7 +7,7 @@ use crate::io::read_buffer::ReadBuffer; use crate::io::read_signature::ReadSignature; use crate::io::write_buffer::WriteBuffer; use crate::io::write_signature::WriteSignature; -use crate::{BYTE_SIZE_U128, BYTE_SIZE_U32}; +use crate::BYTE_SIZE_U32; pub trait Codec: QuadEncoder + Decoder { fn block_size() -> usize; @@ -39,31 +39,31 @@ pub trait Codec: QuadEncoder + Decoder { let mark = out_buffer.index; signature.init(out_buffer.index); out_buffer.skip(Self::signature_significant_bytes()); - for sub_block in block.chunks(BYTE_SIZE_U128) { - match <&[u8] as TryInto<[u8; BYTE_SIZE_U128]>>::try_into(sub_block) { - Ok(array) => { - let value_u128 = u128::from_le_bytes(array); - self.encode_quad((value_u128 & 0xffffffff) as u32, out_buffer, signature); - self.encode_quad(((value_u128 >> 32) & 0xffffffff) as u32, out_buffer, signature); - self.encode_quad(((value_u128 >> 64) & 0xffffffff) as u32, out_buffer, signature); - self.encode_quad((value_u128 >> 96) as u32, out_buffer, signature); - } - Err(_error) => { - // Less than 16 bytes left - for bytes in sub_block.chunks(BYTE_SIZE_U32) { - match <&[u8] as TryInto<[u8; BYTE_SIZE_U32]>>::try_into(bytes) { - Ok(array) => { - self.encode_quad(u32::from_le_bytes(array), out_buffer, signature); - } - Err(_error) => { - // Implicit signature plain flag (0x0) - out_buffer.push(bytes); - } - } - } - } + + // 统一处理所有数据为小端序的quads + let mut all_quads = Vec::new(); + let mut remaining_bytes = Vec::new(); + + for chunk in block.chunks(BYTE_SIZE_U32) { + if chunk.len() == BYTE_SIZE_U32 { + let quad = u32::from_le_bytes(chunk.try_into().unwrap()); + all_quads.push(quad); + } else { + // 收集不完整的字节,稍后处理 + remaining_bytes.extend_from_slice(chunk); } } + + // 先处理所有完整的quads + if !all_quads.is_empty() { + self.encode_batch(&all_quads, out_buffer, signature); + } + + // 最后处理剩余的不完整字节 + if !remaining_bytes.is_empty() { + out_buffer.push(&remaining_bytes); + } + Self::write_signature(out_buffer, signature); protection_state.update(out_buffer.index - mark >= Self::block_size()); } @@ -124,4 +124,4 @@ pub trait Codec: QuadEncoder + Decoder { Ok(out_buffer.index) } -} \ No newline at end of file +} diff --git a/src/codec/quad_encoder.rs b/src/codec/quad_encoder.rs index 8960508..45d89f2 100644 --- a/src/codec/quad_encoder.rs +++ b/src/codec/quad_encoder.rs @@ -1,6 +1,12 @@ +// File: src/codec/quad_encoder.rs + use crate::io::write_buffer::WriteBuffer; use crate::io::write_signature::WriteSignature; pub trait QuadEncoder { + /// 编码单个 u32 quad fn encode_quad(&mut self, quad: u32, out_buffer: &mut WriteBuffer, signature: &mut WriteSignature); -} \ No newline at end of file + + /// 批量编码 u32 quads,支持 RVV 或标量实现 + fn encode_batch(&mut self, quads: &[u32], out_buffer: &mut WriteBuffer, signature: &mut WriteSignature); +} diff --git a/src/lib.rs b/src/lib.rs index 94365aa..4accde7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,6 @@ +#![cfg_attr(all(target_arch = "riscv64", target_feature = "v"), feature(riscv_ext_intrinsics))] +#![cfg_attr(all(target_arch = "riscv64", target_feature = "v"), feature(riscv_target_feature))] + pub mod codec; pub mod algorithms; pub mod buffer; @@ -83,5 +86,4 @@ mod tests { Err(_) => { assert!(false); } } } -} - +} \ No newline at end of file