diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index 2daf74502cd..a1131cc827a 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -130,7 +130,7 @@ struct ChunkMeta { } /// A mini-block chunk that has been decoded and decompressed -#[derive(Debug)] +#[derive(Debug, Clone)] struct DecodedMiniBlockChunk { rep: Option>, def: Option>, @@ -530,13 +530,42 @@ impl DecodePageTask for DecodeMiniBlockTask { // We need to keep track of the offset into repbuf/defbuf that we are building up let mut level_offset = 0; + + // Pre-compute caching needs for each chunk by checking if the next chunk is the same + let needs_caching: Vec = self + .instructions + .windows(2) + .map(|w| w[0].1.chunk_idx == w[1].1.chunk_idx) + .chain(std::iter::once(false)) // the last one never needs caching + .collect(); + + // Cache for storing decoded chunks when beneficial + let mut chunk_cache: Option<(usize, DecodedMiniBlockChunk)> = None; + // Now we iterate through each instruction and process it - for (instructions, chunk) in self.instructions.iter() { - // TODO: It's very possible that we have duplicate `buf` in self.instructions and we - // don't want to decode the buf again and again on the same thread. + for (idx, (instructions, chunk)) in self.instructions.iter().enumerate() { + let should_cache_this_chunk = needs_caching[idx]; + + let decoded_chunk = match &chunk_cache { + Some((cached_chunk_idx, ref cached_chunk)) + if *cached_chunk_idx == chunk.chunk_idx => + { + // Clone only when we have a cache hit (much cheaper than decoding) + cached_chunk.clone() + } + _ => { + // Cache miss, need to decode + let decoded = self.decode_miniblock_chunk(&chunk.data, chunk.items_in_chunk)?; + + // Only update cache if this chunk will benefit the next access + if should_cache_this_chunk { + chunk_cache = Some((chunk.chunk_idx, decoded.clone())); + } + decoded + } + }; - let DecodedMiniBlockChunk { rep, def, values } = - self.decode_miniblock_chunk(&chunk.data, chunk.items_in_chunk)?; + let DecodedMiniBlockChunk { rep, def, values } = decoded_chunk; // Our instructions tell us which rows we want to take from this chunk let row_range_start =