From b8119290162e2f42e1e13b9b558afc6f0d4ada1b Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 6 Jan 2026 18:54:53 +0800 Subject: [PATCH 1/5] Add inverted index optimization notes --- inverted_query_optimizations.md | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 inverted_query_optimizations.md diff --git a/inverted_query_optimizations.md b/inverted_query_optimizations.md new file mode 100644 index 00000000000..9b74b49455d --- /dev/null +++ b/inverted_query_optimizations.md @@ -0,0 +1,68 @@ +# Inverted Index Query: Potential Performance Optimizations + +Below are candidate optimizations in the current inverted-index query path (IO, CPU, and WAND/BMW logic). I focused on the hot paths in: +- `rust/lance/src/io/exec/fts.rs` +- `rust/lance-index/src/scalar/inverted/index.rs` +- `rust/lance-index/src/scalar/inverted/wand.rs` +- `rust/lance-index/src/scalar/inverted/encoding.rs` + +These are suggestions to explore; none are implemented here. + +## IO / Storage + +1. **Batch posting-list reads for multi‑term queries** + - Today `PostingListReader::posting_list` reads a single token row at a time (`read_range(token_id..token_id+1)`), and `InvertedPartition::load_posting_lists` does this for each token. For fuzzy expansions or long queries this becomes many small I/Os. Consider batching contiguous token IDs (single `read_range`) or a multi‑row read API, then slicing in memory. + +2. **Avoid double reads for phrase queries** + - For phrase queries `posting_list(..., is_phrase_query=true)` first loads postings without positions, then `read_positions` does a second read. When phrase query is requested, read `POSTING_COL + POSITION_COL` in one call and cache it with a distinct key (e.g., include `with_position` in the cache key) to avoid a second I/O. + +3. **Lazy / block‑level loading for very large posting lists** + - Compressed posting lists are stored as a single row containing all blocks, so WAND still loads *all* blocks even when early termination is possible. Consider storing block offsets (or each block as a row) to enable on‑demand block reads, especially for very frequent tokens where many blocks are skipped. + +4. **Partition‑level pruning before loading posting lists** + - `InvertedIndex::bm25_search` loads posting lists for all partitions before per‑partition search. If the `RowAddrMask` (prefilter) can be mapped to partitions/fragments, you can skip partitions that cannot match any row IDs, avoiding the posting‑list reads entirely. + +5. **Positions: keep compressed and decode lazily** + - `PostingIterator::positions` fully decompresses positions into a `Vec` each time. For phrase queries with many candidates, this becomes expensive. A lazy iterator over compressed blocks (or caching per‑doc positions once) could reduce I/O and CPU for position checks. + +## CPU / Scoring + +1. **Precompute query weights (IDF) once per query** + - In `InvertedIndex::bm25_search`, each candidate doc recomputes `query_weight` via `IndexBM25Scorer::num_docs_containing_token`, which scans partitions per call. For large candidate sets this is very costly. Precompute `idf` per term once (per query) and use it directly during scoring. + +2. **Avoid repeated String cloning for tokens** + - `tokens_by_position` is rebuilt per partition with cloned `String`s. `PostingIterator` also stores `String`. Consider using `Arc` or storing indices into the original `Tokens` vector to reduce allocations and cache pressure (especially with fuzzy expansions). + +3. **Phrase query position checks allocate each time** + - `Wand::check_positions` allocates `Vec` and sorts it for *every* candidate doc. You can precompute the query‑term order once, reuse a small fixed buffer, and avoid resorting on every candidate. + +4. **Reduce repeated decompression in tight loops** + - `PostingIterator::doc()` is called frequently, and for compressed lists it may decompress a block repeatedly in inner loops. Consider caching the last `(block_idx, block_offset)` and avoiding redundant `doc()` calls in `next`, `check_pivot_aligned`, and `check_block_max` paths. + +5. **Use token IDs for fuzzy expansions** + - `expand_fuzzy` gets FST matches as strings, then re‑maps to token IDs. The FST already stores the token ID; use it directly to avoid string materialization and lookups for large expansions. + +## WAND / BMW Algorithm + +1. **Tighten block upper bounds** + - `DocSet::calculate_block_max_scores` already multiplies by `idf * (K1+1)` when writing. In `Wand::block_max_score`, the compressed path multiplies by `(K1+1)` again. If this is redundant, bounds become too loose, reducing pruning effectiveness. Verify and remove extra scaling if safe. + +2. **Avoid full sort on every `move_preceding`** + - `Wand::move_preceding` does `postings.sort_unstable()` each time a candidate is rejected. For many terms (fuzzy OR queries) this is expensive. Maintain a heap or insertion‑sorted vector to reduce per‑candidate sort cost. + +3. **Optimize pivot selection cost** + - `find_pivot_term` recomputes a linear prefix sum of `approximate_upper_bound` for every candidate. For large term counts, keep prefix sums or update incrementally when postings move to reduce O(n) per iteration. + +4. **Enable BMW on legacy / plain lists** + - `PostingList::Plain` uses `approximate_upper_bound` for every block because there is no per‑block max. If legacy indexes are still important, consider storing block max scores there too (or compressing legacy postings on load) to enable block‑max pruning. + +5. **Use global BM25 stats earlier to prune more** + - Per‑partition WAND uses local `avgdl` and IDF. Global rescoring then discards many candidates. If you can precompute global IDF for query terms and supply it to per‑partition WAND (with correct upper bounds), you can reduce per‑partition candidate volume and CPU. + +## Quick Targets (likely high ROI) + +- Batch posting‑list reads for queries with many tokens (IO). +- Precompute query IDF weights once per query (CPU). +- Remove redundant scaling in `block_max_score` if verified (WAND pruning). +- Avoid per‑candidate allocation/sort in `check_positions` (phrase queries). + From bc199067a89acade734bf2365b8c470a33b9413f Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 6 Jan 2026 19:06:53 +0800 Subject: [PATCH 2/5] Optimize compressed posting skips --- rust/lance-index/src/scalar/inverted/wand.rs | 50 ++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 61786bdb23a..fecf1025ff8 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -242,9 +242,30 @@ impl PostingIterator { block_idx += 1; } self.index = self.index.max(block_idx * BLOCK_SIZE); - let length = self.list.len(); - while self.index < length && (self.doc().unwrap().doc_id() as u32) < least_id { - self.index += 1; + let length = list.length as usize; + while self.index < length { + let block_idx = self.index / BLOCK_SIZE; + let block_offset = self.index % BLOCK_SIZE; + let compressed = unsafe { + let compressed = self.compressed.as_ref().unwrap(); + &mut *compressed.get() + }; + if compressed.block_idx != block_idx || compressed.doc_ids.is_empty() { + let block = list.blocks.value(block_idx); + compressed.decompress(block, block_idx, list.blocks.len(), list.length); + } + let in_block = &compressed.doc_ids[block_offset..]; + let offset_in_block = in_block.partition_point(|&doc_id| doc_id < least_id); + let new_offset = block_offset + offset_in_block; + if new_offset < compressed.doc_ids.len() { + self.index = block_idx * BLOCK_SIZE + new_offset; + break; + } + if block_idx + 1 >= list.blocks.len() { + self.index = length; + break; + } + self.index = (block_idx + 1) * BLOCK_SIZE; } self.block_idx = self.index / BLOCK_SIZE; } @@ -952,6 +973,29 @@ mod tests { assert_eq!(result.len(), 0); // Should not panic } + #[test] + fn test_posting_iterator_next_compressed_partition_point() { + let mut docs = DocSet::default(); + let num_docs = (BLOCK_SIZE * 2 + 5) as u32; + for i in 0..num_docs { + docs.append(i as u64, 1); + } + + let doc_ids = (0..num_docs).collect::>(); + let posting = generate_posting_list(doc_ids, 1.0, None, true); + let mut iter = PostingIterator::new(String::from("term"), 0, 0, posting, docs.len()); + + iter.next(10); + assert_eq!(iter.doc().unwrap().doc_id(), 10); + + let target = BLOCK_SIZE as u64 + 3; + iter.next(target); + assert_eq!(iter.doc().unwrap().doc_id(), target); + + iter.next(num_docs as u64 + 10); + assert!(iter.doc().is_none()); + } + #[test] fn test_wand_skip_to_next_block() { let mut docs = DocSet::default(); From ede5d2dbb68b3cc12e5386f5d43f2383ecd47eec Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 6 Jan 2026 19:39:42 +0800 Subject: [PATCH 3/5] Remove inverted query notes from repo --- inverted_query_optimizations.md | 68 --------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 inverted_query_optimizations.md diff --git a/inverted_query_optimizations.md b/inverted_query_optimizations.md deleted file mode 100644 index 9b74b49455d..00000000000 --- a/inverted_query_optimizations.md +++ /dev/null @@ -1,68 +0,0 @@ -# Inverted Index Query: Potential Performance Optimizations - -Below are candidate optimizations in the current inverted-index query path (IO, CPU, and WAND/BMW logic). I focused on the hot paths in: -- `rust/lance/src/io/exec/fts.rs` -- `rust/lance-index/src/scalar/inverted/index.rs` -- `rust/lance-index/src/scalar/inverted/wand.rs` -- `rust/lance-index/src/scalar/inverted/encoding.rs` - -These are suggestions to explore; none are implemented here. - -## IO / Storage - -1. **Batch posting-list reads for multi‑term queries** - - Today `PostingListReader::posting_list` reads a single token row at a time (`read_range(token_id..token_id+1)`), and `InvertedPartition::load_posting_lists` does this for each token. For fuzzy expansions or long queries this becomes many small I/Os. Consider batching contiguous token IDs (single `read_range`) or a multi‑row read API, then slicing in memory. - -2. **Avoid double reads for phrase queries** - - For phrase queries `posting_list(..., is_phrase_query=true)` first loads postings without positions, then `read_positions` does a second read. When phrase query is requested, read `POSTING_COL + POSITION_COL` in one call and cache it with a distinct key (e.g., include `with_position` in the cache key) to avoid a second I/O. - -3. **Lazy / block‑level loading for very large posting lists** - - Compressed posting lists are stored as a single row containing all blocks, so WAND still loads *all* blocks even when early termination is possible. Consider storing block offsets (or each block as a row) to enable on‑demand block reads, especially for very frequent tokens where many blocks are skipped. - -4. **Partition‑level pruning before loading posting lists** - - `InvertedIndex::bm25_search` loads posting lists for all partitions before per‑partition search. If the `RowAddrMask` (prefilter) can be mapped to partitions/fragments, you can skip partitions that cannot match any row IDs, avoiding the posting‑list reads entirely. - -5. **Positions: keep compressed and decode lazily** - - `PostingIterator::positions` fully decompresses positions into a `Vec` each time. For phrase queries with many candidates, this becomes expensive. A lazy iterator over compressed blocks (or caching per‑doc positions once) could reduce I/O and CPU for position checks. - -## CPU / Scoring - -1. **Precompute query weights (IDF) once per query** - - In `InvertedIndex::bm25_search`, each candidate doc recomputes `query_weight` via `IndexBM25Scorer::num_docs_containing_token`, which scans partitions per call. For large candidate sets this is very costly. Precompute `idf` per term once (per query) and use it directly during scoring. - -2. **Avoid repeated String cloning for tokens** - - `tokens_by_position` is rebuilt per partition with cloned `String`s. `PostingIterator` also stores `String`. Consider using `Arc` or storing indices into the original `Tokens` vector to reduce allocations and cache pressure (especially with fuzzy expansions). - -3. **Phrase query position checks allocate each time** - - `Wand::check_positions` allocates `Vec` and sorts it for *every* candidate doc. You can precompute the query‑term order once, reuse a small fixed buffer, and avoid resorting on every candidate. - -4. **Reduce repeated decompression in tight loops** - - `PostingIterator::doc()` is called frequently, and for compressed lists it may decompress a block repeatedly in inner loops. Consider caching the last `(block_idx, block_offset)` and avoiding redundant `doc()` calls in `next`, `check_pivot_aligned`, and `check_block_max` paths. - -5. **Use token IDs for fuzzy expansions** - - `expand_fuzzy` gets FST matches as strings, then re‑maps to token IDs. The FST already stores the token ID; use it directly to avoid string materialization and lookups for large expansions. - -## WAND / BMW Algorithm - -1. **Tighten block upper bounds** - - `DocSet::calculate_block_max_scores` already multiplies by `idf * (K1+1)` when writing. In `Wand::block_max_score`, the compressed path multiplies by `(K1+1)` again. If this is redundant, bounds become too loose, reducing pruning effectiveness. Verify and remove extra scaling if safe. - -2. **Avoid full sort on every `move_preceding`** - - `Wand::move_preceding` does `postings.sort_unstable()` each time a candidate is rejected. For many terms (fuzzy OR queries) this is expensive. Maintain a heap or insertion‑sorted vector to reduce per‑candidate sort cost. - -3. **Optimize pivot selection cost** - - `find_pivot_term` recomputes a linear prefix sum of `approximate_upper_bound` for every candidate. For large term counts, keep prefix sums or update incrementally when postings move to reduce O(n) per iteration. - -4. **Enable BMW on legacy / plain lists** - - `PostingList::Plain` uses `approximate_upper_bound` for every block because there is no per‑block max. If legacy indexes are still important, consider storing block max scores there too (or compressing legacy postings on load) to enable block‑max pruning. - -5. **Use global BM25 stats earlier to prune more** - - Per‑partition WAND uses local `avgdl` and IDF. Global rescoring then discards many candidates. If you can precompute global IDF for query terms and supply it to per‑partition WAND (with correct upper bounds), you can reduce per‑partition candidate volume and CPU. - -## Quick Targets (likely high ROI) - -- Batch posting‑list reads for queries with many tokens (IO). -- Precompute query IDF weights once per query (CPU). -- Remove redundant scaling in `block_max_score` if verified (WAND pruning). -- Avoid per‑candidate allocation/sort in `check_positions` (phrase queries). - From 28ac0b53b9443601e804dd4fe60284af795179b9 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 8 Jan 2026 16:50:08 +0800 Subject: [PATCH 4/5] Refactor posting iterator block loading --- rust/lance-index/src/scalar/inverted/wand.rs | 51 +++++++++++--------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index fecf1025ff8..2037949f5a5 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -22,7 +22,7 @@ use super::{ encoding::{decompress_positions, decompress_posting_block, decompress_posting_remainder}, query::FtsSearchParams, scorer::Scorer, - DocSet, PostingList, RawDocInfo, + CompressedPostingList, DocSet, PostingList, RawDocInfo, }; use super::{builder::BLOCK_SIZE, DocInfo}; use super::{ @@ -140,6 +140,28 @@ impl Ord for PostingIterator { } impl PostingIterator { + #[inline] + fn compressed_state_ptr(&self) -> *mut CompressedState { + debug_assert!(self.compressed.is_some()); + // this method is called very frequently, so we prefer to use `UnsafeCell` instead of + // `RefCell` to avoid the overhead of runtime borrow checking + self.compressed.as_ref().unwrap().get() + } + + #[inline] + fn ensure_compressed_block_ptr( + &self, + list: &CompressedPostingList, + block_idx: usize, + ) -> *mut CompressedState { + let compressed = unsafe { &mut *self.compressed_state_ptr() }; + if compressed.block_idx != block_idx || compressed.doc_ids.is_empty() { + let block = list.blocks.value(block_idx); + compressed.decompress(block, block_idx, list.blocks.len(), list.length); + } + compressed as *mut CompressedState + } + pub(crate) fn new( token: String, token_id: u32, @@ -194,19 +216,10 @@ impl PostingIterator { match self.list { PostingList::Compressed(ref list) => { - debug_assert!(self.compressed.is_some()); - // this method is called very frequently, so we prefer to use `UnsafeCell` instead of `RefCell` - // to avoid the overhead of runtime borrow checking - let compressed = unsafe { - let compressed = self.compressed.as_ref().unwrap(); - &mut *compressed.get() - }; let block_idx = self.index / BLOCK_SIZE; let block_offset = self.index % BLOCK_SIZE; - if compressed.block_idx != block_idx || compressed.doc_ids.is_empty() { - let block = list.blocks.value(block_idx); - compressed.decompress(block, block_idx, list.blocks.len(), list.length); - } + let compressed = + unsafe { &mut *self.ensure_compressed_block_ptr(list, block_idx) }; // Read from the decompressed block let doc_id = compressed.doc_ids[block_offset]; @@ -232,7 +245,7 @@ impl PostingIterator { // move to the next doc id that is greater than or equal to least_id fn next(&mut self, least_id: u64) { match self.list { - PostingList::Compressed(ref mut list) => { + PostingList::Compressed(ref list) => { debug_assert!(least_id <= u32::MAX as u64); let least_id = least_id as u32; let mut block_idx = self.index / BLOCK_SIZE; @@ -246,14 +259,8 @@ impl PostingIterator { while self.index < length { let block_idx = self.index / BLOCK_SIZE; let block_offset = self.index % BLOCK_SIZE; - let compressed = unsafe { - let compressed = self.compressed.as_ref().unwrap(); - &mut *compressed.get() - }; - if compressed.block_idx != block_idx || compressed.doc_ids.is_empty() { - let block = list.blocks.value(block_idx); - compressed.decompress(block, block_idx, list.blocks.len(), list.length); - } + let compressed = + unsafe { &mut *self.ensure_compressed_block_ptr(list, block_idx) }; let in_block = &compressed.doc_ids[block_offset..]; let offset_in_block = in_block.partition_point(|&doc_id| doc_id < least_id); let new_offset = block_offset + offset_in_block; @@ -277,7 +284,7 @@ impl PostingIterator { fn shallow_next(&mut self, least_id: u64) { match self.list { - PostingList::Compressed(ref mut list) => { + PostingList::Compressed(ref list) => { debug_assert!(least_id <= u32::MAX as u64); let least_id = least_id as u32; while self.block_idx + 1 < list.blocks.len() From 1289160baaf56f81c4cc8159ecf9c27c2641d74e Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 8 Jan 2026 16:55:05 +0800 Subject: [PATCH 5/5] format Signed-off-by: BubbleCal --- rust/lance-index/src/scalar/inverted/wand.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 2037949f5a5..1e378a26eec 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -218,8 +218,7 @@ impl PostingIterator { PostingList::Compressed(ref list) => { let block_idx = self.index / BLOCK_SIZE; let block_offset = self.index % BLOCK_SIZE; - let compressed = - unsafe { &mut *self.ensure_compressed_block_ptr(list, block_idx) }; + let compressed = unsafe { &mut *self.ensure_compressed_block_ptr(list, block_idx) }; // Read from the decompressed block let doc_id = compressed.doc_ids[block_offset];