From 2ab62b346212497cfd0dce37969fb771e1bc1502 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 30 Dec 2025 16:05:32 +0800 Subject: [PATCH 1/5] Optimize SQ distance and u8 dot --- rust/lance-index/src/vector/sq/storage.rs | 16 +++++++++-- rust/lance-linalg/src/distance/dot.rs | 34 +++++++++++++++++++---- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/rust/lance-index/src/vector/sq/storage.rs b/rust/lance-index/src/vector/sq/storage.rs index c3ef4c96345..384087c168f 100644 --- a/rust/lance-index/src/vector/sq/storage.rs +++ b/rust/lance-index/src/vector/sq/storage.rs @@ -387,17 +387,27 @@ impl VectorStore for ScalarQuantizationStorage { fn dist_calculator_from_id(&self, id: u32) -> Self::DistanceCalculator<'_> { let (offset, chunk) = self.chunk(id); let query_sq_code = chunk.sq_code_slice(id - offset).to_vec(); + let bounds = self.quantizer.bounds(); + let scale = sq_distance_scale(&bounds); SQDistCalculator { query_sq_code, - bounds: self.quantizer.bounds(), + bounds, + scale, storage: self, } } } +#[inline] +fn sq_distance_scale(bounds: &Range) -> f32 { + let range = (bounds.end - bounds.start) as f32; + (range * range) / (255.0_f32 * 255.0_f32) +} + pub struct SQDistCalculator<'a> { query_sq_code: Vec, bounds: Range, + scale: f32, storage: &'a ScalarQuantizationStorage, } @@ -421,9 +431,11 @@ impl<'a> SQDistCalculator<'a> { panic!("Unsupported data type for ScalarQuantizationStorage"); } }; + let scale = sq_distance_scale(&bounds); Self { query_sq_code, bounds, + scale, storage, } } @@ -440,7 +452,7 @@ impl DistCalculator for SQDistCalculator<'_> { DistanceType::Dot => dot_distance(sq_code, &self.query_sq_code), _ => panic!("We should not reach here: sq distance can only be L2 or Dot"), }; - inverse_scalar_dist(std::iter::once(dist), &self.bounds)[0] + dist * self.scale } fn distance_all(&self, _k_hint: usize) -> Vec { diff --git a/rust/lance-linalg/src/distance/dot.rs b/rust/lance-linalg/src/distance/dot.rs index 92e65c1fe68..f90bfcdec2e 100644 --- a/rust/lance-linalg/src/distance/dot.rs +++ b/rust/lance-linalg/src/distance/dot.rs @@ -57,6 +57,25 @@ fn dot_scalar< sum + sums.iter().copied().sum::() } +#[inline] +fn dot_u8_scalar(from: &[u8], to: &[u8]) -> u32 { + let x_chunks = to.chunks_exact(LANES); + let y_chunks = from.chunks_exact(LANES); + let remainder_sum = x_chunks + .remainder() + .iter() + .zip(y_chunks.remainder().iter()) + .map(|(&x, &y)| x as u32 * y as u32) + .sum::(); + let mut sums = [0_u32; LANES]; + for (x, y) in x_chunks.zip(y_chunks) { + for i in 0..LANES { + sums[i] += x[i] as u32 * y[i] as u32; + } + } + remainder_sum + sums.iter().copied().sum::() +} + /// Dot product. #[inline] pub fn dot(from: &[T], to: &[T]) -> f32 { @@ -152,11 +171,7 @@ impl Dot for f64 { impl Dot for u8 { #[inline] fn dot(x: &[Self], y: &[Self]) -> f32 { - // TODO: this is not optimized for auto vectorization yet. - x.iter() - .zip(y.iter()) - .map(|(&x_i, &y_i)| x_i as u32 * y_i as u32) - .sum::() as f32 + dot_u8_scalar::<16>(x, y) as f32 } } @@ -257,6 +272,15 @@ mod tests { assert_eq!(f32::dot(&x, &y), dot(&x, &y)); + let x: Vec = (0..20).map(|v| v as u8).collect(); + let y: Vec = (100..120).map(|v| v as u8).collect(); + let expected = x + .iter() + .zip(y.iter()) + .map(|(&x, &y)| x as u32 * y as u32) + .sum::() as f32; + assert_eq!(expected, dot(&x, &y)); + let x: Vec = (0..512).map(|v| v as f32).collect(); let y: Vec = (100..612).map(|v| v as f32).collect(); From 37a6df0a88289a1fea57fb2a18a74d121d69866e Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 30 Dec 2025 18:56:42 +0800 Subject: [PATCH 2/5] Add SQ HNSW benchmark --- rust/lance-index/benches/hnsw.rs | 103 ++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/rust/lance-index/benches/hnsw.rs b/rust/lance-index/benches/hnsw.rs index 967b2e67b67..030a349e663 100644 --- a/rust/lance-index/benches/hnsw.rs +++ b/rust/lance-index/benches/hnsw.rs @@ -7,7 +7,8 @@ use std::{collections::HashSet, sync::Arc, time::Duration}; -use arrow_array::{types::Float32Type, FixedSizeListArray}; +use arrow_array::{types::Float32Type, FixedSizeListArray, RecordBatch, UInt64Array}; +use arrow_schema::{DataType, Field, Schema}; use criterion::{criterion_group, criterion_main, Criterion}; use lance_arrow::FixedSizeListArrayExt; use lance_index::vector::v3::subindex::IvfSubIndex; @@ -17,7 +18,11 @@ use pprof::criterion::{Output, PProfProfiler}; use lance_index::vector::{ flat::storage::FlatFloatStorage, hnsw::builder::{HnswBuildParams, HnswQueryParams, HNSW}, + quantizer::Quantization, + sq::{builder::SQBuildParams, ScalarQuantizer}, + storage::StorageBuilder, }; +use lance_core::ROW_ID_FIELD; use lance_linalg::distance::DistanceType; use lance_testing::datagen::generate_random_array_with_seed; @@ -85,6 +90,98 @@ fn bench_hnsw(c: &mut Criterion) { }); } +fn bench_hnsw_sq(c: &mut Criterion) { + const DIMENSION: usize = 128; + const TOTAL: usize = 100_000; + const SEED: [u8; 32] = [42; 32]; + const K: usize = 100; + + let rt = tokio::runtime::Runtime::new().unwrap(); + + let data = generate_random_array_with_seed::(TOTAL * DIMENSION, SEED); + let fsl = FixedSizeListArray::try_new_from_values(data, DIMENSION as i32).unwrap(); + let quantizer = ::build( + &fsl, + DistanceType::L2, + &SQBuildParams::default(), + ) + .unwrap(); + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "vector", + DataType::FixedSizeList( + Field::new_list_field(DataType::Float32, true).into(), + DIMENSION as i32, + ), + true, + ), + ROW_ID_FIELD.clone(), + ])); + let row_ids = UInt64Array::from_iter_values((0..TOTAL).map(|v| v as u64)); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(fsl.clone()), Arc::new(row_ids)], + ) + .unwrap(); + let sq_storage = StorageBuilder::new("vector".to_owned(), DistanceType::L2, quantizer, None) + .unwrap() + .build(vec![batch]) + .unwrap(); + let vectors = Arc::new(sq_storage); + + let query = fsl.value(0); + c.bench_function(format!("create_hnsw_sq({TOTAL}x{DIMENSION})").as_str(), |b| { + b.to_async(&rt).iter(|| async { + let hnsw = HNSW::index_vectors(vectors.as_ref(), HnswBuildParams::default()).unwrap(); + let uids: HashSet = hnsw + .search_basic( + query.clone(), + K, + &HnswQueryParams { + ef: 300, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }, + None, + vectors.as_ref(), + ) + .unwrap() + .iter() + .map(|node| node.id) + .collect(); + + assert_eq!(uids.len(), K); + }) + }); + + let hnsw = HNSW::index_vectors(vectors.as_ref(), HnswBuildParams::default()).unwrap(); + c.bench_function(format!("search_hnsw_sq{TOTAL}x{DIMENSION}").as_str(), |b| { + b.to_async(&rt).iter(|| async { + let uids: HashSet = hnsw + .search_basic( + query.clone(), + K, + &HnswQueryParams { + ef: 300, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }, + None, + vectors.as_ref(), + ) + .unwrap() + .iter() + .map(|node| node.id) + .collect(); + + assert_eq!(uids.len(), K); + }) + }); +} + #[cfg(target_os = "linux")] criterion_group!( name=benches; @@ -92,7 +189,7 @@ criterion_group!( .measurement_time(Duration::from_secs(10)) .sample_size(10) .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); - targets = bench_hnsw); + targets = bench_hnsw, bench_hnsw_sq); // Non-linux version does not support pprof. #[cfg(not(target_os = "linux"))] @@ -101,6 +198,6 @@ criterion_group!( config = Criterion::default() .measurement_time(Duration::from_secs(10)) .sample_size(10); - targets = bench_hnsw); + targets = bench_hnsw, bench_hnsw_sq); criterion_main!(benches); From a02b95de0a2e679af5d23d992d40ea066ca784ef Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 30 Dec 2025 19:25:06 +0800 Subject: [PATCH 3/5] Revert u8 dot changes --- rust/lance-linalg/src/distance/dot.rs | 34 ++++----------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/rust/lance-linalg/src/distance/dot.rs b/rust/lance-linalg/src/distance/dot.rs index f90bfcdec2e..92e65c1fe68 100644 --- a/rust/lance-linalg/src/distance/dot.rs +++ b/rust/lance-linalg/src/distance/dot.rs @@ -57,25 +57,6 @@ fn dot_scalar< sum + sums.iter().copied().sum::() } -#[inline] -fn dot_u8_scalar(from: &[u8], to: &[u8]) -> u32 { - let x_chunks = to.chunks_exact(LANES); - let y_chunks = from.chunks_exact(LANES); - let remainder_sum = x_chunks - .remainder() - .iter() - .zip(y_chunks.remainder().iter()) - .map(|(&x, &y)| x as u32 * y as u32) - .sum::(); - let mut sums = [0_u32; LANES]; - for (x, y) in x_chunks.zip(y_chunks) { - for i in 0..LANES { - sums[i] += x[i] as u32 * y[i] as u32; - } - } - remainder_sum + sums.iter().copied().sum::() -} - /// Dot product. #[inline] pub fn dot(from: &[T], to: &[T]) -> f32 { @@ -171,7 +152,11 @@ impl Dot for f64 { impl Dot for u8 { #[inline] fn dot(x: &[Self], y: &[Self]) -> f32 { - dot_u8_scalar::<16>(x, y) as f32 + // TODO: this is not optimized for auto vectorization yet. + x.iter() + .zip(y.iter()) + .map(|(&x_i, &y_i)| x_i as u32 * y_i as u32) + .sum::() as f32 } } @@ -272,15 +257,6 @@ mod tests { assert_eq!(f32::dot(&x, &y), dot(&x, &y)); - let x: Vec = (0..20).map(|v| v as u8).collect(); - let y: Vec = (100..120).map(|v| v as u8).collect(); - let expected = x - .iter() - .zip(y.iter()) - .map(|(&x, &y)| x as u32 * y as u32) - .sum::() as f32; - assert_eq!(expected, dot(&x, &y)); - let x: Vec = (0..512).map(|v| v as f32).collect(); let y: Vec = (100..612).map(|v| v as f32).collect(); From f5ca19b211061c055ed70af8f22cbd3d8b6bc0d8 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 30 Dec 2025 19:31:36 +0800 Subject: [PATCH 4/5] Optimize SQ distance_all scaling --- rust/lance-index/src/vector/sq.rs | 9 ------ rust/lance-index/src/vector/sq/storage.rs | 37 ++++++++++++----------- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/rust/lance-index/src/vector/sq.rs b/rust/lance-index/src/vector/sq.rs index 6ac382bb347..520ed3fc212 100644 --- a/rust/lance-index/src/vector/sq.rs +++ b/rust/lance-index/src/vector/sq.rs @@ -276,15 +276,6 @@ pub(crate) fn scale_to_u8(values: &[T::Native], bounds: &Rang .collect_vec() } -pub(crate) fn inverse_scalar_dist( - values: impl Iterator, - bounds: &Range, -) -> Vec { - let range = (bounds.end - bounds.start) as f32; - values - .map(|v| v * range.powi(2) / 255.0.powi(2)) - .collect_vec() -} #[cfg(test)] mod tests { use arrow::datatypes::{Float16Type, Float32Type, Float64Type}; diff --git a/rust/lance-index/src/vector/sq/storage.rs b/rust/lance-index/src/vector/sq/storage.rs index 384087c168f..13c916aa657 100644 --- a/rust/lance-index/src/vector/sq/storage.rs +++ b/rust/lance-index/src/vector/sq/storage.rs @@ -23,7 +23,7 @@ use serde::{Deserialize, Serialize}; use snafu::location; use std::sync::Arc; -use super::{inverse_scalar_dist, scale_to_u8, ScalarQuantizer}; +use super::{scale_to_u8, ScalarQuantizer}; use crate::frag_reuse::FragReuseIndex; use crate::{ vector::{ @@ -388,11 +388,9 @@ impl VectorStore for ScalarQuantizationStorage { let (offset, chunk) = self.chunk(id); let query_sq_code = chunk.sq_code_slice(id - offset).to_vec(); let bounds = self.quantizer.bounds(); - let scale = sq_distance_scale(&bounds); SQDistCalculator { query_sq_code, - bounds, - scale, + scale: sq_distance_scale(&bounds), storage: self, } } @@ -406,7 +404,6 @@ fn sq_distance_scale(bounds: &Range) -> f32 { pub struct SQDistCalculator<'a> { query_sq_code: Vec, - bounds: Range, scale: f32, storage: &'a ScalarQuantizationStorage, } @@ -431,11 +428,9 @@ impl<'a> SQDistCalculator<'a> { panic!("Unsupported data type for ScalarQuantizationStorage"); } }; - let scale = sq_distance_scale(&bounds); Self { query_sq_code, - bounds, - scale, + scale: sq_distance_scale(&bounds), storage, } } @@ -457,24 +452,30 @@ impl DistCalculator for SQDistCalculator<'_> { fn distance_all(&self, _k_hint: usize) -> Vec { match self.storage.distance_type { - DistanceType::L2 | DistanceType::Cosine => inverse_scalar_dist( - self.storage.chunks.iter().flat_map(|c| { + DistanceType::L2 | DistanceType::Cosine => self + .storage + .chunks + .iter() + .flat_map(|c| { c.sq_codes .values() .chunks_exact(c.dim()) .map(|sq_codes| l2_distance_uint_scalar(sq_codes, &self.query_sq_code)) - }), - &self.bounds, - ), - DistanceType::Dot => inverse_scalar_dist( - self.storage.chunks.iter().flat_map(|c| { + }) + .map(|dist| dist * self.scale) + .collect(), + DistanceType::Dot => self + .storage + .chunks + .iter() + .flat_map(|c| { c.sq_codes .values() .chunks_exact(c.dim()) .map(|sq_codes| dot_distance(sq_codes, &self.query_sq_code)) - }), - &self.bounds, - ), + }) + .map(|dist| dist * self.scale) + .collect(), _ => panic!("We should not reach here: sq distance can only be L2 or Dot"), } } From ae1ca699a6d53aa1fa965c99390095fbecedf317 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 30 Dec 2025 20:27:39 +0800 Subject: [PATCH 5/5] fmt Signed-off-by: BubbleCal --- rust/lance-index/benches/hnsw.rs | 70 ++++++++++++++++---------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/rust/lance-index/benches/hnsw.rs b/rust/lance-index/benches/hnsw.rs index 030a349e663..5339074eb37 100644 --- a/rust/lance-index/benches/hnsw.rs +++ b/rust/lance-index/benches/hnsw.rs @@ -15,6 +15,7 @@ use lance_index::vector::v3::subindex::IvfSubIndex; #[cfg(target_os = "linux")] use pprof::criterion::{Output, PProfProfiler}; +use lance_core::ROW_ID_FIELD; use lance_index::vector::{ flat::storage::FlatFloatStorage, hnsw::builder::{HnswBuildParams, HnswQueryParams, HNSW}, @@ -22,7 +23,6 @@ use lance_index::vector::{ sq::{builder::SQBuildParams, ScalarQuantizer}, storage::StorageBuilder, }; -use lance_core::ROW_ID_FIELD; use lance_linalg::distance::DistanceType; use lance_testing::datagen::generate_random_array_with_seed; @@ -100,12 +100,9 @@ fn bench_hnsw_sq(c: &mut Criterion) { let data = generate_random_array_with_seed::(TOTAL * DIMENSION, SEED); let fsl = FixedSizeListArray::try_new_from_values(data, DIMENSION as i32).unwrap(); - let quantizer = ::build( - &fsl, - DistanceType::L2, - &SQBuildParams::default(), - ) - .unwrap(); + let quantizer = + ::build(&fsl, DistanceType::L2, &SQBuildParams::default()) + .unwrap(); let schema = Arc::new(Schema::new(vec![ Field::new( @@ -119,11 +116,8 @@ fn bench_hnsw_sq(c: &mut Criterion) { ROW_ID_FIELD.clone(), ])); let row_ids = UInt64Array::from_iter_values((0..TOTAL).map(|v| v as u64)); - let batch = RecordBatch::try_new( - schema, - vec![Arc::new(fsl.clone()), Arc::new(row_ids)], - ) - .unwrap(); + let batch = + RecordBatch::try_new(schema, vec![Arc::new(fsl.clone()), Arc::new(row_ids)]).unwrap(); let sq_storage = StorageBuilder::new("vector".to_owned(), DistanceType::L2, quantizer, None) .unwrap() .build(vec![batch]) @@ -131,30 +125,34 @@ fn bench_hnsw_sq(c: &mut Criterion) { let vectors = Arc::new(sq_storage); let query = fsl.value(0); - c.bench_function(format!("create_hnsw_sq({TOTAL}x{DIMENSION})").as_str(), |b| { - b.to_async(&rt).iter(|| async { - let hnsw = HNSW::index_vectors(vectors.as_ref(), HnswBuildParams::default()).unwrap(); - let uids: HashSet = hnsw - .search_basic( - query.clone(), - K, - &HnswQueryParams { - ef: 300, - lower_bound: None, - upper_bound: None, - dist_q_c: 0.0, - }, - None, - vectors.as_ref(), - ) - .unwrap() - .iter() - .map(|node| node.id) - .collect(); - - assert_eq!(uids.len(), K); - }) - }); + c.bench_function( + format!("create_hnsw_sq({TOTAL}x{DIMENSION})").as_str(), + |b| { + b.to_async(&rt).iter(|| async { + let hnsw = + HNSW::index_vectors(vectors.as_ref(), HnswBuildParams::default()).unwrap(); + let uids: HashSet = hnsw + .search_basic( + query.clone(), + K, + &HnswQueryParams { + ef: 300, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }, + None, + vectors.as_ref(), + ) + .unwrap() + .iter() + .map(|node| node.id) + .collect(); + + assert_eq!(uids.len(), K); + }) + }, + ); let hnsw = HNSW::index_vectors(vectors.as_ref(), HnswBuildParams::default()).unwrap(); c.bench_function(format!("search_hnsw_sq{TOTAL}x{DIMENSION}").as_str(), |b| {