From cd4651e5943483aaefeb0af945cb7e7cfd601c0a Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Wed, 4 Dec 2024 17:32:05 +0800 Subject: [PATCH 01/13] feat: support hamming distance & binary vector Signed-off-by: BubbleCal --- rust/lance-index/benches/hnsw.rs | 4 +- rust/lance-index/src/vector/flat/index.rs | 82 ++++++- rust/lance-index/src/vector/flat/storage.rs | 201 ++++++++++++++++-- rust/lance-index/src/vector/hnsw/builder.rs | 8 +- rust/lance-index/src/vector/ivf.rs | 2 +- rust/lance-index/src/vector/quantizer.rs | 8 +- rust/lance-index/src/vector/residual.rs | 24 ++- .../lance-linalg/benches/compute_partition.rs | 13 +- rust/lance-linalg/src/distance/hamming.rs | 39 ++++ rust/lance-linalg/src/kmeans.rs | 110 +++++++--- rust/lance/examples/hnsw.rs | 4 +- rust/lance/src/dataset/scanner.rs | 24 ++- rust/lance/src/index.rs | 12 +- rust/lance/src/index/append.rs | 11 +- rust/lance/src/index/vector.rs | 70 ++++-- rust/lance/src/index/vector/builder.rs | 4 +- rust/lance/src/index/vector/ivf.rs | 14 +- rust/lance/src/index/vector/ivf/io.rs | 1 + rust/lance/src/index/vector/ivf/v2.rs | 82 ++++--- rust/lance/src/io/exec/knn.rs | 2 +- 20 files changed, 594 insertions(+), 121 deletions(-) diff --git a/rust/lance-index/benches/hnsw.rs b/rust/lance-index/benches/hnsw.rs index b51d75d7469..e250dfffd83 100644 --- a/rust/lance-index/benches/hnsw.rs +++ b/rust/lance-index/benches/hnsw.rs @@ -15,7 +15,7 @@ use lance_index::vector::v3::subindex::IvfSubIndex; use pprof::criterion::{Output, PProfProfiler}; use lance_index::vector::{ - flat::storage::FlatStorage, + flat::storage::FlatFloatStorage, hnsw::builder::{HnswBuildParams, HNSW}, }; use lance_linalg::distance::DistanceType; @@ -31,7 +31,7 @@ fn bench_hnsw(c: &mut Criterion) { let data = generate_random_array_with_seed::(TOTAL * DIMENSION, SEED); let fsl = FixedSizeListArray::try_new_from_values(data, DIMENSION as i32).unwrap(); - let vectors = Arc::new(FlatStorage::new(fsl.clone(), DistanceType::L2)); + let vectors = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); let query = fsl.value(0); c.bench_function( diff --git a/rust/lance-index/src/vector/flat/index.rs b/rust/lance-index/src/vector/flat/index.rs index f50e995e4cb..bc26fd5620f 100644 --- a/rust/lance-index/src/vector/flat/index.rs +++ b/rust/lance-index/src/vector/flat/index.rs @@ -28,7 +28,7 @@ use crate::{ }, }; -use super::storage::{FlatStorage, FLAT_COLUMN}; +use super::storage::{FlatBinStorage, FlatFloatStorage, FLAT_COLUMN}; /// A Flat index is any index that stores no metadata, and /// during query, it simply scans over the storage and returns the top k results @@ -166,7 +166,7 @@ impl FlatQuantizer { impl Quantization for FlatQuantizer { type BuildParams = (); type Metadata = FlatMetadata; - type Storage = FlatStorage; + type Storage = FlatFloatStorage; fn build(data: &dyn Array, distance_type: DistanceType, _: &Self::BuildParams) -> Result { let dim = data.as_fixed_size_list().value_length(); @@ -228,3 +228,81 @@ impl TryFrom for FlatQuantizer { } } } + +#[derive(Debug, Clone, DeepSizeOf)] +pub struct FlatBinQuantizer { + dim: usize, + distance_type: DistanceType, +} + +impl FlatBinQuantizer { + pub fn new(dim: usize, distance_type: DistanceType) -> Self { + Self { dim, distance_type } + } +} + +impl Quantization for FlatBinQuantizer { + type BuildParams = (); + type Metadata = FlatMetadata; + type Storage = FlatBinStorage; + + fn build(data: &dyn Array, distance_type: DistanceType, _: &Self::BuildParams) -> Result { + let dim = data.as_fixed_size_list().value_length(); + Ok(Self::new(dim as usize, distance_type)) + } + + fn code_dim(&self) -> usize { + self.dim + } + + fn column(&self) -> &'static str { + FLAT_COLUMN + } + + fn from_metadata(metadata: &Self::Metadata, distance_type: DistanceType) -> Result { + Ok(Quantizer::FlatBin(Self { + dim: metadata.dim, + distance_type, + })) + } + + fn metadata( + &self, + _: Option, + ) -> Result { + let metadata = FlatMetadata { dim: self.dim }; + Ok(serde_json::to_value(metadata)?) + } + + fn metadata_key() -> &'static str { + "flat" + } + + fn quantization_type() -> QuantizationType { + QuantizationType::Flat + } + + fn quantize(&self, vectors: &dyn Array) -> Result { + Ok(vectors.slice(0, vectors.len())) + } +} + +impl From for Quantizer { + fn from(value: FlatBinQuantizer) -> Self { + Self::FlatBin(value) + } +} + +impl TryFrom for FlatBinQuantizer { + type Error = Error; + + fn try_from(value: Quantizer) -> Result { + match value { + Quantizer::FlatBin(quantizer) => Ok(quantizer), + _ => Err(Error::invalid_input( + "quantizer is not FlatBinQuantizer", + location!(), + )), + } + } +} diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs index b3bb11d02a0..05c42c3f12b 100644 --- a/rust/lance-index/src/vector/flat/storage.rs +++ b/rust/lance-index/src/vector/flat/storage.rs @@ -10,6 +10,8 @@ use crate::vector::storage::{DistCalculator, VectorStore}; use crate::vector::utils::do_prefetch; use arrow::array::AsArray; use arrow::compute::concat_batches; +use arrow::datatypes::UInt8Type; +use arrow_array::ArrowPrimitiveType; use arrow_array::{ types::{Float32Type, UInt64Type}, Array, ArrayRef, FixedSizeListArray, RecordBatch, UInt64Array, @@ -18,6 +20,7 @@ use arrow_schema::{DataType, SchemaRef}; use deepsize::DeepSizeOf; use lance_core::{Error, Result, ROW_ID}; use lance_file::reader::FileReader; +use lance_linalg::distance::hamming::hamming; use lance_linalg::distance::DistanceType; use snafu::{location, Location}; @@ -27,7 +30,7 @@ pub const FLAT_COLUMN: &str = "flat"; /// All data are stored in memory #[derive(Debug, Clone)] -pub struct FlatStorage { +pub struct FlatFloatStorage { batch: RecordBatch, distance_type: DistanceType, @@ -36,14 +39,14 @@ pub struct FlatStorage { vectors: Arc, } -impl DeepSizeOf for FlatStorage { +impl DeepSizeOf for FlatFloatStorage { fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { self.batch.get_array_memory_size() } } #[async_trait::async_trait] -impl QuantizerStorage for FlatStorage { +impl QuantizerStorage for FlatFloatStorage { type Metadata = FlatMetadata; async fn load_partition( _: &FileReader, @@ -55,7 +58,7 @@ impl QuantizerStorage for FlatStorage { } } -impl FlatStorage { +impl FlatFloatStorage { // deprecated, use `try_from_batch` instead pub fn new(vectors: FixedSizeListArray, distance_type: DistanceType) -> Self { let row_ids = Arc::new(UInt64Array::from_iter_values(0..vectors.len() as u64)); @@ -80,8 +83,8 @@ impl FlatStorage { } } -impl VectorStore for FlatStorage { - type DistanceCalculator<'a> = FlatDistanceCal<'a>; +impl VectorStore for FlatFloatStorage { + type DistanceCalculator<'a> = FlatDistanceCal<'a, Float32Type>; fn try_from_batch(batch: RecordBatch, distance_type: DistanceType) -> Result { let row_ids = Arc::new( @@ -149,11 +152,11 @@ impl VectorStore for FlatStorage { } fn dist_calculator(&self, query: ArrayRef) -> Self::DistanceCalculator<'_> { - FlatDistanceCal::new(self.vectors.as_ref(), query, self.distance_type) + Self::DistanceCalculator::new(self.vectors.as_ref(), query, self.distance_type) } fn dist_calculator_from_id(&self, id: u32) -> Self::DistanceCalculator<'_> { - FlatDistanceCal::new( + Self::DistanceCalculator::new( self.vectors.as_ref(), self.vectors.value(id as usize), self.distance_type, @@ -176,14 +179,165 @@ impl VectorStore for FlatStorage { } } -pub struct FlatDistanceCal<'a> { - vectors: &'a [f32], - query: Vec, +/// All data are stored in memory +#[derive(Debug, Clone)] +pub struct FlatBinStorage { + batch: RecordBatch, + distance_type: DistanceType, + + // helper fields + pub(super) row_ids: Arc, + vectors: Arc, +} + +impl DeepSizeOf for FlatBinStorage { + fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { + self.batch.get_array_memory_size() + } +} + +#[async_trait::async_trait] +impl QuantizerStorage for FlatBinStorage { + type Metadata = FlatMetadata; + async fn load_partition( + _: &FileReader, + _: std::ops::Range, + _: DistanceType, + _: &Self::Metadata, + ) -> Result { + unimplemented!("Flat will be used in new index builder which doesn't require this") + } +} + +impl FlatBinStorage { + // deprecated, use `try_from_batch` instead + pub fn new(vectors: FixedSizeListArray, distance_type: DistanceType) -> Self { + let row_ids = Arc::new(UInt64Array::from_iter_values(0..vectors.len() as u64)); + let vectors = Arc::new(vectors); + + let batch = RecordBatch::try_from_iter_with_nullable(vec![ + (ROW_ID, row_ids.clone() as ArrayRef, true), + (FLAT_COLUMN, vectors.clone() as ArrayRef, true), + ]) + .unwrap(); + + Self { + batch, + distance_type, + row_ids, + vectors, + } + } + + pub fn vector(&self, id: u32) -> ArrayRef { + self.vectors.value(id as usize) + } +} + +impl VectorStore for FlatBinStorage { + type DistanceCalculator<'a> = FlatDistanceCal<'a, UInt8Type>; + + fn try_from_batch(batch: RecordBatch, distance_type: DistanceType) -> Result { + let row_ids = Arc::new( + batch + .column_by_name(ROW_ID) + .ok_or(Error::Schema { + message: format!("column {} not found", ROW_ID), + location: location!(), + })? + .as_primitive::() + .clone(), + ); + let vectors = Arc::new( + batch + .column_by_name(FLAT_COLUMN) + .ok_or(Error::Schema { + message: "column flat not found".to_string(), + location: location!(), + })? + .as_fixed_size_list() + .clone(), + ); + Ok(Self { + batch, + distance_type, + row_ids, + vectors, + }) + } + + fn to_batches(&self) -> Result> { + Ok([self.batch.clone()].into_iter()) + } + + fn append_batch(&self, batch: RecordBatch, _vector_column: &str) -> Result { + // TODO: use chunked storage + let new_batch = concat_batches(&batch.schema(), vec![&self.batch, &batch].into_iter())?; + let mut storage = self.clone(); + storage.batch = new_batch; + Ok(storage) + } + + fn schema(&self) -> &SchemaRef { + self.batch.schema_ref() + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn len(&self) -> usize { + self.vectors.len() + } + + fn distance_type(&self) -> DistanceType { + self.distance_type + } + + fn row_id(&self, id: u32) -> u64 { + self.row_ids.values()[id as usize] + } + + fn row_ids(&self) -> impl Iterator { + self.row_ids.values().iter() + } + + fn dist_calculator(&self, query: ArrayRef) -> Self::DistanceCalculator<'_> { + Self::DistanceCalculator::new(self.vectors.as_ref(), query, self.distance_type) + } + + fn dist_calculator_from_id(&self, id: u32) -> Self::DistanceCalculator<'_> { + Self::DistanceCalculator::new( + self.vectors.as_ref(), + self.vectors.value(id as usize), + self.distance_type, + ) + } + + /// Distance between two vectors. + fn distance_between(&self, a: u32, b: u32) -> f32 { + match self.vectors.value_type() { + DataType::Float32 => { + let vector1 = self.vectors.value(a as usize); + let vector2 = self.vectors.value(b as usize); + self.distance_type.func()( + vector1.as_primitive::().values(), + vector2.as_primitive::().values(), + ) + } + _ => unimplemented!(), + } + } +} + +pub struct FlatDistanceCal<'a, T: ArrowPrimitiveType> { + vectors: &'a [T::Native], + query: Vec, dimension: usize, - distance_fn: fn(&[f32], &[f32]) -> f32, + distance_fn: fn(&[T::Native], &[T::Native]) -> f32, } -impl<'a> FlatDistanceCal<'a> { +impl<'a> FlatDistanceCal<'a, Float32Type> { fn new(vectors: &'a FixedSizeListArray, query: ArrayRef, distance_type: DistanceType) -> Self { // Gained significant performance improvement by using strong typed primitive slice. // TODO: to support other data types other than `f32`, make FlatDistanceCal a generic struct. @@ -196,14 +350,31 @@ impl<'a> FlatDistanceCal<'a> { distance_fn: distance_type.func(), } } +} + +impl<'a> FlatDistanceCal<'a, UInt8Type> { + fn new(vectors: &'a FixedSizeListArray, query: ArrayRef, _distance_type: DistanceType) -> Self { + // Gained significant performance improvement by using strong typed primitive slice. + // TODO: to support other data types other than `f32`, make FlatDistanceCal a generic struct. + let flat_array = vectors.values().as_primitive::(); + let dimension = vectors.value_length() as usize; + Self { + vectors: flat_array.values(), + query: query.as_primitive::().values().to_vec(), + dimension, + distance_fn: hamming, + } + } +} +impl<'a, T: ArrowPrimitiveType> FlatDistanceCal<'a, T> { #[inline] - fn get_vector(&self, id: u32) -> &[f32] { + fn get_vector(&self, id: u32) -> &[T::Native] { &self.vectors[self.dimension * id as usize..self.dimension * (id + 1) as usize] } } -impl DistCalculator for FlatDistanceCal<'_> { +impl DistCalculator for FlatDistanceCal<'_, T> { #[inline] fn distance(&self, id: u32) -> f32 { let vector = self.get_vector(id); diff --git a/rust/lance-index/src/vector/hnsw/builder.rs b/rust/lance-index/src/vector/hnsw/builder.rs index fc5e43a1b86..420f37fde26 100644 --- a/rust/lance-index/src/vector/hnsw/builder.rs +++ b/rust/lance-index/src/vector/hnsw/builder.rs @@ -31,7 +31,7 @@ use serde::{Deserialize, Serialize}; use super::super::graph::beam_search; use super::{select_neighbors_heuristic, HnswMetadata, HNSW_TYPE, VECTOR_ID_COL, VECTOR_ID_FIELD}; use crate::prefilter::PreFilter; -use crate::vector::flat::storage::FlatStorage; +use crate::vector::flat::storage::FlatFloatStorage; use crate::vector::graph::builder::GraphBuilderNode; use crate::vector::graph::{greedy_search, Visited}; use crate::vector::graph::{ @@ -100,7 +100,7 @@ impl HnswBuildParams { /// - `data`: A FixedSizeList to build the HNSW. /// - `distance_type`: The distance type to use. pub async fn build(self, data: ArrayRef, distance_type: DistanceType) -> Result { - let vec_store = Arc::new(FlatStorage::new( + let vec_store = Arc::new(FlatFloatStorage::new( data.as_fixed_size_list().clone(), distance_type, )); @@ -819,7 +819,7 @@ mod tests { use crate::scalar::IndexWriter; use crate::vector::v3::subindex::IvfSubIndex; use crate::vector::{ - flat::storage::FlatStorage, + flat::storage::FlatFloatStorage, graph::{DISTS_FIELD, NEIGHBORS_FIELD}, hnsw::{builder::HnswBuildParams, HNSW, VECTOR_ID_FIELD}, }; @@ -831,7 +831,7 @@ mod tests { const NUM_EDGES: usize = 20; let data = generate_random_array(TOTAL * DIM); let fsl = FixedSizeListArray::try_new_from_values(data, DIM as i32).unwrap(); - let store = Arc::new(FlatStorage::new(fsl.clone(), DistanceType::L2)); + let store = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); let builder = HNSW::index_vectors( store.as_ref(), HnswBuildParams::default() diff --git a/rust/lance-index/src/vector/ivf.rs b/rust/lance-index/src/vector/ivf.rs index 55bfc641732..ab3a685718b 100644 --- a/rust/lance-index/src/vector/ivf.rs +++ b/rust/lance-index/src/vector/ivf.rs @@ -54,7 +54,7 @@ pub fn new_ivf_transformer_with_quantizer( range: Option>, ) -> Result { match quantizer { - Quantizer::Flat(_) => Ok(IvfTransformer::new_flat( + Quantizer::Flat(_) | Quantizer::FlatBin(_) => Ok(IvfTransformer::new_flat( centroids, metric_type, vector_column, diff --git a/rust/lance-index/src/vector/quantizer.rs b/rust/lance-index/src/vector/quantizer.rs index 1290a0f07b2..110e438df0a 100644 --- a/rust/lance-index/src/vector/quantizer.rs +++ b/rust/lance-index/src/vector/quantizer.rs @@ -19,7 +19,7 @@ use snafu::{location, Location}; use crate::{IndexMetadata, INDEX_METADATA_SCHEMA_KEY}; -use super::flat::index::FlatQuantizer; +use super::flat::index::{FlatBinQuantizer, FlatQuantizer}; use super::pq::ProductQuantizer; use super::{ivf::storage::IvfModel, sq::ScalarQuantizer, storage::VectorStore}; @@ -98,6 +98,7 @@ impl QuantizerBuildParams for () { #[derive(Debug, Clone, DeepSizeOf)] pub enum Quantizer { Flat(FlatQuantizer), + FlatBin(FlatBinQuantizer), Product(ProductQuantizer), Scalar(ScalarQuantizer), } @@ -106,6 +107,7 @@ impl Quantizer { pub fn code_dim(&self) -> usize { match self { Self::Flat(fq) => fq.code_dim(), + Self::FlatBin(fq) => fq.code_dim(), Self::Product(pq) => pq.code_dim(), Self::Scalar(sq) => sq.code_dim(), } @@ -114,6 +116,7 @@ impl Quantizer { pub fn column(&self) -> &'static str { match self { Self::Flat(fq) => fq.column(), + Self::FlatBin(fq) => fq.column(), Self::Product(pq) => pq.column(), Self::Scalar(sq) => sq.column(), } @@ -122,6 +125,7 @@ impl Quantizer { pub fn metadata_key(&self) -> &'static str { match self { Self::Flat(_) => FlatQuantizer::metadata_key(), + Self::FlatBin(_) => FlatBinQuantizer::metadata_key(), Self::Product(_) => ProductQuantizer::metadata_key(), Self::Scalar(_) => ScalarQuantizer::metadata_key(), } @@ -130,6 +134,7 @@ impl Quantizer { pub fn quantization_type(&self) -> QuantizationType { match self { Self::Flat(_) => QuantizationType::Flat, + Self::FlatBin(_) => QuantizationType::Flat, Self::Product(_) => QuantizationType::Product, Self::Scalar(_) => QuantizationType::Scalar, } @@ -138,6 +143,7 @@ impl Quantizer { pub fn metadata(&self, args: Option) -> Result { match self { Self::Flat(fq) => fq.metadata(args), + Self::FlatBin(fq) => fq.metadata(args), Self::Product(pq) => pq.metadata(args), Self::Scalar(sq) => sq.metadata(args), } diff --git a/rust/lance-index/src/vector/residual.rs b/rust/lance-index/src/vector/residual.rs index b094e43d114..90730529b41 100644 --- a/rust/lance-index/src/vector/residual.rs +++ b/rust/lance-index/src/vector/residual.rs @@ -1,19 +1,21 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::ops::{AddAssign, DivAssign}; use std::sync::Arc; +use arrow_array::ArrowNumericType; use arrow_array::{ cast::AsArray, - types::{ArrowPrimitiveType, Float16Type, Float32Type, Float64Type, UInt32Type}, + types::{Float16Type, Float32Type, Float64Type, UInt32Type}, Array, FixedSizeListArray, PrimitiveArray, RecordBatch, UInt32Array, }; use arrow_schema::DataType; use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt}; use lance_core::{Error, Result}; use lance_linalg::distance::{DistanceType, Dot, L2}; -use lance_linalg::kmeans::compute_partitions; -use num_traits::Float; +use lance_linalg::kmeans::{compute_partitions, KMeansAlgoFloat}; +use num_traits::{Float, FromPrimitive, Num}; use snafu::{location, Location}; use tracing::instrument; @@ -53,29 +55,31 @@ impl ResidualTransform { } } -fn do_compute_residual( +fn do_compute_residual( centroids: &FixedSizeListArray, vectors: &FixedSizeListArray, distance_type: Option, partitions: Option<&UInt32Array>, ) -> Result where - T::Native: Float + L2 + Dot, + T::Native: Num + Float + L2 + Dot + DivAssign + AddAssign + FromPrimitive, { let dimension = centroids.value_length() as usize; - let centroids_slice = centroids.values().as_primitive::().values(); - let vectors_slice = vectors.values().as_primitive::().values(); + let centroids = centroids.values().as_primitive::(); + let vectors = vectors.values().as_primitive::(); let part_ids = partitions.cloned().unwrap_or_else(|| { - compute_partitions( - centroids_slice, - vectors_slice, + compute_partitions::>( + centroids, + vectors, dimension, distance_type.expect("provide either partitions or distance type"), ) .into() }); + let vectors_slice = vectors.values(); + let centroids_slice = centroids.values(); let residuals = vectors_slice .chunks_exact(dimension) .enumerate() diff --git a/rust/lance-linalg/benches/compute_partition.rs b/rust/lance-linalg/benches/compute_partition.rs index 7b155a9aa5b..5cdda57158a 100644 --- a/rust/lance-linalg/benches/compute_partition.rs +++ b/rust/lance-linalg/benches/compute_partition.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use arrow_array::types::Float32Type; use criterion::{criterion_group, criterion_main, Criterion}; +use lance_linalg::kmeans::KMeansAlgoFloat; use lance_linalg::{distance::MetricType, kmeans::compute_partitions}; use lance_testing::datagen::generate_random_array_with_seed; #[cfg(target_os = "linux")] @@ -24,9 +25,9 @@ fn bench_compute_partitions(c: &mut Criterion) { c.bench_function("compute_centroids(L2)", |b| { b.iter(|| { - compute_partitions( - centroids.values(), - input.values(), + compute_partitions::>( + centroids.as_ref(), + &input, DIMENSION, MetricType::L2, ) @@ -35,9 +36,9 @@ fn bench_compute_partitions(c: &mut Criterion) { c.bench_function("compute_centroids(Cosine)", |b| { b.iter(|| { - compute_partitions( - centroids.values(), - input.values(), + compute_partitions::>( + centroids.as_ref(), + &input, DIMENSION, MetricType::Cosine, ) diff --git a/rust/lance-linalg/src/distance/hamming.rs b/rust/lance-linalg/src/distance/hamming.rs index 0b94f867bc0..80e03088318 100644 --- a/rust/lance-linalg/src/distance/hamming.rs +++ b/rust/lance-linalg/src/distance/hamming.rs @@ -3,6 +3,14 @@ //! Hamming distance. +use std::sync::Arc; + +use crate::{Error, Result}; +use arrow_array::cast::AsArray; +use arrow_array::types::UInt8Type; +use arrow_array::{Array, Float32Array}; +use arrow_schema::DataType; + pub trait Hamming { /// Hamming distance between two vectors. fn hamming(x: &[u8], y: &[u8]) -> f32; @@ -44,6 +52,37 @@ pub fn hamming_scalar(x: &[u8], y: &[u8]) -> f32 { .sum::() as f32 } +pub fn hamming_distance_batch<'a>( + from: &'a [u8], + to: &'a [u8], + dimension: usize, +) -> Box + 'a> { + debug_assert_eq!(from.len(), dimension); + debug_assert_eq!(to.len() % dimension, 0); + Box::new(to.chunks_exact(dimension).map(|v| hamming(from, v))) +} + +pub fn hamming_distance_arrow_batch(from: &dyn Array, to: &dyn Array) -> Result> { + let dists = match *from.data_type() { + DataType::UInt8 => hamming_distance_batch( + from.as_primitive::().values(), + to.as_primitive::().values(), + from.len(), + ), + _ => { + return Err(Error::InvalidArgumentError(format!( + "Unsupported data type: {:?}", + from.data_type() + ))) + } + }; + + Ok(Arc::new(Float32Array::new( + dists.collect(), + to.nulls().cloned(), + ))) +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/lance-linalg/src/kmeans.rs b/rust/lance-linalg/src/kmeans.rs index 57c8f16839a..a318a92b6cc 100644 --- a/rust/lance-linalg/src/kmeans.rs +++ b/rust/lance-linalg/src/kmeans.rs @@ -28,7 +28,7 @@ use num_traits::{AsPrimitive, Float, FromPrimitive, Num, Zero}; use rand::prelude::*; use rayon::prelude::*; -use crate::distance::hamming::hamming; +use crate::distance::hamming::{hamming, hamming_distance_batch}; use crate::distance::{dot_distance_batch, DistanceType}; use crate::kernels::{argmax, argmin_value_float}; use crate::{ @@ -170,7 +170,7 @@ fn hist_stddev(k: usize, membership: &[Option]) -> f32 { .sqrt() } -trait KMeansAlgo { +pub trait KMeansAlgo { /// Recompute the membership of each vector. /// /// Parameters: @@ -194,7 +194,7 @@ trait KMeansAlgo { ) -> KMeans; } -struct KMeansAlgoFloat +pub struct KMeansAlgoFloat where T::Native: Float + Num, { @@ -596,6 +596,12 @@ pub fn kmeans_find_partitions_arrow_array( nprobes, distance_type, )?), + (DataType::UInt8, DataType::UInt8) => kmeans_find_partitions_binary( + centroids.values().as_primitive::().values(), + query.as_primitive::().values(), + nprobes, + distance_type, + ), _ => Err(ArrowError::InvalidArgumentError(format!( "Centroids and vectors have different types: {} != {}", centroids.value_type(), @@ -637,6 +643,27 @@ pub fn kmeans_find_partitions( sort_to_indices(&dists_arr, None, Some(nprobes)) } +pub fn kmeans_find_partitions_binary( + centroids: &[u8], + query: &[u8], + nprobes: usize, + distance_type: DistanceType, +) -> Result { + let dists: Vec = match distance_type { + DistanceType::Hamming => hamming_distance_batch(query, centroids, query.len()).collect(), + _ => { + panic!( + "KMeans::find_partitions: {} is not supported", + distance_type + ); + } + }; + + // TODO: use heap to just keep nprobes smallest values. + let dists_arr = Float32Array::from(dists); + sort_to_indices(&dists_arr, None, Some(nprobes)) +} + /// Compute partitions from Arrow FixedSizeListArray. pub fn compute_partitions_arrow_array( centroids: &FixedSizeListArray, @@ -649,21 +676,36 @@ pub fn compute_partitions_arrow_array( )); } match (centroids.value_type(), vectors.value_type()) { - (DataType::Float16, DataType::Float16) => Ok(compute_partitions( - centroids.values().as_primitive::().values(), - vectors.values().as_primitive::().values(), + (DataType::Float16, DataType::Float16) => Ok(compute_partitions::< + Float16Type, + KMeansAlgoFloat, + >( + centroids.values().as_primitive(), + vectors.values().as_primitive(), centroids.value_length(), distance_type, )), - (DataType::Float32, DataType::Float32) => Ok(compute_partitions( - centroids.values().as_primitive::().values(), - vectors.values().as_primitive::().values(), + (DataType::Float32, DataType::Float32) => Ok(compute_partitions::< + Float32Type, + KMeansAlgoFloat, + >( + centroids.values().as_primitive(), + vectors.values().as_primitive(), centroids.value_length(), distance_type, )), - (DataType::Float64, DataType::Float64) => Ok(compute_partitions( - centroids.values().as_primitive::().values(), - vectors.values().as_primitive::().values(), + (DataType::Float64, DataType::Float64) => Ok(compute_partitions::< + Float64Type, + KMeansAlgoFloat, + >( + centroids.values().as_primitive(), + vectors.values().as_primitive(), + centroids.value_length(), + distance_type, + )), + (DataType::UInt8, DataType::UInt8) => Ok(compute_partitions::( + centroids.values().as_primitive(), + vectors.values().as_primitive(), centroids.value_length(), distance_type, )), @@ -676,17 +718,23 @@ pub fn compute_partitions_arrow_array( /// Compute partition ID of each vector in the KMeans. /// /// If returns `None`, means the vector is not valid, i.e., all `NaN`. -pub fn compute_partitions( - centroids: &[T], - vectors: &[T], +pub fn compute_partitions>( + centroids: &PrimitiveArray, + vectors: &PrimitiveArray, dimension: impl AsPrimitive, distance_type: DistanceType, -) -> Vec> { +) -> Vec> +where + T::Native: Num, +{ let dimension = dimension.as_(); - vectors - .par_chunks(dimension) - .map(|vec| compute_partition(centroids, vec, distance_type)) - .collect::>() + let (membership, _) = K::compute_membership_and_loss( + centroids.values(), + vectors.values(), + dimension, + distance_type, + ); + membership } #[inline] @@ -752,7 +800,12 @@ mod tests { ) }) .collect::>(); - let actual = compute_partitions(centroids.values(), data.values(), DIM, DistanceType::L2); + let actual = compute_partitions::>( + ¢roids, + &data, + DIM, + DistanceType::L2, + ); assert_eq!(expected, actual); } @@ -782,11 +835,16 @@ mod tests { let centroids = generate_random_array(DIM * NUM_CENTROIDS); let values = Float32Array::from_iter_values(repeat(f32::NAN).take(DIM * K)); - compute_partitions::(centroids.values(), values.values(), DIM, DistanceType::L2) - .iter() - .for_each(|cd| { - assert!(cd.is_none()); - }); + compute_partitions::>( + ¢roids, + &values, + DIM, + DistanceType::L2, + ) + .iter() + .for_each(|cd| { + assert!(cd.is_none()); + }); } #[tokio::test] diff --git a/rust/lance/examples/hnsw.rs b/rust/lance/examples/hnsw.rs index 414038167fa..9c8b9d558ae 100644 --- a/rust/lance/examples/hnsw.rs +++ b/rust/lance/examples/hnsw.rs @@ -16,7 +16,7 @@ use futures::StreamExt; use lance::Dataset; use lance_index::vector::v3::subindex::IvfSubIndex; use lance_index::vector::{ - flat::storage::FlatStorage, + flat::storage::FlatFloatStorage, hnsw::{builder::HnswBuildParams, HNSW}, }; use lance_linalg::distance::DistanceType; @@ -79,7 +79,7 @@ async fn main() { let fsl = concat(&arrs).unwrap().as_fixed_size_list().clone(); println!("Loaded {:?} batches", fsl.len()); - let vector_store = Arc::new(FlatStorage::new(fsl.clone(), DistanceType::L2)); + let vector_store = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); let q = fsl.value(0); let k = 10; diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 6e0cf9c47a2..8b801c5b5f9 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -7,7 +7,9 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use arrow_array::{Array, Float32Array, Int64Array, RecordBatch}; +use arrow_array::{ + Array, ArrowPrimitiveType, Float32Array, Int64Array, PrimitiveArray, RecordBatch, +}; use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef, SortOptions}; use arrow_select::concat::concat_batches; use async_recursion::async_recursion; @@ -632,7 +634,12 @@ impl Scanner { } /// Find k-nearest neighbor within the vector column. - pub fn nearest(&mut self, column: &str, q: &Float32Array, k: usize) -> Result<&mut Self> { + pub fn nearest( + &mut self, + column: &str, + q: &PrimitiveArray, + k: usize, + ) -> Result<&mut Self> { if !self.prefilter { // We can allow fragment scan if the input to nearest is a prefilter. // The fragment scan will be performed by the prefilter. @@ -662,8 +669,13 @@ impl Scanner { ))?; let key = match field.data_type() { DataType::FixedSizeList(dt, _) => { - if dt.data_type().is_floating() { - coerce_float_vector(q, FloatType::try_from(dt.data_type())?)? + if dt.data_type() == q.data_type() { + Box::new(q.clone()) + } else if dt.data_type().is_floating() && *q.data_type() == DataType::Float32 { + coerce_float_vector( + q.as_any().downcast_ref::().unwrap(), + FloatType::try_from(dt.data_type())?, + )? } else { return Err(Error::invalid_input( format!( @@ -1576,7 +1588,9 @@ impl Scanner { let schema = self.dataset.schema(); if let Some(field) = schema.field(&q.column) { match field.data_type() { - DataType::FixedSizeList(subfield, _) if subfield.data_type().is_floating() => {} + DataType::FixedSizeList(subfield, _) + if subfield.data_type().is_floating() + || *subfield.data_type() == DataType::UInt8 => {} _ => { return Err(Error::invalid_input( format!( diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index ace9906d5cc..f3543b8ba0b 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -21,7 +21,7 @@ use lance_index::scalar::expression::{ }; use lance_index::scalar::lance_format::LanceIndexStore; use lance_index::scalar::{InvertedIndexParams, ScalarIndex, ScalarIndexType}; -use lance_index::vector::flat::index::{FlatIndex, FlatQuantizer}; +use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; use lance_index::vector::hnsw::HNSW; use lance_index::vector::pq::ProductQuantizer; use lance_index::vector::sq::ScalarQuantizer; @@ -757,6 +757,16 @@ impl DatasetIndexInternalExt for Dataset { .await?; Ok(Arc::new(ivf) as Arc) } + DataType::UInt8 => { + let ivf = IVFIndex::::try_new( + self.object_store.clone(), + self.indices_dir(), + uuid.to_owned(), + Arc::downgrade(&self.session), + ) + .await?; + Ok(Arc::new(ivf) as Arc) + } _ => Err(Error::Index { message: format!( "the field type {} is not supported for FLAT index", diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 3c6a377dd5a..eb64030e103 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -152,6 +152,7 @@ pub async fn merge_indices<'a>( mod tests { use super::*; + use arrow::datatypes::Float32Type; use arrow_array::cast::AsArray; use arrow_array::types::UInt32Type; use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt32Array}; @@ -225,7 +226,9 @@ mod tests { let q = array.value(5); let mut scanner = dataset.scan(); - scanner.nearest("vector", q.as_primitive(), 10).unwrap(); + scanner + .nearest("vector", q.as_primitive::(), 10) + .unwrap(); let results = scanner .try_into_stream() .await @@ -257,7 +260,9 @@ mod tests { assert_eq!(index_dirs.len(), 2); let mut scanner = dataset.scan(); - scanner.nearest("vector", q.as_primitive(), 10).unwrap(); + scanner + .nearest("vector", q.as_primitive::(), 10) + .unwrap(); let results = scanner .try_into_stream() .await @@ -385,7 +390,7 @@ mod tests { .scan() .project(&["id"]) .unwrap() - .nearest("vector", array.value(0).as_primitive(), 2) + .nearest("vector", array.value(0).as_primitive::(), 2) .unwrap() .refine(1) .try_into_batch() diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index bd05fcc6436..122889807e6 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -15,9 +15,10 @@ mod utils; #[cfg(test)] mod fixture_test; +use arrow_schema::DataType; use builder::IvfIndexBuilder; use lance_file::reader::FileReader; -use lance_index::vector::flat::index::{FlatIndex, FlatQuantizer}; +use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; use lance_index::vector::hnsw::HNSW; use lance_index::vector::ivf::storage::IvfModel; use lance_index::vector::pq::ProductQuantizer; @@ -252,18 +253,61 @@ pub(crate) async fn build_vector_index( let temp_dir_path = Path::from_filesystem_path(temp_dir.path())?; let shuffler = IvfShuffler::new(temp_dir_path, ivf_params.num_partitions); if is_ivf_flat(stages) { - IvfIndexBuilder::::new( - dataset.clone(), - column.to_owned(), - dataset.indices_dir().child(uuid), - params.metric_type, - Box::new(shuffler), - Some(ivf_params.clone()), - Some(()), - (), - )? - .build() - .await?; + let data_type = dataset + .schema() + .field(column) + .ok_or(Error::Schema { + message: format!("Column {} not found in schema", column), + location: location!(), + })? + .data_type(); + match data_type { + DataType::FixedSizeList(f, _) => match f.data_type() { + DataType::Float16 | DataType::Float32 | DataType::Float64 => { + IvfIndexBuilder::::new( + dataset.clone(), + column.to_owned(), + dataset.indices_dir().child(uuid), + params.metric_type, + Box::new(shuffler), + Some(ivf_params.clone()), + Some(()), + (), + )? + .build() + .await?; + } + DataType::UInt8 => { + IvfIndexBuilder::::new( + dataset.clone(), + column.to_owned(), + dataset.indices_dir().child(uuid), + params.metric_type, + Box::new(shuffler), + Some(ivf_params.clone()), + Some(()), + (), + )? + .build() + .await?; + } + _ => { + return Err(Error::Index { + message: format!( + "Build Vector Index: invalid data type: {:?}", + f.data_type() + ), + location: location!(), + }); + } + }, + _ => { + return Err(Error::Index { + message: format!("Build Vector Index: invalid data type: {:?}", data_type), + location: location!(), + }); + } + } } else if is_ivf_pq(stages) { let len = stages.len(); let StageParams::PQ(pq_params) = &stages[len - 1] else { diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index c4c22265c4a..c79fcf45b45 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -14,7 +14,7 @@ use lance_core::{Error, Result, ROW_ID_FIELD}; use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::v2::reader::FileReaderOptions; use lance_file::v2::{reader::FileReader, writer::FileWriter}; -use lance_index::vector::flat::storage::FlatStorage; +use lance_index::vector::flat::storage::FlatFloatStorage; use lance_index::vector::ivf::storage::IvfModel; use lance_index::vector::quantizer::{ QuantizationMetadata, QuantizationType, QuantizerBuildParams, @@ -434,7 +434,7 @@ impl IvfIndexBuilde // build the sub index, with in-memory storage let index_len = { let vectors = batch[&self.column].as_fixed_size_list(); - let flat_storage = FlatStorage::new(vectors.clone(), self.distance_type); + let flat_storage = FlatFloatStorage::new(vectors.clone(), self.distance_type); let sub_index = S::index_vectors(&flat_storage, self.sub_index_params.clone())?; let path = self.temp_dir.child(format!("index_part{}", part_id)); let writer = object_store.create(&path).await?; diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 25dfee8b364..8b7fd6b62ac 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -9,6 +9,7 @@ use std::{ sync::{Arc, Weak}, }; +use arrow::datatypes::UInt8Type; use arrow_arith::numeric::sub; use arrow_array::{ cast::{as_struct_array, AsArray}, @@ -638,6 +639,7 @@ async fn optimize_ivf_hnsw_indices( // Write the metadata of quantizer let quantization_metadata = match &quantizer { Quantizer::Flat(_) => None, + Quantizer::FlatBin(_) => None, Quantizer::Product(pq) => { let codebook_tensor = pb::Tensor::try_from(&pq.codebook)?; let codebook_pos = aux_writer.tell().await?; @@ -1604,6 +1606,7 @@ async fn write_ivf_hnsw_file( // For PQ, we need to store the codebook let quantization_metadata = match &quantizer { Quantizer::Flat(_) => None, + Quantizer::FlatBin(_) => None, Quantizer::Product(pq) => { let codebook_tensor = pb::Tensor::try_from(&pq.codebook)?; let codebook_pos = aux_writer.tell().await?; @@ -1731,6 +1734,15 @@ async fn train_ivf_model( ) .await } + (DataType::UInt8, DistanceType::Hamming) => { + do_train_ivf_model::( + values.as_primitive::().values(), + dim, + distance_type, + params, + ) + .await + } _ => Err(Error::Index { message: "Unsupported data type".to_string(), location: location!(), @@ -2750,7 +2762,7 @@ mod tests { true, )])); - let arr = generate_random_array_with_range(1000 * DIM, 1000.0..1001.0); + let arr = generate_random_array_with_range::(1000 * DIM, 1000.0..1001.0); let fsl = FixedSizeListArray::try_new_from_values(arr.clone(), DIM as i32).unwrap(); let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(fsl)]).unwrap(); let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema.clone()); diff --git a/rust/lance/src/index/vector/ivf/io.rs b/rust/lance/src/index/vector/ivf/io.rs index 8290f88ab26..3fe89b74a82 100644 --- a/rust/lance/src/index/vector/ivf/io.rs +++ b/rust/lance/src/index/vector/ivf/io.rs @@ -320,6 +320,7 @@ pub(super) async fn write_hnsw_quantization_index_partitions( let code_column = match &quantizer { Quantizer::Flat(_) => None, + Quantizer::FlatBin(_) => None, Quantizer::Product(pq) => Some(pq.column()), Quantizer::Scalar(_) => None, }; diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 727f50ecea7..a20282842cf 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -518,9 +518,11 @@ mod tests { use std::collections::HashSet; use std::{collections::HashMap, ops::Range, sync::Arc}; - use arrow::datatypes::UInt64Type; + use arrow::datatypes::{UInt64Type, UInt8Type}; use arrow::{array::AsArray, datatypes::Float32Type}; - use arrow_array::{Array, FixedSizeListArray, RecordBatch, RecordBatchIterator}; + use arrow_array::{ + Array, ArrowPrimitiveType, FixedSizeListArray, RecordBatch, RecordBatchIterator, + }; use arrow_schema::{DataType, Field, Schema}; use lance_arrow::FixedSizeListArrayExt; @@ -531,8 +533,10 @@ mod tests { use lance_index::vector::sq::builder::SQBuildParams; use lance_index::vector::DIST_COL; use lance_index::{DatasetIndexExt, IndexType}; + use lance_linalg::distance::hamming::hamming; use lance_linalg::distance::DistanceType; use lance_testing::datagen::generate_random_array_with_range; + use rand::distributions::uniform::SampleUniform; use rstest::rstest; use tempfile::tempdir; @@ -540,27 +544,32 @@ mod tests { const DIM: usize = 32; - async fn generate_test_dataset( + async fn generate_test_dataset( test_uri: &str, - range: Range, - ) -> (Dataset, Arc) { - let vectors = generate_random_array_with_range::(1000 * DIM, range); + range: Range, + ) -> (Dataset, Arc) + where + T::Native: SampleUniform, + { + let vectors = generate_random_array_with_range::(1000 * DIM, range); let metadata: HashMap = vec![("test".to_string(), "ivf_pq".to_string())] .into_iter() .collect(); - + let data_type = vectors.data_type().clone(); let schema: Arc<_> = Schema::new(vec![Field::new( "vector", DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Float32, true)), + Arc::new(Field::new("item", data_type.clone(), true)), DIM as i32, ), true, )]) .with_metadata(metadata) .into(); - let fsl = FixedSizeListArray::try_new_from_values(vectors, DIM as i32).unwrap(); - let fsl = lance_linalg::kernels::normalize_fsl(&fsl).unwrap(); + let mut fsl = FixedSizeListArray::try_new_from_values(vectors, DIM as i32).unwrap(); + if data_type != DataType::UInt8 { + fsl = lance_linalg::kernels::normalize_fsl(&fsl).unwrap(); + } let array = Arc::new(fsl); let batch = RecordBatch::try_new(schema.clone(), vec![array.clone()]).unwrap(); @@ -572,16 +581,22 @@ mod tests { #[allow(dead_code)] fn ground_truth( vectors: &FixedSizeListArray, - query: &[f32], + query: &dyn Array, k: usize, distance_type: DistanceType, ) -> Vec<(f32, u64)> { let mut dists = vec![]; for i in 0..vectors.len() { - let dist = distance_type.func()( - query, - vectors.value(i).as_primitive::().values(), - ); + let dist = match distance_type { + DistanceType::Hamming => hamming( + query.as_primitive::().values(), + vectors.value(i).as_primitive::().values(), + ), + _ => distance_type.func()( + query.as_primitive::().values(), + vectors.value(i).as_primitive::().values(), + ), + }; dists.push((dist, i as u64)); } dists.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); @@ -590,12 +605,31 @@ mod tests { } async fn test_index(params: VectorIndexParams, nlist: usize, recall_requirement: f32) { + match params.metric_type { + DistanceType::Hamming => { + test_index_impl::(params, nlist, recall_requirement, 0..2).await; + } + _ => { + test_index_impl::(params, nlist, recall_requirement, 0.0..1.0).await; + } + } + } + + async fn test_index_impl( + params: VectorIndexParams, + nlist: usize, + recall_requirement: f32, + range: Range, + ) where + T::Native: SampleUniform, + { let test_dir = tempdir().unwrap(); let test_uri = test_dir.path().to_str().unwrap(); - let (mut dataset, vectors) = generate_test_dataset(test_uri, 0.0..1.0).await; + let (mut dataset, vectors) = generate_test_dataset::(test_uri, range).await; + let vector_column = "vector"; dataset - .create_index(&["vector"], IndexType::Vector, None, ¶ms, true) + .create_index(&[vector_column], IndexType::Vector, None, ¶ms, true) .await .unwrap(); @@ -603,7 +637,7 @@ mod tests { let k = 100; let result = dataset .scan() - .nearest("vector", query.as_primitive::(), k) + .nearest(vector_column, query.as_primitive::(), k) .unwrap() .nprobs(nlist) .with_row_id() @@ -625,12 +659,7 @@ mod tests { .collect::>(); let row_ids = results.iter().map(|(_, id)| *id).collect::>(); - let gt = ground_truth( - &vectors, - query.as_primitive::().values(), - k, - params.metric_type, - ); + let gt = ground_truth(&vectors, query.as_ref(), k, params.metric_type); let gt_set = gt.iter().map(|r| r.1).collect::>(); let recall = row_ids.intersection(>_set).count() as f32 / k as f32; @@ -647,6 +676,7 @@ mod tests { #[case(4, DistanceType::L2, 1.0)] #[case(4, DistanceType::Cosine, 1.0)] #[case(4, DistanceType::Dot, 1.0)] + #[case(4, DistanceType::Hamming, 0.9)] #[tokio::test] async fn test_build_ivf_flat( #[case] nlist: usize, @@ -781,7 +811,7 @@ mod tests { let test_uri = test_dir.path().to_str().unwrap(); let nlist = 4; - let (mut dataset, _) = generate_test_dataset(test_uri, 0.0..1.0).await; + let (mut dataset, _) = generate_test_dataset::(test_uri, 0.0..1.0).await; let ivf_params = IvfBuildParams::new(nlist); let sq_params = SQBuildParams::default(); @@ -824,7 +854,7 @@ mod tests { let test_uri = test_dir.path().to_str().unwrap(); let nlist = 1000; - let (mut dataset, _) = generate_test_dataset(test_uri, 0.0..1.0).await; + let (mut dataset, _) = generate_test_dataset::(test_uri, 0.0..1.0).await; let ivf_params = IvfBuildParams::new(nlist); let sq_params = SQBuildParams::default(); diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index a09aa2f1331..96a017c706b 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -737,7 +737,7 @@ mod tests { let dataset = Dataset::open(test_uri).await.unwrap(); let stream = dataset .scan() - .nearest("vector", q.as_primitive(), 10) + .nearest("vector", q.as_primitive::(), 10) .unwrap() .try_into_stream() .await From a2af388f85d0ee838a1894fa1696f76086ed22e7 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Wed, 4 Dec 2024 17:45:58 +0800 Subject: [PATCH 02/13] make clippy happy Signed-off-by: BubbleCal --- rust/lance-index/src/vector/flat/storage.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs index 05c42c3f12b..8337027dbc2 100644 --- a/rust/lance-index/src/vector/flat/storage.rs +++ b/rust/lance-index/src/vector/flat/storage.rs @@ -334,6 +334,7 @@ pub struct FlatDistanceCal<'a, T: ArrowPrimitiveType> { vectors: &'a [T::Native], query: Vec, dimension: usize, + #[allow(clippy::type_complexity)] distance_fn: fn(&[T::Native], &[T::Native]) -> f32, } From aa206a2b5a32f60849bcf6976e00dd8a34eeb546 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 5 Dec 2024 14:16:12 +0800 Subject: [PATCH 03/13] fix Signed-off-by: BubbleCal --- python/src/utils.rs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/python/src/utils.rs b/python/src/utils.rs index 9b8420e781b..9f53c90772b 100644 --- a/python/src/utils.rs +++ b/python/src/utils.rs @@ -15,6 +15,7 @@ use std::sync::Arc; use arrow::compute::concat; +use arrow::datatypes::Float32Type; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use arrow_array::{cast::AsArray, Array, FixedSizeListArray, Float32Array, UInt32Array}; use arrow_data::ArrayData; @@ -26,7 +27,7 @@ use lance_file::writer::FileWriter; use lance_index::scalar::IndexWriter; use lance_index::vector::hnsw::{builder::HnswBuildParams, HNSW}; use lance_index::vector::v3::subindex::IvfSubIndex; -use lance_linalg::kmeans::compute_partitions; +use lance_linalg::kmeans::{compute_partitions, KMeansAlgoFloat}; use lance_linalg::{ distance::DistanceType, kmeans::{KMeans as LanceKMeans, KMeansParams}, @@ -132,14 +133,15 @@ impl KMeans { if !matches!(fixed_size_arr.value_type(), DataType::Float32) { return Err(PyValueError::new_err("Must be a FixedSizeList of Float32")); }; - let values: Arc = fixed_size_arr.values().as_primitive().clone().into(); - let centroids: &Float32Array = kmeans.centroids.as_primitive(); - let cluster_ids = UInt32Array::from(compute_partitions( - centroids.values(), - values.values(), - kmeans.dimension, - kmeans.distance_type, - )); + let values = fixed_size_arr.values().as_primitive(); + let centroids = kmeans.centroids.as_primitive(); + let cluster_ids = + UInt32Array::from(compute_partitions::< + Float32Type, + KMeansAlgoFloat, + >( + centroids, values, kmeans.dimension, kmeans.distance_type + )); cluster_ids.into_data().to_pyarrow(py) } From 4f279eaf21351120796d68f78fe1f9decabcffd8 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 5 Dec 2024 14:31:06 +0800 Subject: [PATCH 04/13] fix Signed-off-by: BubbleCal --- python/Cargo.lock | 26 ++++++++++----------- rust/lance-index/src/vector/flat/storage.rs | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python/Cargo.lock b/python/Cargo.lock index d971675f4f3..3341e597a9c 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1945,7 +1945,7 @@ dependencies = [ [[package]] name = "fsst" -version = "0.20.0" +version = "0.20.1" dependencies = [ "rand", ] @@ -2710,7 +2710,7 @@ dependencies = [ [[package]] name = "lance" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow", "arrow-arith", @@ -2772,7 +2772,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -2789,7 +2789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -2825,7 +2825,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow", "arrow-array", @@ -2851,7 +2851,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow", "arrow-array", @@ -2866,7 +2866,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrayref", "arrow", @@ -2904,7 +2904,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow-arith", "arrow-array", @@ -2938,7 +2938,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow", "arrow-array", @@ -2989,7 +2989,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow", "arrow-arith", @@ -3028,7 +3028,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow-array", "arrow-ord", @@ -3051,7 +3051,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow", "arrow-array", @@ -4080,7 +4080,7 @@ dependencies = [ [[package]] name = "pylance" -version = "0.20.0" +version = "0.20.1" dependencies = [ "arrow", "arrow-array", diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs index 8337027dbc2..2ceef42605f 100644 --- a/rust/lance-index/src/vector/flat/storage.rs +++ b/rust/lance-index/src/vector/flat/storage.rs @@ -368,7 +368,7 @@ impl<'a> FlatDistanceCal<'a, UInt8Type> { } } -impl<'a, T: ArrowPrimitiveType> FlatDistanceCal<'a, T> { +impl FlatDistanceCal<'_, T> { #[inline] fn get_vector(&self, id: u32) -> &[T::Native] { &self.vectors[self.dimension * id as usize..self.dimension * (id + 1) as usize] From 085db49ce0e52ef54eca0f02420fc22bbebc537a Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 5 Dec 2024 16:26:34 +0800 Subject: [PATCH 05/13] update Cargo.lock Signed-off-by: BubbleCal --- python/Cargo.lock | 564 ++++++++++++++++++++++++++-------------------- 1 file changed, 316 insertions(+), 248 deletions(-) diff --git a/python/Cargo.lock b/python/Cargo.lock index 3341e597a9c..543d12523e4 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -57,9 +57,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "android-tzdata" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.92" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f37166d7d48a0284b99dd824694c26119c700b53bf0d1540cdb147dbdaaf13" +checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" [[package]] name = "arc-swap" @@ -150,7 +150,7 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.1", + "hashbrown 0.15.2", "num", ] @@ -347,9 +347,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.17" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" +checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" dependencies = [ "bzip2", "flate2", @@ -393,9 +393,9 @@ dependencies = [ [[package]] name = "async-io" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "444b0228950ee6501b3568d3c93bf1176a1fdbc3b758dcd9475046d30f4dc7e8" +checksum = "43a2b323ccce0a1d90b449fd71f2a06ca7faa7c54c2751f06c9bd851fc061059" dependencies = [ "async-lock", "cfg-if", @@ -438,7 +438,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -481,7 +481,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -513,9 +513,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.9" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d6448cfb224dd6a9b9ac734f58622dd0d4751f3589f3b777345745f46b2eb14" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" dependencies = [ "aws-credential-types", "aws-runtime", @@ -524,7 +524,7 @@ dependencies = [ "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.60.7", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -555,9 +555,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.4.3" +version = "1.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468" +checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -580,15 +580,15 @@ dependencies = [ [[package]] name = "aws-sdk-dynamodb" -version = "1.52.0" +version = "1.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "473aa619c2a3581ab00d9000e66a11982f6354d0150797518b8d459c7f9a6b5c" +checksum = "a18e18b3cf6b75c1fcb15e677f6dbd2a6d8dfe4d168e0a36721f7a6167c6c829" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -603,15 +603,15 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.48.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded855583fa1d22e88fe39fd6062b062376e50a8211989e07cf5e38d52eb3453" +checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -625,15 +625,15 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.49.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9177ea1192e6601ae16c7273385690d88a7ed386a00b74a6bc894d12103cd933" +checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -647,15 +647,15 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.48.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "823ef553cf36713c97453e2ddff1eb8f62be7f4523544e2a5db64caf80100f0a" +checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -670,9 +670,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.5" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5619742a0d8f253be760bfbb8e8e8368c69e3587e4637af5754e488a611499b1" +checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -683,7 +683,7 @@ dependencies = [ "hex", "hmac", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "once_cell", "percent-encoding", "sha2", @@ -731,6 +731,15 @@ dependencies = [ "aws-smithy-types", ] +[[package]] +name = "aws-smithy-json" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095" +dependencies = [ + "aws-smithy-types", +] + [[package]] name = "aws-smithy-query" version = "0.60.7" @@ -743,9 +752,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.3" +version = "1.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be28bd063fa91fd871d131fc8b68d7cd4c5fa0869bea68daca50dcb1cbd76be2" +checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -770,15 +779,15 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.7.2" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" +checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd" dependencies = [ "aws-smithy-async", "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "pin-project-lite", "tokio", "tracing", @@ -787,16 +796,16 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.8" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07c9cdc179e6afbf5d391ab08c85eac817b51c87e1892a5edb5f7bbdc64314b4" +checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -915,9 +924,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" dependencies = [ "arrayref", "arrayvec", @@ -989,9 +998,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "bytes-utils" @@ -1026,9 +1035,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.34" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b9470d453346108f93a59222a9a1a5724db32d0a4727b7ab7ace4b4d822dc9" +checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" dependencies = [ "jobserver", "libc", @@ -1091,9 +1100,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "7.1.1" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" dependencies = [ "strum", "strum_macros", @@ -1145,6 +1154,16 @@ dependencies = [ "libc", ] +[[package]] +name = "core-foundation" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -1153,9 +1172,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" dependencies = [ "libc", ] @@ -1245,9 +1264,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -1768,7 +1787,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -1816,12 +1835,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1854,9 +1873,9 @@ dependencies = [ [[package]] name = "event-listener-strategy" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1" +checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2" dependencies = [ "event-listener 5.3.1", "pin-project-lite", @@ -1870,9 +1889,9 @@ checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" [[package]] name = "fastrand" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "filetime" @@ -1904,9 +1923,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.34" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", "miniz_oxide", @@ -2006,9 +2025,9 @@ checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f1fa2f9765705486b33fd2acf1577f8ec449c2ba1f318ae5447697b7c08d210" +checksum = "cef40d21ae2c515b51041df9ed313ed21e572df340ea58a922a0aefe7e8891a1" dependencies = [ "fastrand", "futures-core", @@ -2025,7 +2044,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -2126,16 +2145,16 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" +checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" dependencies = [ "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http 1.1.0", + "http 1.2.0", "indexmap", "slab", "tokio", @@ -2166,9 +2185,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" dependencies = [ "allocator-api2", "equivalent", @@ -2253,9 +2272,9 @@ dependencies = [ [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -2280,7 +2299,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.1.0", + "http 1.2.0", ] [[package]] @@ -2291,7 +2310,7 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "pin-project-lite", ] @@ -2340,15 +2359,15 @@ dependencies = [ [[package]] name = "hyper" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" +checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f" dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.6", - "http 1.1.0", + "h2 0.4.7", + "http 1.2.0", "http-body 1.0.1", "httparse", "itoa", @@ -2381,11 +2400,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", - "http 1.1.0", - "hyper 1.5.0", + "http 1.2.0", + "hyper 1.5.1", "hyper-util", - "rustls 0.23.16", - "rustls-native-certs 0.8.0", + "rustls 0.23.19", + "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2401,9 +2420,9 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", - "hyper 1.5.0", + "hyper 1.5.1", "pin-project-lite", "socket2", "tokio", @@ -2558,7 +2577,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -2584,12 +2603,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] @@ -2677,9 +2696,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "jobserver" @@ -2692,10 +2711,11 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -3166,9 +3186,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.161" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "libm" @@ -3195,9 +3215,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "litemap" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "lock_api" @@ -3224,7 +3244,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] @@ -3320,11 +3340,10 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi 0.3.9", "libc", "wasi", "windows-sys 0.52.0", @@ -3358,7 +3377,7 @@ dependencies = [ "rustc_version", "smallvec", "tagptr", - "thiserror", + "thiserror 1.0.69", "triomphe", "uuid", ] @@ -3521,7 +3540,7 @@ dependencies = [ "chrono", "futures", "humantime", - "hyper 1.5.0", + "hyper 1.5.1", "itertools 0.13.0", "md-5", "parking_lot", @@ -3665,7 +3684,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.1", + "hashbrown 0.15.2", "lz4_flex", "num", "num-bigint", @@ -3821,7 +3840,7 @@ checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3855,9 +3874,9 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "polling" -version = "3.7.3" +version = "3.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2790cd301dec6cd3b7a025e4815cf825724a51c98dccfe6a3e55f05ffb6511" +checksum = "a604568c3202727d1507653cb121dbd627a58684eb09a820fd746bee38b4442f" dependencies = [ "cfg-if", "concurrent-queue", @@ -3870,9 +3889,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "powerfmt" @@ -3906,14 +3925,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" dependencies = [ "proc-macro2", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -3987,7 +4006,7 @@ dependencies = [ "prost 0.12.6", "prost-types 0.12.6", "regex", - "syn 2.0.87", + "syn 2.0.90", "tempfile", ] @@ -4008,7 +4027,7 @@ dependencies = [ "prost 0.13.3", "prost-types 0.13.3", "regex", - "syn 2.0.87", + "syn 2.0.90", "tempfile", ] @@ -4035,7 +4054,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4048,7 +4067,7 @@ dependencies = [ "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4168,7 +4187,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4181,7 +4200,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4211,44 +4230,47 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" dependencies = [ "bytes", "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.0.0", - "rustls 0.23.16", + "rustc-hash 2.1.0", + "rustls 0.23.19", "socket2", - "thiserror", + "thiserror 2.0.4", "tokio", "tracing", ] [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", + "getrandom", "rand", "ring", - "rustc-hash 2.0.0", - "rustls 0.23.16", + "rustc-hash 2.1.0", + "rustls 0.23.19", + "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.4", "tinyvec", "tracing", + "web-time", ] [[package]] name = "quinn-udp" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" +checksum = "7d5a626c6807713b15cac82a6acaccd6043c9a5408c24baae07611fec3f243da" dependencies = [ "cfg_aliases", "libc", @@ -4374,7 +4396,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4391,9 +4413,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -4432,11 +4454,11 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2 0.4.6", - "http 1.1.0", + "h2 0.4.7", + "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.0", + "hyper 1.5.1", "hyper-rustls 0.27.3", "hyper-util", "ipnet", @@ -4447,8 +4469,8 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.16", - "rustls-native-certs 0.8.0", + "rustls 0.23.19", + "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", @@ -4484,9 +4506,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1" +checksum = "f81dc953b2244ddd5e7860cb0bb2a790494b898ef321d4aff8e260efab60cc88" dependencies = [ "bytemuck", "byteorder", @@ -4516,9 +4538,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "rustc_version" @@ -4531,9 +4553,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.39" +version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375116bee2be9ed569afe2154ea6a99dfdffd257f533f187498c2a8f5feaf4ee" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ "bitflags 2.6.0", "errno", @@ -4556,9 +4578,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.16" +version = "0.23.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" +checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1" dependencies = [ "log", "once_cell", @@ -4578,20 +4600,19 @@ dependencies = [ "openssl-probe", "rustls-pemfile 1.0.4", "schannel", - "security-framework", + "security-framework 2.11.1", ] [[package]] name = "rustls-native-certs" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" dependencies = [ "openssl-probe", - "rustls-pemfile 2.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.0.1", ] [[package]] @@ -4617,6 +4638,9 @@ name = "rustls-pki-types" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +dependencies = [ + "web-time", +] [[package]] name = "rustls-webpki" @@ -4662,9 +4686,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" dependencies = [ "windows-sys 0.59.0", ] @@ -4690,7 +4714,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4716,7 +4740,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ "bitflags 2.6.0", - "core-foundation", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8" +dependencies = [ + "bitflags 2.6.0", + "core-foundation 0.10.0", "core-foundation-sys", "libc", "security-framework-sys", @@ -4724,9 +4761,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" dependencies = [ "core-foundation-sys", "libc", @@ -4749,22 +4786,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4775,14 +4812,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "itoa", "memchr", @@ -4799,7 +4836,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4941,7 +4978,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -4952,9 +4989,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -4984,7 +5021,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -5030,7 +5067,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -5052,7 +5089,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.87", + "syn 2.0.90", "typify", "walkdir", ] @@ -5076,9 +5113,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.87" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -5087,9 +5124,9 @@ dependencies = [ [[package]] name = "sync_wrapper" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" dependencies = [ "futures-core", ] @@ -5102,7 +5139,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -5156,7 +5193,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "uuid", "winapi", @@ -5277,9 +5314,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", "fastrand", @@ -5322,28 +5359,48 @@ dependencies = [ "prost 0.12.6", "prost-build 0.12.6", "tar", - "thiserror", + "thiserror 1.0.69", "ureq", ] [[package]] name = "thiserror" -version = "1.0.68" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490" +dependencies = [ + "thiserror-impl 2.0.4", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ - "thiserror-impl", + "proc-macro2", + "quote", + "syn 2.0.90", ] [[package]] name = "thiserror-impl" -version = "1.0.68" +version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e" +checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -5369,9 +5426,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.36" +version = "0.3.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", "itoa", @@ -5390,9 +5447,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" dependencies = [ "num-conv", "time-core", @@ -5434,9 +5491,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.0" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", @@ -5457,7 +5514,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -5476,7 +5533,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.16", + "rustls 0.23.19", "rustls-pki-types", "tokio", ] @@ -5494,9 +5551,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -5513,9 +5570,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -5524,13 +5581,13 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -5546,9 +5603,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", "valuable", @@ -5567,9 +5624,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "nu-ansi-term", "sharded-slab", @@ -5632,8 +5689,8 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.87", - "thiserror", + "syn 2.0.90", + "thiserror 1.0.69", "unicode-ident", ] @@ -5650,15 +5707,15 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.87", + "syn 2.0.90", "typify-impl", ] [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-segmentation" @@ -5668,9 +5725,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.14" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "unindent" @@ -5692,15 +5749,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.10.1" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" dependencies = [ "base64 0.22.1", "flate2", "log", "once_cell", - "rustls 0.23.16", + "rustls 0.23.19", "rustls-pki-types", "url", "webpki-roots", @@ -5708,9 +5765,9 @@ dependencies = [ [[package]] name = "url" -version = "2.5.3" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -5802,9 +5859,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" dependencies = [ "cfg-if", "once_cell", @@ -5813,36 +5870,37 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.45" +version = "0.4.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" +checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d" dependencies = [ "cfg-if", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5850,22 +5908,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" [[package]] name = "wasm-streams" @@ -5882,9 +5940,19 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ "js-sys", "wasm-bindgen", @@ -5892,9 +5960,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.26.6" +version = "0.26.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958" +checksum = "5d642ff16b7e79272ae451b7322067cdc17cadf68c23264be9d94a32319efe7e" dependencies = [ "rustls-pki-types", ] @@ -6178,9 +6246,9 @@ dependencies = [ [[package]] name = "yoke" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" dependencies = [ "serde", "stable_deref_trait", @@ -6190,13 +6258,13 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "synstructure", ] @@ -6218,27 +6286,27 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] name = "zerofrom" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "synstructure", ] @@ -6267,7 +6335,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -6290,9 +6358,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", From 1a2893be1b4a463ce0c1f7f0cf99d64ee8754768 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 5 Dec 2024 16:43:53 +0800 Subject: [PATCH 06/13] fix stack overflow Signed-off-by: BubbleCal --- rust/lance/src/index/vector/utils.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index 661877ed539..c0f4d141961 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -49,12 +49,7 @@ pub async fn maybe_sample_training_data( } else { let mut scanner = dataset.scan(); scanner.project(&[column])?; - let batches = scanner - .try_into_stream() - .await? - .try_collect::>() - .await?; - concat_batches(&Arc::new(ArrowSchema::from(&projection)), &batches)? + scanner.try_into_batch().await? }; let array = batch.column_by_name(column).ok_or(Error::Index { From 844fa3c0cc7dba470d6c7549752475ee322e7ac5 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 5 Dec 2024 16:49:07 +0800 Subject: [PATCH 07/13] fix stack overflow Signed-off-by: BubbleCal --- rust/lance/src/index/vector/utils.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index c0f4d141961..9629747555b 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -4,9 +4,6 @@ use std::sync::Arc; use arrow_array::{cast::AsArray, FixedSizeListArray}; -use arrow_schema::Schema as ArrowSchema; -use arrow_select::concat::concat_batches; -use futures::stream::TryStreamExt; use snafu::{location, Location}; use tokio::sync::Mutex; From b52af6d8dbfad5af3a6b570121f4d5c932f337ee Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 5 Dec 2024 17:08:37 +0800 Subject: [PATCH 08/13] fix Signed-off-by: BubbleCal --- rust/lance/src/dataset/optimize.rs | 2 +- rust/lance/src/dataset/scanner.rs | 2 +- rust/lance/src/index/vector/utils.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 034168c26c0..a1e8b82ea2d 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -1673,7 +1673,7 @@ mod tests { let mut scanner = dataset.scan(); scanner - .nearest("vec", &vec![0.0; 128].into(), 10) + .nearest("vec", &vec![0.0f32; 128].into(), 10) .unwrap() .project(&["i"]) .unwrap(); diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index d98894a65df..00e686c84f3 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -670,7 +670,7 @@ impl Scanner { DataType::FixedSizeList(dt, _) => { if dt.data_type() == q.data_type() { Box::new(q.clone()) - } else if dt.data_type().is_floating() && *q.data_type() == DataType::Float32 { + } else if dt.data_type().is_floating() { coerce_float_vector( q.as_any().downcast_ref::().unwrap(), FloatType::try_from(dt.data_type())?, diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index 9629747555b..b3c5f5b44c6 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -40,8 +40,8 @@ pub async fn maybe_sample_training_data( sample_size_hint: usize, ) -> Result { let num_rows = dataset.count_rows(None).await?; - let projection = dataset.schema().project(&[column])?; let batch = if num_rows > sample_size_hint { + let projection = dataset.schema().project(&[column])?; dataset.sample(sample_size_hint, &projection).await? } else { let mut scanner = dataset.scan(); From d951bede53d3b51c2808378033185832a9fa8b1a Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 5 Dec 2024 19:56:22 +0800 Subject: [PATCH 09/13] fix stackoverflow Signed-off-by: BubbleCal --- rust/lance/src/index/vector/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index b3c5f5b44c6..179a68a9ba3 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -46,7 +46,7 @@ pub async fn maybe_sample_training_data( } else { let mut scanner = dataset.scan(); scanner.project(&[column])?; - scanner.try_into_batch().await? + Box::pin(scanner.try_into_batch()).await? }; let array = batch.column_by_name(column).ok_or(Error::Index { From 1c9561dfd6c08360089b1cdd06475f8504ab70d5 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 6 Dec 2024 12:06:37 +0800 Subject: [PATCH 10/13] move large future to heap Signed-off-by: BubbleCal --- rust/lance/src/index.rs | 11 +++++++++-- rust/lance/src/index/vector/utils.rs | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index f3543b8ba0b..8dc5d966228 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -270,8 +270,15 @@ impl DatasetIndexExt for Dataset { location: location!(), })?; - build_vector_index(self, column, &index_name, &index_id.to_string(), vec_params) - .await?; + // this is a large future so move it to heap + Box::pin(build_vector_index( + self, + column, + &index_name, + &index_id.to_string(), + vec_params, + )) + .await?; vector_index_details() } // Can't use if let Some(...) here because it's not stable yet. diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index 179a68a9ba3..b3c5f5b44c6 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -46,7 +46,7 @@ pub async fn maybe_sample_training_data( } else { let mut scanner = dataset.scan(); scanner.project(&[column])?; - Box::pin(scanner.try_into_batch()).await? + scanner.try_into_batch().await? }; let array = batch.column_by_name(column).ok_or(Error::Index { From 4ace2d0825b39678a8e83ea2d5dfd55518f5c930 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 6 Dec 2024 12:19:27 +0800 Subject: [PATCH 11/13] fix error message Signed-off-by: BubbleCal --- rust/lance-index/src/vector/flat/storage.rs | 19 ------------------- rust/lance/src/dataset/scanner.rs | 5 +++-- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs index 2ceef42605f..9fece3b3f8d 100644 --- a/rust/lance-index/src/vector/flat/storage.rs +++ b/rust/lance-index/src/vector/flat/storage.rs @@ -210,25 +210,6 @@ impl QuantizerStorage for FlatBinStorage { } impl FlatBinStorage { - // deprecated, use `try_from_batch` instead - pub fn new(vectors: FixedSizeListArray, distance_type: DistanceType) -> Self { - let row_ids = Arc::new(UInt64Array::from_iter_values(0..vectors.len() as u64)); - let vectors = Arc::new(vectors); - - let batch = RecordBatch::try_from_iter_with_nullable(vec![ - (ROW_ID, row_ids.clone() as ArrayRef, true), - (FLAT_COLUMN, vectors.clone() as ArrayRef, true), - ]) - .unwrap(); - - Self { - batch, - distance_type, - row_ids, - vectors, - } - } - pub fn vector(&self, id: u32) -> ArrayRef { self.vectors.value(id as usize) } diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 00e686c84f3..b813c633f03 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -678,9 +678,10 @@ impl Scanner { } else { return Err(Error::invalid_input( format!( - "Column {} is not a vector column (type: {})", + "Column {} has element type {} and the query vector is {}", column, - field.data_type() + dt.data_type(), + q.data_type(), ), location!(), )); From 07e219737817126b6a59c0f51e926d636a679725 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 6 Dec 2024 12:22:05 +0800 Subject: [PATCH 12/13] bump version Signed-off-by: BubbleCal --- Cargo.toml | 2 +- python/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 94405a5c925..0bcffa87816 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ exclude = ["python"] resolver = "2" [workspace.package] -version = "0.20.1" +version = "0.21.0" edition = "2021" authors = ["Lance Devs "] license = "Apache-2.0" diff --git a/python/Cargo.toml b/python/Cargo.toml index a56a87cba14..e9e9f867c4d 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "0.20.1" +version = "0.21.0" edition = "2021" authors = ["Lance Devs "] rust-version = "1.65" From bd2774725aea7d2f9df4c71f2b18d6037771304b Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 6 Dec 2024 12:24:21 +0800 Subject: [PATCH 13/13] fix version Signed-off-by: BubbleCal --- Cargo.lock | 32 ++++++++++++++++---------------- Cargo.toml | 32 ++++++++++++++++---------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2e68786d957..9f26e238541 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2302,7 +2302,7 @@ dependencies = [ [[package]] name = "fsst" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "lance-datagen", @@ -3002,7 +3002,7 @@ dependencies = [ [[package]] name = "lance" -version = "0.20.1" +version = "0.21.0" dependencies = [ "all_asserts", "approx", @@ -3082,7 +3082,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3099,7 +3099,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3138,7 +3138,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3166,7 +3166,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3183,7 +3183,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrayref", "arrow", @@ -3229,7 +3229,7 @@ dependencies = [ [[package]] name = "lance-encoding-datafusion" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3261,7 +3261,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-arith", "arrow-array", @@ -3303,7 +3303,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "0.20.1" +version = "0.21.0" dependencies = [ "approx", "arrow", @@ -3362,7 +3362,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-arith", @@ -3407,7 +3407,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-schema", @@ -3428,7 +3428,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "0.20.1" +version = "0.21.0" dependencies = [ "approx", "arrow-arith", @@ -3457,7 +3457,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3501,7 +3501,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "0.20.1" +version = "0.21.0" dependencies = [ "proc-macro2", "quote", @@ -3510,7 +3510,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-schema", diff --git a/Cargo.toml b/Cargo.toml index 0bcffa87816..84c183579c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,21 +44,21 @@ categories = [ rust-version = "1.78" [workspace.dependencies] -lance = { version = "=0.20.1", path = "./rust/lance" } -lance-arrow = { version = "=0.20.1", path = "./rust/lance-arrow" } -lance-core = { version = "=0.20.1", path = "./rust/lance-core" } -lance-datafusion = { version = "=0.20.1", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=0.20.1", path = "./rust/lance-datagen" } -lance-encoding = { version = "=0.20.1", path = "./rust/lance-encoding" } -lance-encoding-datafusion = { version = "=0.20.1", path = "./rust/lance-encoding-datafusion" } -lance-file = { version = "=0.20.1", path = "./rust/lance-file" } -lance-index = { version = "=0.20.1", path = "./rust/lance-index" } -lance-io = { version = "=0.20.1", path = "./rust/lance-io" } -lance-jni = { version = "=0.20.1", path = "./java/core/lance-jni" } -lance-linalg = { version = "=0.20.1", path = "./rust/lance-linalg" } -lance-table = { version = "=0.20.1", path = "./rust/lance-table" } -lance-test-macros = { version = "=0.20.1", path = "./rust/lance-test-macros" } -lance-testing = { version = "=0.20.1", path = "./rust/lance-testing" } +lance = { version = "=0.21.0", path = "./rust/lance" } +lance-arrow = { version = "=0.21.0", path = "./rust/lance-arrow" } +lance-core = { version = "=0.21.0", path = "./rust/lance-core" } +lance-datafusion = { version = "=0.21.0", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=0.21.0", path = "./rust/lance-datagen" } +lance-encoding = { version = "=0.21.0", path = "./rust/lance-encoding" } +lance-encoding-datafusion = { version = "=0.21.0", path = "./rust/lance-encoding-datafusion" } +lance-file = { version = "=0.21.0", path = "./rust/lance-file" } +lance-index = { version = "=0.21.0", path = "./rust/lance-index" } +lance-io = { version = "=0.21.0", path = "./rust/lance-io" } +lance-jni = { version = "=0.21.0", path = "./java/core/lance-jni" } +lance-linalg = { version = "=0.21.0", path = "./rust/lance-linalg" } +lance-table = { version = "=0.21.0", path = "./rust/lance-table" } +lance-test-macros = { version = "=0.21.0", path = "./rust/lance-test-macros" } +lance-testing = { version = "=0.21.0", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "53.2", optional = false, features = ["prettyprint"] } @@ -111,7 +111,7 @@ datafusion-physical-expr = { version = "42.0", features = [ ] } deepsize = "0.2.0" either = "1.0" -fsst = { version = "=0.20.1", path = "./rust/lance-encoding/src/compression_algo/fsst" } +fsst = { version = "=0.21.0", path = "./rust/lance-encoding/src/compression_algo/fsst" } futures = "0.3" http = "1.1.0" hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }