diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 203ffcdffa8..e705c0eda1d 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -38,6 +38,7 @@ use lance::dataset::{ ColumnAlteration, CommitBuilder, Dataset, NewColumnTransform, ProjectionRequest, ReadParams, Version, WriteParams, }; +use lance::index::{DatasetIndexExt, IndexSegment}; use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; use lance::io::{ObjectStore, ObjectStoreParams}; use lance::session::Session as LanceSession; @@ -45,11 +46,10 @@ use lance::table::format::IndexMetadata; use lance::table::format::{BasePath, Fragment}; use lance_core::datatypes::Schema as LanceSchema; use lance_file::version::LanceFileVersion; -use lance_index::DatasetIndexExt; use lance_index::IndexCriteria as RustIndexCriteria; use lance_index::optimize::OptimizeOptions; use lance_index::scalar::btree::BTreeParameters; -use lance_index::{IndexParams, IndexSegment, IndexType}; +use lance_index::{IndexParams, IndexType}; use lance_io::object_store::ObjectStoreRegistry; use lance_io::object_store::StorageOptionsProvider; use lance_namespace::LanceNamespace; diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 695d8b317c5..402bc010268 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -55,7 +55,7 @@ use lance::dataset::{ transaction::{Operation, Transaction}, }; use lance::index::vector::utils::get_vector_type; -use lance::index::{DatasetIndexInternalExt, vector::VectorIndexParams}; +use lance::index::{DatasetIndexExt, DatasetIndexInternalExt, vector::VectorIndexParams}; use lance::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion}; use lance_arrow::as_fixed_size_list_array; use lance_core::Error; @@ -67,7 +67,7 @@ use lance_index::scalar::inverted::query::{ BooleanQuery, BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, Operator, PhraseQuery, }; use lance_index::{ - DatasetIndexExt, IndexParams, IndexType, + IndexParams, IndexType, optimize::OptimizeOptions, scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams}, vector::{ diff --git a/python/src/indices.rs b/python/src/indices.rs index f1d42918962..9651c6cc00e 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -11,6 +11,7 @@ use arrow_data::ArrayData; use chrono::{DateTime, Utc}; use lance::dataset::Dataset as LanceDataset; use lance::index::vector::ivf::builder::write_vector_storage; +use lance::index::{DatasetIndexExt, IndexSegment, IndexSegmentPlan}; use lance::io::ObjectStore; use lance_index::progress::NoopIndexBuildProgress; use lance_index::vector::ivf::shuffler::{IvfShuffler, shuffle_vectors}; @@ -37,7 +38,7 @@ use crate::{ dataset::Dataset, error::PythonErrorExt, file::object_store_from_uri_or_path_no_options, rt, }; use lance::index::vector::ivf::write_ivf_pq_file_from_existing_index; -use lance_index::{DatasetIndexExt, IndexDescription, IndexSegment, IndexSegmentPlan, IndexType}; +use lance_index::{IndexDescription, IndexType}; use uuid::Uuid; #[pyclass(name = "IndexConfig", module = "lance.indices", get_all)] diff --git a/python/src/lib.rs b/python/src/lib.rs index f7d233cf117..9730f2ba1c5 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -34,6 +34,7 @@ use ::arrow::pyarrow::PyArrowType; use ::arrow_schema::Schema as ArrowSchema; use ::lance::arrow::json::ArrowJsonExt; use ::lance::datafusion::LanceTableProvider; +use ::lance::index::DatasetIndexExt; use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; use datafusion_ffi::table_provider::FFI_TableProvider; #[cfg(feature = "datagen")] @@ -52,7 +53,6 @@ use file::{ LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader, LanceFileStatistics, LanceFileWriter, LancePageMetadata, stable_version, }; -use lance_index::DatasetIndexExt; use log::Level; use pyo3::exceptions::PyIOError; use pyo3::prelude::*; diff --git a/rust/examples/src/full_text_search.rs b/rust/examples/src/full_text_search.rs index 22e701c5863..8269f590ee8 100644 --- a/rust/examples/src/full_text_search.rs +++ b/rust/examples/src/full_text_search.rs @@ -15,8 +15,8 @@ use arrow::datatypes::UInt64Type; use arrow_schema::{DataType, Field, Schema}; use itertools::Itertools; use lance::Dataset; +use lance::index::DatasetIndexExt; use lance_datagen::{RowCount, array}; -use lance_index::DatasetIndexExt; use lance_index::scalar::inverted::flat_full_text_search; use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams}; use object_store::path::Path; diff --git a/rust/examples/src/ivf_hnsw.rs b/rust/examples/src/ivf_hnsw.rs index 296c03b19d6..c1898e10682 100644 --- a/rust/examples/src/ivf_hnsw.rs +++ b/rust/examples/src/ivf_hnsw.rs @@ -11,11 +11,12 @@ use clap::Parser; use futures::TryStreamExt; use lance::Dataset; use lance::dataset::ProjectionRequest; +use lance::index::DatasetIndexExt; use lance::index::vector::VectorIndexParams; +use lance_index::IndexType; use lance_index::vector::hnsw::builder::HnswBuildParams; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::sq::builder::SQBuildParams; -use lance_index::{DatasetIndexExt, IndexType}; use lance_linalg::distance::MetricType; #[derive(Parser, Debug)] diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index bb00a716173..62ae68414a6 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -29,11 +29,9 @@ pub mod progress; pub mod registry; pub mod scalar; pub mod traits; -pub mod types; pub mod vector; pub use crate::traits::*; -pub use crate::types::{IndexSegment, IndexSegmentPlan}; pub const INDEX_FILE_NAME: &str = "index.idx"; /// The name of the auxiliary index file. diff --git a/rust/lance-index/src/traits.rs b/rust/lance-index/src/traits.rs index a883c3ce494..130e59cad81 100644 --- a/rust/lance-index/src/traits.rs +++ b/rust/lance-index/src/traits.rs @@ -1,13 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::sync::Arc; +use lance_core::Result; -use async_trait::async_trait; -use datafusion::execution::SendableRecordBatchStream; -use lance_core::{Error, Result}; - -use crate::{IndexParams, IndexType, optimize::OptimizeOptions, types::IndexSegment}; use lance_table::format::IndexMetadata; /// A set of criteria used to filter potential indices to use for a query @@ -121,191 +116,3 @@ pub trait IndexDescription: Send + Sync { /// (for backward compatibility with indices created before file tracking was added). fn total_size_bytes(&self) -> Option; } - -// Extends Lance Dataset with secondary index. -#[async_trait] -pub trait DatasetIndexExt { - type IndexBuilder<'a> - where - Self: 'a; - type IndexSegmentBuilder<'a> - where - Self: 'a; - - /// Create a builder for creating an index on columns. - /// - /// This returns a builder that can be configured with additional options - /// like `name()`, `replace()`, and `train()` before awaiting to execute. - /// - /// # Parameters - /// - `columns`: the columns to build the indices on. - /// - `index_type`: specify [`IndexType`]. - /// - `params`: index parameters. - fn create_index_builder<'a>( - &'a mut self, - columns: &'a [&'a str], - index_type: IndexType, - params: &'a dyn IndexParams, - ) -> Self::IndexBuilder<'a>; - - /// Create a builder for building physical index segments from uncommitted - /// vector index outputs. - /// - /// The caller supplies the uncommitted index metadata returned by - /// `execute_uncommitted()` so the builder can plan segment grouping without - /// rediscovering fragment coverage. - /// - /// This is the canonical entry point for distributed vector segment build. - /// After building the physical segments, publish them as a - /// logical index with [`Self::commit_existing_index_segments`]. - fn create_index_segment_builder<'a>(&'a self) -> Self::IndexSegmentBuilder<'a>; - - /// Create indices on columns. - /// - /// Upon finish, a new dataset version is generated. - /// - /// Parameters: - /// - /// - `columns`: the columns to build the indices on. - /// - `index_type`: specify [`IndexType`]. - /// - `name`: optional index name. Must be unique in the dataset. - /// if not provided, it will auto-generate one. - /// - `params`: index parameters. - /// - `replace`: replace the existing index if it exists. - /// - /// Returns the metadata of the created index. - async fn create_index( - &mut self, - columns: &[&str], - index_type: IndexType, - name: Option, - params: &dyn IndexParams, - replace: bool, - ) -> Result; - - /// Drop indices by name. - /// - /// Upon finish, a new dataset version is generated. - /// - /// Parameters: - /// - /// - `name`: the name of the index to drop. - async fn drop_index(&mut self, name: &str) -> Result<()>; - - /// Prewarm an index by name. - /// - /// This will load the index into memory and cache it. - /// - /// Generally, this should only be called when it is known the entire index will - /// fit into the index cache. - /// - /// This is a hint that is not enforced by all indices today. Some indices may choose - /// to ignore this hint. - async fn prewarm_index(&self, name: &str) -> Result<()>; - - /// Read all indices of this Dataset version. - /// - /// The indices are lazy loaded and cached in memory within the `Dataset` instance. - /// The cache is invalidated when the dataset version (Manifest) is changed. - async fn load_indices(&self) -> Result>>; - - /// Loads all the indies of a given UUID. - /// - /// Note that it is possible to have multiple indices with the same UUID, - /// as they are the deltas of the same index. - async fn load_index(&self, uuid: &str) -> Result> { - self.load_indices().await.map(|indices| { - indices - .iter() - .find(|idx| idx.uuid.to_string() == uuid) - .cloned() - }) - } - - /// Loads a specific index with the given index name - /// - /// Returns - /// ------- - /// - `Ok(indices)`: if the index exists, returns the index. - /// - `Ok(vec![])`: if the index does not exist. - /// - `Err(e)`: if there is an error loading indices. - /// - async fn load_indices_by_name(&self, name: &str) -> Result> { - self.load_indices().await.map(|indices| { - indices - .iter() - .filter(|idx| idx.name == name) - .cloned() - .collect() - }) - } - - /// Loads a specific index with the given index name. - /// This function only works for indices that are unique. - /// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`] - /// - /// Returns - /// ------- - /// - `Ok(Some(index))`: if the index exists, returns the index. - /// - `Ok(None)`: if the index does not exist. - /// - `Err(e)`: Index error if there are multiple indexes sharing the same name. - /// - async fn load_index_by_name(&self, name: &str) -> Result> { - let indices = self.load_indices_by_name(name).await?; - if indices.is_empty() { - Ok(None) - } else if indices.len() == 1 { - Ok(Some(indices[0].clone())) - } else { - Err(Error::index(format!( - "Found multiple indices of the same name: {:?}, please use load_indices_by_name", - indices.iter().map(|idx| &idx.name).collect::>() - ))) - } - } - - /// Describes indexes in a dataset - /// - /// This method should only access the index metadata and should not load the index into memory. - /// - /// More detailed information may be available from `index_statistics` but that will require - /// loading the index into memory. - async fn describe_indices<'a, 'b>( - &'a self, - criteria: Option>, - ) -> Result>>; - - /// Loads a specific index with the given index name. - async fn load_scalar_index<'a, 'b>( - &'a self, - criteria: IndexCriteria<'b>, - ) -> Result>; - - /// Optimize indices. - async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>; - - /// Find index with a given index_name and return its serialized statistics. - /// - /// If the index does not exist, return Error. - async fn index_statistics(&self, index_name: &str) -> Result; - - /// Commit one or more existing physical index segments as a logical index. - /// - /// This publishes already-built physical segments. It does not build - /// or merge index data; callers should first build segments with - /// [`Self::create_index_segment_builder`] or another index-specific build - /// path and then pass the resulting segments here. - async fn commit_existing_index_segments( - &mut self, - index_name: &str, - column: &str, - segments: Vec, - ) -> Result<()>; - - async fn read_index_partition( - &self, - index_name: &str, - partition_id: usize, - with_vector: bool, - ) -> Result; -} diff --git a/rust/lance-index/src/types.rs b/rust/lance-index/src/types.rs deleted file mode 100644 index 4d653f21f13..00000000000 --- a/rust/lance-index/src/types.rs +++ /dev/null @@ -1,124 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -use std::sync::Arc; - -use crate::IndexType; -use lance_table::format::IndexMetadata; -use roaring::RoaringBitmap; -use uuid::Uuid; - -/// A single physical segment of a logical index. -/// -/// Each segment is stored independently and will become one manifest entry when committed. -/// The logical index identity (name / target column / dataset version) is provided separately -/// by the commit API. -#[derive(Debug, Clone, PartialEq)] -pub struct IndexSegment { - /// Unique ID of the physical segment. - uuid: Uuid, - /// The fragments covered by this segment. - fragment_bitmap: RoaringBitmap, - /// Metadata specific to the index type. - index_details: Arc, - /// The on-disk index version for this segment. - index_version: i32, -} - -impl IndexSegment { - /// Create a fully described segment with the given UUID, fragment coverage, and index - /// metadata. - pub fn new( - uuid: Uuid, - fragment_bitmap: I, - index_details: Arc, - index_version: i32, - ) -> Self - where - I: IntoIterator, - { - Self { - uuid, - fragment_bitmap: fragment_bitmap.into_iter().collect(), - index_details, - index_version, - } - } - - /// Return the UUID of this segment. - pub fn uuid(&self) -> Uuid { - self.uuid - } - - /// Return the fragment coverage of this segment. - pub fn fragment_bitmap(&self) -> &RoaringBitmap { - &self.fragment_bitmap - } - - /// Return the serialized index details for this segment. - pub fn index_details(&self) -> &Arc { - &self.index_details - } - - /// Return the on-disk index version for this segment. - pub fn index_version(&self) -> i32 { - self.index_version - } - - /// Consume the segment and return its component parts. - pub fn into_parts(self) -> (Uuid, RoaringBitmap, Arc, i32) { - ( - self.uuid, - self.fragment_bitmap, - self.index_details, - self.index_version, - ) - } -} - -/// A plan for building one physical segment from one or more existing -/// vector index segments. -#[derive(Debug, Clone, PartialEq)] -pub struct IndexSegmentPlan { - segment: IndexSegment, - segments: Vec, - estimated_bytes: u64, - requested_index_type: Option, -} - -impl IndexSegmentPlan { - /// Create a plan for one built segment. - pub fn new( - segment: IndexSegment, - segments: Vec, - estimated_bytes: u64, - requested_index_type: Option, - ) -> Self { - Self { - segment, - segments, - estimated_bytes, - requested_index_type, - } - } - - /// Return the segment metadata that should be committed after this plan is built. - pub fn segment(&self) -> &IndexSegment { - &self.segment - } - - /// Return the input segment metadata that should be combined into the segment. - pub fn segments(&self) -> &[IndexMetadata] { - &self.segments - } - - /// Return the estimated number of bytes covered by this plan. - pub fn estimated_bytes(&self) -> u64 { - self.estimated_bytes - } - - /// Return the requested logical index type, if one was supplied to the planner. - pub fn requested_index_type(&self) -> Option { - self.requested_index_type - } -} diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 902c5462acc..9c8bf337712 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -18,6 +18,7 @@ use lance::dataset::{ DeleteBuilder, MergeInsertBuilder, ReadParams, WhenMatched, WhenNotMatched, WriteParams, builder::DatasetBuilder, }; +use lance::index::DatasetIndexExt; use lance::session::Session; use lance::{Dataset, dataset::scanner::Scanner}; use lance_core::Error as LanceError; @@ -26,7 +27,6 @@ use lance_core::{Error, Result, box_error}; use lance_index::IndexType; use lance_index::optimize::OptimizeOptions; use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; -use lance_index::traits::DatasetIndexExt; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; use lance_namespace::LanceNamespace; use lance_namespace::error::NamespaceError; diff --git a/rust/lance/benches/ivf_pq.rs b/rust/lance/benches/ivf_pq.rs index ebe70a32d4d..ae92b406168 100644 --- a/rust/lance/benches/ivf_pq.rs +++ b/rust/lance/benches/ivf_pq.rs @@ -9,13 +9,14 @@ use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator}; use arrow_schema::{DataType, Field, FieldRef, Schema}; use criterion::{Criterion, criterion_group, criterion_main}; +use lance::index::DatasetIndexExt; use lance::{ Dataset, dataset::{WriteMode, WriteParams}, index::vector::VectorIndexParams, }; use lance_arrow::*; -use lance_index::{DatasetIndexExt, IndexType}; +use lance_index::IndexType; use lance_linalg::distance::MetricType; use lance_testing::datagen::generate_random_array; #[cfg(target_os = "linux")] diff --git a/rust/lance/benches/mem_wal_write.rs b/rust/lance/benches/mem_wal_write.rs index 363767b519e..31f855fd0ad 100644 --- a/rust/lance/benches/mem_wal_write.rs +++ b/rust/lance/benches/mem_wal_write.rs @@ -54,12 +54,13 @@ use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use lance::dataset::mem_wal::{DatasetMemWalExt, MemWalConfig, RegionWriterConfig}; use lance::dataset::{Dataset, WriteParams}; +use lance::index::DatasetIndexExt; use lance::index::vector::VectorIndexParams; use lance_arrow::FixedSizeListArrayExt; +use lance_index::IndexType; use lance_index::scalar::ScalarIndexParams; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::pq::PQBuildParams; -use lance_index::{DatasetIndexExt, IndexType}; use lance_linalg::distance::DistanceType; #[cfg(target_os = "linux")] use pprof::criterion::{Output, PProfProfiler}; diff --git a/rust/lance/benches/memtable_read.rs b/rust/lance/benches/memtable_read.rs index 240c90294e9..efd16dbb62a 100644 --- a/rust/lance/benches/memtable_read.rs +++ b/rust/lance/benches/memtable_read.rs @@ -38,15 +38,16 @@ use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_m use futures::TryStreamExt; use lance::dataset::mem_wal::write::{CacheConfig, IndexStore, MemTable}; use lance::dataset::{Dataset, WriteParams}; +use lance::index::DatasetIndexExt; use lance::index::vector::VectorIndexParams; use lance_arrow::FixedSizeListArrayExt; +use lance_index::IndexType; use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::tokenizer::InvertedIndexParams; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::ivf::storage::IvfModel; use lance_index::vector::kmeans::{KMeansParams, train_kmeans}; use lance_index::vector::pq::builder::PQBuildParams; -use lance_index::{DatasetIndexExt, IndexType}; use lance_linalg::distance::{DistanceType, MetricType}; #[cfg(target_os = "linux")] use pprof::criterion::{Output, PProfProfiler}; diff --git a/rust/lance/benches/vector_index.rs b/rust/lance/benches/vector_index.rs index cf4106db8b4..21c9aa4e4aa 100644 --- a/rust/lance/benches/vector_index.rs +++ b/rust/lance/benches/vector_index.rs @@ -15,10 +15,11 @@ use pprof::criterion::{Output, PProfProfiler}; use rand::Rng; use lance::dataset::{Dataset, WriteMode, WriteParams, builder::DatasetBuilder}; +use lance::index::DatasetIndexExt; use lance::index::vector::VectorIndexParams; use lance_arrow::{FixedSizeListArrayExt, as_fixed_size_list_array}; use lance_index::{ - DatasetIndexExt, IndexType, + IndexType, vector::{ivf::IvfBuildParams, pq::PQBuildParams}, }; use lance_linalg::distance::MetricType; diff --git a/rust/lance/benches/vector_throughput.rs b/rust/lance/benches/vector_throughput.rs index ff24c7771ba..9a04971684b 100644 --- a/rust/lance/benches/vector_throughput.rs +++ b/rust/lance/benches/vector_throughput.rs @@ -19,10 +19,11 @@ use pprof::criterion::{Output, PProfProfiler}; use rand::Rng; use lance::dataset::{Dataset, WriteMode, WriteParams}; +use lance::index::DatasetIndexExt; use lance::index::vector::VectorIndexParams; use lance_arrow::FixedSizeListArrayExt; use lance_index::{ - DatasetIndexExt, IndexType, + IndexType, vector::{ivf::IvfBuildParams, pq::PQBuildParams}, }; use lance_linalg::distance::MetricType; diff --git a/rust/lance/src/bin/lq.rs b/rust/lance/src/bin/lq.rs index d28121bbf55..afaa15c7f10 100644 --- a/rust/lance/src/bin/lq.rs +++ b/rust/lance/src/bin/lq.rs @@ -10,9 +10,9 @@ use futures::TryStreamExt; use futures::stream::StreamExt; use lance::dataset::Dataset; +use lance::index::DatasetIndexExt; use lance::index::vector::VectorIndexParams; use lance::{Error, Result}; -use lance_index::DatasetIndexExt; use lance_linalg::distance::MetricType; #[derive(Parser)] diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 02f4b28e047..8f3deb2bef6 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -15,6 +15,7 @@ use futures::{FutureExt, Stream}; use crate::dataset::metadata::UpdateFieldMetadataBuilder; use crate::dataset::transaction::translate_schema_metadata_updates; +use crate::index::DatasetIndexExt; use crate::session::caches::{DSMetadataCache, ManifestKey, TransactionKey}; use crate::session::index_caches::DSIndexCache; use itertools::Itertools; @@ -30,7 +31,7 @@ use lance_datafusion::projection::ProjectionPlan; use lance_file::datatypes::populate_schema_dictionary; use lance_file::reader::FileReaderOptions; use lance_file::version::LanceFileVersion; -use lance_index::{DatasetIndexExt, IndexType}; +use lance_index::IndexType; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, StorageOptionsAccessor, StorageOptionsProvider, diff --git a/rust/lance/src/dataset/cleanup.rs b/rust/lance/src/dataset/cleanup.rs index 18380458eeb..f5162bba191 100644 --- a/rust/lance/src/dataset/cleanup.rs +++ b/rust/lance/src/dataset/cleanup.rs @@ -1180,6 +1180,7 @@ mod tests { use super::*; use crate::blob::{BlobArrayBuilder, blob_field}; + use crate::index::DatasetIndexExt; use crate::{ dataset::{ReadParams, WriteMode, WriteParams, builder::DatasetBuilder}, index::vector::VectorIndexParams, @@ -1193,7 +1194,7 @@ mod tests { use datafusion::common::assert_contains; use lance_core::utils::tempfile::TempStrDir; use lance_core::utils::testing::{ProxyObjectStore, ProxyObjectStorePolicy}; - use lance_index::{DatasetIndexExt, IndexType}; + use lance_index::IndexType; use lance_io::object_store::{ ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore, }; @@ -2642,8 +2643,9 @@ mod tests { // Create a full-text index (Inverted) on the "text" column once. // We only create this on main during dataset creation. Branches inherit the index configuration. async fn create_text_index(&mut self) -> Result<()> { + use crate::index::DatasetIndexExt; + use lance_index::IndexType; use lance_index::scalar::InvertedIndexParams; - use lance_index::{DatasetIndexExt, IndexType}; let params = InvertedIndexParams::default(); self.dataset .create_index(&["text"], IndexType::Inverted, None, ¶ms, true) diff --git a/rust/lance/src/dataset/index.rs b/rust/lance/src/dataset/index.rs index d4de4d0be6f..91c535468e5 100644 --- a/rust/lance/src/dataset/index.rs +++ b/rust/lance/src/dataset/index.rs @@ -9,13 +9,13 @@ use std::sync::Arc; use crate::Dataset; use crate::dataset::optimize::RemappedIndex; use crate::dataset::optimize::remapping::RemapResult; +use crate::index::DatasetIndexExt; use crate::index::remap_index; use crate::index::scalar::infer_scalar_index_details; use arrow_schema::DataType; use async_trait::async_trait; use lance_core::{Error, Result}; use lance_encoding::version::LanceFileVersion; -use lance_index::DatasetIndexExt; use lance_index::frag_reuse::FRAG_REUSE_INDEX_NAME; use lance_index::scalar::lance_format::LanceIndexStore; use lance_table::format::IndexMetadata; diff --git a/rust/lance/src/dataset/index/frag_reuse.rs b/rust/lance/src/dataset/index/frag_reuse.rs index cc7b845313d..4fbefcd4725 100644 --- a/rust/lance/src/dataset/index/frag_reuse.rs +++ b/rust/lance/src/dataset/index/frag_reuse.rs @@ -149,12 +149,13 @@ fn is_index_remap_caught_up( mod tests { use super::*; use crate::dataset::optimize::{CompactionOptions, compact_files, remapping}; + use crate::index::DatasetIndexExt; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; use all_asserts::{assert_false, assert_true}; use arrow_array::types::{Float32Type, Int32Type}; use lance_datagen::Dimension; + use lance_index::IndexType; use lance_index::scalar::ScalarIndexParams; - use lance_index::{DatasetIndexExt, IndexType}; #[tokio::test] async fn test_cleanup_frag_reuse_index() { diff --git a/rust/lance/src/dataset/mem_wal/api.rs b/rust/lance/src/dataset/mem_wal/api.rs index 99bb415d6a0..30c6a10811a 100644 --- a/rust/lance/src/dataset/mem_wal/api.rs +++ b/rust/lance/src/dataset/mem_wal/api.rs @@ -8,9 +8,9 @@ use std::sync::Arc; +use crate::index::DatasetIndexExt; use async_trait::async_trait; use lance_core::{Error, Result}; -use lance_index::DatasetIndexExt; use lance_index::mem_wal::{MEM_WAL_INDEX_NAME, MemWalIndexDetails, RegionSpec}; use lance_index::vector::ivf::storage::IvfModel; use lance_index::vector::pq::ProductQuantizer; diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index c3b955fc265..9ff133413ba 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -962,7 +962,7 @@ mod tests { #[tokio::test] async fn test_flusher_with_btree_index() { use super::super::super::index::{BTreeIndexConfig, IndexStore}; - use lance_index::DatasetIndexExt; + use crate::index::DatasetIndexExt; let (store, base_path, base_uri, _temp_dir) = create_local_store().await; let region_id = Uuid::new_v4(); @@ -1060,9 +1060,9 @@ mod tests { #[tokio::test] async fn test_flusher_with_ivf_pq_index() { use super::super::super::index::{IndexStore, IvfPqIndexConfig}; + use crate::index::DatasetIndexExt; use arrow_array::{FixedSizeListArray, Float32Array}; use lance_arrow::FixedSizeListArrayExt; - use lance_index::DatasetIndexExt; use lance_index::vector::ivf::storage::IvfModel; use lance_index::vector::kmeans::{KMeansParams, train_kmeans}; use lance_index::vector::pq::PQBuildParams; @@ -1284,9 +1284,9 @@ mod tests { #[tokio::test] async fn test_flusher_with_fts_index() { use super::super::super::index::{FtsIndexConfig, IndexStore}; + use crate::index::DatasetIndexExt; use arrow_array::StringArray; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; - use lance_index::DatasetIndexExt; use std::sync::Arc; let (store, base_path, base_uri, _temp_dir) = create_local_store().await; diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 65aa1d97586..dda06a7c9d9 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -2128,16 +2128,17 @@ mod tests { mod region_writer_tests { use std::sync::Arc; + use crate::index::DatasetIndexExt; use arrow_array::{ FixedSizeListArray, Float32Array, Int64Array, RecordBatch, RecordBatchIterator, StringArray, }; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use lance_arrow::FixedSizeListArrayExt; + use lance_index::IndexType; use lance_index::scalar::ScalarIndexParams; use lance_index::scalar::inverted::InvertedIndexParams; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::pq::builder::PQBuildParams; - use lance_index::{DatasetIndexExt, IndexType}; use lance_linalg::distance::MetricType; use uuid::Uuid; diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 7cec534e171..c70eb93bbcd 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -97,6 +97,7 @@ use super::{WriteMode, WriteParams, write_fragments_internal}; use crate::Dataset; use crate::Result; use crate::dataset::utils::CapturedRowIds; +use crate::index::DatasetIndexExt; use crate::io::commit::{commit_transaction, migrate_fragments}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; @@ -105,7 +106,6 @@ use lance_core::Error; use lance_core::datatypes::BlobHandling; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::utils::tracing::{DATASET_COMPACTING_EVENT, TRACE_DATASET_EVENTS}; -use lance_index::DatasetIndexExt; use lance_index::frag_reuse::FragReuseGroup; use lance_table::format::{Fragment, RowIdMeta}; use roaring::{RoaringBitmap, RoaringTreemap}; diff --git a/rust/lance/src/dataset/optimize/remapping.rs b/rust/lance/src/dataset/optimize/remapping.rs index 253bd63ff15..dab62bf6166 100644 --- a/rust/lance/src/dataset/optimize/remapping.rs +++ b/rust/lance/src/dataset/optimize/remapping.rs @@ -6,12 +6,12 @@ use crate::Result; use crate::dataset::transaction::{Operation, Transaction}; +use crate::index::DatasetIndexExt; use crate::index::frag_reuse::{load_frag_reuse_index_details, open_frag_reuse_index}; use crate::{Dataset, index}; use async_trait::async_trait; use lance_core::Error; use lance_core::utils::address::RowAddress; -use lance_index::DatasetIndexExt; use lance_index::frag_reuse::{FRAG_REUSE_INDEX_NAME, FragDigest}; use lance_table::format::{Fragment, IndexFile, IndexMetadata}; use lance_table::io::manifest::read_manifest_indexes; diff --git a/rust/lance/src/dataset/rowids.rs b/rust/lance/src/dataset/rowids.rs index b6b4b106798..447868fca99 100644 --- a/rust/lance/src/dataset/rowids.rs +++ b/rust/lance/src/dataset/rowids.rs @@ -130,6 +130,7 @@ mod test { use super::*; use crate::dataset::optimize::{CompactionOptions, compact_files}; + use crate::index::DatasetIndexExt; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; use arrow_array::cast::AsArray; use arrow_array::types::{Float32Type, Int32Type, UInt64Type}; @@ -139,7 +140,7 @@ mod test { use lance_core::datatypes::Schema; use lance_core::{ROW_ADDR, ROW_ID, utils::address::RowAddress}; use lance_datagen::Dimension; - use lance_index::{DatasetIndexExt, IndexType, scalar::ScalarIndexParams}; + use lance_index::{IndexType, scalar::ScalarIndexParams}; use std::collections::HashMap; use std::collections::HashSet; diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 63bd7884879..34483bfd847 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -6,6 +6,7 @@ use std::pin::Pin; use std::sync::{Arc, LazyLock}; use std::task::{Context, Poll}; +use crate::index::DatasetIndexExt; use arrow::array::AsArray; use arrow_array::{Array, Float32Array, Int64Array, RecordBatch}; use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef, SortOptions}; @@ -60,13 +61,13 @@ use lance_datafusion::projection::ProjectionPlan; use lance_file::reader::FileReaderOptions; use lance_index::IndexCriteria; use lance_index::scalar::FullTextSearchQuery; +use lance_index::scalar::expression::ScalarIndexExpr; use lance_index::scalar::expression::{INDEX_EXPR_RESULT_SCHEMA, IndexExprResult, PlannerIndexExt}; use lance_index::scalar::inverted::query::{ FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, PhraseQuery, fill_fts_query_column, }; use lance_index::scalar::inverted::{SCORE_COL, SCORE_FIELD}; use lance_index::vector::{DIST_COL, Query}; -use lance_index::{DatasetIndexExt, scalar::expression::ScalarIndexExpr}; use lance_index::{metrics::NoOpMetricsCollector, scalar::inverted::FTS_SCHEMA}; use lance_io::stream::RecordBatchStream; use lance_linalg::distance::MetricType; diff --git a/rust/lance/src/dataset/schema_evolution.rs b/rust/lance/src/dataset/schema_evolution.rs index b79cb283956..269093fc98b 100644 --- a/rust/lance/src/dataset/schema_evolution.rs +++ b/rust/lance/src/dataset/schema_evolution.rs @@ -1783,11 +1783,12 @@ mod test { ) -> Result<()> { // Create a table with 2 scalar columns, 1 vector column + use crate::index::DatasetIndexExt; use arrow::datatypes::{Int32Type, Int64Type}; use arrow_array::{Float16Array, Float32Array, Int64Array, ListArray}; use half::f16; use lance_arrow::FixedSizeListArrayExt; - use lance_index::{DatasetIndexExt, IndexType, scalar::ScalarIndexParams}; + use lance_index::{IndexType, scalar::ScalarIndexParams}; use lance_linalg::distance::MetricType; use lance_testing::datagen::generate_random_array; diff --git a/rust/lance/src/dataset/tests/dataset_aggregate.rs b/rust/lance/src/dataset/tests/dataset_aggregate.rs index cdc954a9997..ef2a90e6315 100644 --- a/rust/lance/src/dataset/tests/dataset_aggregate.rs +++ b/rust/lance/src/dataset/tests/dataset_aggregate.rs @@ -38,12 +38,13 @@ use tempfile::tempdir; use crate::Dataset; use crate::dataset::scanner::AggregateExpr; +use crate::index::DatasetIndexExt; use crate::index::vector::VectorIndexParams; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount, assert_plan_node_equals}; use lance_arrow::FixedSizeListArrayExt; +use lance_index::IndexType; use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::InvertedIndexParams; -use lance_index::{DatasetIndexExt, IndexType}; use lance_linalg::distance::MetricType; /// Helper to create a field reference expression for a column index diff --git a/rust/lance/src/dataset/tests/dataset_concurrency_store.rs b/rust/lance/src/dataset/tests/dataset_concurrency_store.rs index 7ce57a4c0f4..a9c2aa44c38 100644 --- a/rust/lance/src/dataset/tests/dataset_concurrency_store.rs +++ b/rust/lance/src/dataset/tests/dataset_concurrency_store.rs @@ -8,12 +8,12 @@ use crate::dataset::WriteDestination; use crate::{Dataset, Error, Result}; use crate::dataset::write::{WriteMode, WriteParams}; +use crate::index::DatasetIndexExt; use arrow_array::RecordBatch; use arrow_array::{Int32Array, RecordBatchIterator}; use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use futures::TryStreamExt; use lance_core::utils::tempfile::TempStrDir; -use lance_index::DatasetIndexExt; use lance_index::{IndexType, scalar::ScalarIndexParams}; #[tokio::test] diff --git a/rust/lance/src/dataset/tests/dataset_geo.rs b/rust/lance/src/dataset/tests/dataset_geo.rs index dc1e79dc455..a43718dd7d4 100644 --- a/rust/lance/src/dataset/tests/dataset_geo.rs +++ b/rust/lance/src/dataset/tests/dataset_geo.rs @@ -7,6 +7,7 @@ use std::vec; use crate::Dataset; use crate::dataset::tests::dataset_transactions::execute_sql; +use crate::index::DatasetIndexExt; use arrow_array::RecordBatch; use arrow_array::RecordBatchIterator; use arrow_array::cast::AsArray; @@ -19,8 +20,8 @@ use geoarrow_array::{ }; use geoarrow_schema::{Dimension, LineStringType, PointType, PolygonType}; use lance_core::utils::tempfile::TempStrDir; +use lance_index::IndexType; use lance_index::scalar::ScalarIndexParams; -use lance_index::{DatasetIndexExt, IndexType}; #[tokio::test] async fn test_geo_types() { diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index c3aac4493d4..a2d4889dd06 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -13,6 +13,7 @@ use crate::{Dataset, Error, Result}; use lance_arrow::FixedSizeListArrayExt; use crate::dataset::write::{WriteMode, WriteParams}; +use crate::index::DatasetIndexExt; use arrow::array::{AsArray, GenericListBuilder, GenericStringBuilder}; use arrow::datatypes::UInt64Type; use arrow_array::RecordBatch; @@ -31,7 +32,6 @@ use lance_core::utils::tempfile::TempStrDir; use lance_datagen::{BatchCount, Dimension, RowCount, array, gen_batch}; use lance_file::reader::{FileReader, FileReaderOptions}; use lance_file::version::LanceFileVersion; -use lance_index::DatasetIndexExt; use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::{ query::{BooleanQuery, MatchQuery, Occur, Operator, PhraseQuery}, diff --git a/rust/lance/src/dataset/tests/dataset_io.rs b/rust/lance/src/dataset/tests/dataset_io.rs index e438e0801ea..86ae416b1be 100644 --- a/rust/lance/src/dataset/tests/dataset_io.rs +++ b/rust/lance/src/dataset/tests/dataset_io.rs @@ -36,9 +36,10 @@ use lance_file::version::LanceFileVersion; use lance_io::assert_io_eq; use lance_table::feature_flags; +use crate::index::DatasetIndexExt; use futures::TryStreamExt; +use lance_index::IndexType; use lance_index::scalar::ScalarIndexParams; -use lance_index::{DatasetIndexExt, IndexType}; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; use lance_io::utils::tracking_store::IOTracker; use lance_table::io::manifest::read_manifest; diff --git a/rust/lance/src/dataset/tests/dataset_merge_update.rs b/rust/lance/src/dataset/tests/dataset_merge_update.rs index ef64773cc3c..1bdee1227b1 100644 --- a/rust/lance/src/dataset/tests/dataset_merge_update.rs +++ b/rust/lance/src/dataset/tests/dataset_merge_update.rs @@ -9,11 +9,12 @@ use crate::dataset::WriteDestination; use crate::dataset::optimize::{CompactionOptions, compact_files}; use crate::dataset::transaction::{DataReplacementGroup, Operation}; use crate::dataset::{AutoCleanupParams, MergeInsertBuilder, ProjectionRequest}; +use crate::index::DatasetIndexExt; use crate::{Dataset, Error}; use lance_core::ROW_ADDR; +use lance_index::IndexType; use lance_index::optimize::OptimizeOptions; use lance_index::scalar::ScalarIndexParams; -use lance_index::{DatasetIndexExt, IndexType}; use mock_instant::thread_local::MockClock; use crate::dataset::write::{InsertBuilder, WriteMode, WriteParams}; diff --git a/rust/lance/src/dataset/tests/dataset_migrations.rs b/rust/lance/src/dataset/tests/dataset_migrations.rs index ecbffa717ba..d71a65bfa69 100644 --- a/rust/lance/src/dataset/tests/dataset_migrations.rs +++ b/rust/lance/src/dataset/tests/dataset_migrations.rs @@ -11,12 +11,12 @@ use crate::{Dataset, Result}; use lance_table::format::IndexMetadata; use crate::dataset::write::{WriteMode, WriteParams}; +use crate::index::DatasetIndexExt; use arrow::compute::concat_batches; use arrow_array::RecordBatch; use arrow_array::{Float32Array, Int64Array, RecordBatchIterator}; use arrow_schema::Schema as ArrowSchema; use lance_file::version::LanceFileVersion; -use lance_index::DatasetIndexExt; use futures::{StreamExt, TryStreamExt}; use rstest::rstest; diff --git a/rust/lance/src/dataset/tests/dataset_scanner.rs b/rust/lance/src/dataset/tests/dataset_scanner.rs index e5e72b48f84..d5dac4c8562 100644 --- a/rust/lance/src/dataset/tests/dataset_scanner.rs +++ b/rust/lance/src/dataset/tests/dataset_scanner.rs @@ -9,6 +9,7 @@ use crate::index::vector::VectorIndexParams; use lance_arrow::FixedSizeListArrayExt; use lance_arrow::json::{JsonArray, is_arrow_json_field, json_field}; +use crate::index::DatasetIndexExt; use arrow::compute::concat_batches; use arrow_array::UInt64Array; use arrow_array::{Array, FixedSizeListArray}; @@ -24,7 +25,7 @@ use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::{ SCORE_FIELD, query::PhraseQuery, tokenizer::InvertedIndexParams, }; -use lance_index::{DatasetIndexExt, IndexType, vector::DIST_COL}; +use lance_index::{IndexType, vector::DIST_COL}; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; use lance_io::utils::CachedFileSize; use lance_linalg::distance::MetricType; diff --git a/rust/lance/src/dataset/tests/dataset_transactions.rs b/rust/lance/src/dataset/tests/dataset_transactions.rs index 1f522800a44..2b49d0963e1 100644 --- a/rust/lance/src/dataset/tests/dataset_transactions.rs +++ b/rust/lance/src/dataset/tests/dataset_transactions.rs @@ -14,13 +14,13 @@ use crate::{Dataset, Result}; use lance_table::io::commit::ManifestNamingScheme; use crate::dataset::write::{CommitBuilder, InsertBuilder, WriteMode, WriteParams}; +use crate::index::DatasetIndexExt; use arrow_array::Array; use arrow_array::RecordBatch; use arrow_array::{Int32Array, RecordBatchIterator, StringArray, types::Int32Type}; use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use lance_core::utils::tempfile::{TempDir, TempStrDir}; use lance_datagen::{BatchCount, RowCount, array}; -use lance_index::DatasetIndexExt; use crate::datafusion::LanceTableProvider; use datafusion::prelude::SessionContext; diff --git a/rust/lance/src/dataset/udtf.rs b/rust/lance/src/dataset/udtf.rs index d25aec45009..b39586c2777 100644 --- a/rust/lance/src/dataset/udtf.rs +++ b/rust/lance/src/dataset/udtf.rs @@ -259,13 +259,14 @@ impl FtsQueryUDTFBuilder { pub mod tests { use crate::Dataset; use crate::dataset::udtf::FtsQueryUDTFBuilder; + use crate::index::DatasetIndexExt; use arrow_array::{ Array, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt64Array, }; use arrow_schema::{DataType, Field}; use datafusion::prelude::SessionContext; + use lance_index::IndexType; use lance_index::scalar::InvertedIndexParams; - use lance_index::{DatasetIndexExt, IndexType}; use std::sync::Arc; #[tokio::test] diff --git a/rust/lance/src/dataset/write/delete.rs b/rust/lance/src/dataset/write/delete.rs index 18916b087f5..2f7314925e6 100644 --- a/rust/lance/src/dataset/write/delete.rs +++ b/rust/lance/src/dataset/write/delete.rs @@ -288,6 +288,7 @@ mod tests { use super::*; use crate::dataset::{InsertBuilder, UpdateBuilder}; use crate::dataset::{WriteMode, WriteParams}; + use crate::index::DatasetIndexExt; use crate::utils::test::TestDatasetGenerator; use arrow::array::AsArray; use arrow::datatypes::UInt32Type; @@ -296,7 +297,7 @@ mod tests { use futures::TryStreamExt; use lance_core::utils::tempfile::TempStrDir; use lance_file::version::LanceFileVersion; - use lance_index::{DatasetIndexExt, IndexType, scalar::ScalarIndexParams}; + use lance_index::{IndexType, scalar::ScalarIndexParams}; use rstest::rstest; use std::collections::HashSet; use std::ops::Range; diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index cfc1e8f0dca..2c56d8349fc 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -29,6 +29,7 @@ use super::{CommitBuilder, WriteParams, write_fragments_internal}; use crate::dataset::rowids::get_row_id_index; use crate::dataset::transaction::UpdateMode::{RewriteColumns, RewriteRows}; use crate::dataset::utils::CapturedRowIds; +use crate::index::DatasetIndexExt; use crate::{ Dataset, datafusion::dataframe::SessionContextExt, @@ -94,8 +95,8 @@ use lance_datafusion::{ utils::StreamingWriteSource, }; use lance_file::version::LanceFileVersion; +use lance_index::IndexCriteria; use lance_index::mem_wal::MergedGeneration; -use lance_index::{DatasetIndexExt, IndexCriteria}; use lance_table::format::{Fragment, IndexMetadata, RowIdMeta}; use log::info; use roaring::RoaringTreemap; diff --git a/rust/lance/src/dataset/write/update.rs b/rust/lance/src/dataset/write/update.rs index 855d83018f4..ec34000642d 100644 --- a/rust/lance/src/dataset/write/update.rs +++ b/rust/lance/src/dataset/write/update.rs @@ -483,6 +483,7 @@ mod tests { use super::*; use crate::dataset::{WriteDestination, WriteMode}; + use crate::index::DatasetIndexExt; use crate::index::vector::VectorIndexParams; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; use arrow::{array::AsArray, datatypes::UInt32Type}; @@ -495,7 +496,6 @@ mod tests { use lance_core::utils::tempfile::TempStrDir; use lance_datagen::{Dimension, RowCount}; use lance_file::version::LanceFileVersion; - use lance_index::DatasetIndexExt; use lance_index::IndexType; use lance_index::scalar::ScalarIndexParams; use lance_io::object_store::ObjectStoreParams; diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 28da8f61e1b..843fc8c7740 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -24,6 +24,7 @@ use lance_core::utils::tracing::{ }; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_file::reader::FileReaderOptions; +use lance_index::INDEX_METADATA_SCHEMA_KEY; pub use lance_index::IndexParams; use lance_index::frag_reuse::{FRAG_REUSE_INDEX_NAME, FragReuseIndex}; use lance_index::mem_wal::{MEM_WAL_INDEX_NAME, MemWalIndex}; @@ -42,7 +43,6 @@ use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantize use lance_index::vector::hnsw::HNSW; use lance_index::vector::pq::ProductQuantizer; use lance_index::vector::sq::ScalarQuantizer; -use lance_index::{DatasetIndexExt, INDEX_METADATA_SCHEMA_KEY, IndexDescription, IndexSegment}; use lance_index::{INDEX_FILE_NAME, Index, IndexType, pb, vector::VectorIndex}; use lance_index::{ IndexCriteria, is_system_index, @@ -65,6 +65,7 @@ use uuid::Uuid; use vector::ivf::v2::IVFIndex; use vector::utils::get_vector_type; +mod api; pub(crate) mod append; mod create; pub mod frag_reuse; @@ -79,6 +80,7 @@ use crate::dataset::index::LanceIndexStoreExt; use crate::dataset::optimize::RemappedIndex; use crate::dataset::optimize::remapping::RemapResult; use crate::dataset::transaction::{Operation, Transaction, TransactionBuilder}; +pub use crate::index::api::{DatasetIndexExt, IndexSegment, IndexSegmentPlan}; use crate::index::frag_reuse::{load_frag_reuse_index_details, open_frag_reuse_index}; use crate::index::mem_wal::open_mem_wal_index; pub use crate::index::prefilter::{FilterLoader, PreFilter}; @@ -86,6 +88,7 @@ use crate::index::scalar::{IndexDetails, fetch_index_details, load_training_data use crate::session::index_caches::{FragReuseIndexKey, IndexMetadataKey}; use crate::{Error, Result, dataset::Dataset}; pub use create::CreateIndexBuilder; +pub use lance_index::IndexDescription; fn validate_index_segments(index_name: &str, segments: &[IndexSegment]) -> Result<()> { if segments.is_empty() { @@ -645,7 +648,8 @@ impl DatasetIndexExt for Dataset { /// Create a scalar BTREE index: /// ``` /// # use lance::{Dataset, Result}; - /// # use lance_index::{DatasetIndexExt, IndexType, scalar::ScalarIndexParams}; + /// # use lance::index::DatasetIndexExt; + /// # use lance_index::{IndexType, scalar::ScalarIndexParams}; /// # async fn example(dataset: &mut Dataset) -> Result<()> { /// let params = ScalarIndexParams::default(); /// dataset @@ -659,7 +663,8 @@ impl DatasetIndexExt for Dataset { /// Create an empty index that will be populated later: /// ``` /// # use lance::{Dataset, Result}; - /// # use lance_index::{DatasetIndexExt, IndexType, scalar::ScalarIndexParams}; + /// # use lance::index::DatasetIndexExt; + /// # use lance_index::{IndexType, scalar::ScalarIndexParams}; /// # async fn example(dataset: &mut Dataset) -> Result<()> { /// let params = ScalarIndexParams::default(); /// dataset diff --git a/rust/lance/src/index/api.rs b/rust/lance/src/index/api.rs new file mode 100644 index 00000000000..f8e7ee7d012 --- /dev/null +++ b/rust/lance/src/index/api.rs @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::execution::SendableRecordBatchStream; +use lance_index::{IndexParams, IndexType, optimize::OptimizeOptions}; +use lance_table::format::IndexMetadata; +use roaring::RoaringBitmap; +use uuid::Uuid; + +use crate::{Error, Result}; + +/// A single physical segment of a logical index. +/// +/// Each segment is stored independently and will become one manifest entry when committed. +/// The logical index identity (name / target column / dataset version) is provided separately +/// by the commit API. +#[derive(Debug, Clone, PartialEq)] +pub struct IndexSegment { + /// Unique ID of the physical segment. + uuid: Uuid, + /// The fragments covered by this segment. + fragment_bitmap: RoaringBitmap, + /// Metadata specific to the index type. + index_details: Arc, + /// The on-disk index version for this segment. + index_version: i32, +} + +impl IndexSegment { + /// Create a fully described segment with the given UUID, fragment coverage, and index + /// metadata. + pub fn new( + uuid: Uuid, + fragment_bitmap: I, + index_details: Arc, + index_version: i32, + ) -> Self + where + I: IntoIterator, + { + Self { + uuid, + fragment_bitmap: fragment_bitmap.into_iter().collect(), + index_details, + index_version, + } + } + + /// Return the UUID of this segment. + pub fn uuid(&self) -> Uuid { + self.uuid + } + + /// Return the fragment coverage of this segment. + pub fn fragment_bitmap(&self) -> &RoaringBitmap { + &self.fragment_bitmap + } + + /// Return the serialized index details for this segment. + pub fn index_details(&self) -> &Arc { + &self.index_details + } + + /// Return the on-disk index version for this segment. + pub fn index_version(&self) -> i32 { + self.index_version + } + + /// Consume the segment and return its component parts. + pub fn into_parts(self) -> (Uuid, RoaringBitmap, Arc, i32) { + ( + self.uuid, + self.fragment_bitmap, + self.index_details, + self.index_version, + ) + } +} + +/// A plan for building one physical segment from one or more existing +/// vector index segments. +#[derive(Debug, Clone, PartialEq)] +pub struct IndexSegmentPlan { + segment: IndexSegment, + segments: Vec, + estimated_bytes: u64, + requested_index_type: Option, +} + +impl IndexSegmentPlan { + /// Create a plan for one built segment. + pub fn new( + segment: IndexSegment, + segments: Vec, + estimated_bytes: u64, + requested_index_type: Option, + ) -> Self { + Self { + segment, + segments, + estimated_bytes, + requested_index_type, + } + } + + /// Return the segment metadata that should be committed after this plan is built. + pub fn segment(&self) -> &IndexSegment { + &self.segment + } + + /// Return the input segment metadata that should be combined into the segment. + pub fn segments(&self) -> &[IndexMetadata] { + &self.segments + } + + /// Return the estimated number of bytes covered by this plan. + pub fn estimated_bytes(&self) -> u64 { + self.estimated_bytes + } + + /// Return the requested logical index type, if one was supplied to the planner. + pub fn requested_index_type(&self) -> Option { + self.requested_index_type + } +} + +/// Extends [`crate::Dataset`] with secondary index APIs. +#[async_trait] +pub trait DatasetIndexExt { + type IndexBuilder<'a> + where + Self: 'a; + type IndexSegmentBuilder<'a> + where + Self: 'a; + + /// Create a builder for creating an index on columns. + /// + /// This returns a builder that can be configured with additional options + /// like `name()`, `replace()`, and `train()` before awaiting to execute. + fn create_index_builder<'a>( + &'a mut self, + columns: &'a [&'a str], + index_type: IndexType, + params: &'a dyn IndexParams, + ) -> Self::IndexBuilder<'a>; + + /// Create a builder for building physical index segments from uncommitted + /// vector index outputs. + /// + /// The caller supplies the uncommitted index metadata returned by + /// `execute_uncommitted()` so the builder can plan segment grouping without + /// rediscovering fragment coverage. + /// + /// This is the canonical entry point for distributed vector segment build. + /// After building the physical segments, publish them as a + /// logical index with [`Self::commit_existing_index_segments`]. + fn create_index_segment_builder<'a>(&'a self) -> Self::IndexSegmentBuilder<'a>; + + /// Create indices on columns. + /// + /// Upon finish, a new dataset version is generated. + async fn create_index( + &mut self, + columns: &[&str], + index_type: IndexType, + name: Option, + params: &dyn IndexParams, + replace: bool, + ) -> Result; + + /// Drop indices by name. + /// + /// Upon finish, a new dataset version is generated. + async fn drop_index(&mut self, name: &str) -> Result<()>; + + /// Prewarm an index by name. + /// + /// This will load the index into memory and cache it. + async fn prewarm_index(&self, name: &str) -> Result<()>; + + /// Read all indices of this Dataset version. + /// + /// The indices are lazy loaded and cached in memory within the `Dataset` instance. + /// The cache is invalidated when the dataset version (Manifest) is changed. + async fn load_indices(&self) -> Result>>; + + /// Loads all the indices of a given UUID. + /// + /// Note that it is possible to have multiple indices with the same UUID, + /// as they are the deltas of the same index. + async fn load_index(&self, uuid: &str) -> Result> { + self.load_indices().await.map(|indices| { + indices + .iter() + .find(|idx| idx.uuid.to_string() == uuid) + .cloned() + }) + } + + /// Loads a specific index with the given index name. + /// + /// Returns `Ok(vec![])` if the index does not exist. + async fn load_indices_by_name(&self, name: &str) -> Result> { + self.load_indices().await.map(|indices| { + indices + .iter() + .filter(|idx| idx.name == name) + .cloned() + .collect() + }) + } + + /// Loads a specific index with the given index name. + /// This function only works for indices that are unique. + /// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]. + async fn load_index_by_name(&self, name: &str) -> Result> { + let indices = self.load_indices_by_name(name).await?; + if indices.is_empty() { + Ok(None) + } else if indices.len() == 1 { + Ok(Some(indices[0].clone())) + } else { + Err(Error::index(format!( + "Found multiple indices of the same name: {:?}, please use load_indices_by_name", + indices.iter().map(|idx| &idx.name).collect::>() + ))) + } + } + + /// Describes indexes in a dataset. + /// + /// This method should only access the index metadata and should not load the index into memory. + async fn describe_indices<'a, 'b>( + &'a self, + criteria: Option>, + ) -> Result>>; + + /// Loads a specific scalar index using the provided criteria. + async fn load_scalar_index<'a, 'b>( + &'a self, + criteria: lance_index::IndexCriteria<'b>, + ) -> Result>; + + /// Optimize indices. + async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>; + + /// Find an index with the given name and return its serialized statistics. + async fn index_statistics(&self, index_name: &str) -> Result; + + /// Commit one or more existing physical index segments as a logical index. + async fn commit_existing_index_segments( + &mut self, + index_name: &str, + column: &str, + segments: Vec, + ) -> Result<()>; + + async fn read_index_partition( + &self, + index_name: &str, + partition_id: usize, + with_vector: bool, + ) -> Result; +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::{IndexSegment, IndexSegmentPlan}; + use lance_index::IndexType; + use uuid::Uuid; + + #[test] + fn test_index_segment_plan_accessors() { + let uuid = Uuid::new_v4(); + let segment = IndexSegment::new(uuid, [1_u32, 3], Arc::new(prost_types::Any::default()), 7); + let plan = IndexSegmentPlan::new(segment.clone(), vec![], 128, Some(IndexType::BTree)); + + assert_eq!(segment.uuid(), uuid); + assert_eq!( + segment.fragment_bitmap().iter().collect::>(), + vec![1, 3] + ); + assert_eq!(segment.index_version(), 7); + assert_eq!(plan.segment().uuid(), uuid); + assert_eq!(plan.estimated_bytes(), 128); + assert_eq!(plan.requested_index_type(), Some(IndexType::BTree)); + } +} diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index b9944b9e656..d80b9bf710e 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -310,6 +310,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( mod tests { use super::*; + use crate::index::DatasetIndexExt; use arrow::datatypes::{Float32Type, UInt32Type}; use arrow_array::cast::AsArray; use arrow_array::{ @@ -324,7 +325,7 @@ mod tests { use lance_index::vector::hnsw::builder::HnswBuildParams; use lance_index::vector::sq::builder::SQBuildParams; use lance_index::{ - DatasetIndexExt, IndexType, + IndexType, scalar::ScalarIndexParams, vector::{ivf::IvfBuildParams, pq::PQBuildParams}, }; diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index a394f52258e..0bf9fdd283c 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -20,7 +20,7 @@ use crate::{ use futures::future::{BoxFuture, try_join_all}; use lance_core::datatypes::format_field_path; use lance_index::progress::{IndexBuildProgress, NoopIndexBuildProgress}; -use lance_index::{IndexParams, IndexSegment, IndexSegmentPlan, IndexType, scalar::CreatedIndex}; +use lance_index::{IndexParams, IndexType, scalar::CreatedIndex}; use lance_index::{ metrics::NoOpMetricsCollector, scalar::{LANCE_SCALAR_INDEX, ScalarIndexParams, inverted::tokenizer::InvertedIndexParams}, @@ -32,6 +32,8 @@ use uuid::Uuid; use arrow_array::RecordBatchReader; +use super::{IndexSegment, IndexSegmentPlan}; + /// Generate default index name from field path. /// /// Joins field names with `.` to create the base index name. diff --git a/rust/lance/src/index/frag_reuse.rs b/rust/lance/src/index/frag_reuse.rs index e9e66f972b6..e5f63514d86 100644 --- a/rust/lance/src/index/frag_reuse.rs +++ b/rust/lance/src/index/frag_reuse.rs @@ -3,8 +3,8 @@ use crate::Dataset; use crate::dataset::optimize::remapping::transpose_row_ids_from_digest; +use crate::index::DatasetIndexExt; use lance_core::Error; -use lance_index::DatasetIndexExt; use lance_index::frag_reuse::{ FRAG_REUSE_DETAILS_FILE_NAME, FRAG_REUSE_INDEX_NAME, FragReuseGroup, FragReuseIndex, FragReuseIndexDetails, FragReuseVersion, diff --git a/rust/lance/src/index/mem_wal.rs b/rust/lance/src/index/mem_wal.rs index c979ab42fe2..66d3a9aaca2 100644 --- a/rust/lance/src/index/mem_wal.rs +++ b/rust/lance/src/index/mem_wal.rs @@ -122,9 +122,9 @@ mod tests { use std::sync::Arc; + use crate::index::DatasetIndexExt; use arrow_array::{Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; - use lance_index::DatasetIndexExt; use crate::dataset::transaction::{Operation, Transaction}; use crate::dataset::{CommitBuilder, InsertBuilder, WriteParams}; diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index f2c9cfd8969..44739454bec 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -6,6 +6,7 @@ use std::sync::{Arc, LazyLock}; +use crate::index::DatasetIndexExt; use crate::index::DatasetIndexInternalExt; use crate::session::index_caches::ProstAny; use crate::{ @@ -39,7 +40,7 @@ use lance_index::scalar::{ ScalarIndex, ScalarIndexParams, bitmap::BITMAP_LOOKUP_NAME, inverted::INVERT_LIST_FILE, lance_format::LanceIndexStore, }; -use lance_index::{DatasetIndexExt, IndexCriteria, IndexType}; +use lance_index::{IndexCriteria, IndexType}; use lance_table::format::{Fragment, IndexMetadata}; use log::info; use tracing::instrument; @@ -819,9 +820,9 @@ mod tests { #[tokio::test] async fn test_initialize_scalar_index_btree() { use crate::dataset::Dataset; + use crate::index::DatasetIndexExt; use arrow_array::types::Float32Type; use lance_datagen::{BatchCount, RowCount, array}; - use lance_index::DatasetIndexExt; use lance_index::metrics::NoOpMetricsCollector; use lance_index::scalar::ScalarIndexParams; @@ -925,9 +926,9 @@ mod tests { #[tokio::test] async fn test_optimize_scalar_index_btree() { use crate::dataset::Dataset; + use crate::index::DatasetIndexExt; use arrow_array::types::Float32Type; use lance_datagen::{BatchCount, RowCount, array}; - use lance_index::DatasetIndexExt; use lance_index::metrics::NoOpMetricsCollector; use lance_index::scalar::ScalarIndexParams; @@ -1043,9 +1044,9 @@ mod tests { #[tokio::test] async fn test_initialize_scalar_index_bitmap() { use crate::dataset::Dataset; + use crate::index::DatasetIndexExt; use arrow_array::types::Float32Type; use lance_datagen::{BatchCount, RowCount, array}; - use lance_index::DatasetIndexExt; use lance_index::scalar::ScalarIndexParams; let test_dir = TempStrDir::default(); @@ -1123,8 +1124,8 @@ mod tests { #[tokio::test] async fn test_initialize_scalar_index_inverted() { use crate::dataset::Dataset; + use crate::index::DatasetIndexExt; use lance_datagen::{BatchCount, ByteCount, RowCount, array}; - use lance_index::DatasetIndexExt; use lance_index::metrics::NoOpMetricsCollector; use lance_index::scalar::inverted::tokenizer::InvertedIndexParams; @@ -1262,9 +1263,9 @@ mod tests { #[tokio::test] async fn test_initialize_scalar_index_zonemap() { use crate::dataset::Dataset; + use crate::index::DatasetIndexExt; use arrow_array::types::Float32Type; use lance_datagen::{BatchCount, RowCount, array}; - use lance_index::DatasetIndexExt; use lance_index::metrics::NoOpMetricsCollector; use lance_index::scalar::ScalarIndexParams; use lance_index::scalar::zonemap::ZoneMapIndexBuilderParams; diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index a19f472a9bf..4b5b113cae9 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -47,9 +47,7 @@ use lance_index::vector::{ pq::PQBuildParams, sq::{ScalarQuantizer, builder::SQBuildParams}, }; -use lance_index::{ - DatasetIndexExt, INDEX_AUXILIARY_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, IndexType, -}; +use lance_index::{INDEX_AUXILIARY_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, IndexType}; use lance_io::traits::Reader; use lance_linalg::distance::*; use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; @@ -58,7 +56,7 @@ use tracing::instrument; use utils::get_vector_type; use uuid::Uuid; -use super::{DatasetIndexInternalExt, IndexParams, pb, vector_index_details}; +use super::{DatasetIndexExt, DatasetIndexInternalExt, IndexParams, pb, vector_index_details}; use crate::dataset::index::dataset_format_version; use crate::dataset::transaction::{Operation, Transaction}; use crate::{Error, Result, dataset::Dataset, index::pb::vector_index_stage::Stage}; @@ -1688,6 +1686,7 @@ fn derive_hnsw_params(source_index: &dyn VectorIndex) -> HnswBuildParams { mod tests { use super::*; use crate::dataset::Dataset; + use crate::index::DatasetIndexExt; use arrow_array::Array; use arrow_array::RecordBatch; use arrow_array::types::{Float32Type, Int32Type}; @@ -1695,7 +1694,6 @@ mod tests { use lance_core::utils::tempfile::TempStrDir; use lance_datagen::{BatchCount, RowCount, array}; use lance_file::writer::FileWriterOptions; - use lance_index::DatasetIndexExt; use lance_index::metrics::NoOpMetricsCollector; use lance_linalg::distance::MetricType; diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 8e47b77cdfd..90864154094 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -64,8 +64,7 @@ use lance_index::vector::quantizer::QuantizationType; use lance_index::vector::v3::shuffler::create_ivf_shuffler; use lance_index::vector::v3::subindex::{IvfSubIndex, SubIndexType}; use lance_index::{ - INDEX_AUXILIARY_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, Index, IndexMetadata, IndexSegment, - IndexSegmentPlan, IndexType, + INDEX_AUXILIARY_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, Index, IndexMetadata, IndexType, optimize::OptimizeOptions, vector::{ Query, VectorIndex, @@ -103,6 +102,8 @@ use tokio::sync::mpsc; use tracing::instrument; use uuid::Uuid; +use crate::index::{IndexSegment, IndexSegmentPlan}; + pub mod builder; pub mod io; pub mod v2; diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 92b0549f43b..70ee762f59d 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -626,7 +626,9 @@ mod tests { use lance_index::vector::storage::VectorStore; use crate::dataset::{InsertBuilder, UpdateBuilder, WriteMode, WriteParams}; + use crate::index::DatasetIndexExt; use crate::index::DatasetIndexInternalExt; + use crate::index::IndexSegment; use crate::index::vector::ivf::v2::IvfPq; use crate::index::vector::ivf::{build_segment, plan_segments}; use crate::utils::test::copy_test_data_to_tmp; @@ -644,6 +646,7 @@ mod tests { use lance_encoding::decoder::DecoderPlugins; use lance_file::reader::{FileReader, FileReaderOptions}; use lance_file::writer::FileWriter; + use lance_index::IndexType; use lance_index::vector::DIST_COL; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::kmeans::{KMeansParams, train_kmeans}; @@ -653,7 +656,6 @@ mod tests { use lance_index::vector::{ pq::storage::ProductQuantizationMetadata, storage::STORAGE_METADATA_KEY, }; - use lance_index::{DatasetIndexExt, IndexSegment, IndexType}; use lance_index::{INDEX_AUXILIARY_FILE_NAME, metrics::NoOpMetricsCollector}; use lance_index::{optimize::OptimizeOptions, scalar::IndexReader}; use lance_index::{scalar::IndexWriter, vector::hnsw::builder::HnswBuildParams}; diff --git a/rust/lance/src/io/commit.rs b/rust/lance/src/io/commit.rs index 7a7b9a5a77a..2e61054d28f 100644 --- a/rust/lance/src/io/commit.rs +++ b/rust/lance/src/io/commit.rs @@ -48,6 +48,7 @@ use crate::dataset::{ ManifestWriteConfig, NewTransactionResult, TRANSACTIONS_DIR, load_new_transactions, write_manifest_file, }; +use crate::index::DatasetIndexExt; use crate::index::DatasetIndexInternalExt; use crate::io::deletion::read_dataset_deletion_file; use crate::session::Session; @@ -56,7 +57,7 @@ use crate::session::index_caches::IndexMetadataKey; use futures::future::Either; use futures::{StreamExt, TryFutureExt, TryStreamExt}; use lance_core::{Error, Result}; -use lance_index::{DatasetIndexExt, is_system_index}; +use lance_index::is_system_index; use lance_io::object_store::ObjectStoreRegistry; use log; use object_store::path::Path; diff --git a/rust/lance/src/io/commit/conflict_resolver.rs b/rust/lance/src/io/commit/conflict_resolver.rs index 3765b87eec7..8949cb383cc 100644 --- a/rust/lance/src/io/commit/conflict_resolver.rs +++ b/rust/lance/src/io/commit/conflict_resolver.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use crate::index::DatasetIndexExt; use crate::index::frag_reuse::{build_frag_reuse_index_metadata, load_frag_reuse_index_details}; use crate::index::mem_wal::{load_mem_wal_index_details, new_mem_wal_index_meta}; use crate::io::deletion::read_dataset_deletion_file; @@ -14,7 +15,6 @@ use lance_core::{ Error, Result, utils::{deletion::DeletionVector, mask::RowAddrTreeMap}, }; -use lance_index::DatasetIndexExt; use lance_index::frag_reuse::FRAG_REUSE_INDEX_NAME; use lance_index::mem_wal::{MEM_WAL_INDEX_NAME, MergedGeneration}; use lance_table::format::IndexMetadata; diff --git a/rust/lance/src/io/exec/filtered_read.rs b/rust/lance/src/io/exec/filtered_read.rs index 35fd486a283..e3837f9ce4f 100644 --- a/rust/lance/src/io/exec/filtered_read.rs +++ b/rust/lance/src/io/exec/filtered_read.rs @@ -2052,6 +2052,7 @@ impl ExecutionPlan for FilteredReadExec { mod tests { use std::collections::HashSet; + use crate::index::DatasetIndexExt; use arrow::{ compute::concat_batches, datatypes::{Float32Type, UInt32Type, UInt64Type}, @@ -2064,7 +2065,7 @@ mod tests { use lance_core::utils::tempfile::TempStrDir; use lance_datagen::{BatchCount, Dimension, RowCount, array, gen_batch}; use lance_index::{ - DatasetIndexExt, IndexType, + IndexType, optimize::OptimizeOptions, scalar::{ScalarIndexParams, expression::PlannerIndexExt}, }; diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index 6e129841c76..0858c294753 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -25,7 +25,9 @@ use lance_datafusion::utils::{ExecutionPlanMetricsSetExt, MetricsExt, PARTITIONS use super::PreFilterSource; use super::utils::{IndexMetrics, InstrumentedRecordBatchStreamAdapter, build_prefilter}; +use crate::index::DatasetIndexExt; use crate::{Dataset, index::DatasetIndexInternalExt}; +use lance_index::IndexCriteria; use lance_index::metrics::MetricsCollector; use lance_index::scalar::inverted::builder::document_input; use lance_index::scalar::inverted::lance_tokenizer::{DocType, JsonTokenizer, LanceTokenizer}; @@ -37,7 +39,6 @@ use lance_index::scalar::inverted::tokenizer::lance_tokenizer::TextTokenizer; use lance_index::scalar::inverted::{ FTS_SCHEMA, InvertedIndex, SCORE_COL, flat_bm25_search_stream, }; -use lance_index::{DatasetIndexExt, IndexCriteria}; use lance_index::{prefilter::PreFilter, scalar::inverted::query::BooleanQuery}; use tracing::instrument; @@ -1384,6 +1385,7 @@ impl ExecutionPlan for BooleanQueryExec { pub mod tests { use std::sync::{Arc, Mutex}; + use crate::index::DatasetIndexExt; use datafusion::{execution::TaskContext, physical_plan::ExecutionPlan}; use lance_datafusion::datagen::DatafusionDatagenExt; use lance_datafusion::exec::{ExecutionStatsCallback, ExecutionSummaryCounts}; @@ -1396,7 +1398,7 @@ pub mod tests { PhraseQuery, }; use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams}; - use lance_index::{DatasetIndexExt, IndexCriteria, IndexType}; + use lance_index::{IndexCriteria, IndexType}; use crate::{ index::DatasetIndexInternalExt, diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index ab7589ce1a8..83240b19741 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -1344,6 +1344,7 @@ impl ExecutionPlan for MultivectorScoringExec { mod tests { use super::*; + use crate::index::DatasetIndexExt; use arrow::compute::{concat_batches, sort_to_indices, take_record_batch}; use arrow::datatypes::Float32Type; use arrow_array::{ @@ -1353,10 +1354,10 @@ mod tests { use lance_core::utils::tempfile::TempStrDir; use lance_datafusion::exec::{ExecutionStatsCallback, ExecutionSummaryCounts}; use lance_datagen::{BatchCount, RowCount, array}; + use lance_index::IndexType; use lance_index::optimize::OptimizeOptions; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::pq::PQBuildParams; - use lance_index::{DatasetIndexExt, IndexType}; use lance_linalg::distance::MetricType; use lance_testing::datagen::generate_random_array; use rstest::rstest; diff --git a/rust/lance/src/io/exec/scalar_index.rs b/rust/lance/src/io/exec/scalar_index.rs index aafd3c8d2eb..f587ec22a91 100644 --- a/rust/lance/src/io/exec/scalar_index.rs +++ b/rust/lance/src/io/exec/scalar_index.rs @@ -7,7 +7,7 @@ use super::utils::{IndexMetrics, InstrumentedRecordBatchStreamAdapter}; use crate::{ Dataset, dataset::rowids::load_row_id_sequences, - index::{DatasetIndexInternalExt, prefilter::DatasetPreFilter}, + index::{DatasetIndexExt, DatasetIndexInternalExt, prefilter::DatasetPreFilter}, }; use arrow_array::{Array, RecordBatch, UInt64Array}; use arrow_schema::{Schema, SchemaRef}; @@ -40,7 +40,7 @@ use lance_datafusion::{ }, }; use lance_index::{ - DatasetIndexExt, IndexCriteria, + IndexCriteria, metrics::MetricsCollector, scalar::{ SargableQuery, ScalarIndex, @@ -733,6 +733,7 @@ impl ExecutionPlan for MaterializeIndexExec { mod tests { use std::{ops::Bound, sync::Arc}; + use crate::index::DatasetIndexExt; use arrow::datatypes::UInt64Type; use datafusion::{ execution::TaskContext, physical_plan::ExecutionPlan, prelude::SessionConfig, @@ -742,7 +743,7 @@ mod tests { use lance_core::utils::tempfile::TempStrDir; use lance_datagen::gen_batch; use lance_index::{ - DatasetIndexExt, IndexType, + IndexType, scalar::{ SargableQuery, ScalarIndexParams, expression::{ScalarIndexExpr, ScalarIndexSearch}, diff --git a/rust/lance/src/session/index_extension.rs b/rust/lance/src/session/index_extension.rs index b500d52086c..2055f64e340 100644 --- a/rust/lance/src/session/index_extension.rs +++ b/rust/lance/src/session/index_extension.rs @@ -65,6 +65,7 @@ mod test { sync::{Arc, atomic::AtomicBool}, }; + use crate::index::DatasetIndexExt; use arrow_array::{Float32Array, RecordBatch, UInt32Array}; use arrow_schema::Schema; use datafusion::execution::SendableRecordBatchStream; @@ -75,8 +76,7 @@ mod test { use lance_file::version::LanceFileVersion; use lance_index::vector::v3::subindex::SubIndexType; use lance_index::{ - DatasetIndexExt, INDEX_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, Index, IndexMetadata, - IndexType, + INDEX_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, Index, IndexMetadata, IndexType, vector::{Query, hnsw::VECTOR_ID_FIELD}, }; use lance_index::{ diff --git a/rust/lance/tests/query/inverted.rs b/rust/lance/tests/query/inverted.rs index 63b36b6a823..c9ce1231d92 100644 --- a/rust/lance/tests/query/inverted.rs +++ b/rust/lance/tests/query/inverted.rs @@ -7,9 +7,10 @@ use arrow_array::{ArrayRef, Int32Array, RecordBatch, StringArray, UInt32Array}; use lance::Dataset; use lance::dataset::scanner::ColumnOrdering; use lance::dataset::{InsertBuilder, WriteParams}; +use lance::index::DatasetIndexExt; +use lance_index::IndexType; use lance_index::scalar::inverted::query::{FtsQuery, PhraseQuery}; use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams}; -use lance_index::{DatasetIndexExt, IndexType}; use tantivy::tokenizer::Language; use super::{strip_score_column, test_fts, test_scan, test_take}; diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs index 6c72f66bd23..65fa6f4e4d3 100644 --- a/rust/lance/tests/query/primitives.rs +++ b/rust/lance/tests/query/primitives.rs @@ -13,8 +13,9 @@ use lance::Dataset; use lance::dataset::WriteParams; use lance::dataset::optimize::{CompactionOptions, compact_files}; +use lance::index::DatasetIndexExt; use lance_datagen::{ArrayGeneratorExt, RowCount, array, gen_batch}; -use lance_index::{DatasetIndexExt, IndexType}; +use lance_index::IndexType; use super::{test_filter, test_scan, test_take}; use crate::utils::DatasetTestCases; diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs index c407c62c03b..b8a034a50b0 100644 --- a/rust/lance/tests/utils/mod.rs +++ b/rust/lance/tests/utils/mod.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use arrow_array::{ArrayRef, Int32Array, RecordBatch}; use futures::FutureExt; +use lance::index::DatasetIndexExt; use lance::index::vector::VectorIndexParams; use lance::{ Dataset, @@ -17,7 +18,7 @@ use lance_index::vector::hnsw::builder::HnswBuildParams; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::pq::PQBuildParams; use lance_index::vector::sq::builder::SQBuildParams; -use lance_index::{DatasetIndexExt, IndexParams, IndexType}; +use lance_index::{IndexParams, IndexType}; use lance_linalg::distance::{DistanceType, MetricType}; #[derive(Clone, Copy, Debug)]