Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions java/lance-jni/src/blocking_dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,18 @@ use lance::dataset::{
ColumnAlteration, CommitBuilder, Dataset, NewColumnTransform, ProjectionRequest, ReadParams,
Version, WriteParams,
};
use lance::index::{DatasetIndexExt, IndexSegment};
use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore;
use lance::io::{ObjectStore, ObjectStoreParams};
use lance::session::Session as LanceSession;
use lance::table::format::IndexMetadata;
use lance::table::format::{BasePath, Fragment};
use lance_core::datatypes::Schema as LanceSchema;
use lance_file::version::LanceFileVersion;
use lance_index::DatasetIndexExt;
use lance_index::IndexCriteria as RustIndexCriteria;
use lance_index::optimize::OptimizeOptions;
use lance_index::scalar::btree::BTreeParameters;
use lance_index::{IndexParams, IndexSegment, IndexType};
use lance_index::{IndexParams, IndexType};
use lance_io::object_store::ObjectStoreRegistry;
use lance_io::object_store::StorageOptionsProvider;
use lance_namespace::LanceNamespace;
Expand Down
4 changes: 2 additions & 2 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ use lance::dataset::{
transaction::{Operation, Transaction},
};
use lance::index::vector::utils::get_vector_type;
use lance::index::{DatasetIndexInternalExt, vector::VectorIndexParams};
use lance::index::{DatasetIndexExt, DatasetIndexInternalExt, vector::VectorIndexParams};
use lance::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion};
use lance_arrow::as_fixed_size_list_array;
use lance_core::Error;
Expand All @@ -67,7 +67,7 @@ use lance_index::scalar::inverted::query::{
BooleanQuery, BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, Operator, PhraseQuery,
};
use lance_index::{
DatasetIndexExt, IndexParams, IndexType,
IndexParams, IndexType,
optimize::OptimizeOptions,
scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams},
vector::{
Expand Down
3 changes: 2 additions & 1 deletion python/src/indices.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use arrow_data::ArrayData;
use chrono::{DateTime, Utc};
use lance::dataset::Dataset as LanceDataset;
use lance::index::vector::ivf::builder::write_vector_storage;
use lance::index::{DatasetIndexExt, IndexSegment, IndexSegmentPlan};
use lance::io::ObjectStore;
use lance_index::progress::NoopIndexBuildProgress;
use lance_index::vector::ivf::shuffler::{IvfShuffler, shuffle_vectors};
Expand All @@ -37,7 +38,7 @@ use crate::{
dataset::Dataset, error::PythonErrorExt, file::object_store_from_uri_or_path_no_options, rt,
};
use lance::index::vector::ivf::write_ivf_pq_file_from_existing_index;
use lance_index::{DatasetIndexExt, IndexDescription, IndexSegment, IndexSegmentPlan, IndexType};
use lance_index::{IndexDescription, IndexType};
use uuid::Uuid;

#[pyclass(name = "IndexConfig", module = "lance.indices", get_all)]
Expand Down
2 changes: 1 addition & 1 deletion python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use ::arrow::pyarrow::PyArrowType;
use ::arrow_schema::Schema as ArrowSchema;
use ::lance::arrow::json::ArrowJsonExt;
use ::lance::datafusion::LanceTableProvider;
use ::lance::index::DatasetIndexExt;
use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
use datafusion_ffi::table_provider::FFI_TableProvider;
#[cfg(feature = "datagen")]
Expand All @@ -52,7 +53,6 @@ use file::{
LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader,
LanceFileStatistics, LanceFileWriter, LancePageMetadata, stable_version,
};
use lance_index::DatasetIndexExt;
use log::Level;
use pyo3::exceptions::PyIOError;
use pyo3::prelude::*;
Expand Down
2 changes: 1 addition & 1 deletion rust/examples/src/full_text_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ use arrow::datatypes::UInt64Type;
use arrow_schema::{DataType, Field, Schema};
use itertools::Itertools;
use lance::Dataset;
use lance::index::DatasetIndexExt;
use lance_datagen::{RowCount, array};
use lance_index::DatasetIndexExt;
use lance_index::scalar::inverted::flat_full_text_search;
use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams};
use object_store::path::Path;
Expand Down
3 changes: 2 additions & 1 deletion rust/examples/src/ivf_hnsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ use clap::Parser;
use futures::TryStreamExt;
use lance::Dataset;
use lance::dataset::ProjectionRequest;
use lance::index::DatasetIndexExt;
use lance::index::vector::VectorIndexParams;
use lance_index::IndexType;
use lance_index::vector::hnsw::builder::HnswBuildParams;
use lance_index::vector::ivf::IvfBuildParams;
use lance_index::vector::sq::builder::SQBuildParams;
use lance_index::{DatasetIndexExt, IndexType};
use lance_linalg::distance::MetricType;

#[derive(Parser, Debug)]
Expand Down
2 changes: 0 additions & 2 deletions rust/lance-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,9 @@ pub mod progress;
pub mod registry;
pub mod scalar;
pub mod traits;
pub mod types;
pub mod vector;

pub use crate::traits::*;
pub use crate::types::{IndexSegment, IndexSegmentPlan};

pub const INDEX_FILE_NAME: &str = "index.idx";
/// The name of the auxiliary index file.
Expand Down
195 changes: 1 addition & 194 deletions rust/lance-index/src/traits.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::sync::Arc;
use lance_core::Result;

use async_trait::async_trait;
use datafusion::execution::SendableRecordBatchStream;
use lance_core::{Error, Result};

use crate::{IndexParams, IndexType, optimize::OptimizeOptions, types::IndexSegment};
use lance_table::format::IndexMetadata;

/// A set of criteria used to filter potential indices to use for a query
Expand Down Expand Up @@ -121,191 +116,3 @@ pub trait IndexDescription: Send + Sync {
/// (for backward compatibility with indices created before file tracking was added).
fn total_size_bytes(&self) -> Option<u64>;
}

// Extends Lance Dataset with secondary index.
#[async_trait]
pub trait DatasetIndexExt {
type IndexBuilder<'a>
where
Self: 'a;
type IndexSegmentBuilder<'a>
where
Self: 'a;

/// Create a builder for creating an index on columns.
///
/// This returns a builder that can be configured with additional options
/// like `name()`, `replace()`, and `train()` before awaiting to execute.
///
/// # Parameters
/// - `columns`: the columns to build the indices on.
/// - `index_type`: specify [`IndexType`].
/// - `params`: index parameters.
fn create_index_builder<'a>(
&'a mut self,
columns: &'a [&'a str],
index_type: IndexType,
params: &'a dyn IndexParams,
) -> Self::IndexBuilder<'a>;

/// Create a builder for building physical index segments from uncommitted
/// vector index outputs.
///
/// The caller supplies the uncommitted index metadata returned by
/// `execute_uncommitted()` so the builder can plan segment grouping without
/// rediscovering fragment coverage.
///
/// This is the canonical entry point for distributed vector segment build.
/// After building the physical segments, publish them as a
/// logical index with [`Self::commit_existing_index_segments`].
fn create_index_segment_builder<'a>(&'a self) -> Self::IndexSegmentBuilder<'a>;

/// Create indices on columns.
///
/// Upon finish, a new dataset version is generated.
///
/// Parameters:
///
/// - `columns`: the columns to build the indices on.
/// - `index_type`: specify [`IndexType`].
/// - `name`: optional index name. Must be unique in the dataset.
/// if not provided, it will auto-generate one.
/// - `params`: index parameters.
/// - `replace`: replace the existing index if it exists.
///
/// Returns the metadata of the created index.
async fn create_index(
&mut self,
columns: &[&str],
index_type: IndexType,
name: Option<String>,
params: &dyn IndexParams,
replace: bool,
) -> Result<IndexMetadata>;

/// Drop indices by name.
///
/// Upon finish, a new dataset version is generated.
///
/// Parameters:
///
/// - `name`: the name of the index to drop.
async fn drop_index(&mut self, name: &str) -> Result<()>;

/// Prewarm an index by name.
///
/// This will load the index into memory and cache it.
///
/// Generally, this should only be called when it is known the entire index will
/// fit into the index cache.
///
/// This is a hint that is not enforced by all indices today. Some indices may choose
/// to ignore this hint.
async fn prewarm_index(&self, name: &str) -> Result<()>;

/// Read all indices of this Dataset version.
///
/// The indices are lazy loaded and cached in memory within the `Dataset` instance.
/// The cache is invalidated when the dataset version (Manifest) is changed.
async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;

/// Loads all the indies of a given UUID.
///
/// Note that it is possible to have multiple indices with the same UUID,
/// as they are the deltas of the same index.
async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
self.load_indices().await.map(|indices| {
indices
.iter()
.find(|idx| idx.uuid.to_string() == uuid)
.cloned()
})
}

/// Loads a specific index with the given index name
///
/// Returns
/// -------
/// - `Ok(indices)`: if the index exists, returns the index.
/// - `Ok(vec![])`: if the index does not exist.
/// - `Err(e)`: if there is an error loading indices.
///
async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
self.load_indices().await.map(|indices| {
indices
.iter()
.filter(|idx| idx.name == name)
.cloned()
.collect()
})
}

/// Loads a specific index with the given index name.
/// This function only works for indices that are unique.
/// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]
///
/// Returns
/// -------
/// - `Ok(Some(index))`: if the index exists, returns the index.
/// - `Ok(None)`: if the index does not exist.
/// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
///
async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
let indices = self.load_indices_by_name(name).await?;
if indices.is_empty() {
Ok(None)
} else if indices.len() == 1 {
Ok(Some(indices[0].clone()))
} else {
Err(Error::index(format!(
"Found multiple indices of the same name: {:?}, please use load_indices_by_name",
indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()
)))
}
}

/// Describes indexes in a dataset
///
/// This method should only access the index metadata and should not load the index into memory.
///
/// More detailed information may be available from `index_statistics` but that will require
/// loading the index into memory.
async fn describe_indices<'a, 'b>(
&'a self,
criteria: Option<IndexCriteria<'b>>,
) -> Result<Vec<Arc<dyn IndexDescription>>>;

/// Loads a specific index with the given index name.
async fn load_scalar_index<'a, 'b>(
&'a self,
criteria: IndexCriteria<'b>,
) -> Result<Option<IndexMetadata>>;

/// Optimize indices.
async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;

/// Find index with a given index_name and return its serialized statistics.
///
/// If the index does not exist, return Error.
async fn index_statistics(&self, index_name: &str) -> Result<String>;

/// Commit one or more existing physical index segments as a logical index.
///
/// This publishes already-built physical segments. It does not build
/// or merge index data; callers should first build segments with
/// [`Self::create_index_segment_builder`] or another index-specific build
/// path and then pass the resulting segments here.
async fn commit_existing_index_segments(
&mut self,
index_name: &str,
column: &str,
segments: Vec<IndexSegment>,
) -> Result<()>;

async fn read_index_partition(
&self,
index_name: &str,
partition_id: usize,
with_vector: bool,
) -> Result<SendableRecordBatchStream>;
}
Loading
Loading