Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions protos/table.proto
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,15 @@ message IndexMetadata {
//
// Indices should avoid putting large amounts of information in this field, as it will
// bloat the manifest.
//
// Indexes are plugins, and so the format of the details message is flexible and not fully
// defined by the table format. However, there are some conventions that should be followed:
//
// - When Lance APIs refer to indexes they will use the type URL of the index details as the
// identifier for the index type. If a user provides a simple string identifier like
// "btree" then it will be converted to "/lance.table.BTreeIndexDetails"
// - Type URLs comparisons are case-insensitive. Thereform an index must have a unique type
// URL ignoring case.
google.protobuf.Any index_details = 6;

// The minimum lance version that this index is compatible with.
Expand Down
20 changes: 20 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@

from .commit import CommitLock
from .io import StorageOptionsProvider
from .lance.indices import IndexDescription
from .progress import FragmentWriteProgress
from .types import ReaderLike

Expand Down Expand Up @@ -645,8 +646,27 @@ def checkout_latest(self):
self._ds.checkout_latest()

def list_indices(self) -> List[Index]:
"""
Returns index information for all indices in the dataset.

This method is deprecated as it requires loading the statistics for each index
which can be a very expensive operation. Instead use describe_indices() to
list index information and index_statistics() to get the statistics for
individual indexes of interest.
"""
# TODO: https://github.com/lancedb/lance/issues/5237 deprecate this method
# warnings.warn(
# "The 'list_indices' method is deprecated. It may be removed in a future"
# "version. Use describe_indices() instead.",
# DeprecationWarning,
# )

return self._ds.load_indices()

def describe_indices(self) -> List[IndexDescription]:
"""Returns index information for all indices in the dataset."""
return self._ds.describe_indices()

def index_statistics(self, index_name: str) -> Dict[str, Any]:
warnings.warn(
"LanceDataset.index_statistics() is deprecated, "
Expand Down
2 changes: 2 additions & 0 deletions python/python/lance/lance/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ from .fragment import (
from .fragment import (
RowIdMeta as RowIdMeta,
)
from .indices import IndexDescription as IndexDescription
from .optimize import (
Compaction as Compaction,
)
Expand Down Expand Up @@ -215,6 +216,7 @@ class _Dataset:
def index_statistics(self, index_name: str) -> str: ...
def serialized_manifest(self) -> bytes: ...
def load_indices(self) -> List[Index]: ...
def describe_indices(self) -> List[IndexDescription]: ...
def scanner(
self,
columns: Optional[List[str]] = None,
Expand Down
24 changes: 24 additions & 0 deletions python/python/lance/lance/indices/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime
from typing import Optional

import pyarrow as pa

class IndexConfig:
Expand Down Expand Up @@ -47,3 +50,24 @@ def transform_vectors(
pq_codebook: pa.Array,
dst_uri: str,
): ...

class IndexSegmentDescription:
uuid: str
dataset_version_at_last_update: int
fragment_ids: set[int]
index_version: int
created_at: Optional[datetime]

def __repr__(self) -> str: ...

class IndexDescription:
name: str
type_url: str
index_type: str
num_rows_indexed: int
fields: list[int]
field_names: list[str]
segments: list[IndexSegmentDescription]
details: dict

def __repr__(self) -> str: ...
121 changes: 121 additions & 0 deletions python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4006,3 +4006,124 @@ def test_json_inverted_match_query(tmp_path):
full_text_query=MatchQuery("Author,str,tolkien", "json_col")
)
assert results.num_rows == 1


def test_describe_indices(tmp_path):
data = pa.table(
{
"id": range(100),
"text": [f"document {i} about lance database" for i in range(100)],
"bitmap": range(100),
"bloomfilter": range(100),
"btree": range(100),
"json": pa.array(
[json.dumps({"key": f"value_{i}"}) for i in range(100)], pa.json_()
),
"ngram": [f"document {i}" for i in range(100)],
"zonemap": range(100),
}
)
ds = lance.write_dataset(data, tmp_path)
ds.create_scalar_index("text", index_type="INVERTED")
indices = ds.describe_indices()
assert len(indices) == 1

assert indices[0].name == "text_idx"
assert indices[0].type_url == "/lance.table.InvertedIndexDetails"
assert indices[0].index_type == "Inverted"
assert indices[0].num_rows_indexed == 100
assert indices[0].fields == [1]
assert indices[0].field_names == ["text"]
assert len(indices[0].segments) == 1
assert indices[0].segments[0].uuid is not None
assert indices[0].segments[0].fragment_ids == {0}
assert indices[0].segments[0].dataset_version_at_last_update == 1
assert indices[0].segments[0].index_version == 1
assert indices[0].segments[0].created_at is not None
assert isinstance(indices[0].segments[0].created_at, datetime)

details = indices[0].details
assert details is not None and len(details) > 0
assert details["lance_tokenizer"] is None
assert details["base_tokenizer"] == "simple"
assert details["language"] == "English"
assert not details["with_position"]
assert details["max_token_length"] == 40
assert details["lower_case"]
assert details["stem"]
assert details["remove_stop_words"]
assert details["custom_stop_words"] is None
assert details["ascii_folding"]
assert details["min_ngram_length"] == 3
assert details["max_ngram_length"] == 3
assert not details["prefix_only"]

ds.create_scalar_index("bitmap", index_type="BITMAP")
ds.create_scalar_index("bloomfilter", index_type="BLOOMFILTER")
ds.create_scalar_index("btree", index_type="BTREE")
ds.create_scalar_index(
"json",
IndexConfig(
index_type="json", parameters={"target_index_type": "btree", "path": "x"}
),
)
ds.create_scalar_index("ngram", index_type="NGRAM")
ds.create_scalar_index("zonemap", index_type="ZONEMAP")

indices = ds.describe_indices()
# Skip text index since it is already asserted above
indices = [index for index in indices if index.name != "text_idx"]
indices.sort(key=lambda x: x.name)

names = [
"bitmap_idx",
"bloomfilter_idx",
"btree_idx",
"json_idx",
"ngram_idx",
"zonemap_idx",
]
types_urls = [
"/lance.table.BitmapIndexDetails",
"/lance.index.pb.BloomFilterIndexDetails",
"/lance.table.BTreeIndexDetails",
"/lance.index.pb.JsonIndexDetails",
"/lance.table.NGramIndexDetails",
"/lance.table.ZoneMapIndexDetails",
]
index_types = [
"Bitmap",
"BloomFilter",
"BTree",
"Json",
"NGram",
"ZoneMap",
]
details = [
"{}",
"{}",
"{}",
'{"path":"x","target_details":{}}',
"{}",
"{}",
]

for i in range(len(indices)):
assert indices[i].name == names[i]
assert indices[i].type_url == types_urls[i]
assert indices[i].index_type == index_types[i]
assert indices[i].num_rows_indexed == 100
assert indices[i].fields == [i + 2]
assert indices[i].field_names == [data.column_names[i + 2]]
assert len(indices[i].segments) == 1
assert indices[i].segments[0].fragment_ids == {0}
assert indices[i].segments[0].dataset_version_at_last_update == i + 2
assert indices[i].segments[0].index_version == 0
assert indices[i].segments[0].created_at is not None
assert isinstance(indices[i].segments[0].created_at, datetime)
assert indices[i].details == json.loads(details[i])

ds.delete("id < 50")
indices = ds.describe_indices()
for index in indices:
assert index.num_rows_indexed == 50
19 changes: 18 additions & 1 deletion python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from lance import LanceFragment
from lance import LanceDataset, LanceFragment
from lance.dataset import VectorIndexReader
from lance.indices import IndexFileVersion
from lance.util import validate_vector_index # noqa: E402
Expand Down Expand Up @@ -1391,6 +1391,23 @@ def test_load_indices(dataset):
assert len(indices) == 1


def test_describe_vector_index(indexed_dataset: LanceDataset):
info = indexed_dataset.describe_indices()[0]

assert info.name == "vector_idx"
assert info.type_url == "/lance.table.VectorIndexDetails"
# This is currently Unknown because vector indices are not yet handled by plugins
assert info.index_type == "Unknown"
Comment on lines +1399 to +1400
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems unfortunate. Hopefully we can fix this very soon!

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the current implementation (list_indices) can actually determine the type for vector indexes so this is a bit of a regression but I think it involves opening the index and I'd like to be able to do it from the details / manifest only.

assert info.num_rows_indexed == 1000
assert info.fields == [0]
assert info.field_names == ["vector"]
assert len(info.segments) == 1
assert info.segments[0].fragment_ids == {0}
assert info.segments[0].dataset_version_at_last_update == 1
assert info.segments[0].index_version == 1
assert info.segments[0].created_at is not None


def test_optimize_indices(indexed_dataset):
data = create_table()
indexed_dataset = lance.write_dataset(data, indexed_dataset.uri, mode="append")
Expand Down
14 changes: 13 additions & 1 deletion python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ use lance_table::io::commit::CommitHandler;
use crate::error::PythonErrorExt;
use crate::file::object_store_from_uri_or_path;
use crate::fragment::FileFragment;
use crate::indices::PyIndexConfig;
use crate::indices::{PyIndexConfig, PyIndexDescription};
use crate::rt;
use crate::scanner::ScanStatistics;
use crate::schema::{logical_schema_from_lance, LanceSchema};
Expand Down Expand Up @@ -2599,6 +2599,18 @@ impl Dataset {
let builder = self.ds.sql(&sql);
Ok(SqlQueryBuilder { builder })
}

#[pyo3(signature=())]
fn describe_indices(&self, py: Python<'_>) -> PyResult<Vec<PyIndexDescription>> {
let new_self = self.ds.as_ref().clone();
let indices = rt()
.block_on(Some(py), new_self.describe_indices(None))?
.infer_error()?;
Ok(indices
.into_iter()
.map(|desc| PyIndexDescription::new(desc.as_ref(), self.ds.as_ref()))
.collect())
}
}

#[pyclass(name = "SqlQuery", module = "_lib", subclass)]
Expand Down
Loading
Loading