Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion java/src/main/java/org/lance/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -1257,7 +1257,11 @@ public List<String> listIndexes() {
/**
* Get all indexes with full metadata.
*
* @return list of Index objects with complete metadata including index type and fragment coverage
* <p>Each returned {@link Index} is a physical index segment from the manifest. Use {@link
* #describeIndices()} for the logical-index view.
*
* @return list of Index objects with complete segment metadata, including index type and fragment
* coverage
*/
public List<Index> getIndexes() {
try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) {
Expand Down
9 changes: 9 additions & 0 deletions java/src/main/java/org/lance/index/IndexDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ public List<Index> getMetadata() {
return metadata;
}

/**
* Physical index segments for this logical index.
*
* <p>This is an alias for {@link #getMetadata()} with a less ambiguous name.
*/
public List<Index> getSegments() {
return metadata;
}

/**
* JSON representation of index-specific details.
*
Expand Down
9 changes: 9 additions & 0 deletions java/src/test/java/org/lance/DatasetTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1924,14 +1924,23 @@ public void testDescribeIndicesByName(@TempDir Path tempDir) throws Exception {
assertTrue(desc.getRowsIndexed() > 0, "rowsIndexed should be positive");
assertNotNull(desc.getMetadata(), "Metadata list should not be null");
assertFalse(desc.getMetadata().isEmpty(), "Metadata list should not be empty");
assertEquals(
desc.getMetadata(), desc.getSegments(), "segments alias should match metadata");
assertNotNull(desc.getDetailsJson(), "Details JSON should not be null");

assertEquals(1, desc.getSegments().size(), "Expected exactly one physical segment");
assertEquals("index1", desc.getSegments().get(0).name());

descriptions = dataset.describeIndices();
assertEquals(2, descriptions.size(), "Expected exactly one matching index");
for (IndexDescription indexDesc : descriptions) {
assertTrue(indexDesc.getRowsIndexed() > 0, "rowsIndexed should be positive");
assertNotNull(indexDesc.getMetadata(), "Metadata list should not be null");
assertFalse(indexDesc.getMetadata().isEmpty(), "Metadata list should not be empty");
assertEquals(
indexDesc.getMetadata(),
indexDesc.getSegments(),
"segments alias should match metadata");
assertNotNull(indexDesc.getDetailsJson(), "Details JSON should not be null");
}
}
Expand Down
11 changes: 6 additions & 5 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,12 +641,13 @@ def checkout_latest(self):

def list_indices(self) -> List[Index]:
"""
Returns index information for all indices in the dataset.
Returns physical index segment information for all indices in the dataset.

This method is deprecated as it requires loading the statistics for each index
which can be a very expensive operation. Instead use describe_indices() to
list index information and index_statistics() to get the statistics for
individual indexes of interest.
which can be a very expensive operation. It also exposes physical index
segments directly. Instead use describe_indices() for logical index
descriptions and index_statistics() to get the statistics for individual
indexes of interest.
"""
warnings.warn(
"The 'list_indices' method is deprecated. It may be removed in a future "
Expand All @@ -657,7 +658,7 @@ def list_indices(self) -> List[Index]:
return self._ds.load_indices()

def describe_indices(self) -> List[IndexDescription]:
"""Returns index information for all indices in the dataset."""
"""Returns logical index information aggregated across all segments."""
return self._ds.describe_indices()

def index_statistics(self, index_name: str) -> Dict[str, Any]:
Expand Down
2 changes: 2 additions & 0 deletions python/python/lance/indices/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .pq import PqModel

IndexSegment = _lance.indices.IndexSegment
IndexSegmentDescription = _lance.indices.IndexSegmentDescription
IndexSegmentPlan = _lance.indices.IndexSegmentPlan

__all__ = [
Expand All @@ -18,6 +19,7 @@
"IvfModel",
"IndexFileVersion",
"IndexSegment",
"IndexSegmentDescription",
"IndexSegmentPlan",
]

Expand Down
1 change: 1 addition & 0 deletions python/python/lance/lance/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ from .fragment import (
)
from .indices import IndexDescription as IndexDescription
from .indices import IndexSegment as IndexSegment
from .indices import IndexSegmentDescription as IndexSegmentDescription
from .indices import IndexSegmentPlan as IndexSegmentPlan
from .lance import PySearchFilter
from .optimize import (
Expand Down
16 changes: 16 additions & 0 deletions python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1643,6 +1643,22 @@ def test_optimize_indices(indexed_dataset):
assert stats["num_indices"] == 2


def test_logical_and_physical_index_views(indexed_dataset):
data = create_table()
indexed_dataset = lance.write_dataset(data, indexed_dataset.uri, mode="append")
indexed_dataset.optimize.optimize_indices(num_indices_to_merge=0)

logical_indices = indexed_dataset.describe_indices()
assert len(logical_indices) == 1
assert logical_indices[0].name == "vector_idx"
assert len(logical_indices[0].segments) == 2
assert all(segment.fragment_ids for segment in logical_indices[0].segments)

stats = indexed_dataset.stats.index_stats("vector_idx")
assert stats["num_segments"] == stats["num_indices"] == 2
assert stats["segments"] == stats["indices"]


@pytest.mark.skip(reason="retrain is deprecated")
def test_retrain_indices(indexed_dataset):
data = create_table()
Expand Down
35 changes: 19 additions & 16 deletions python/src/indices.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,24 @@ pub struct PyIndexSegmentDescription {
}

impl PyIndexSegmentDescription {
pub fn from_metadata(segment: &lance_table::format::IndexMetadata) -> Self {
let fragment_ids = segment
.fragment_bitmap
.as_ref()
.map(|bitmap| bitmap.iter().collect::<HashSet<_>>())
.unwrap_or_default();
let size_bytes = segment.total_size_bytes();

Self {
uuid: segment.uuid.to_string(),
dataset_version_at_last_update: segment.dataset_version,
fragment_ids,
index_version: segment.index_version,
created_at: segment.created_at,
size_bytes,
}
}

pub fn __repr__(&self) -> String {
format!(
"IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?})",
Expand Down Expand Up @@ -633,22 +651,7 @@ impl PyIndexDescription {
let segments = index
.metadata()
.iter()
.map(|segment| {
let fragment_ids = segment
.fragment_bitmap
.as_ref()
.map(|bitmap| bitmap.iter().collect::<HashSet<_>>())
.unwrap_or_default();
let size_bytes = segment.total_size_bytes();
PyIndexSegmentDescription {
uuid: segment.uuid.to_string(),
dataset_version_at_last_update: segment.dataset_version,
fragment_ids,
index_version: segment.index_version,
created_at: segment.created_at,
size_bytes,
}
})
.map(PyIndexSegmentDescription::from_metadata)
.collect();

let details = index.details().unwrap_or_else(|_| "{}".to_string());
Expand Down
7 changes: 7 additions & 0 deletions rust/lance-index/src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ pub trait IndexDescription: Send + Sync {
/// IndexMetadata for each segment of the index.
fn metadata(&self) -> &[IndexMetadata];

/// Returns the physical index segments that make up this logical index.
///
/// This is an alias for [`Self::metadata`] with a less ambiguous name.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: +1 I like explicitly calling it segments.

fn segments(&self) -> &[IndexMetadata] {
self.metadata()
}

/// Returns the index type URL
///
/// This is extracted from the type url of the index details
Expand Down
15 changes: 14 additions & 1 deletion rust/lance/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1160,7 +1160,9 @@ async fn index_statistics_scalar(
"index_type": index_type,
"name": index_name,
"num_indices": num_indices,
"indices": indices_stats,
"num_segments": num_indices,
"indices": indices_stats.clone(),
"segments": indices_stats,
"num_indexed_fragments": num_indexed_fragments,
"num_indexed_rows": num_indexed_rows,
"num_unindexed_fragments": num_unindexed_fragments,
Expand Down Expand Up @@ -2408,8 +2410,13 @@ mod tests {
fn get_bitmap(meta: &IndexMetadata) -> Vec<u32> {
meta.fragment_bitmap.as_ref().unwrap().iter().collect()
}
fn assert_segment_aliases(stats: &serde_json::Value) {
assert_eq!(stats["num_segments"], stats["num_indices"]);
assert_eq!(stats["segments"], stats["indices"]);
}

let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2422,6 +2429,7 @@ mod tests {
RecordBatchIterator::new(vec![record_batch].into_iter().map(Ok), schema.clone());
dataset.append(reader, None).await.unwrap();
let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 512);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2436,6 +2444,7 @@ mod tests {
.await
.unwrap();
let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 512);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2453,6 +2462,7 @@ mod tests {
.await
.unwrap();
let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 512);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2463,6 +2473,7 @@ mod tests {
assert_eq!(get_bitmap(&meta[0]), vec![0]);

let stats = get_stats(&dataset, "other_vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 1024);
assert_eq!(stats["num_indexed_fragments"], 2);
Expand All @@ -2479,6 +2490,7 @@ mod tests {
.unwrap();

let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 1024);
assert_eq!(stats["num_indexed_fragments"], 2);
Expand All @@ -2493,6 +2505,7 @@ mod tests {
.await
.unwrap();
let stats = get_stats(&dataset, "other_vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 1024);
assert_eq!(stats["num_indexed_fragments"], 2);
Expand Down
Loading