diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index b0517ca1bb8..d9c58e9b54a 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -1257,7 +1257,11 @@ public List listIndexes() { /** * Get all indexes with full metadata. * - * @return list of Index objects with complete metadata including index type and fragment coverage + *

Each returned {@link Index} is a physical index segment from the manifest. Use {@link + * #describeIndices()} for the logical-index view. + * + * @return list of Index objects with complete segment metadata, including index type and fragment + * coverage */ public List getIndexes() { try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { diff --git a/java/src/main/java/org/lance/index/IndexDescription.java b/java/src/main/java/org/lance/index/IndexDescription.java index d17782eb531..1b5e5a3a8f8 100755 --- a/java/src/main/java/org/lance/index/IndexDescription.java +++ b/java/src/main/java/org/lance/index/IndexDescription.java @@ -83,6 +83,15 @@ public List getMetadata() { return metadata; } + /** + * Physical index segments for this logical index. + * + *

This is an alias for {@link #getMetadata()} with a less ambiguous name. + */ + public List getSegments() { + return metadata; + } + /** * JSON representation of index-specific details. * diff --git a/java/src/test/java/org/lance/DatasetTest.java b/java/src/test/java/org/lance/DatasetTest.java index 00e5d930323..a707b4f4a3c 100644 --- a/java/src/test/java/org/lance/DatasetTest.java +++ b/java/src/test/java/org/lance/DatasetTest.java @@ -1924,14 +1924,23 @@ public void testDescribeIndicesByName(@TempDir Path tempDir) throws Exception { assertTrue(desc.getRowsIndexed() > 0, "rowsIndexed should be positive"); assertNotNull(desc.getMetadata(), "Metadata list should not be null"); assertFalse(desc.getMetadata().isEmpty(), "Metadata list should not be empty"); + assertEquals( + desc.getMetadata(), desc.getSegments(), "segments alias should match metadata"); assertNotNull(desc.getDetailsJson(), "Details JSON should not be null"); + assertEquals(1, desc.getSegments().size(), "Expected exactly one physical segment"); + assertEquals("index1", desc.getSegments().get(0).name()); + descriptions = dataset.describeIndices(); assertEquals(2, descriptions.size(), "Expected exactly one matching index"); for (IndexDescription indexDesc : descriptions) { assertTrue(indexDesc.getRowsIndexed() > 0, "rowsIndexed should be positive"); assertNotNull(indexDesc.getMetadata(), "Metadata list should not be null"); assertFalse(indexDesc.getMetadata().isEmpty(), "Metadata list should not be empty"); + assertEquals( + indexDesc.getMetadata(), + indexDesc.getSegments(), + "segments alias should match metadata"); assertNotNull(indexDesc.getDetailsJson(), "Details JSON should not be null"); } } diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index fd2db9de351..7496746285a 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -641,12 +641,13 @@ def checkout_latest(self): def list_indices(self) -> List[Index]: """ - Returns index information for all indices in the dataset. + Returns physical index segment information for all indices in the dataset. This method is deprecated as it requires loading the statistics for each index - which can be a very expensive operation. Instead use describe_indices() to - list index information and index_statistics() to get the statistics for - individual indexes of interest. + which can be a very expensive operation. It also exposes physical index + segments directly. Instead use describe_indices() for logical index + descriptions and index_statistics() to get the statistics for individual + indexes of interest. """ warnings.warn( "The 'list_indices' method is deprecated. It may be removed in a future " @@ -657,7 +658,7 @@ def list_indices(self) -> List[Index]: return self._ds.load_indices() def describe_indices(self) -> List[IndexDescription]: - """Returns index information for all indices in the dataset.""" + """Returns logical index information aggregated across all segments.""" return self._ds.describe_indices() def index_statistics(self, index_name: str) -> Dict[str, Any]: diff --git a/python/python/lance/indices/__init__.py b/python/python/lance/indices/__init__.py index 8dfb1148345..b35e5d5b174 100644 --- a/python/python/lance/indices/__init__.py +++ b/python/python/lance/indices/__init__.py @@ -9,6 +9,7 @@ from .pq import PqModel IndexSegment = _lance.indices.IndexSegment +IndexSegmentDescription = _lance.indices.IndexSegmentDescription IndexSegmentPlan = _lance.indices.IndexSegmentPlan __all__ = [ @@ -18,6 +19,7 @@ "IvfModel", "IndexFileVersion", "IndexSegment", + "IndexSegmentDescription", "IndexSegmentPlan", ] diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index e2f70a853a1..f0be29f39ca 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -62,6 +62,7 @@ from .fragment import ( ) from .indices import IndexDescription as IndexDescription from .indices import IndexSegment as IndexSegment +from .indices import IndexSegmentDescription as IndexSegmentDescription from .indices import IndexSegmentPlan as IndexSegmentPlan from .lance import PySearchFilter from .optimize import ( diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 2451f4684b4..0c1d5ab9ed4 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1643,6 +1643,22 @@ def test_optimize_indices(indexed_dataset): assert stats["num_indices"] == 2 +def test_logical_and_physical_index_views(indexed_dataset): + data = create_table() + indexed_dataset = lance.write_dataset(data, indexed_dataset.uri, mode="append") + indexed_dataset.optimize.optimize_indices(num_indices_to_merge=0) + + logical_indices = indexed_dataset.describe_indices() + assert len(logical_indices) == 1 + assert logical_indices[0].name == "vector_idx" + assert len(logical_indices[0].segments) == 2 + assert all(segment.fragment_ids for segment in logical_indices[0].segments) + + stats = indexed_dataset.stats.index_stats("vector_idx") + assert stats["num_segments"] == stats["num_indices"] == 2 + assert stats["segments"] == stats["indices"] + + @pytest.mark.skip(reason="retrain is deprecated") def test_retrain_indices(indexed_dataset): data = create_table() diff --git a/python/src/indices.rs b/python/src/indices.rs index 9651c6cc00e..c811c49f831 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -580,6 +580,24 @@ pub struct PyIndexSegmentDescription { } impl PyIndexSegmentDescription { + pub fn from_metadata(segment: &lance_table::format::IndexMetadata) -> Self { + let fragment_ids = segment + .fragment_bitmap + .as_ref() + .map(|bitmap| bitmap.iter().collect::>()) + .unwrap_or_default(); + let size_bytes = segment.total_size_bytes(); + + Self { + uuid: segment.uuid.to_string(), + dataset_version_at_last_update: segment.dataset_version, + fragment_ids, + index_version: segment.index_version, + created_at: segment.created_at, + size_bytes, + } + } + pub fn __repr__(&self) -> String { format!( "IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?})", @@ -633,22 +651,7 @@ impl PyIndexDescription { let segments = index .metadata() .iter() - .map(|segment| { - let fragment_ids = segment - .fragment_bitmap - .as_ref() - .map(|bitmap| bitmap.iter().collect::>()) - .unwrap_or_default(); - let size_bytes = segment.total_size_bytes(); - PyIndexSegmentDescription { - uuid: segment.uuid.to_string(), - dataset_version_at_last_update: segment.dataset_version, - fragment_ids, - index_version: segment.index_version, - created_at: segment.created_at, - size_bytes, - } - }) + .map(PyIndexSegmentDescription::from_metadata) .collect(); let details = index.details().unwrap_or_else(|_| "{}".to_string()); diff --git a/rust/lance-index/src/traits.rs b/rust/lance-index/src/traits.rs index 130e59cad81..0b99954a727 100644 --- a/rust/lance-index/src/traits.rs +++ b/rust/lance-index/src/traits.rs @@ -72,6 +72,13 @@ pub trait IndexDescription: Send + Sync { /// IndexMetadata for each segment of the index. fn metadata(&self) -> &[IndexMetadata]; + /// Returns the physical index segments that make up this logical index. + /// + /// This is an alias for [`Self::metadata`] with a less ambiguous name. + fn segments(&self) -> &[IndexMetadata] { + self.metadata() + } + /// Returns the index type URL /// /// This is extracted from the type url of the index details diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 843fc8c7740..c18cce66c6e 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1160,7 +1160,9 @@ async fn index_statistics_scalar( "index_type": index_type, "name": index_name, "num_indices": num_indices, - "indices": indices_stats, + "num_segments": num_indices, + "indices": indices_stats.clone(), + "segments": indices_stats, "num_indexed_fragments": num_indexed_fragments, "num_indexed_rows": num_indexed_rows, "num_unindexed_fragments": num_unindexed_fragments, @@ -2408,8 +2410,13 @@ mod tests { fn get_bitmap(meta: &IndexMetadata) -> Vec { meta.fragment_bitmap.as_ref().unwrap().iter().collect() } + fn assert_segment_aliases(stats: &serde_json::Value) { + assert_eq!(stats["num_segments"], stats["num_indices"]); + assert_eq!(stats["segments"], stats["indices"]); + } let stats = get_stats(&dataset, "vec_idx").await; + assert_segment_aliases(&stats); assert_eq!(stats["num_unindexed_rows"], 0); assert_eq!(stats["num_indexed_rows"], 512); assert_eq!(stats["num_indexed_fragments"], 1); @@ -2422,6 +2429,7 @@ mod tests { RecordBatchIterator::new(vec![record_batch].into_iter().map(Ok), schema.clone()); dataset.append(reader, None).await.unwrap(); let stats = get_stats(&dataset, "vec_idx").await; + assert_segment_aliases(&stats); assert_eq!(stats["num_unindexed_rows"], 512); assert_eq!(stats["num_indexed_rows"], 512); assert_eq!(stats["num_indexed_fragments"], 1); @@ -2436,6 +2444,7 @@ mod tests { .await .unwrap(); let stats = get_stats(&dataset, "vec_idx").await; + assert_segment_aliases(&stats); assert_eq!(stats["num_unindexed_rows"], 512); assert_eq!(stats["num_indexed_rows"], 512); assert_eq!(stats["num_indexed_fragments"], 1); @@ -2453,6 +2462,7 @@ mod tests { .await .unwrap(); let stats = get_stats(&dataset, "vec_idx").await; + assert_segment_aliases(&stats); assert_eq!(stats["num_unindexed_rows"], 512); assert_eq!(stats["num_indexed_rows"], 512); assert_eq!(stats["num_indexed_fragments"], 1); @@ -2463,6 +2473,7 @@ mod tests { assert_eq!(get_bitmap(&meta[0]), vec![0]); let stats = get_stats(&dataset, "other_vec_idx").await; + assert_segment_aliases(&stats); assert_eq!(stats["num_unindexed_rows"], 0); assert_eq!(stats["num_indexed_rows"], 1024); assert_eq!(stats["num_indexed_fragments"], 2); @@ -2479,6 +2490,7 @@ mod tests { .unwrap(); let stats = get_stats(&dataset, "vec_idx").await; + assert_segment_aliases(&stats); assert_eq!(stats["num_unindexed_rows"], 0); assert_eq!(stats["num_indexed_rows"], 1024); assert_eq!(stats["num_indexed_fragments"], 2); @@ -2493,6 +2505,7 @@ mod tests { .await .unwrap(); let stats = get_stats(&dataset, "other_vec_idx").await; + assert_segment_aliases(&stats); assert_eq!(stats["num_unindexed_rows"], 0); assert_eq!(stats["num_indexed_rows"], 1024); assert_eq!(stats["num_indexed_fragments"], 2);