diff --git a/python/python/ci_benchmarks/datagen/basic.py b/python/python/ci_benchmarks/datagen/basic.py index 9629ac09509..b24193907b7 100644 --- a/python/python/ci_benchmarks/datagen/basic.py +++ b/python/python/ci_benchmarks/datagen/basic.py @@ -72,7 +72,7 @@ def _create(dataset_uri: str): schema=SCHEMA, mode="create", ) - if ds.list_indices() == []: + if not ds.describe_indices(): ds.create_scalar_index("row_number", "BTREE") ds.create_scalar_index("row_number_bitmap", "BITMAP") diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index e0344b64ca3..ada146b0299 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -647,12 +647,11 @@ def list_indices(self) -> List[Index]: list index information and index_statistics() to get the statistics for individual indexes of interest. """ - # TODO: https://github.com/lancedb/lance/issues/5237 deprecate this method - # warnings.warn( - # "The 'list_indices' method is deprecated. It may be removed in a future" - # "version. Use describe_indices() instead.", - # DeprecationWarning, - # ) + warnings.warn( + "The 'list_indices' method is deprecated. It may be removed in a future " + "version. Use describe_indices() instead.", + DeprecationWarning, + ) return self._ds.load_indices() @@ -670,7 +669,7 @@ def index_statistics(self, index_name: str) -> Dict[str, Any]: @property def has_index(self): - return len(self.list_indices()) > 0 + return len(self.describe_indices()) > 0 def _apply_default_scan_options(self, builder: ScannerBuilder): if self._default_scan_options: @@ -3298,8 +3297,8 @@ def drop_index(self, name: str): Note: Indices are dropped by "index name". This is not the same as the field name. If you did not specify a name when you created the index then a name was - generated for you. You can use the `list_indices` method to get the names of - the indices. + generated for you. You can use the `describe_indices` method to get the names + of the indices. """ return self._ds.drop_index(name) @@ -3931,9 +3930,19 @@ def _default_vector_index_for_column(self, column: str) -> str: Raises KeyError if no such index exists. """ - for meta in self.list_indices(): - if column in meta["fields"] and meta["type"].startswith("IVF"): - return meta["name"] + # Resolve column path to field id for describe_indices matching. + lance_field = self._ds.lance_schema.field_case_insensitive(column) + if lance_field is None: + raise KeyError(f"No IVF index for column '{column}'") + field_id = lance_field.id() + + indices = self.describe_indices() + for idx in indices: + if field_id in idx.fields: + # Use index_stats to get the concrete IVF subtype. + index_type = self.stats.index_stats(idx.name).get("index_type", "") + if index_type.startswith("IVF"): + return idx.name raise KeyError(f"No IVF index for column '{column}'") def centroids( diff --git a/python/python/tests/test_column_names.py b/python/python/tests/test_column_names.py index 5ec38085896..f7b5962b523 100644 --- a/python/python/tests/test_column_names.py +++ b/python/python/tests/test_column_names.py @@ -84,10 +84,10 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset): """Scalar index creation should work with mixed-case column names.""" mixed_case_dataset.create_scalar_index("userId", index_type="BTREE") - indices = mixed_case_dataset.list_indices() + indices = mixed_case_dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["fields"] == ["userId"] - assert indices[0]["name"] == "userId_idx" + assert indices[0].field_names == ["userId"] + assert indices[0].name == "userId_idx" # Query using the indexed column result = mixed_case_dataset.to_table(filter="userId = 50") @@ -206,7 +206,7 @@ def test_scalar_index_on_each_case_variant(self, tmp_path, case_variant_table): # Create separate datasets for each test to avoid index conflicts ds1 = lance.write_dataset(case_variant_table, tmp_path / "ds1") ds1.create_scalar_index("camelCase", index_type="BTREE") - assert ds1.list_indices()[0]["fields"] == ["camelCase"] + assert ds1.describe_indices()[0].field_names == ["camelCase"] # Query camelCase=50 should return row 50 (where CamelCase=49, CAMELCASE=0) result = ds1.to_table(filter="camelCase = 50") @@ -221,7 +221,7 @@ def test_scalar_index_on_each_case_variant(self, tmp_path, case_variant_table): # Test CamelCase index ds2 = lance.write_dataset(case_variant_table, tmp_path / "ds2") ds2.create_scalar_index("CamelCase", index_type="BTREE") - assert ds2.list_indices()[0]["fields"] == ["CamelCase"] + assert ds2.describe_indices()[0].field_names == ["CamelCase"] # Query CamelCase=50 should return row 49 (where camelCase=49, CAMELCASE=99) result = ds2.to_table(filter="CamelCase = 50") @@ -236,7 +236,7 @@ def test_scalar_index_on_each_case_variant(self, tmp_path, case_variant_table): # Test CAMELCASE index ds3 = lance.write_dataset(case_variant_table, tmp_path / "ds3") ds3.create_scalar_index("CAMELCASE", index_type="BTREE") - assert ds3.list_indices()[0]["fields"] == ["CAMELCASE"] + assert ds3.describe_indices()[0].field_names == ["CAMELCASE"] # Query CAMELCASE=50 should return row 0 (where camelCase=0, CamelCase=99) result = ds3.to_table(filter="CAMELCASE = 50") @@ -347,11 +347,10 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): # Column name is used directly without SQL parsing special_char_dataset.create_scalar_index("user-id", index_type="BTREE") - indices = special_char_dataset.list_indices() + indices = special_char_dataset.describe_indices() assert len(indices) == 1 - # Field with special chars is returned in quoted format for SQL compatibility - assert indices[0]["fields"] == ["`user-id`"] - assert indices[0]["name"] == "user-id_idx" + assert indices[0].field_names == ["user-id"] + assert indices[0].name == "user-id_idx" # Query using the indexed column (requires backticks in filter) result = special_char_dataset.to_table(filter="`user-id` = 50") @@ -460,10 +459,10 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): "MetaData.userId", index_type="BTREE" ) - indices = nested_mixed_case_dataset.list_indices() + indices = nested_mixed_case_dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["fields"] == ["MetaData.userId"] - assert indices[0]["name"] == "MetaData.userId_idx" + assert indices[0].name == "MetaData.userId_idx" + assert indices[0].field_names == ["userId"] # Query using the indexed column result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -482,10 +481,10 @@ def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): """Scalar index on top-level mixed-case column works.""" nested_mixed_case_dataset.create_scalar_index("rowId", index_type="BTREE") - indices = nested_mixed_case_dataset.list_indices() + indices = nested_mixed_case_dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["fields"] == ["rowId"] - assert indices[0]["name"] == "rowId_idx" + assert indices[0].name == "rowId_idx" + assert indices[0].field_names == ["rowId"] result = nested_mixed_case_dataset.to_table(filter="rowId = 50") assert result.num_rows == 1 @@ -509,10 +508,11 @@ def test_scalar_index_with_lowercased_nested_path(self, nested_mixed_case_datase "metadata.userid", index_type="BTREE" ) - indices = nested_mixed_case_dataset.list_indices() + indices = nested_mixed_case_dataset.describe_indices() assert len(indices) == 1 # Should store with correct case from schema - assert indices[0]["fields"] == ["MetaData.userId"] + assert indices[0].name == "MetaData.userId_idx" + assert indices[0].field_names == ["userId"] # Query should also work with correct case result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -574,11 +574,10 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase "`meta-data`.`user-id`", index_type="BTREE" ) - indices = nested_special_char_dataset.list_indices() + indices = nested_special_char_dataset.describe_indices() assert len(indices) == 1 - # Fields with special chars are returned in quoted format for SQL compatibility - assert indices[0]["fields"] == ["`meta-data`.`user-id`"] - assert indices[0]["name"] == "meta-data.user-id_idx" + assert indices[0].field_names == ["user-id"] + assert indices[0].name == "meta-data.user-id_idx" # Query using the indexed column (backticks required in filter) result = nested_special_char_dataset.to_table( @@ -599,10 +598,9 @@ def test_scalar_index_on_top_level_special_chars(self, nested_special_char_datas """Scalar index on top-level special char column works.""" nested_special_char_dataset.create_scalar_index("`row-id`", index_type="BTREE") - indices = nested_special_char_dataset.list_indices() + indices = nested_special_char_dataset.describe_indices() assert len(indices) == 1 - # Field with special chars is returned in quoted format for SQL compatibility - assert indices[0]["fields"] == ["`row-id`"] + assert indices[0].field_names == ["row-id"] result = nested_special_char_dataset.to_table(filter="`row-id` = 50") assert result.num_rows == 1 diff --git a/python/python/tests/test_commit_index.py b/python/python/tests/test_commit_index.py index c5d4f3ca9d1..f7471d39175 100644 --- a/python/python/tests/test_commit_index.py +++ b/python/python/tests/test_commit_index.py @@ -52,7 +52,7 @@ def _get_field_id_by_name(lance_schema, field_name): def test_commit_index(dataset_with_index, test_table, tmp_path): from lance.dataset import Index - index_id = dataset_with_index.list_indices()[0]["uuid"] + index_id = dataset_with_index.describe_indices()[0].segments[0].uuid # Create a new dataset without index dataset_without_index = lance.write_dataset( @@ -90,13 +90,13 @@ def test_commit_index(dataset_with_index, test_table, tmp_path): read_version=dataset_without_index.version, ) - # Verify that both datasets have the index - assert len(dataset_with_index.list_indices()) == 1 - assert len(dataset_without_index.list_indices()) == 1 + # Verify the manually committed index matches the original index stats + stats_with = dataset_with_index.stats.index_stats("meta_idx") + stats_without = dataset_without_index.stats.index_stats("meta_idx") - assert ( - dataset_without_index.list_indices()[0] == dataset_with_index.list_indices()[0] - ) + assert stats_without["name"] == stats_with["name"] + assert stats_without["index_type"] == stats_with["index_type"] + assert stats_without["num_indexed_rows"] == stats_with["num_indexed_rows"] # Check if the index is used in scans for dataset in [dataset_with_index, dataset_without_index]: diff --git a/python/python/tests/test_create_empty_index.py b/python/python/tests/test_create_empty_index.py index 047cbb16e59..77d4ab034c9 100644 --- a/python/python/tests/test_create_empty_index.py +++ b/python/python/tests/test_create_empty_index.py @@ -16,10 +16,10 @@ def test_create_empty_scalar_index(): dataset.create_scalar_index("id", "BTREE", train=False) # Verify index exists and has correct stats - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["type"] == "BTree" - stats = dataset.stats.index_stats(indices[0]["name"]) + assert indices[0].index_type == "BTree" + stats = dataset.stats.index_stats(indices[0].name) assert stats["num_indexed_rows"] == 0 assert stats["num_unindexed_rows"] == dataset.count_rows() diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 26ab6e99162..e29f02705e2 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -347,6 +347,6 @@ def test_load_shuffled_vectors( ) final_ds = lance.dataset(str(tmpdir / "dataset")) - assert final_ds.has_index - assert final_ds.list_indices()[0]["fields"] == ["vectors"] - assert len(final_ds.list_indices()[0]["fragment_ids"]) == NUM_FRAGMENTS + stats = final_ds.stats.index_stats("vectors_idx") + assert stats["name"] == "vectors_idx" + assert stats["num_indexed_fragments"] == NUM_FRAGMENTS diff --git a/python/python/tests/test_memory_leaks.py b/python/python/tests/test_memory_leaks.py index 9a0d8356882..29907089ba0 100644 --- a/python/python/tests/test_memory_leaks.py +++ b/python/python/tests/test_memory_leaks.py @@ -87,9 +87,8 @@ def test_index_statistics_no_leak(self, tmp_path) -> None: def access_index_stats() -> None: d = lance.dataset(dataset_path) - for idx in d.list_indices(): - if name := idx.get("name"): - d.stats.index_stats(name) + for idx in d.describe_indices(): + d.stats.index_stats(idx.name) assert_noleaks( access_index_stats, iterations=1000, threshold_mb=2.0, check_interval=25 diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index 1f23f3bac48..72239bb8cc9 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -296,8 +296,8 @@ def test_index_remapping_multiple_rewrite_tasks(tmp_path: Path): fragments = list(ds.get_fragments()) assert len(fragments) == 2 - index = ds.list_indices()[0] - index_frag_ids = list(index["fragment_ids"]) + index = ds.describe_indices()[0] + index_frag_ids = list(index.segments[0].fragment_ids) frag_ids = [frag.fragment_id for frag in fragments] assert len(index_frag_ids) == 1 diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index ce6a66944c0..271ca5ccb36 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -185,9 +185,9 @@ def btree_comparison_datasets(tmp_path): def test_load_indices(indexed_dataset: lance.LanceDataset): - indices = indexed_dataset.list_indices() - vec_idx = next(idx for idx in indices if idx["type"] == "IVF_PQ") - scalar_idx = next(idx for idx in indices if idx["type"] == "BTree") + indices = indexed_dataset.describe_indices() + vec_idx = next(idx for idx in indices if "VectorIndex" in idx.type_url) + scalar_idx = next(idx for idx in indices if idx.index_type == "BTree") assert vec_idx is not None assert scalar_idx is not None @@ -665,7 +665,7 @@ def test_filter_with_fts_index(dataset): def test_create_scalar_index_fts_alias(dataset): dataset.create_scalar_index("doc", index_type="FTS", with_position=False) - assert any(idx["type"] == "Inverted" for idx in dataset.list_indices()) + assert any(idx.index_type == "Inverted" for idx in dataset.describe_indices()) def test_multi_index_create(tmp_path): @@ -677,24 +677,23 @@ def test_multi_index_create(tmp_path): "ints", index_type="BITMAP", name="ints_bitmap_idx", replace=True ) - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 2 - assert indices[0]["name"] == "ints_idx" - assert indices[0]["type"] == "BTree" - assert indices[1]["name"] == "ints_bitmap_idx" - assert indices[1]["type"] == "Bitmap" + idx_by_name = {idx.name: idx for idx in indices} + assert idx_by_name["ints_idx"].index_type == "BTree" + assert idx_by_name["ints_bitmap_idx"].index_type == "Bitmap" # Test that we can drop one of the indices dataset.drop_index("ints_idx") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["name"] == "ints_bitmap_idx" - assert indices[0]["type"] == "Bitmap" + assert indices[0].name == "ints_bitmap_idx" + assert indices[0].index_type == "Bitmap" # Test that we can drop the last index dataset.drop_index("ints_bitmap_idx") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 0 @@ -1549,9 +1548,9 @@ def test_bitmap_index(tmp_path: Path): ) dataset = lance.write_dataset(tbl, tmp_path / "dataset") dataset.create_scalar_index("a", index_type="BITMAP") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["type"] == "Bitmap" + assert indices[0].index_type == "Bitmap" def test_bitmap_empty_range(tmp_path: Path): @@ -1629,9 +1628,9 @@ def test_ngram_index(tmp_path: Path): def test_with(tbl: pa.Table): dataset = lance.write_dataset(tbl, tmp_path / "dataset", mode="overwrite") dataset.create_scalar_index("words", index_type="NGRAM") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["type"] == "NGram" + assert indices[0].index_type == "NGram" scan_plan = dataset.scanner(filter="contains(words, 'apple')").explain_plan( True @@ -1683,7 +1682,7 @@ def test_zonemap_index(tmp_path: Path): tbl = pa.Table.from_arrays([pa.array([i for i in range(8193)])], names=["values"]) dataset = lance.write_dataset(tbl, tmp_path / "dataset") dataset.create_scalar_index("values", index_type="ZONEMAP") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 # Get detailed index statistics @@ -1779,9 +1778,9 @@ def test_zonemap_index_remapping(tmp_path: Path): # Train a zone map index dataset.create_scalar_index("values", index_type="ZONEMAP") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["type"] == "ZoneMap" + assert indices[0].index_type == "ZoneMap" # Confirm the zone map index is used if you search the dataset scanner = dataset.scanner(filter="values > 2500", prefilter=True) @@ -1828,7 +1827,7 @@ def test_bloomfilter_index(tmp_path: Path): tbl = pa.Table.from_arrays([pa.array([i for i in range(10000)])], names=["values"]) dataset = lance.write_dataset(tbl, tmp_path / "dataset") dataset.create_scalar_index("values", index_type="BLOOMFILTER") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 # Get detailed index statistics @@ -2016,9 +2015,9 @@ def test_label_list_index(tmp_path: Path): tbl = pa.Table.from_arrays([tag_list], names=["tags"]) dataset = lance.write_dataset(tbl, tmp_path / "dataset") dataset.create_scalar_index("tags", index_type="LABEL_LIST") - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["type"] == "LabelList" + assert indices[0].index_type == "LabelList" def test_label_list_index_array_contains(tmp_path: Path): @@ -2215,8 +2214,8 @@ def test_searches(): test_searches() # Make sure fetching index stats on empty index is ok - for idx in ds.list_indices(): - ds.stats.index_stats(idx["name"]) + for idx in ds.describe_indices(): + ds.stats.index_stats(idx.name) # Make sure updating empty indices is ok ds.optimize.optimize_indices() @@ -2286,17 +2285,17 @@ def test_drop_index(tmp_path): ds.create_scalar_index("fts", index_type="INVERTED") ds.create_scalar_index("ngram", index_type="NGRAM") - assert len(ds.list_indices()) == 4 + assert len(ds.describe_indices()) == 4 # Attempt to drop index (name does not exist) with pytest.raises(RuntimeError, match="index not found"): ds.drop_index("nonexistent_name") - for idx in ds.list_indices(): - idx_name = idx["name"] + for idx in ds.describe_indices(): + idx_name = idx.name ds.drop_index(idx_name) - assert len(ds.list_indices()) == 0 + assert len(ds.describe_indices()) == 0 # Ensure we can still search columns assert ds.to_table(filter="btree = 1").num_rows == 1 @@ -2964,20 +2963,10 @@ def test_build_distributed_fts_index_basic(tmp_path): ) # Verify the index was created - indices = distributed_ds.list_indices() - assert len(indices) > 0, "No indices found after distributed index creation" - - # Find our distributed index - distributed_index = None - for idx in indices: - if "distributed" in idx["name"]: - distributed_index = idx - break - - assert distributed_index is not None, "Distributed index not found" - assert distributed_index["type"] == "Inverted", ( - f"Expected Inverted index, got {distributed_index['type']}" - ) + index_name = "text_distributed_idx" + stats = distributed_ds.stats.index_stats(index_name) + assert stats["name"] == index_name + assert stats["index_type"] == "Inverted" # Test that the index works for searching results = distributed_ds.scanner( @@ -3392,19 +3381,9 @@ def test_distribute_fts_index_build(tmp_path): ) # Verify the index was created and is functional - indices = ds_committed.list_indices() - assert len(indices) > 0, "No indices found after commit" - - # Find our index - our_index = None - for idx in indices: - if idx["name"] == index_name: - our_index = idx - break - assert our_index is not None, f"Index '{index_name}' not found in indices list" - assert our_index["type"] == "Inverted", ( - f"Expected Inverted index, got {our_index['type']}" - ) + stats = ds_committed.stats.index_stats(index_name) + assert stats["name"] == index_name + assert stats["index_type"] == "Inverted" # Test that the index works for searching # Get a sample text from the dataset to search for @@ -3472,10 +3451,10 @@ def test_backward_compatibility_no_fragment_ids(tmp_path): ) # Verify the index was created - indices = ds.list_indices() + indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0]["name"] == "full_dataset_idx" - assert indices[0]["type"] == "Inverted" + assert indices[0].name == "full_dataset_idx" + assert indices[0].index_type == "Inverted" # Test that the index works sample_data = ds.take([0], columns=["text"]) @@ -3496,10 +3475,10 @@ def test_backward_compatibility_changed_index_protos(tmp_path): shutil.copytree(path, tmp_path, dirs_exist_ok=True) ds = lance.dataset(tmp_path) - indices = ds.list_indices() + indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0]["name"] == "x_idx" - assert indices[0]["type"] == "BTree" + assert indices[0].name == "x_idx" + assert indices[0].index_type == "BTree" results = ds.scanner(filter="x = 100").to_table() assert results.num_rows == 1 @@ -3583,20 +3562,9 @@ def test_distribute_btree_index_build(tmp_path): ) # Verify the index was created and is functional - indices = ds_committed.list_indices() - assert len(indices) > 0, "No indices found after commit" - - # Find our index - our_index = None - for idx in indices: - if idx["name"] == index_name: - our_index = idx - break - - assert our_index is not None, f"Index '{index_name}' not found in indices list" - assert our_index["type"] == "BTree", ( - f"Expected BTree index, got {our_index['type']}" - ) + stats = ds_committed.stats.index_stats(index_name) + assert stats["name"] == index_name + assert stats["index_type"] == "BTree" # Test that the index works for searching # Test exact equality queries @@ -3984,10 +3952,10 @@ def test_nested_field_btree_index(tmp_path): dataset.create_scalar_index(column="meta.lang", index_type="BTREE") # Verify index was created - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["fields"] == ["meta.lang"] - assert indices[0]["type"] == "BTree" + assert indices[0].field_names == ["lang"] + assert indices[0].index_type == "BTree" # Test query using the index - filter for English language result = dataset.scanner(filter="meta.lang = 'en'").to_table() @@ -4085,10 +4053,10 @@ def test_nested_field_fts_index(tmp_path): ds.create_scalar_index("data.text", index_type="INVERTED", with_position=False) # Verify index was created - indices = ds.list_indices() + indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0]["fields"] == ["data.text"] - assert indices[0]["type"] == "Inverted" + assert indices[0].field_names == ["text"] + assert indices[0].index_type == "Inverted" # Test full text search on nested field results = ds.to_table(full_text_query="lance") @@ -4159,10 +4127,10 @@ def test_nested_field_bitmap_index(tmp_path): ds.create_scalar_index("attributes.color", index_type="BITMAP") # Verify index was created - indices = ds.list_indices() + indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0]["fields"] == ["attributes.color"] - assert indices[0]["type"] == "Bitmap" + assert indices[0].field_names == ["color"] + assert indices[0].index_type == "Bitmap" # Test equality query results = ds.to_table(filter="attributes.color = 'red'", prefilter=True) diff --git a/python/python/tests/test_schema_evolution.py b/python/python/tests/test_schema_evolution.py index 6560d8c7e7d..205aaa4fa66 100644 --- a/python/python/tests/test_schema_evolution.py +++ b/python/python/tests/test_schema_evolution.py @@ -37,12 +37,12 @@ def test_drop_columns(tmp_path: Path): "c": pa.int64(), } ) - assert len(dataset.list_indices()) == 1 + assert len(dataset.describe_indices()) == 1 # Drop vector column, index is dropped dataset.drop_columns(["a"]) assert dataset.schema == pa.schema({"c": pa.int64()}) - assert len(dataset.list_indices()) == 0 + assert len(dataset.describe_indices()) == 0 # Can't drop all columns with pytest.raises(ValueError): diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 087338a9a21..dcdf88ee84d 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -497,9 +497,8 @@ def test_create_index_accelerator_fallback(tmp_path, caplog): accelerator="cuda", ) - indices = dataset.list_indices() - assert len(indices) == 1 - assert indices[0]["type"] == "IVF_HNSW_SQ" + stats = dataset.stats.index_stats("vector_idx") + assert stats["index_type"] == "IVF_HNSW_SQ" assert any( "does not support GPU acceleration; falling back to CPU" in record.message for record in caplog.records @@ -561,7 +560,7 @@ def test_has_index(dataset, tmp_path): ) assert ann_ds.has_index - assert ann_ds.list_indices()[0]["fields"] == ["vector"] + assert ann_ds.describe_indices()[0].field_names == ["vector"] def test_index_type(dataset, tmp_path): @@ -574,7 +573,8 @@ def test_index_type(dataset, tmp_path): num_sub_vectors=16, replace=True, ) - assert ann_ds.list_indices()[0]["type"] == "IVF_PQ" + stats = ann_ds.stats.index_stats("vector_idx") + assert stats["index_type"] == "IVF_PQ" ann_ds = ann_ds.create_index( "vector", @@ -583,7 +583,8 @@ def test_index_type(dataset, tmp_path): num_sub_vectors=16, replace=True, ) - assert ann_ds.list_indices()[0]["type"] == "IVF_HNSW_SQ" + stats = ann_ds.stats.index_stats("vector_idx") + assert stats["index_type"] == "IVF_HNSW_SQ" ann_ds = ann_ds.create_index( "vector", @@ -592,7 +593,8 @@ def test_index_type(dataset, tmp_path): num_sub_vectors=16, replace=True, ) - assert ann_ds.list_indices()[0]["type"] == "IVF_HNSW_PQ" + stats = ann_ds.stats.index_stats("vector_idx") + assert stats["index_type"] == "IVF_HNSW_PQ" def test_create_dot_index(dataset, tmp_path): @@ -791,7 +793,7 @@ def test_create_ivf_sq_index(dataset, tmp_path): index_type="IVF_SQ", num_partitions=4, ) - assert ann_ds.list_indices()[0]["fields"] == ["vector"] + assert ann_ds.describe_indices()[0].field_names == ["vector"] def test_create_ivf_rq_index(): @@ -802,7 +804,7 @@ def test_create_ivf_rq_index(): num_partitions=4, num_bits=1, ) - assert ds.list_indices()[0]["fields"] == ["vector"] + assert ds.describe_indices()[0].field_names == ["vector"] with pytest.raises( NotImplementedError, @@ -850,7 +852,7 @@ def test_create_ivf_hnsw_pq_index(dataset, tmp_path): num_partitions=4, num_sub_vectors=16, ) - assert ann_ds.list_indices()[0]["fields"] == ["vector"] + assert ann_ds.describe_indices()[0].field_names == ["vector"] def test_create_ivf_hnsw_sq_index(dataset, tmp_path): @@ -862,7 +864,7 @@ def test_create_ivf_hnsw_sq_index(dataset, tmp_path): num_partitions=4, num_sub_vectors=16, ) - assert ann_ds.list_indices()[0]["fields"] == ["vector"] + assert ann_ds.describe_indices()[0].field_names == ["vector"] def test_create_ivf_hnsw_flat_index(dataset, tmp_path): @@ -874,7 +876,7 @@ def test_create_ivf_hnsw_flat_index(dataset, tmp_path): num_partitions=4, num_sub_vectors=16, ) - assert ann_ds.list_indices()[0]["fields"] == ["vector"] + assert ann_ds.describe_indices()[0].field_names == ["vector"] def test_multivec_ann(indexed_multivec_dataset: lance.LanceDataset): @@ -940,10 +942,10 @@ def test_pre_populated_ivf_centroids(dataset, tmp_path: Path): )["id"].to_numpy() assert len(actual) == 10 - index_meta = dataset_with_index.list_indices()[0] - index_uuid = index_meta["uuid"] + index_meta = dataset_with_index.describe_indices()[0] + index_uuid = index_meta.segments[0].uuid assert len(index_uuid) == 36 - assert index_meta["fragment_ids"] == {0} + assert index_meta.segments[0].fragment_ids == {0} expected_filepath = str(tmp_path / "_indices" / index_uuid / "index.idx") if platform.system() == "Windows": @@ -1426,7 +1428,7 @@ def test_index_cast_centroids(tmp_path): ) # Get the centroids - index_name = dataset.list_indices()[0]["name"] + index_name = dataset.describe_indices()[0].name index_stats = dataset.stats.index_stats(index_name) centroids = index_stats["indices"][0]["centroids"] values = pa.array([x for arr in centroids for x in arr], pa.float32()) @@ -1508,13 +1510,13 @@ def test_fragment_scan_disallowed_on_ann_with_index_scan_prefilter(tmp_path): def test_load_indices(dataset): - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 0 dataset.create_index( "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=16 ) - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 @@ -1538,23 +1540,23 @@ def test_describe_vector_index(indexed_dataset: LanceDataset): def test_optimize_indices(indexed_dataset): data = create_table() indexed_dataset = lance.write_dataset(data, indexed_dataset.uri, mode="append") - indices = indexed_dataset.list_indices() - assert len(indices) == 1 + stats = indexed_dataset.stats.index_stats("vector_idx") + assert stats["num_indices"] == 1 indexed_dataset.optimize.optimize_indices(num_indices_to_merge=0) - indices = indexed_dataset.list_indices() - assert len(indices) == 2 + stats = indexed_dataset.stats.index_stats("vector_idx") + assert stats["num_indices"] == 2 @pytest.mark.skip(reason="retrain is deprecated") def test_retrain_indices(indexed_dataset): data = create_table() indexed_dataset = lance.write_dataset(data, indexed_dataset.uri, mode="append") - indices = indexed_dataset.list_indices() - assert len(indices) == 1 + stats = indexed_dataset.stats.index_stats("vector_idx") + assert stats["num_indices"] == 1 indexed_dataset.optimize.optimize_indices(num_indices_to_merge=0) - indices = indexed_dataset.list_indices() - assert len(indices) == 2 + stats = indexed_dataset.stats.index_stats("vector_idx") + assert stats["num_indices"] == 2 stats = indexed_dataset.stats.index_stats("vector_idx") centroids = stats["indices"][0]["centroids"] @@ -1565,8 +1567,8 @@ def test_retrain_indices(indexed_dataset): new_centroids = indexed_dataset.stats.index_stats("vector_idx")["indices"][0][ "centroids" ] - indices = indexed_dataset.list_indices() - assert len(indices) == 1 + stats = indexed_dataset.stats.index_stats("vector_idx") + assert stats["num_indices"] == 1 assert centroids != new_centroids @@ -1584,10 +1586,10 @@ def test_no_include_deleted_rows(indexed_dataset): def test_drop_indices(indexed_dataset): - idx_name = indexed_dataset.list_indices()[0]["name"] + idx_name = indexed_dataset.describe_indices()[0].name indexed_dataset.drop_index(idx_name) - indices = indexed_dataset.list_indices() + indices = indexed_dataset.describe_indices() assert len(indices) == 0 test_vec = ( @@ -1608,7 +1610,7 @@ def test_drop_indices(indexed_dataset): def test_read_partition(indexed_dataset): - idx_name = indexed_dataset.list_indices()[0]["name"] + idx_name = indexed_dataset.describe_indices()[0].name reader = VectorIndexReader(indexed_dataset, idx_name) num_rows = indexed_dataset.count_rows() @@ -1790,9 +1792,9 @@ def test_nested_field_vector_index(tmp_path): ) # Verify index was created - indices = dataset.list_indices() + indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0]["fields"] == ["data.embedding"] + assert indices[0].field_names == ["embedding"] # Test querying with the index query_vec = vectors[0]