Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/python/ci_benchmarks/datagen/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _create(dataset_uri: str):
schema=SCHEMA,
mode="create",
)
if ds.list_indices() == []:
if not ds.describe_indices():
ds.create_scalar_index("row_number", "BTREE")
ds.create_scalar_index("row_number_bitmap", "BITMAP")

Expand Down
33 changes: 21 additions & 12 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,12 +647,11 @@ def list_indices(self) -> List[Index]:
list index information and index_statistics() to get the statistics for
individual indexes of interest.
"""
# TODO: https://github.com/lancedb/lance/issues/5237 deprecate this method
# warnings.warn(
# "The 'list_indices' method is deprecated. It may be removed in a future"
# "version. Use describe_indices() instead.",
# DeprecationWarning,
# )
warnings.warn(
"The 'list_indices' method is deprecated. It may be removed in a future "
"version. Use describe_indices() instead.",
DeprecationWarning,
)

return self._ds.load_indices()

Expand All @@ -670,7 +669,7 @@ def index_statistics(self, index_name: str) -> Dict[str, Any]:

@property
def has_index(self):
return len(self.list_indices()) > 0
return len(self.describe_indices()) > 0

def _apply_default_scan_options(self, builder: ScannerBuilder):
if self._default_scan_options:
Expand Down Expand Up @@ -3298,8 +3297,8 @@ def drop_index(self, name: str):

Note: Indices are dropped by "index name". This is not the same as the field
name. If you did not specify a name when you created the index then a name was
generated for you. You can use the `list_indices` method to get the names of
the indices.
generated for you. You can use the `describe_indices` method to get the names
of the indices.
"""
return self._ds.drop_index(name)

Expand Down Expand Up @@ -3931,9 +3930,19 @@ def _default_vector_index_for_column(self, column: str) -> str:

Raises KeyError if no such index exists.
"""
for meta in self.list_indices():
if column in meta["fields"] and meta["type"].startswith("IVF"):
return meta["name"]
# Resolve column path to field id for describe_indices matching.
lance_field = self._ds.lance_schema.field_case_insensitive(column)
if lance_field is None:
raise KeyError(f"No IVF index for column '{column}'")
field_id = lance_field.id()

indices = self.describe_indices()
for idx in indices:
if field_id in idx.fields:
# Use index_stats to get the concrete IVF subtype.
index_type = self.stats.index_stats(idx.name).get("index_type", "")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think index_stats can be an expensive call but maybe not. I wonder if we could further refine this to only consider columns that have a list or fixed-size-list data type? Either way though, this is an improvement over the past behavior so I think it's ok.

if index_type.startswith("IVF"):
return idx.name
raise KeyError(f"No IVF index for column '{column}'")

def centroids(
Expand Down
48 changes: 23 additions & 25 deletions python/python/tests/test_column_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset):
"""Scalar index creation should work with mixed-case column names."""
mixed_case_dataset.create_scalar_index("userId", index_type="BTREE")

indices = mixed_case_dataset.list_indices()
indices = mixed_case_dataset.describe_indices()
assert len(indices) == 1
assert indices[0]["fields"] == ["userId"]
assert indices[0]["name"] == "userId_idx"
assert indices[0].field_names == ["userId"]
assert indices[0].name == "userId_idx"

# Query using the indexed column
result = mixed_case_dataset.to_table(filter="userId = 50")
Expand Down Expand Up @@ -206,7 +206,7 @@ def test_scalar_index_on_each_case_variant(self, tmp_path, case_variant_table):
# Create separate datasets for each test to avoid index conflicts
ds1 = lance.write_dataset(case_variant_table, tmp_path / "ds1")
ds1.create_scalar_index("camelCase", index_type="BTREE")
assert ds1.list_indices()[0]["fields"] == ["camelCase"]
assert ds1.describe_indices()[0].field_names == ["camelCase"]

# Query camelCase=50 should return row 50 (where CamelCase=49, CAMELCASE=0)
result = ds1.to_table(filter="camelCase = 50")
Expand All @@ -221,7 +221,7 @@ def test_scalar_index_on_each_case_variant(self, tmp_path, case_variant_table):
# Test CamelCase index
ds2 = lance.write_dataset(case_variant_table, tmp_path / "ds2")
ds2.create_scalar_index("CamelCase", index_type="BTREE")
assert ds2.list_indices()[0]["fields"] == ["CamelCase"]
assert ds2.describe_indices()[0].field_names == ["CamelCase"]

# Query CamelCase=50 should return row 49 (where camelCase=49, CAMELCASE=99)
result = ds2.to_table(filter="CamelCase = 50")
Expand All @@ -236,7 +236,7 @@ def test_scalar_index_on_each_case_variant(self, tmp_path, case_variant_table):
# Test CAMELCASE index
ds3 = lance.write_dataset(case_variant_table, tmp_path / "ds3")
ds3.create_scalar_index("CAMELCASE", index_type="BTREE")
assert ds3.list_indices()[0]["fields"] == ["CAMELCASE"]
assert ds3.describe_indices()[0].field_names == ["CAMELCASE"]

# Query CAMELCASE=50 should return row 0 (where camelCase=0, CamelCase=99)
result = ds3.to_table(filter="CAMELCASE = 50")
Expand Down Expand Up @@ -347,11 +347,10 @@ def test_scalar_index_with_special_chars(self, special_char_dataset):
# Column name is used directly without SQL parsing
special_char_dataset.create_scalar_index("user-id", index_type="BTREE")

indices = special_char_dataset.list_indices()
indices = special_char_dataset.describe_indices()
assert len(indices) == 1
# Field with special chars is returned in quoted format for SQL compatibility
assert indices[0]["fields"] == ["`user-id`"]
assert indices[0]["name"] == "user-id_idx"
assert indices[0].field_names == ["user-id"]
assert indices[0].name == "user-id_idx"

# Query using the indexed column (requires backticks in filter)
result = special_char_dataset.to_table(filter="`user-id` = 50")
Expand Down Expand Up @@ -460,10 +459,10 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset):
"MetaData.userId", index_type="BTREE"
)

indices = nested_mixed_case_dataset.list_indices()
indices = nested_mixed_case_dataset.describe_indices()
assert len(indices) == 1
assert indices[0]["fields"] == ["MetaData.userId"]
assert indices[0]["name"] == "MetaData.userId_idx"
assert indices[0].name == "MetaData.userId_idx"
assert indices[0].field_names == ["userId"]

# Query using the indexed column
result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50")
Expand All @@ -482,10 +481,10 @@ def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset):
"""Scalar index on top-level mixed-case column works."""
nested_mixed_case_dataset.create_scalar_index("rowId", index_type="BTREE")

indices = nested_mixed_case_dataset.list_indices()
indices = nested_mixed_case_dataset.describe_indices()
assert len(indices) == 1
assert indices[0]["fields"] == ["rowId"]
assert indices[0]["name"] == "rowId_idx"
assert indices[0].name == "rowId_idx"
assert indices[0].field_names == ["rowId"]

result = nested_mixed_case_dataset.to_table(filter="rowId = 50")
assert result.num_rows == 1
Expand All @@ -509,10 +508,11 @@ def test_scalar_index_with_lowercased_nested_path(self, nested_mixed_case_datase
"metadata.userid", index_type="BTREE"
)

indices = nested_mixed_case_dataset.list_indices()
indices = nested_mixed_case_dataset.describe_indices()
assert len(indices) == 1
# Should store with correct case from schema
assert indices[0]["fields"] == ["MetaData.userId"]
assert indices[0].name == "MetaData.userId_idx"
assert indices[0].field_names == ["userId"]

# Query should also work with correct case
result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50")
Expand Down Expand Up @@ -574,11 +574,10 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase
"`meta-data`.`user-id`", index_type="BTREE"
)

indices = nested_special_char_dataset.list_indices()
indices = nested_special_char_dataset.describe_indices()
assert len(indices) == 1
# Fields with special chars are returned in quoted format for SQL compatibility
assert indices[0]["fields"] == ["`meta-data`.`user-id`"]
assert indices[0]["name"] == "meta-data.user-id_idx"
assert indices[0].field_names == ["user-id"]
assert indices[0].name == "meta-data.user-id_idx"

# Query using the indexed column (backticks required in filter)
result = nested_special_char_dataset.to_table(
Expand All @@ -599,10 +598,9 @@ def test_scalar_index_on_top_level_special_chars(self, nested_special_char_datas
"""Scalar index on top-level special char column works."""
nested_special_char_dataset.create_scalar_index("`row-id`", index_type="BTREE")

indices = nested_special_char_dataset.list_indices()
indices = nested_special_char_dataset.describe_indices()
assert len(indices) == 1
# Field with special chars is returned in quoted format for SQL compatibility
assert indices[0]["fields"] == ["`row-id`"]
assert indices[0].field_names == ["row-id"]

result = nested_special_char_dataset.to_table(filter="`row-id` = 50")
assert result.num_rows == 1
Expand Down
14 changes: 7 additions & 7 deletions python/python/tests/test_commit_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _get_field_id_by_name(lance_schema, field_name):
def test_commit_index(dataset_with_index, test_table, tmp_path):
from lance.dataset import Index

index_id = dataset_with_index.list_indices()[0]["uuid"]
index_id = dataset_with_index.describe_indices()[0].segments[0].uuid

# Create a new dataset without index
dataset_without_index = lance.write_dataset(
Expand Down Expand Up @@ -90,13 +90,13 @@ def test_commit_index(dataset_with_index, test_table, tmp_path):
read_version=dataset_without_index.version,
)

# Verify that both datasets have the index
assert len(dataset_with_index.list_indices()) == 1
assert len(dataset_without_index.list_indices()) == 1
# Verify the manually committed index matches the original index stats
stats_with = dataset_with_index.stats.index_stats("meta_idx")
stats_without = dataset_without_index.stats.index_stats("meta_idx")

assert (
dataset_without_index.list_indices()[0] == dataset_with_index.list_indices()[0]
)
assert stats_without["name"] == stats_with["name"]
assert stats_without["index_type"] == stats_with["index_type"]
assert stats_without["num_indexed_rows"] == stats_with["num_indexed_rows"]

# Check if the index is used in scans
for dataset in [dataset_with_index, dataset_without_index]:
Expand Down
6 changes: 3 additions & 3 deletions python/python/tests/test_create_empty_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ def test_create_empty_scalar_index():
dataset.create_scalar_index("id", "BTREE", train=False)

# Verify index exists and has correct stats
indices = dataset.list_indices()
indices = dataset.describe_indices()
assert len(indices) == 1
assert indices[0]["type"] == "BTree"
stats = dataset.stats.index_stats(indices[0]["name"])
assert indices[0].index_type == "BTree"
stats = dataset.stats.index_stats(indices[0].name)
assert stats["num_indexed_rows"] == 0
assert stats["num_unindexed_rows"] == dataset.count_rows()

Expand Down
6 changes: 3 additions & 3 deletions python/python/tests/test_indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,6 @@ def test_load_shuffled_vectors(
)

final_ds = lance.dataset(str(tmpdir / "dataset"))
assert final_ds.has_index
assert final_ds.list_indices()[0]["fields"] == ["vectors"]
assert len(final_ds.list_indices()[0]["fragment_ids"]) == NUM_FRAGMENTS
stats = final_ds.stats.index_stats("vectors_idx")
assert stats["name"] == "vectors_idx"
assert stats["num_indexed_fragments"] == NUM_FRAGMENTS
5 changes: 2 additions & 3 deletions python/python/tests/test_memory_leaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,8 @@ def test_index_statistics_no_leak(self, tmp_path) -> None:

def access_index_stats() -> None:
d = lance.dataset(dataset_path)
for idx in d.list_indices():
if name := idx.get("name"):
d.stats.index_stats(name)
for idx in d.describe_indices():
d.stats.index_stats(idx.name)

assert_noleaks(
access_index_stats, iterations=1000, threshold_mb=2.0, check_interval=25
Expand Down
4 changes: 2 additions & 2 deletions python/python/tests/test_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,8 @@ def test_index_remapping_multiple_rewrite_tasks(tmp_path: Path):
fragments = list(ds.get_fragments())
assert len(fragments) == 2

index = ds.list_indices()[0]
index_frag_ids = list(index["fragment_ids"])
index = ds.describe_indices()[0]
index_frag_ids = list(index.segments[0].fragment_ids)
frag_ids = [frag.fragment_id for frag in fragments]

assert len(index_frag_ids) == 1
Expand Down
Loading
Loading