lance-format · westonpace · Nov 18, 2025 · Nov 12, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/protos/table.proto b/protos/table.proto
@@ -257,6 +257,15 @@ message IndexMetadata {
   //
   // Indices should avoid putting large amounts of information in this field, as it will
   // bloat the manifest.
+  //
+  // Indexes are plugins, and so the format of the details message is flexible and not fully
+  // defined by the table format.  However, there are some conventions that should be followed:
+  //
+  // - When Lance APIs refer to indexes they will use the type URL of the index details as the
+  //   identifier for the index type.  If a user provides a simple string identifier like
+  //   "btree" then it will be converted to "/lance.table.BTreeIndexDetails"
+  // - Type URLs comparisons are case-insensitive.  Thereform an index must have a unique type
+  //   URL ignoring case.
   google.protobuf.Any index_details = 6;
 
   // The minimum lance version that this index is compatible with.

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -75,6 +75,7 @@
 
     from .commit import CommitLock
     from .io import StorageOptionsProvider
+    from .lance.indices import IndexDescription
     from .progress import FragmentWriteProgress
     from .types import ReaderLike
 
@@ -645,8 +646,27 @@ def checkout_latest(self):
         self._ds.checkout_latest()
 
     def list_indices(self) -> List[Index]:
+        """
+        Returns index information for all indices in the dataset.
+
+        This method is deprecated as it requires loading the statistics for each index
+        which can be a very expensive operation.  Instead use describe_indices() to
+        list index information and index_statistics() to get the statistics for
+        individual indexes of interest.
+        """
+        # TODO: https://github.com/lancedb/lance/issues/5237 deprecate this method
+        # warnings.warn(
+        #     "The 'list_indices' method is deprecated.  It may be removed in a future"
+        #     "version.  Use describe_indices() instead.",
+        #     DeprecationWarning,
+        # )
+
         return self._ds.load_indices()
 
+    def describe_indices(self) -> List[IndexDescription]:
+        """Returns index information for all indices in the dataset."""
+        return self._ds.describe_indices()
+
     def index_statistics(self, index_name: str) -> Dict[str, Any]:
         warnings.warn(
             "LanceDataset.index_statistics() is deprecated, "

diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
@@ -60,6 +60,7 @@ from .fragment import (
 from .fragment import (
     RowIdMeta as RowIdMeta,
 )
+from .indices import IndexDescription as IndexDescription
 from .optimize import (
     Compaction as Compaction,
 )
@@ -215,6 +216,7 @@ class _Dataset:
     def index_statistics(self, index_name: str) -> str: ...
     def serialized_manifest(self) -> bytes: ...
     def load_indices(self) -> List[Index]: ...
+    def describe_indices(self) -> List[IndexDescription]: ...
     def scanner(
         self,
         columns: Optional[List[str]] = None,

diff --git a/python/python/lance/lance/indices/__init__.pyi b/python/python/lance/lance/indices/__init__.pyi
@@ -12,6 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from datetime import datetime
+from typing import Optional
+
 import pyarrow as pa
 
 class IndexConfig:
@@ -47,3 +50,24 @@ def transform_vectors(
     pq_codebook: pa.Array,
     dst_uri: str,
 ): ...
+
+class IndexSegmentDescription:
+    uuid: str
+    dataset_version_at_last_update: int
+    fragment_ids: set[int]
+    index_version: int
+    created_at: Optional[datetime]
+
+    def __repr__(self) -> str: ...
+
+class IndexDescription:
+    name: str
+    type_url: str
+    index_type: str
+    num_rows_indexed: int
+    fields: list[int]
+    field_names: list[str]
+    segments: list[IndexSegmentDescription]
+    details: dict
+
+    def __repr__(self) -> str: ...
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
@@ -4006,3 +4006,124 @@ def test_json_inverted_match_query(tmp_path):
         full_text_query=MatchQuery("Author,str,tolkien", "json_col")
     )
     assert results.num_rows == 1
+
+
+def test_describe_indices(tmp_path):
+    data = pa.table(
+        {
+            "id": range(100),
+            "text": [f"document {i} about lance database" for i in range(100)],
+            "bitmap": range(100),
+            "bloomfilter": range(100),
+            "btree": range(100),
+            "json": pa.array(
+                [json.dumps({"key": f"value_{i}"}) for i in range(100)], pa.json_()
+            ),
+            "ngram": [f"document {i}" for i in range(100)],
+            "zonemap": range(100),
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path)
+    ds.create_scalar_index("text", index_type="INVERTED")
+    indices = ds.describe_indices()
+    assert len(indices) == 1
+
+    assert indices[0].name == "text_idx"
+    assert indices[0].type_url == "/lance.table.InvertedIndexDetails"
+    assert indices[0].index_type == "Inverted"
+    assert indices[0].num_rows_indexed == 100
+    assert indices[0].fields == [1]
+    assert indices[0].field_names == ["text"]
+    assert len(indices[0].segments) == 1
+    assert indices[0].segments[0].uuid is not None
+    assert indices[0].segments[0].fragment_ids == {0}
+    assert indices[0].segments[0].dataset_version_at_last_update == 1
+    assert indices[0].segments[0].index_version == 1
+    assert indices[0].segments[0].created_at is not None
+    assert isinstance(indices[0].segments[0].created_at, datetime)
+
+    details = indices[0].details
+    assert details is not None and len(details) > 0
+    assert details["lance_tokenizer"] is None
+    assert details["base_tokenizer"] == "simple"
+    assert details["language"] == "English"
+    assert not details["with_position"]
+    assert details["max_token_length"] == 40
+    assert details["lower_case"]
+    assert details["stem"]
+    assert details["remove_stop_words"]
+    assert details["custom_stop_words"] is None
+    assert details["ascii_folding"]
+    assert details["min_ngram_length"] == 3
+    assert details["max_ngram_length"] == 3
+    assert not details["prefix_only"]
+
+    ds.create_scalar_index("bitmap", index_type="BITMAP")
+    ds.create_scalar_index("bloomfilter", index_type="BLOOMFILTER")
+    ds.create_scalar_index("btree", index_type="BTREE")
+    ds.create_scalar_index(
+        "json",
+        IndexConfig(
+            index_type="json", parameters={"target_index_type": "btree", "path": "x"}
+        ),
+    )
+    ds.create_scalar_index("ngram", index_type="NGRAM")
+    ds.create_scalar_index("zonemap", index_type="ZONEMAP")
+
+    indices = ds.describe_indices()
+    # Skip text index since it is already asserted above
+    indices = [index for index in indices if index.name != "text_idx"]
+    indices.sort(key=lambda x: x.name)
+
+    names = [
+        "bitmap_idx",
+        "bloomfilter_idx",
+        "btree_idx",
+        "json_idx",
+        "ngram_idx",
+        "zonemap_idx",
+    ]
+    types_urls = [
+        "/lance.table.BitmapIndexDetails",
+        "/lance.index.pb.BloomFilterIndexDetails",
+        "/lance.table.BTreeIndexDetails",
+        "/lance.index.pb.JsonIndexDetails",
+        "/lance.table.NGramIndexDetails",
+        "/lance.table.ZoneMapIndexDetails",
+    ]
+    index_types = [
+        "Bitmap",
+        "BloomFilter",
+        "BTree",
+        "Json",
+        "NGram",
+        "ZoneMap",
+    ]
+    details = [
+        "{}",
+        "{}",
+        "{}",
+        '{"path":"x","target_details":{}}',
+        "{}",
+        "{}",
+    ]
+
+    for i in range(len(indices)):
+        assert indices[i].name == names[i]
+        assert indices[i].type_url == types_urls[i]
+        assert indices[i].index_type == index_types[i]
+        assert indices[i].num_rows_indexed == 100
+        assert indices[i].fields == [i + 2]
+        assert indices[i].field_names == [data.column_names[i + 2]]
+        assert len(indices[i].segments) == 1
+        assert indices[i].segments[0].fragment_ids == {0}
+        assert indices[i].segments[0].dataset_version_at_last_update == i + 2
+        assert indices[i].segments[0].index_version == 0
+        assert indices[i].segments[0].created_at is not None
+        assert isinstance(indices[i].segments[0].created_at, datetime)
+        assert indices[i].details == json.loads(details[i])
+
+    ds.delete("id < 50")
+    indices = ds.describe_indices()
+    for index in indices:
+        assert index.num_rows_indexed == 50
diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
@@ -12,7 +12,7 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 import pytest
-from lance import LanceFragment
+from lance import LanceDataset, LanceFragment
 from lance.dataset import VectorIndexReader
 from lance.indices import IndexFileVersion
 from lance.util import validate_vector_index  # noqa: E402
@@ -1391,6 +1391,23 @@ def test_load_indices(dataset):
     assert len(indices) == 1
 
 
+def test_describe_vector_index(indexed_dataset: LanceDataset):
+    info = indexed_dataset.describe_indices()[0]
+
+    assert info.name == "vector_idx"
+    assert info.type_url == "/lance.table.VectorIndexDetails"
+    # This is currently Unknown because vector indices are not yet handled by plugins
+    assert info.index_type == "Unknown"
+    assert info.num_rows_indexed == 1000
+    assert info.fields == [0]
+    assert info.field_names == ["vector"]
+    assert len(info.segments) == 1
+    assert info.segments[0].fragment_ids == {0}
+    assert info.segments[0].dataset_version_at_last_update == 1
+    assert info.segments[0].index_version == 1
+    assert info.segments[0].created_at is not None
+
+
 def test_optimize_indices(indexed_dataset):
     data = create_table()
     indexed_dataset = lance.write_dataset(data, indexed_dataset.uri, mode="append")

diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -86,7 +86,7 @@ use lance_table::io::commit::CommitHandler;
 use crate::error::PythonErrorExt;
 use crate::file::object_store_from_uri_or_path;
 use crate::fragment::FileFragment;
-use crate::indices::PyIndexConfig;
+use crate::indices::{PyIndexConfig, PyIndexDescription};
 use crate::rt;
 use crate::scanner::ScanStatistics;
 use crate::schema::{logical_schema_from_lance, LanceSchema};
@@ -2599,6 +2599,18 @@ impl Dataset {
         let builder = self.ds.sql(&sql);
         Ok(SqlQueryBuilder { builder })
     }
+
+    #[pyo3(signature=())]
+    fn describe_indices(&self, py: Python<'_>) -> PyResult<Vec<PyIndexDescription>> {
+        let new_self = self.ds.as_ref().clone();
+        let indices = rt()
+            .block_on(Some(py), new_self.describe_indices(None))?
+            .infer_error()?;
+        Ok(indices
+            .into_iter()
+            .map(|desc| PyIndexDescription::new(desc.as_ref(), self.ds.as_ref()))
+            .collect())
+    }
 }
 
 #[pyclass(name = "SqlQuery", module = "_lib", subclass)]