From d826391280ce14ca2b1ddb275311098f4fd660e3 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sun, 11 Jan 2026 00:30:54 +0800 Subject: [PATCH 1/4] lance-index: use LABEL_LIST index for array_contains --- rust/lance-index/src/scalar/expression.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index edf9b055721..603e7852fd0 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -490,6 +490,24 @@ impl ScalarQueryParser for LabelListQueryParser { if args.len() != 2 { return None; } + if func.name() == "array_has" { + let inner_type = match data_type { + DataType::List(field) | DataType::LargeList(field) => field.data_type(), + _ => return None, + }; + let scalar = maybe_scalar(&args[1], inner_type)?; + // Do not push down NULL needles. + if scalar.is_null() { + return None; + } + let query = LabelListQuery::HasAnyLabel(vec![scalar]); + return Some(IndexedExpression::index_query( + column.to_string(), + self.index_name.clone(), + Arc::new(query), + )); + } + let label_list = maybe_scalar(&args[1], data_type)?; if let ScalarValue::List(list_arr) = label_list { let list_values = list_arr.values(); @@ -1651,6 +1669,7 @@ fn visit_node( } match expr { Expr::Between(between) => Ok(visit_between(between, index_info)), + Expr::Alias(alias) => visit_node(alias.expr.as_ref(), index_info, depth), Expr::Column(_) => Ok(visit_column(expr, index_info)), Expr::InList(in_list) => Ok(visit_in_list(in_list, index_info)), Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)), From cd0790065e67444d8e93094079ea26e7909530af Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sun, 11 Jan 2026 00:32:17 +0800 Subject: [PATCH 2/4] python: test LABEL_LIST index with array_contains --- python/python/tests/test_scalar_index.py | 33 ++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 443172db13e..859eb737d82 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1977,6 +1977,39 @@ def test_label_list_index(tmp_path: Path): assert indices[0]["type"] == "LabelList" +def test_label_list_index_array_contains(tmp_path: Path): + # Include lists with NULL items to ensure NULL needle behavior matches + # non-index execution. + tbl = pa.table( + {"labels": [["foo", "bar"], ["bar"], ["baz"], ["qux", None], [None], []]} + ) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + expected_null_rows = dataset.to_table( + filter="array_contains(labels, NULL)" + ).num_rows + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + result = dataset.to_table(filter="array_contains(labels, 'foo')") + assert result.num_rows == 1 + + result = dataset.to_table(filter="array_contains(labels, 'bar')") + assert result.num_rows == 2 + + result = dataset.to_table(filter="array_contains(labels, 'oof')") + assert result.num_rows == 0 + + explain = dataset.scanner(filter="array_contains(labels, 'foo')").explain_plan() + assert "ScalarIndexQuery" in explain + + # NULL needle: preserve semantics (must match pre-index execution) and avoid + # using the LABEL_LIST index. + actual_null_rows = dataset.to_table(filter="array_contains(labels, NULL)").num_rows + assert actual_null_rows == expected_null_rows + explain = dataset.scanner(filter="array_contains(labels, NULL)").explain_plan() + assert "ScalarIndexQuery" not in explain + + def test_create_index_empty_dataset(tmp_path: Path): # Creating an index on an empty dataset is (currently) not terribly useful but # we shouldn't return strange errors. From 61c096882574e5f9574393c91d0c024bcc1f5075 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sun, 11 Jan 2026 00:32:42 +0800 Subject: [PATCH 3/4] docs: clarify LABEL_LIST membership filters --- docs/src/format/table/index/scalar/label_list.md | 9 +++++---- python/python/lance/dataset.py | 5 +++-- rust/lance-index/src/scalar/label_list.rs | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/src/format/table/index/scalar/label_list.md b/docs/src/format/table/index/scalar/label_list.md index 8d50f2638b0..1c5cb5cdaa1 100644 --- a/docs/src/format/table/index/scalar/label_list.md +++ b/docs/src/format/table/index/scalar/label_list.md @@ -26,7 +26,8 @@ The label list index uses a bitmap index internally and stores its data in: The label list index provides exact results for the following query types: -| Query Type | Description | Operation | Result Type | -|----------------------|----------------------------------------|---------------------------------------------|-------------| -| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact | -| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact | \ No newline at end of file +| Query Type | Description | Operation | Result Type | +|-------------------------------------|----------------------------------------|---------------------------------------------|-------------| +| **array_has / array_contains** | Array contains the specified value | Bitmap lookup for a single label | Exact | +| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact | +| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact | diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 942654d5a7b..a5e430b92f8 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2398,8 +2398,9 @@ def create_scalar_index( * ``LABEL_LIST``. A special index that is used to index list columns whose values have small cardinality. For example, a column that contains lists of tags (e.g. ``["tag1", "tag2", "tag3"]``) can be indexed - with a ``LABEL_LIST`` index. This index can only speedup queries with - ``array_has_any`` or ``array_has_all`` filters. + with a ``LABEL_LIST`` index. This index can speed up list membership + filters such as ``array_has_any``, ``array_has_all``, and + ``array_has`` / ``array_contains``. * ``NGRAM``. A special index that is used to index string columns. This index creates a bitmap for each ngram in the string. By default we use trigrams. This index can currently speed up queries using the ``contains`` function diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index b9850b3c01c..0cfd00d4866 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -57,8 +57,8 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf { impl LabelListSubIndex for T {} /// A scalar index that can be used on `List` columns to -/// support queries with array_contains_all and array_contains_any -/// using an underlying bitmap index. +/// accelerate list membership filters such as `array_has_all`, `array_has_any`, +/// and `array_has` / `array_contains`, using an underlying bitmap index. #[derive(Clone, Debug, DeepSizeOf)] pub struct LabelListIndex { values_index: Arc, From 33558d583c65c1534fb97a3e135b539b835bf68a Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Mon, 12 Jan 2026 23:31:55 +0800 Subject: [PATCH 4/4] test: add top-level NULL coverage and clarify scalar semantics --- python/python/tests/test_scalar_index.py | 2 +- rust/lance-index/src/scalar/expression.rs | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 859eb737d82..d81c89b0dff 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1981,7 +1981,7 @@ def test_label_list_index_array_contains(tmp_path: Path): # Include lists with NULL items to ensure NULL needle behavior matches # non-index execution. tbl = pa.table( - {"labels": [["foo", "bar"], ["bar"], ["baz"], ["qux", None], [None], []]} + {"labels": [["foo", "bar"], ["bar"], ["baz"], ["qux", None], [None], [], None]} ) dataset = lance.write_dataset(tbl, tmp_path / "dataset") expected_null_rows = dataset.to_table( diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 603e7852fd0..38f30034739 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -490,13 +490,15 @@ impl ScalarQueryParser for LabelListQueryParser { if args.len() != 2 { return None; } + // DataFusion normalizes array_contains to array_has if func.name() == "array_has" { let inner_type = match data_type { DataType::List(field) | DataType::LargeList(field) => field.data_type(), _ => return None, }; let scalar = maybe_scalar(&args[1], inner_type)?; - // Do not push down NULL needles. + // array_has(..., NULL) returns no matches in datafusion, but the index would + // match rows containing NULL. Fallback to match datafusion behavior. if scalar.is_null() { return None; }