From 625924cbc6de40622a1485f1d46118660ab84c04 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sun, 1 Feb 2026 02:08:43 +0800 Subject: [PATCH 1/5] Fix label list explain for NULL literals --- python/python/tests/test_scalar_index.py | 16 ++++++++++++++++ rust/lance-index/src/scalar.rs | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 75ec01d9a82..f43e59b1f46 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2054,6 +2054,22 @@ def test_label_list_index_array_contains(tmp_path: Path): assert "ScalarIndexQuery" not in explain +def test_label_list_index_explain_null_literals(tmp_path: Path): + tbl = pa.table({"labels": [["foo", None], ["foo"]]}) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + # explain_plan should not panic when list literals include NULLs. + for expr in [ + "array_has_any(labels, [NULL])", + "array_has_all(labels, [NULL])", + "array_has_any(labels, ['foo', NULL])", + "array_has_all(labels, ['foo', NULL])", + ]: + explain = dataset.scanner(filter=expr).explain_plan() + assert isinstance(explain, str) + + def test_create_index_empty_dataset(tmp_path: Path): # Creating an index on an empty dataset is (currently) not terribly useful but # we shouldn't return strange errors. diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 6d07b5b8218..98aebe96e8c 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -549,7 +549,7 @@ impl AnyQuery for LabelListQuery { let offsets_buffer = OffsetBuffer::new(ScalarBuffer::::from(vec![0, labels_arr.len() as i32])); let labels_list = ListArray::try_new( - Arc::new(Field::new("item", labels_arr.data_type().clone(), false)), + Arc::new(Field::new("item", labels_arr.data_type().clone(), true)), offsets_buffer, labels_arr, None, @@ -569,7 +569,7 @@ impl AnyQuery for LabelListQuery { let offsets_buffer = OffsetBuffer::new(ScalarBuffer::::from(vec![0, labels_arr.len() as i32])); let labels_list = ListArray::try_new( - Arc::new(Field::new("item", labels_arr.data_type().clone(), false)), + Arc::new(Field::new("item", labels_arr.data_type().clone(), true)), offsets_buffer, labels_arr, None, From b3b5b46f35a8375997f427db6eb6f5c8f84476a9 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sun, 1 Feb 2026 02:46:09 +0800 Subject: [PATCH 2/5] Fix label list NULL overlap in bitmap index --- python/python/tests/test_scalar_index.py | 22 ++++++++++++++++++++++ rust/lance-index/src/scalar/bitmap.rs | 7 ++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index f43e59b1f46..fa1922a4cfd 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2054,6 +2054,28 @@ def test_label_list_index_array_contains(tmp_path: Path): assert "ScalarIndexQuery" not in explain +def test_label_list_index_null_element_match(tmp_path: Path): + """Ensure LABEL_LIST index keeps scan semantics when lists contain NULLs.""" + tbl = pa.table({"labels": [["foo", None], ["foo"], None]}) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + filters = [ + "array_has_any(labels, ['foo'])", + "array_has_all(labels, ['foo'])", + "array_contains(labels, 'foo')", + ] + expected = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + actual = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + assert actual == expected + + def test_label_list_index_explain_null_literals(tmp_path: Path): tbl = pa.table({"labels": [["foo", None], ["foo"]]}) dataset = lance.write_dataset(tbl, tmp_path / "dataset") diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 4fb9fc3334c..66d749da0df 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -546,7 +546,12 @@ impl ScalarIndex for BitmapIndex { } }; - let selection = NullableRowAddrSet::new(row_ids, null_row_ids.unwrap_or_default()); + let mut null_rows = null_row_ids.unwrap_or_default(); + if !null_rows.is_empty() { + // A row can be both TRUE and NULL after list flattening; treat it as TRUE. + null_rows -= &row_ids; + } + let selection = NullableRowAddrSet::new(row_ids, null_rows); Ok(SearchResult::Exact(selection)) } From d04587c7988b013cb25f9e95908e99df850fbacc Mon Sep 17 00:00:00 2001 From: fenfeng9 <36840213+fenfeng9@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:27:22 +0800 Subject: [PATCH 3/5] Update python/python/tests/test_scalar_index.py Co-authored-by: Will Jones --- python/python/tests/test_scalar_index.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index fa1922a4cfd..741d7128203 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2063,6 +2063,9 @@ def test_label_list_index_null_element_match(tmp_path: Path): "array_has_any(labels, ['foo'])", "array_has_all(labels, ['foo'])", "array_contains(labels, 'foo')", + "NOT array_has_any(labels, ['foo'])", + "NOT array_has_all(labels, ['foo'])", + "NOT array_contains(labels, 'foo')", ] expected = { f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters From bc7897bdd9db1a66a5d2d0baf0d0b44553f0c154 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sat, 7 Feb 2026 15:10:05 +0800 Subject: [PATCH 4/5] test: update label index test case --- python/python/tests/test_scalar_index.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 741d7128203..39e14f2ce3c 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2056,16 +2056,19 @@ def test_label_list_index_array_contains(tmp_path: Path): def test_label_list_index_null_element_match(tmp_path: Path): """Ensure LABEL_LIST index keeps scan semantics when lists contain NULLs.""" - tbl = pa.table({"labels": [["foo", None], ["foo"], None]}) + tbl = pa.table( + {"labels": [["foo", None], ["foo"], ["bar", None], ["bar"], None, []]} + ) dataset = lance.write_dataset(tbl, tmp_path / "dataset") filters = [ "array_has_any(labels, ['foo'])", "array_has_all(labels, ['foo'])", "array_contains(labels, 'foo')", - "NOT array_has_any(labels, ['foo'])", - "NOT array_has_all(labels, ['foo'])", - "NOT array_contains(labels, 'foo')", + # TODO(issue #5904): Enable after fixing NOT filters with NULL lists/elements + # "NOT array_has_any(labels, ['foo'])", + # "NOT array_has_all(labels, ['foo'])", + # "NOT array_contains(labels, 'foo')", ] expected = { f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters From 690fe230386db13665d7482c3ea60f609608c558 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sat, 7 Feb 2026 19:47:23 +0800 Subject: [PATCH 5/5] fix(lance-index): ignore null elements in label_list matching - Clear element-level nulls in label_list searches - Update null-handling tests for label_list --- python/python/tests/test_scalar_index.py | 58 +++++++++++++++++++-- rust/lance-index/src/scalar/bitmap.rs | 7 +-- rust/lance-index/src/scalar/label_list.rs | 9 +++- rust/lance-index/src/scalar/lance_format.rs | 14 ++--- 4 files changed, 66 insertions(+), 22 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 39e14f2ce3c..bc4aa803399 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2055,9 +2055,9 @@ def test_label_list_index_array_contains(tmp_path: Path): def test_label_list_index_null_element_match(tmp_path: Path): - """Ensure LABEL_LIST index keeps scan semantics when lists contain NULLs.""" + """Covers NULL elements inside non-NULL lists (list itself is never NULL).""" tbl = pa.table( - {"labels": [["foo", None], ["foo"], ["bar", None], ["bar"], None, []]} + {"labels": [["foo", None], ["foo"], ["bar", None], [None], ["bar"], []]} ) dataset = lance.write_dataset(tbl, tmp_path / "dataset") @@ -2065,7 +2065,32 @@ def test_label_list_index_null_element_match(tmp_path: Path): "array_has_any(labels, ['foo'])", "array_has_all(labels, ['foo'])", "array_contains(labels, 'foo')", - # TODO(issue #5904): Enable after fixing NOT filters with NULL lists/elements + "NOT array_has_any(labels, ['foo'])", + "NOT array_has_all(labels, ['foo'])", + "NOT array_contains(labels, 'foo')", + ] + expected = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + actual = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + assert actual == expected + + +def test_label_list_index_null_list_match(tmp_path: Path): + """Covers NULL lists (list itself is NULL, elements are not NULL).""" + tbl = pa.table({"labels": [["foo"], ["bar"], None, []]}) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + filters = [ + "array_has_any(labels, ['foo'])", + "array_has_all(labels, ['foo'])", + "array_contains(labels, 'foo')", + # TODO(issue #5904): Enable after fixing NOT filters with whole-list NULLs # "NOT array_has_any(labels, ['foo'])", # "NOT array_has_all(labels, ['foo'])", # "NOT array_contains(labels, 'foo')", @@ -2082,6 +2107,33 @@ def test_label_list_index_null_element_match(tmp_path: Path): assert actual == expected +def test_label_list_index_null_literal_filters(tmp_path: Path): + """Ensure filters with NULL literal needles produce consistent results with scan.""" + tbl = pa.table( + {"labels": [["foo", None], ["bar", None], [None], ["foo"], ["bar"], []]} + ) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + filters = [ + "array_has_any(labels, [NULL])", + "array_has_all(labels, [NULL])", + "array_contains(labels, NULL)", + "NOT array_has_any(labels, [NULL])", + "NOT array_has_all(labels, [NULL])", + "NOT array_contains(labels, NULL)", + ] + expected = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + actual = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + assert actual == expected + + def test_label_list_index_explain_null_literals(tmp_path: Path): tbl = pa.table({"labels": [["foo", None], ["foo"]]}) dataset = lance.write_dataset(tbl, tmp_path / "dataset") diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 66d749da0df..4fb9fc3334c 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -546,12 +546,7 @@ impl ScalarIndex for BitmapIndex { } }; - let mut null_rows = null_row_ids.unwrap_or_default(); - if !null_rows.is_empty() { - // A row can be both TRUE and NULL after list flattening; treat it as TRUE. - null_rows -= &row_ids; - } - let selection = NullableRowAddrSet::new(row_ids, null_rows); + let selection = NullableRowAddrSet::new(row_ids, null_row_ids.unwrap_or_default()); Ok(SearchResult::Exact(selection)) } diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index 0cfd00d4866..e971c45fa97 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -13,7 +13,7 @@ use datafusion_common::ScalarValue; use deepsize::DeepSizeOf; use futures::{stream::BoxStream, StreamExt, TryStream, TryStreamExt}; use lance_core::cache::LanceCache; -use lance_core::utils::mask::NullableRowAddrSet; +use lance_core::utils::mask::{NullableRowAddrSet, RowAddrTreeMap}; use lance_core::{Error, Result}; use roaring::RoaringBitmap; use snafu::location; @@ -45,7 +45,12 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf { ) -> Result { let result = self.search(query, metrics).await?; match result { - SearchResult::Exact(row_ids) => Ok(row_ids), + SearchResult::Exact(row_ids) => { + // Label list semantics treat NULL elements as non-matches, so only TRUE/FALSE + // results should remain for array_has_any/array_has_all when the list itself + // is non-NULL. Clear nulls to avoid propagating element-level NULLs. + Ok(row_ids.with_nulls(RowAddrTreeMap::new())) + } _ => Err(Error::Internal { message: "Label list sub-index should return exact results".to_string(), location: location!(), diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 817fb803c64..cdb3f73db84 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -1551,7 +1551,7 @@ pub mod tests { // Test: Search for lists containing value 1 // Row 0: [1, 2] - contains 1 → TRUE - // Row 1: [3, null] - has null item, unknown if it matches → NULL + // Row 1: [3, null] - null elements are ignored → FALSE // Row 2: [4] - doesn't contain 1 → FALSE let query = LabelListQuery::HasAnyLabel(vec![ScalarValue::UInt8(Some(1))]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); @@ -1570,17 +1570,9 @@ pub mod tests { "Should find row 0 where list contains 1" ); - let null_row_ids = row_ids.null_rows(); assert!( - !null_row_ids.is_empty(), - "null_row_ids should not be empty - row 1 has null item" - ); - let null_rows: Vec = - null_row_ids.row_addrs().unwrap().map(u64::from).collect(); - assert_eq!( - null_rows, - vec![1], - "Should report row 1 as null because it contains a null item" + row_ids.null_rows().is_empty(), + "null_row_ids should be empty when null elements are ignored" ); } _ => panic!("Expected Exact search result"),