diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 75ec01d9a82..bc4aa803399 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2054,6 +2054,102 @@ def test_label_list_index_array_contains(tmp_path: Path): assert "ScalarIndexQuery" not in explain +def test_label_list_index_null_element_match(tmp_path: Path): + """Covers NULL elements inside non-NULL lists (list itself is never NULL).""" + tbl = pa.table( + {"labels": [["foo", None], ["foo"], ["bar", None], [None], ["bar"], []]} + ) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + filters = [ + "array_has_any(labels, ['foo'])", + "array_has_all(labels, ['foo'])", + "array_contains(labels, 'foo')", + "NOT array_has_any(labels, ['foo'])", + "NOT array_has_all(labels, ['foo'])", + "NOT array_contains(labels, 'foo')", + ] + expected = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + actual = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + assert actual == expected + + +def test_label_list_index_null_list_match(tmp_path: Path): + """Covers NULL lists (list itself is NULL, elements are not NULL).""" + tbl = pa.table({"labels": [["foo"], ["bar"], None, []]}) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + filters = [ + "array_has_any(labels, ['foo'])", + "array_has_all(labels, ['foo'])", + "array_contains(labels, 'foo')", + # TODO(issue #5904): Enable after fixing NOT filters with whole-list NULLs + # "NOT array_has_any(labels, ['foo'])", + # "NOT array_has_all(labels, ['foo'])", + # "NOT array_contains(labels, 'foo')", + ] + expected = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + actual = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + assert actual == expected + + +def test_label_list_index_null_literal_filters(tmp_path: Path): + """Ensure filters with NULL literal needles produce consistent results with scan.""" + tbl = pa.table( + {"labels": [["foo", None], ["bar", None], [None], ["foo"], ["bar"], []]} + ) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + filters = [ + "array_has_any(labels, [NULL])", + "array_has_all(labels, [NULL])", + "array_contains(labels, NULL)", + "NOT array_has_any(labels, [NULL])", + "NOT array_has_all(labels, [NULL])", + "NOT array_contains(labels, NULL)", + ] + expected = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + actual = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + assert actual == expected + + +def test_label_list_index_explain_null_literals(tmp_path: Path): + tbl = pa.table({"labels": [["foo", None], ["foo"]]}) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + # explain_plan should not panic when list literals include NULLs. + for expr in [ + "array_has_any(labels, [NULL])", + "array_has_all(labels, [NULL])", + "array_has_any(labels, ['foo', NULL])", + "array_has_all(labels, ['foo', NULL])", + ]: + explain = dataset.scanner(filter=expr).explain_plan() + assert isinstance(explain, str) + + def test_create_index_empty_dataset(tmp_path: Path): # Creating an index on an empty dataset is (currently) not terribly useful but # we shouldn't return strange errors. diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 6d07b5b8218..98aebe96e8c 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -549,7 +549,7 @@ impl AnyQuery for LabelListQuery { let offsets_buffer = OffsetBuffer::new(ScalarBuffer::::from(vec![0, labels_arr.len() as i32])); let labels_list = ListArray::try_new( - Arc::new(Field::new("item", labels_arr.data_type().clone(), false)), + Arc::new(Field::new("item", labels_arr.data_type().clone(), true)), offsets_buffer, labels_arr, None, @@ -569,7 +569,7 @@ impl AnyQuery for LabelListQuery { let offsets_buffer = OffsetBuffer::new(ScalarBuffer::::from(vec![0, labels_arr.len() as i32])); let labels_list = ListArray::try_new( - Arc::new(Field::new("item", labels_arr.data_type().clone(), false)), + Arc::new(Field::new("item", labels_arr.data_type().clone(), true)), offsets_buffer, labels_arr, None, diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index 0cfd00d4866..e971c45fa97 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -13,7 +13,7 @@ use datafusion_common::ScalarValue; use deepsize::DeepSizeOf; use futures::{stream::BoxStream, StreamExt, TryStream, TryStreamExt}; use lance_core::cache::LanceCache; -use lance_core::utils::mask::NullableRowAddrSet; +use lance_core::utils::mask::{NullableRowAddrSet, RowAddrTreeMap}; use lance_core::{Error, Result}; use roaring::RoaringBitmap; use snafu::location; @@ -45,7 +45,12 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf { ) -> Result { let result = self.search(query, metrics).await?; match result { - SearchResult::Exact(row_ids) => Ok(row_ids), + SearchResult::Exact(row_ids) => { + // Label list semantics treat NULL elements as non-matches, so only TRUE/FALSE + // results should remain for array_has_any/array_has_all when the list itself + // is non-NULL. Clear nulls to avoid propagating element-level NULLs. + Ok(row_ids.with_nulls(RowAddrTreeMap::new())) + } _ => Err(Error::Internal { message: "Label list sub-index should return exact results".to_string(), location: location!(), diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 817fb803c64..cdb3f73db84 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -1551,7 +1551,7 @@ pub mod tests { // Test: Search for lists containing value 1 // Row 0: [1, 2] - contains 1 → TRUE - // Row 1: [3, null] - has null item, unknown if it matches → NULL + // Row 1: [3, null] - null elements are ignored → FALSE // Row 2: [4] - doesn't contain 1 → FALSE let query = LabelListQuery::HasAnyLabel(vec![ScalarValue::UInt8(Some(1))]); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); @@ -1570,17 +1570,9 @@ pub mod tests { "Should find row 0 where list contains 1" ); - let null_row_ids = row_ids.null_rows(); assert!( - !null_row_ids.is_empty(), - "null_row_ids should not be empty - row 1 has null item" - ); - let null_rows: Vec = - null_row_ids.row_addrs().unwrap().map(u64::from).collect(); - assert_eq!( - null_rows, - vec![1], - "Should report row 1 as null because it contains a null item" + row_ids.null_rows().is_empty(), + "null_row_ids should be empty when null elements are ignored" ); } _ => panic!("Expected Exact search result"),