Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2054,6 +2054,102 @@ def test_label_list_index_array_contains(tmp_path: Path):
assert "ScalarIndexQuery" not in explain


def test_label_list_index_null_element_match(tmp_path: Path):
"""Covers NULL elements inside non-NULL lists (list itself is never NULL)."""
tbl = pa.table(
{"labels": [["foo", None], ["foo"], ["bar", None], [None], ["bar"], []]}
)
dataset = lance.write_dataset(tbl, tmp_path / "dataset")

filters = [
"array_has_any(labels, ['foo'])",
"array_has_all(labels, ['foo'])",
"array_contains(labels, 'foo')",
Comment thread
fenfeng9 marked this conversation as resolved.
"NOT array_has_any(labels, ['foo'])",
"NOT array_has_all(labels, ['foo'])",
"NOT array_contains(labels, 'foo')",
]
expected = {
f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters
}

dataset.create_scalar_index("labels", index_type="LABEL_LIST")

actual = {
f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters
}
assert actual == expected


def test_label_list_index_null_list_match(tmp_path: Path):
"""Covers NULL lists (list itself is NULL, elements are not NULL)."""
tbl = pa.table({"labels": [["foo"], ["bar"], None, []]})
dataset = lance.write_dataset(tbl, tmp_path / "dataset")

filters = [
"array_has_any(labels, ['foo'])",
"array_has_all(labels, ['foo'])",
"array_contains(labels, 'foo')",
# TODO(issue #5904): Enable after fixing NOT filters with whole-list NULLs
# "NOT array_has_any(labels, ['foo'])",
# "NOT array_has_all(labels, ['foo'])",
# "NOT array_contains(labels, 'foo')",
]
expected = {
f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters
}

dataset.create_scalar_index("labels", index_type="LABEL_LIST")

actual = {
f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters
}
assert actual == expected


def test_label_list_index_null_literal_filters(tmp_path: Path):
"""Ensure filters with NULL literal needles produce consistent results with scan."""
tbl = pa.table(
{"labels": [["foo", None], ["bar", None], [None], ["foo"], ["bar"], []]}
)
dataset = lance.write_dataset(tbl, tmp_path / "dataset")

filters = [
"array_has_any(labels, [NULL])",
"array_has_all(labels, [NULL])",
"array_contains(labels, NULL)",
"NOT array_has_any(labels, [NULL])",
"NOT array_has_all(labels, [NULL])",
"NOT array_contains(labels, NULL)",
]
expected = {
f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters
}

dataset.create_scalar_index("labels", index_type="LABEL_LIST")

actual = {
f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters
}
assert actual == expected


def test_label_list_index_explain_null_literals(tmp_path: Path):
tbl = pa.table({"labels": [["foo", None], ["foo"]]})
dataset = lance.write_dataset(tbl, tmp_path / "dataset")
dataset.create_scalar_index("labels", index_type="LABEL_LIST")

# explain_plan should not panic when list literals include NULLs.
for expr in [
"array_has_any(labels, [NULL])",
"array_has_all(labels, [NULL])",
"array_has_any(labels, ['foo', NULL])",
"array_has_all(labels, ['foo', NULL])",
]:
explain = dataset.scanner(filter=expr).explain_plan()
assert isinstance(explain, str)


def test_create_index_empty_dataset(tmp_path: Path):
# Creating an index on an empty dataset is (currently) not terribly useful but
# we shouldn't return strange errors.
Expand Down
4 changes: 2 additions & 2 deletions rust/lance-index/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ impl AnyQuery for LabelListQuery {
let offsets_buffer =
OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, labels_arr.len() as i32]));
let labels_list = ListArray::try_new(
Arc::new(Field::new("item", labels_arr.data_type().clone(), false)),
Arc::new(Field::new("item", labels_arr.data_type().clone(), true)),
offsets_buffer,
labels_arr,
None,
Expand All @@ -569,7 +569,7 @@ impl AnyQuery for LabelListQuery {
let offsets_buffer =
OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, labels_arr.len() as i32]));
let labels_list = ListArray::try_new(
Arc::new(Field::new("item", labels_arr.data_type().clone(), false)),
Arc::new(Field::new("item", labels_arr.data_type().clone(), true)),
offsets_buffer,
labels_arr,
None,
Expand Down
9 changes: 7 additions & 2 deletions rust/lance-index/src/scalar/label_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use datafusion_common::ScalarValue;
use deepsize::DeepSizeOf;
use futures::{stream::BoxStream, StreamExt, TryStream, TryStreamExt};
use lance_core::cache::LanceCache;
use lance_core::utils::mask::NullableRowAddrSet;
use lance_core::utils::mask::{NullableRowAddrSet, RowAddrTreeMap};
use lance_core::{Error, Result};
use roaring::RoaringBitmap;
use snafu::location;
Expand Down Expand Up @@ -45,7 +45,12 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf {
) -> Result<NullableRowAddrSet> {
let result = self.search(query, metrics).await?;
match result {
SearchResult::Exact(row_ids) => Ok(row_ids),
SearchResult::Exact(row_ids) => {
// Label list semantics treat NULL elements as non-matches, so only TRUE/FALSE
// results should remain for array_has_any/array_has_all when the list itself
// is non-NULL. Clear nulls to avoid propagating element-level NULLs.
Ok(row_ids.with_nulls(RowAddrTreeMap::new()))
}
_ => Err(Error::Internal {
message: "Label list sub-index should return exact results".to_string(),
location: location!(),
Expand Down
14 changes: 3 additions & 11 deletions rust/lance-index/src/scalar/lance_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1551,7 +1551,7 @@ pub mod tests {

// Test: Search for lists containing value 1
// Row 0: [1, 2] - contains 1 → TRUE
// Row 1: [3, null] - has null item, unknown if it matches → NULL
// Row 1: [3, null] - null elements are ignored → FALSE
// Row 2: [4] - doesn't contain 1 → FALSE
let query = LabelListQuery::HasAnyLabel(vec![ScalarValue::UInt8(Some(1))]);
let result = index.search(&query, &NoOpMetricsCollector).await.unwrap();
Expand All @@ -1570,17 +1570,9 @@ pub mod tests {
"Should find row 0 where list contains 1"
);

let null_row_ids = row_ids.null_rows();
assert!(
!null_row_ids.is_empty(),
"null_row_ids should not be empty - row 1 has null item"
);
let null_rows: Vec<u64> =
null_row_ids.row_addrs().unwrap().map(u64::from).collect();
assert_eq!(
null_rows,
vec![1],
"Should report row 1 as null because it contains a null item"
row_ids.null_rows().is_empty(),
"null_row_ids should be empty when null elements are ignored"
);
}
_ => panic!("Expected Exact search result"),
Expand Down