From 85790d2ecd03e10716075a908353bcc6f510e5b9 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Mon, 9 Feb 2026 06:13:43 +0800 Subject: [PATCH 1/5] fix: correct OR null semantics for nullable masks --- python/python/tests/test_scalar_index.py | 17 ++++++++++ rust/lance-core/src/utils/mask/nullable.rs | 36 +++++++++++++++++++--- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index ce6a66944c0..7b207b0dd7c 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2010,6 +2010,23 @@ def test_scalar_index_with_nulls(tmp_path): assert len(result) == k +def test_scalar_index_or_with_nulls_matches_scan(tmp_path: Path): + data = pa.table({"c1": pa.array([None, 1], type=pa.int64())}) + filters = [ + "(c1 != 0) OR (c1 < 5)", + "NOT ((c1 != 0) OR (c1 < 5))", + ] + + ds_no = lance.write_dataset(data, tmp_path / "no") + expected = {f: ds_no.to_table(filter=f).num_rows for f in filters} + + for idx_type in ["BITMAP", "BTREE"]: + ds = lance.write_dataset(data, tmp_path / idx_type.lower()) + ds.create_scalar_index("c1", index_type=idx_type) + for f in filters: + assert ds.to_table(filter=f).num_rows == expected[f] + + def test_label_list_index(tmp_path: Path): tags = pa.array(["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"]) tag_list = pa.ListArray.from_arrays([0, 2, 4], tags) diff --git a/rust/lance-core/src/utils/mask/nullable.rs b/rust/lance-core/src/utils/mask/nullable.rs index 1f4b5b7dad3..d7ebb44297c 100644 --- a/rust/lance-core/src/utils/mask/nullable.rs +++ b/rust/lance-core/src/utils/mask/nullable.rs @@ -237,14 +237,18 @@ impl std::ops::BitOr for NullableRowAddrMask { } (Self::AllowList(allow), Self::BlockList(block)) | (Self::BlockList(block), Self::AllowList(allow)) => { + let allow_true = allow.selected.clone() - &allow.nulls; + let block_false = block.selected.clone() - &block.nulls; + let nulls = if allow.nulls.is_empty() && block.nulls.is_empty() { RowAddrTreeMap::new() // Fast path } else { - // null or null -> null (excluding rows that are true in either) - let allow_true = allow.selected.clone() - &allow.nulls; - ((allow.nulls | block.nulls) & block.selected.clone()) - allow_true + // NULL if (allow NULL and block FALSE) or (block NULL and allow not TRUE) + (allow.nulls.clone() & &block_false) | (block.nulls - &allow_true) }; - let selected = (block.selected - allow.selected) | &nulls; + // Remaining FALSE rows after OR (block FALSE minus allow TRUE/NULL) + let false_rows = block_false - &allow_true - &allow.nulls; + let selected = false_rows | &nulls; Self::BlockList(NullableRowAddrSet { selected, nulls }) } (Self::BlockList(a), Self::BlockList(b)) => { @@ -363,6 +367,30 @@ mod tests { assert_mask_selects(&result, &[], &[0, 1, 2, 3]); } + #[test] + fn test_or_allow_block_keeps_block_nulls() { + // Allow|Block OR must preserve NULLs from block even when block.selected is empty. + // allow: TRUE=[1], NULL=[0]; block: FALSE=[], NULL=[0] + let allow_mask = allow(&[1], &[0]); + let block_mask = block(&[], &[0]); + let result = allow_mask | block_mask; + + // Row 1 is TRUE; row 0 remains NULL (not selected) + assert_mask_selects(&result, &[1], &[0]); + } + + #[test] + fn test_or_allow_block_keeps_block_nulls_with_false_rows() { + // Ensure FALSE stays FALSE and NULL stays NULL when both appear on the block side. + // allow: TRUE=[2], NULL=[]; block: FALSE=[1], NULL=[0] + let allow_mask = allow(&[2], &[]); + let block_mask = block(&[1], &[0]); + let result = allow_mask | block_mask; + + // Row 2 is TRUE; row 1 is FALSE; row 0 remains NULL (not selected) + assert_mask_selects(&result, &[2], &[0, 1]); + } + #[test] fn test_row_selection_bit_or() { // [T, N, T, N, F, F, F] From 7cd85a13e1330e8311dbbb9fd12925787d5c3d81 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Mon, 9 Feb 2026 17:39:23 +0800 Subject: [PATCH 2/5] chore: simplify nullable OR selected computation --- rust/lance-core/src/utils/mask/nullable.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rust/lance-core/src/utils/mask/nullable.rs b/rust/lance-core/src/utils/mask/nullable.rs index d7ebb44297c..585bd40a08a 100644 --- a/rust/lance-core/src/utils/mask/nullable.rs +++ b/rust/lance-core/src/utils/mask/nullable.rs @@ -243,12 +243,9 @@ impl std::ops::BitOr for NullableRowAddrMask { let nulls = if allow.nulls.is_empty() && block.nulls.is_empty() { RowAddrTreeMap::new() // Fast path } else { - // NULL if (allow NULL and block FALSE) or (block NULL and allow not TRUE) (allow.nulls.clone() & &block_false) | (block.nulls - &allow_true) }; - // Remaining FALSE rows after OR (block FALSE minus allow TRUE/NULL) - let false_rows = block_false - &allow_true - &allow.nulls; - let selected = false_rows | &nulls; + let selected = (block_false - &allow_true) | &nulls; Self::BlockList(NullableRowAddrSet { selected, nulls }) } (Self::BlockList(a), Self::BlockList(b)) => { From 947f29fc02d3f5b64e29d69edd4fe149e2131e63 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Mon, 9 Feb 2026 17:46:08 +0800 Subject: [PATCH 3/5] chore: add nullable OR comment --- rust/lance-core/src/utils/mask/nullable.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rust/lance-core/src/utils/mask/nullable.rs b/rust/lance-core/src/utils/mask/nullable.rs index 585bd40a08a..3575731e017 100644 --- a/rust/lance-core/src/utils/mask/nullable.rs +++ b/rust/lance-core/src/utils/mask/nullable.rs @@ -243,7 +243,9 @@ impl std::ops::BitOr for NullableRowAddrMask { let nulls = if allow.nulls.is_empty() && block.nulls.is_empty() { RowAddrTreeMap::new() // Fast path } else { - (allow.nulls.clone() & &block_false) | (block.nulls - &allow_true) + // NULL|FALSE=NULL, FALSE|NULL=NULL, NULL|NULL=NULL, TRUE|NULL=TRUE. + // So NULL rows are: (allow NULL & block FALSE) or (block NULL & allow not TRUE). + (allow.nulls & &block_false) | (block.nulls - &allow_true) }; let selected = (block_false - &allow_true) | &nulls; Self::BlockList(NullableRowAddrSet { selected, nulls }) From 1a77a094acee25049a94047c93d4d2b550350059 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Wed, 11 Feb 2026 17:45:43 +0800 Subject: [PATCH 4/5] test: add OR predicate cases to primitives.rs --- rust/lance/tests/query/primitives.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs index b2b8b9db5c1..a6173bf83b7 100644 --- a/rust/lance/tests/query/primitives.rs +++ b/rust/lance/tests/query/primitives.rs @@ -79,6 +79,8 @@ async fn test_query_integer(#[case] data_type: DataType) { test_filter(&original, &ds, "NOT (value > 20)").await; test_filter(&original, &ds, "value is null").await; test_filter(&original, &ds, "value is not null").await; + test_filter(&original, &ds, "(value != 0) OR (value < 20)").await; + test_filter(&original, &ds, "NOT ((value != 0) OR (value < 20))").await; }) .await } From 114fe5efa48ac0822060521b7df56f09aac8ed36 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Wed, 11 Feb 2026 17:50:20 +0800 Subject: [PATCH 5/5] test: drop redundant scalar index OR+NULL Python test --- python/python/tests/test_scalar_index.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 7b207b0dd7c..ce6a66944c0 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2010,23 +2010,6 @@ def test_scalar_index_with_nulls(tmp_path): assert len(result) == k -def test_scalar_index_or_with_nulls_matches_scan(tmp_path: Path): - data = pa.table({"c1": pa.array([None, 1], type=pa.int64())}) - filters = [ - "(c1 != 0) OR (c1 < 5)", - "NOT ((c1 != 0) OR (c1 < 5))", - ] - - ds_no = lance.write_dataset(data, tmp_path / "no") - expected = {f: ds_no.to_table(filter=f).num_rows for f in filters} - - for idx_type in ["BITMAP", "BTREE"]: - ds = lance.write_dataset(data, tmp_path / idx_type.lower()) - ds.create_scalar_index("c1", index_type=idx_type) - for f in filters: - assert ds.to_table(filter=f).num_rows == expected[f] - - def test_label_list_index(tmp_path: Path): tags = pa.array(["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"]) tag_list = pa.ListArray.from_arrays([0, 2, 4], tags)