From 29df1da74a01a329d25642a73a5160f5f3dbcbc1 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 25 Sep 2025 11:56:04 -0700 Subject: [PATCH 1/2] cowardly refuse to use scalar indexes on expressions with null literals --- python/python/tests/test_scalar_index.py | 17 ++---- rust/lance-index/src/scalar/expression.rs | 69 ++++++++++++++++++++++- 2 files changed, 73 insertions(+), 13 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index fbac4773205..a4872804a98 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1710,26 +1710,19 @@ def test_null_handling(tmp_path: Path): ) dataset = lance.write_dataset(tbl, tmp_path / "dataset") - def check(has_index: bool): + def check(): assert dataset.to_table(filter="x IS NULL").num_rows == 1 assert dataset.to_table(filter="x IS NOT NULL").num_rows == 3 assert dataset.to_table(filter="x > 0").num_rows == 3 assert dataset.to_table(filter="x < 5").num_rows == 3 assert dataset.to_table(filter="x IN (1, 2)").num_rows == 2 - # Note: there is a bit of discrepancy here. Datafusion does not consider - # NULL==NULL when doing an IN operation due to classic SQL shenanigans. - # We should decide at some point which behavior we want and make this - # consistent. - if has_index: - assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 3 - else: - assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 2 + assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 2 - check(False) + check() dataset.create_scalar_index("x", index_type="BITMAP") - check(True) + check() dataset.create_scalar_index("x", index_type="BTREE") - check(True) + check() def test_nan_handling(tmp_path: Path): diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 4d50e618d2d..9fe2a333430 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -253,6 +253,16 @@ impl ScalarQueryParser for SargableQueryParser { low: &Bound, high: &Bound, ) -> Option { + if let Bound::Included(val) | Bound::Excluded(val) = low { + if val.is_null() { + return None; + } + } + if let Bound::Included(val) | Bound::Excluded(val) = high { + if val.is_null() { + return None; + } + } let query = SargableQuery::Range(low.clone(), high.clone()); Some(IndexedExpression::index_query_with_recheck( column.to_string(), @@ -263,6 +273,9 @@ impl ScalarQueryParser for SargableQueryParser { } fn visit_in_list(&self, column: &str, in_list: &[ScalarValue]) -> Option { + if in_list.iter().any(|val| val.is_null()) { + return None; + } let query = SargableQuery::IsIn(in_list.to_vec()); Some(IndexedExpression::index_query_with_recheck( column.to_string(), @@ -296,6 +309,9 @@ impl ScalarQueryParser for SargableQueryParser { value: &ScalarValue, op: &Operator, ) -> Option { + if value.is_null() { + return None; + } let query = match op { Operator::Lt => SargableQuery::Range(Bound::Unbounded, Bound::Excluded(value.clone())), Operator::LtEq => { @@ -1704,6 +1720,7 @@ mod tests { expected: Option, optimize: bool, ) { + println!("Checking expression: {}", expr); let schema = Schema::new(vec![ Field::new("color", DataType::Utf8, false), Field::new("size", DataType::Float32, false), @@ -1941,6 +1958,7 @@ mod tests { Bound::Included(ScalarValue::UInt32(Some(10))), ), ); + // Small in-list (in-list with 3 or fewer items optimizes into or-chain) check_simple( &index_info, "aisle IN (5, 6, 7)", @@ -1971,6 +1989,42 @@ mod tests { ScalarValue::UInt32(Some(7)), ]), ); + check_simple( + &index_info, + "aisle IN (5, 6, 7, 8, 9)", + "aisle", + SargableQuery::IsIn(vec![ + ScalarValue::UInt32(Some(5)), + ScalarValue::UInt32(Some(6)), + ScalarValue::UInt32(Some(7)), + ScalarValue::UInt32(Some(8)), + ScalarValue::UInt32(Some(9)), + ]), + ); + check_simple_negated( + &index_info, + "NOT aisle IN (5, 6, 7, 8, 9)", + "aisle", + SargableQuery::IsIn(vec![ + ScalarValue::UInt32(Some(5)), + ScalarValue::UInt32(Some(6)), + ScalarValue::UInt32(Some(7)), + ScalarValue::UInt32(Some(8)), + ScalarValue::UInt32(Some(9)), + ]), + ); + check_simple_negated( + &index_info, + "aisle NOT IN (5, 6, 7, 8, 9)", + "aisle", + SargableQuery::IsIn(vec![ + ScalarValue::UInt32(Some(5)), + ScalarValue::UInt32(Some(6)), + ScalarValue::UInt32(Some(7)), + ScalarValue::UInt32(Some(8)), + ScalarValue::UInt32(Some(9)), + ]), + ); check_simple( &index_info, "on_sale is false", @@ -2101,6 +2155,19 @@ mod tests { ); // Non-normalized arithmetic (can use expression simplification) - check_no_index(&index_info, "aisle + 3 < 10") + check_no_index(&index_info, "aisle + 3 < 10"); + + // Currently we assume that the return of an index search tells us which rows are + // TRUE and all other rows are FALSE. This will need to change but for now it is + // safer to not support the following cases because the return value of non-matched + // rows is NULL and not FALSE. + check_no_index(&index_info, "aisle IN (5, 6, NULL)"); + // OR-list with NULL (in future DF version this will be optimized repr of + // small in-list with NULL so let's get ready for it) + check_no_index(&index_info, "aisle = 5 OR aisle = 6 OR NULL"); + check_no_index(&index_info, "aisle IN (5, 6, 7, 8, NULL)"); + check_no_index(&index_info, "aisle = NULL"); + check_no_index(&index_info, "aisle BETWEEN 5 AND NULL"); + check_no_index(&index_info, "aisle BETWEEN NULL AND 10"); } } From fe50ca75f85c7f6fffea4861b0c9b9d0ccb9b2e2 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 25 Sep 2025 13:31:53 -0700 Subject: [PATCH 2/2] Remove println --- rust/lance-index/src/scalar/expression.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 9fe2a333430..047dddeaa93 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -1720,7 +1720,6 @@ mod tests { expected: Option, optimize: bool, ) { - println!("Checking expression: {}", expr); let schema = Schema::new(vec![ Field::new("color", DataType::Utf8, false), Field::new("size", DataType::Float32, false),