Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1710,26 +1710,19 @@ def test_null_handling(tmp_path: Path):
)
dataset = lance.write_dataset(tbl, tmp_path / "dataset")

def check(has_index: bool):
def check():
assert dataset.to_table(filter="x IS NULL").num_rows == 1
assert dataset.to_table(filter="x IS NOT NULL").num_rows == 3
assert dataset.to_table(filter="x > 0").num_rows == 3
assert dataset.to_table(filter="x < 5").num_rows == 3
assert dataset.to_table(filter="x IN (1, 2)").num_rows == 2
# Note: there is a bit of discrepancy here. Datafusion does not consider
# NULL==NULL when doing an IN operation due to classic SQL shenanigans.
# We should decide at some point which behavior we want and make this
# consistent.
if has_index:
assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 3
else:
assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 2
assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 2

check(False)
check()
dataset.create_scalar_index("x", index_type="BITMAP")
check(True)
check()
dataset.create_scalar_index("x", index_type="BTREE")
check(True)
check()


def test_nan_handling(tmp_path: Path):
Expand Down
68 changes: 67 additions & 1 deletion rust/lance-index/src/scalar/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,16 @@ impl ScalarQueryParser for SargableQueryParser {
low: &Bound<ScalarValue>,
high: &Bound<ScalarValue>,
) -> Option<IndexedExpression> {
if let Bound::Included(val) | Bound::Excluded(val) = low {
if val.is_null() {
return None;
}
}
if let Bound::Included(val) | Bound::Excluded(val) = high {
if val.is_null() {
return None;
}
}
let query = SargableQuery::Range(low.clone(), high.clone());
Some(IndexedExpression::index_query_with_recheck(
column.to_string(),
Expand All @@ -263,6 +273,9 @@ impl ScalarQueryParser for SargableQueryParser {
}

fn visit_in_list(&self, column: &str, in_list: &[ScalarValue]) -> Option<IndexedExpression> {
if in_list.iter().any(|val| val.is_null()) {
return None;
}
let query = SargableQuery::IsIn(in_list.to_vec());
Some(IndexedExpression::index_query_with_recheck(
column.to_string(),
Expand Down Expand Up @@ -296,6 +309,9 @@ impl ScalarQueryParser for SargableQueryParser {
value: &ScalarValue,
op: &Operator,
) -> Option<IndexedExpression> {
if value.is_null() {
return None;
}
let query = match op {
Operator::Lt => SargableQuery::Range(Bound::Unbounded, Bound::Excluded(value.clone())),
Operator::LtEq => {
Expand Down Expand Up @@ -1941,6 +1957,7 @@ mod tests {
Bound::Included(ScalarValue::UInt32(Some(10))),
),
);
// Small in-list (in-list with 3 or fewer items optimizes into or-chain)
check_simple(
&index_info,
"aisle IN (5, 6, 7)",
Expand Down Expand Up @@ -1971,6 +1988,42 @@ mod tests {
ScalarValue::UInt32(Some(7)),
]),
);
check_simple(
&index_info,
"aisle IN (5, 6, 7, 8, 9)",
"aisle",
SargableQuery::IsIn(vec![
ScalarValue::UInt32(Some(5)),
ScalarValue::UInt32(Some(6)),
ScalarValue::UInt32(Some(7)),
ScalarValue::UInt32(Some(8)),
ScalarValue::UInt32(Some(9)),
]),
);
check_simple_negated(
&index_info,
"NOT aisle IN (5, 6, 7, 8, 9)",
"aisle",
SargableQuery::IsIn(vec![
ScalarValue::UInt32(Some(5)),
ScalarValue::UInt32(Some(6)),
ScalarValue::UInt32(Some(7)),
ScalarValue::UInt32(Some(8)),
ScalarValue::UInt32(Some(9)),
]),
);
check_simple_negated(
&index_info,
"aisle NOT IN (5, 6, 7, 8, 9)",
"aisle",
SargableQuery::IsIn(vec![
ScalarValue::UInt32(Some(5)),
ScalarValue::UInt32(Some(6)),
ScalarValue::UInt32(Some(7)),
ScalarValue::UInt32(Some(8)),
ScalarValue::UInt32(Some(9)),
]),
);
check_simple(
&index_info,
"on_sale is false",
Expand Down Expand Up @@ -2101,6 +2154,19 @@ mod tests {
);

// Non-normalized arithmetic (can use expression simplification)
check_no_index(&index_info, "aisle + 3 < 10")
check_no_index(&index_info, "aisle + 3 < 10");

// Currently we assume that the return of an index search tells us which rows are
// TRUE and all other rows are FALSE. This will need to change but for now it is
// safer to not support the following cases because the return value of non-matched
// rows is NULL and not FALSE.
check_no_index(&index_info, "aisle IN (5, 6, NULL)");
// OR-list with NULL (in future DF version this will be optimized repr of
// small in-list with NULL so let's get ready for it)
check_no_index(&index_info, "aisle = 5 OR aisle = 6 OR NULL");
check_no_index(&index_info, "aisle IN (5, 6, 7, 8, NULL)");
check_no_index(&index_info, "aisle = NULL");
check_no_index(&index_info, "aisle BETWEEN 5 AND NULL");
check_no_index(&index_info, "aisle BETWEEN NULL AND 10");
}
}
Loading