Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions python/pyiceberg/expressions/visitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,3 +881,82 @@ def rewrite_to_dnf(expr: BooleanExpression) -> Tuple[BooleanExpression, ...]:
# (A AND NOT(B) AND C) OR (NOT(D) AND E AND F) OR (G)
expr_without_not = rewrite_not(expr)
return visit(expr_without_not, _RewriteToDNF())


class ExpressionToPlainFormat(BoundBooleanExpressionVisitor[List[Tuple[str, str, Any]]]):
def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "in", literals)]

def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "not in", literals)]

def visit_is_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "==", float("nan"))]

def visit_not_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "!=", float("nan"))]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NaN is always not equal to itself, at least in Java. Are we sure that this works?


def visit_is_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "==", None)]

def visit_not_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "!=", None)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see anything in the docs that indicate this is the right way to pass this, so we should make sure there are tests for it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, and I have #6398 lined up to exactly test this. I'll revive the PR tomorrow.


def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "==", literal.value)]

def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "!=", literal.value)]

def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, ">=", literal.value)]

def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, ">", literal.value)]

def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "<", literal.value)]

def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]:
return [(term.ref().field.name, "<=", literal.value)]

def visit_true(self) -> List[Tuple[str, str, Any]]:
return [] # Not supported
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is okay. Zero filters basically do the same thing as true. The problem is converting false into the same thing. I think for that, we should throw an exception because it cannot be safely handled.

Copy link
Contributor Author

@Fokko Fokko Jan 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm okay with raising an error. Keep in mind that for Dask (and also PyArrow) this is only used to skip Parquet pages, we have to do a final pass to filter on a row level anyway:

image

(from the abovementioned blog)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also allows us to rewrite an expression to read PyArrow tables with filters: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html


def visit_false(self) -> List[Tuple[str, str, Any]]:
raise ValueError("Not supported: AlwaysFalse")

def visit_not(self, child_result: List[Tuple[str, str, Any]]) -> List[Tuple[str, str, Any]]:
raise ValueError(f"Not allowed: {child_result}")

def visit_and(
self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]]
) -> List[Tuple[str, str, Any]]:
return left_result + right_result

def visit_or(
self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]]
) -> List[Tuple[str, str, Any]]:
raise ValueError(f"Not allowed: {left_result} || {right_result}")


def expression_to_plain_format(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[str, str, Any]]]:
"""Formats a Disjunctive Normal Form expression into the format that can be fed into:

- https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
- https://docs.dask.org/en/stable/generated/dask.dataframe.read_parquet.html

Contrary to normal DNF that may contain Not expressions, but here they should have
been rewritten. This can be done using ``rewrite_not(...)``.

Keep in mind that this is only used for page skipping, and still needs to filter
on a row level.

Args:
expressions: Expression in Disjunctive Normal Form

Returns:
Formatter filter compatible with Dask and PyArrow
"""
# In the form of expr1 ∨ expr2 ∨ ... ∨ exprN
return [visit(expression, ExpressionToPlainFormat()) for expression in expressions]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this reuse the ExpressionToPlainFormat instance?

Copy link
Contributor Author

@Fokko Fokko Jan 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good call! I've pulled this out of the loop

21 changes: 21 additions & 0 deletions python/tests/expressions/test_visitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
BooleanExpressionVisitor,
BoundBooleanExpressionVisitor,
_ManifestEvalVisitor,
expression_to_plain_format,
rewrite_not,
rewrite_to_dnf,
visit,
Expand Down Expand Up @@ -1451,3 +1452,23 @@ def test_to_dnf_and() -> None:
def test_to_dnf_not_and() -> None:
expr = Not(And(Not(EqualTo("Q", "b")), EqualTo("R", "c")))
assert rewrite_to_dnf(expr) == (EqualTo("Q", "b"), NotEqualTo("R", "c"))


def test_dnf_to_dask(table_schema_simple: Schema) -> None:
expr = (
BoundGreaterThan[str](
term=BoundReference(table_schema_simple.find_field(1), table_schema_simple.accessor_for_field(1)),
literal=literal("hello"),
),
And(
BoundIn[int](
term=BoundReference(table_schema_simple.find_field(2), table_schema_simple.accessor_for_field(2)),
literals={literal(1), literal(2), literal(3)},
),
BoundEqualTo[bool](
term=BoundReference(table_schema_simple.find_field(3), table_schema_simple.accessor_for_field(3)),
literal=literal(True),
),
),
)
assert expression_to_plain_format(expr) == [[("foo", ">", "hello")], [("bar", "in", {1, 2, 3}), ("baz", "==", True)]]