diff --git a/python/pyiceberg/expressions/visitors.py b/python/pyiceberg/expressions/visitors.py index f312f12c3f1c..a76b183a98f2 100644 --- a/python/pyiceberg/expressions/visitors.py +++ b/python/pyiceberg/expressions/visitors.py @@ -881,3 +881,82 @@ def rewrite_to_dnf(expr: BooleanExpression) -> Tuple[BooleanExpression, ...]: # (A AND NOT(B) AND C) OR (NOT(D) AND E AND F) OR (G) expr_without_not = rewrite_not(expr) return visit(expr_without_not, _RewriteToDNF()) + + +class ExpressionToPlainFormat(BoundBooleanExpressionVisitor[List[Tuple[str, str, Any]]]): + def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "in", literals)] + + def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "not in", literals)] + + def visit_is_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "==", float("nan"))] + + def visit_not_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "!=", float("nan"))] + + def visit_is_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "==", None)] + + def visit_not_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "!=", None)] + + def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "==", literal.value)] + + def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "!=", literal.value)] + + def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, ">=", literal.value)] + + def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, ">", literal.value)] + + def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "<", literal.value)] + + def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "<=", literal.value)] + + def visit_true(self) -> List[Tuple[str, str, Any]]: + return [] # Not supported + + def visit_false(self) -> List[Tuple[str, str, Any]]: + raise ValueError("Not supported: AlwaysFalse") + + def visit_not(self, child_result: List[Tuple[str, str, Any]]) -> List[Tuple[str, str, Any]]: + raise ValueError(f"Not allowed: {child_result}") + + def visit_and( + self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] + ) -> List[Tuple[str, str, Any]]: + return left_result + right_result + + def visit_or( + self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] + ) -> List[Tuple[str, str, Any]]: + raise ValueError(f"Not allowed: {left_result} || {right_result}") + + +def expression_to_plain_format(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[str, str, Any]]]: + """Formats a Disjunctive Normal Form expression into the format that can be fed into: + + - https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html + - https://docs.dask.org/en/stable/generated/dask.dataframe.read_parquet.html + + Contrary to normal DNF that may contain Not expressions, but here they should have + been rewritten. This can be done using ``rewrite_not(...)``. + + Keep in mind that this is only used for page skipping, and still needs to filter + on a row level. + + Args: + expressions: Expression in Disjunctive Normal Form + + Returns: + Formatter filter compatible with Dask and PyArrow + """ + # In the form of expr1 ∨ expr2 ∨ ... ∨ exprN + return [visit(expression, ExpressionToPlainFormat()) for expression in expressions] diff --git a/python/tests/expressions/test_visitors.py b/python/tests/expressions/test_visitors.py index 13e4d01eea35..40e97004ad0e 100644 --- a/python/tests/expressions/test_visitors.py +++ b/python/tests/expressions/test_visitors.py @@ -64,6 +64,7 @@ BooleanExpressionVisitor, BoundBooleanExpressionVisitor, _ManifestEvalVisitor, + expression_to_plain_format, rewrite_not, rewrite_to_dnf, visit, @@ -1451,3 +1452,23 @@ def test_to_dnf_and() -> None: def test_to_dnf_not_and() -> None: expr = Not(And(Not(EqualTo("Q", "b")), EqualTo("R", "c"))) assert rewrite_to_dnf(expr) == (EqualTo("Q", "b"), NotEqualTo("R", "c")) + + +def test_dnf_to_dask(table_schema_simple: Schema) -> None: + expr = ( + BoundGreaterThan[str]( + term=BoundReference(table_schema_simple.find_field(1), table_schema_simple.accessor_for_field(1)), + literal=literal("hello"), + ), + And( + BoundIn[int]( + term=BoundReference(table_schema_simple.find_field(2), table_schema_simple.accessor_for_field(2)), + literals={literal(1), literal(2), literal(3)}, + ), + BoundEqualTo[bool]( + term=BoundReference(table_schema_simple.find_field(3), table_schema_simple.accessor_for_field(3)), + literal=literal(True), + ), + ), + ) + assert expression_to_plain_format(expr) == [[("foo", ">", "hello")], [("bar", "in", {1, 2, 3}), ("baz", "==", True)]]