From 816cbe5ec779d9f816860f63586492e0d1124f3e Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 11 Jan 2023 17:59:33 +0100 Subject: [PATCH 1/4] Python: Add visitor to DNF expr into Dask format --- python/pyiceberg/expressions/visitors.py | 79 +++++++++++++++++++++++ python/tests/expressions/test_visitors.py | 21 ++++++ 2 files changed, 100 insertions(+) diff --git a/python/pyiceberg/expressions/visitors.py b/python/pyiceberg/expressions/visitors.py index f312f12c3f1c..5a41a9e4685c 100644 --- a/python/pyiceberg/expressions/visitors.py +++ b/python/pyiceberg/expressions/visitors.py @@ -881,3 +881,82 @@ def rewrite_to_dnf(expr: BooleanExpression) -> Tuple[BooleanExpression, ...]: # (A AND NOT(B) AND C) OR (NOT(D) AND E AND F) OR (G) expr_without_not = rewrite_not(expr) return visit(expr_without_not, _RewriteToDNF()) + + +class _to_dask_format(BoundBooleanExpressionVisitor[List[Tuple[str, str, Any]]]): + def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "in", literals)] + + def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "not in", literals)] + + def visit_is_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "==", float("nan"))] + + def visit_not_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "!=", float("nan"))] + + def visit_is_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "==", None)] + + def visit_not_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "!=", None)] + + def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "==", literal.value)] + + def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "!=", literal.value)] + + def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, ">=", literal.value)] + + def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, ">", literal.value)] + + def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "<", literal.value)] + + def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + return [(term.ref().field.name, "<=", literal.value)] + + def visit_true(self) -> List[Tuple[str, str, Any]]: + return [] # Not supported + + def visit_false(self) -> List[Tuple[str, str, Any]]: + return [] # Not supported + + def visit_not(self, child_result: List[Tuple[str, str, Any]]) -> List[Tuple[str, str, Any]]: + raise ValueError(f"Not allowed: {child_result}") + + def visit_and( + self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] + ) -> List[Tuple[str, str, Any]]: + return left_result + right_result + + def visit_or( + self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] + ) -> List[Tuple[str, str, Any]]: + raise ValueError(f"Not allowed: {left_result} || {right_result}") + + +def dnf_to_dask(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[str, str, Any]]]: + """Formats a Disjunctive Normal Form expression into the format that can be fed into: + + - https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html + - https://docs.dask.org/en/stable/generated/dask.dataframe.read_parquet.html + + Contrary to normal DNF that may contain Not expressions, but here they should have + been rewritten. This can be done using ``rewrite_not(...)``. + + Keep in mind that this is only used for page skipping, and still needs to filter + on a row level. + + Args: + expressions: Expression in Disjunctive Normal Form + + Returns: + Formatter filter compatible with Dask and PyArrow + """ + # In the form of expr1 ∨ expr2 ∨ ... ∨ exprN + return [visit(expression, _to_dask_format()) for expression in expressions] diff --git a/python/tests/expressions/test_visitors.py b/python/tests/expressions/test_visitors.py index e79c353f8168..e08403de52a1 100644 --- a/python/tests/expressions/test_visitors.py +++ b/python/tests/expressions/test_visitors.py @@ -64,6 +64,7 @@ BooleanExpressionVisitor, BoundBooleanExpressionVisitor, _ManifestEvalVisitor, + dnf_to_dask, rewrite_not, rewrite_to_dnf, visit, @@ -1455,3 +1456,23 @@ def test_to_dnf_and() -> None: def test_to_dnf_not_and() -> None: expr = Not(And(Not(EqualTo("Q", "b")), EqualTo("R", "c"))) assert rewrite_to_dnf(expr) == (EqualTo("Q", "b"), NotEqualTo("R", "c")) + + +def test_dnf_to_dask(table_schema_simple: Schema) -> None: + expr = ( + BoundGreaterThan[str]( + term=BoundReference(table_schema_simple.find_field(1), table_schema_simple.accessor_for_field(1)), + literal=literal("hello"), + ), + And( + BoundIn[int]( + term=BoundReference(table_schema_simple.find_field(2), table_schema_simple.accessor_for_field(2)), + literals={literal(1), literal(2), literal(3)}, + ), + BoundEqualTo[bool]( + term=BoundReference(table_schema_simple.find_field(3), table_schema_simple.accessor_for_field(3)), + literal=literal(True), + ), + ), + ) + assert dnf_to_dask(expr) == [[("foo", ">", "hello")], [("bar", "in", {1, 2, 3}), ("baz", "==", True)]] From 9452f00e0e1f05b9af1b73b81390fb67a23f4b50 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 25 Jan 2023 22:28:04 +0100 Subject: [PATCH 2/4] Raise exception --- python/pyiceberg/expressions/visitors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyiceberg/expressions/visitors.py b/python/pyiceberg/expressions/visitors.py index 5a41a9e4685c..4a7f875c322d 100644 --- a/python/pyiceberg/expressions/visitors.py +++ b/python/pyiceberg/expressions/visitors.py @@ -924,7 +924,7 @@ def visit_true(self) -> List[Tuple[str, str, Any]]: return [] # Not supported def visit_false(self) -> List[Tuple[str, str, Any]]: - return [] # Not supported + raise ValueError("Not supported: AlwaysFalse") def visit_not(self, child_result: List[Tuple[str, str, Any]]) -> List[Tuple[str, str, Any]]: raise ValueError(f"Not allowed: {child_result}") From b44504e311d6bf13942bb2bbbe17fba7bba6ba37 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 25 Jan 2023 22:34:56 +0100 Subject: [PATCH 3/4] Fix naming --- python/pyiceberg/expressions/visitors.py | 2 +- python/tests/expressions/test_visitors.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyiceberg/expressions/visitors.py b/python/pyiceberg/expressions/visitors.py index 4a7f875c322d..b7305d215ab0 100644 --- a/python/pyiceberg/expressions/visitors.py +++ b/python/pyiceberg/expressions/visitors.py @@ -940,7 +940,7 @@ def visit_or( raise ValueError(f"Not allowed: {left_result} || {right_result}") -def dnf_to_dask(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[str, str, Any]]]: +def expr_to_dnf(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[str, str, Any]]]: """Formats a Disjunctive Normal Form expression into the format that can be fed into: - https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html diff --git a/python/tests/expressions/test_visitors.py b/python/tests/expressions/test_visitors.py index 50ff700be5ff..ece714b4e1b0 100644 --- a/python/tests/expressions/test_visitors.py +++ b/python/tests/expressions/test_visitors.py @@ -64,7 +64,7 @@ BooleanExpressionVisitor, BoundBooleanExpressionVisitor, _ManifestEvalVisitor, - dnf_to_dask, + expr_to_dnf, rewrite_not, rewrite_to_dnf, visit, @@ -1471,4 +1471,4 @@ def test_dnf_to_dask(table_schema_simple: Schema) -> None: ), ), ) - assert dnf_to_dask(expr) == [[("foo", ">", "hello")], [("bar", "in", {1, 2, 3}), ("baz", "==", True)]] + assert expr_to_dnf(expr) == [[("foo", ">", "hello")], [("bar", "in", {1, 2, 3}), ("baz", "==", True)]] From c7bbe182a1798079278afc04a1aa30fed87a3419 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 26 Jan 2023 17:31:36 +0100 Subject: [PATCH 4/4] Fix naming (again) --- python/pyiceberg/expressions/visitors.py | 6 +++--- python/tests/expressions/test_visitors.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyiceberg/expressions/visitors.py b/python/pyiceberg/expressions/visitors.py index b7305d215ab0..a76b183a98f2 100644 --- a/python/pyiceberg/expressions/visitors.py +++ b/python/pyiceberg/expressions/visitors.py @@ -883,7 +883,7 @@ def rewrite_to_dnf(expr: BooleanExpression) -> Tuple[BooleanExpression, ...]: return visit(expr_without_not, _RewriteToDNF()) -class _to_dask_format(BoundBooleanExpressionVisitor[List[Tuple[str, str, Any]]]): +class ExpressionToPlainFormat(BoundBooleanExpressionVisitor[List[Tuple[str, str, Any]]]): def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: return [(term.ref().field.name, "in", literals)] @@ -940,7 +940,7 @@ def visit_or( raise ValueError(f"Not allowed: {left_result} || {right_result}") -def expr_to_dnf(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[str, str, Any]]]: +def expression_to_plain_format(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[str, str, Any]]]: """Formats a Disjunctive Normal Form expression into the format that can be fed into: - https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html @@ -959,4 +959,4 @@ def expr_to_dnf(expressions: Tuple[BooleanExpression, ...]) -> List[List[Tuple[s Formatter filter compatible with Dask and PyArrow """ # In the form of expr1 ∨ expr2 ∨ ... ∨ exprN - return [visit(expression, _to_dask_format()) for expression in expressions] + return [visit(expression, ExpressionToPlainFormat()) for expression in expressions] diff --git a/python/tests/expressions/test_visitors.py b/python/tests/expressions/test_visitors.py index ece714b4e1b0..40e97004ad0e 100644 --- a/python/tests/expressions/test_visitors.py +++ b/python/tests/expressions/test_visitors.py @@ -64,7 +64,7 @@ BooleanExpressionVisitor, BoundBooleanExpressionVisitor, _ManifestEvalVisitor, - expr_to_dnf, + expression_to_plain_format, rewrite_not, rewrite_to_dnf, visit, @@ -1471,4 +1471,4 @@ def test_dnf_to_dask(table_schema_simple: Schema) -> None: ), ), ) - assert expr_to_dnf(expr) == [[("foo", ">", "hello")], [("bar", "in", {1, 2, 3}), ("baz", "==", True)]] + assert expression_to_plain_format(expr) == [[("foo", ">", "hello")], [("bar", "in", {1, 2, 3}), ("baz", "==", True)]]