From 9da77dede17619eeed294f030287d4ba324db747 Mon Sep 17 00:00:00 2001 From: Daniel Weeks Date: Sat, 21 Oct 2023 13:43:35 -0700 Subject: [PATCH 1/6] Update like statements to reflect sql behaciors --- pyiceberg/expressions/parser.py | 13 ++++++++++++- tests/expressions/test_parser.py | 23 ++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index 45805331be..f47f85e459 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import re from decimal import Decimal from pyparsing import ( @@ -78,6 +79,7 @@ identifier = Word(alphas, alphanums + "_$").set_results_name("identifier") column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column") +like_regex = r'(?P(?(? Reference: @@ -217,7 +219,16 @@ def _(result: ParseResults) -> BooleanExpression: @starts_with.set_parse_action def _(result: ParseResults) -> BooleanExpression: - return StartsWith(result.column, result.raw_quoted_string) + literal_like: StringLiteral = result.raw_quoted_string + + match = re.search(like_regex, literal_like.value) + + if match and match.groupdict()['invalid_wildcard']: + raise ValueError("LIKE expression only supports wildcard, '%', at the end of a string") + elif match and match.groupdict()['valid_wildcard']: + return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%'))) + else: + return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%'))) @not_starts_with.set_parse_action diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index 65415f2e9a..439f7c27ba 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -168,8 +168,29 @@ def test_multiple_and_or() -> None: ) == parser.parse("foo is not null and foo < 5 or (foo > 10 and foo < 100 and bar is null)") +def test_like_equality() -> None: + assert EqualTo("foo", "data") == parser.parse("foo LIKE 'data'") + assert EqualTo("foo", "data%") == parser.parse("foo LIKE 'data\\%'") + + def test_starts_with() -> None: - assert StartsWith("foo", "data") == parser.parse("foo LIKE 'data'") + assert StartsWith("foo", "data") == parser.parse("foo LIKE 'data%'") + assert StartsWith("foo", "some % data") == parser.parse("foo LIKE 'some \\% data%'") + assert StartsWith("foo", "some data%") == parser.parse("foo LIKE 'some data\\%%'") + + +def test_invalid_likes() -> None: + invalid_statements = [ + "foo LIKE '%data%'", + "foo LIKE 'da%ta'" + "foo LIKE '%data'" + ] + + for statement in invalid_statements: + with pytest.raises(ValueError) as exc_info: + parser.parse(statement) + + assert "LIKE expression only supports wildcard, '%', at the end of a string" in str(exc_info) def test_not_starts_with() -> None: From f07e21f8064f28ac50bb63789f18abddfb00721a Mon Sep 17 00:00:00 2001 From: Daniel Weeks Date: Sat, 21 Oct 2023 13:48:09 -0700 Subject: [PATCH 2/6] Codestyle --- pyiceberg/expressions/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index f47f85e459..ed7af3701d 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -223,7 +223,7 @@ def _(result: ParseResults) -> BooleanExpression: match = re.search(like_regex, literal_like.value) - if match and match.groupdict()['invalid_wildcard']: + if match and match.groupdict()['invalid_wildcard']: raise ValueError("LIKE expression only supports wildcard, '%', at the end of a string") elif match and match.groupdict()['valid_wildcard']: return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%'))) From 94b735488734f9e7ac8d5a08162dada2d38cf608 Mon Sep 17 00:00:00 2001 From: Daniel Weeks Date: Sat, 21 Oct 2023 13:53:20 -0700 Subject: [PATCH 3/6] Codestyle --- pyiceberg/expressions/parser.py | 1 + tests/expressions/test_parser.py | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index ed7af3701d..594d406cd1 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -81,6 +81,7 @@ like_regex = r'(?P(?(? Reference: return Reference(result.column[-1]) diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index 439f7c27ba..a41600f7f7 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -180,11 +180,7 @@ def test_starts_with() -> None: def test_invalid_likes() -> None: - invalid_statements = [ - "foo LIKE '%data%'", - "foo LIKE 'da%ta'" - "foo LIKE '%data'" - ] + invalid_statements = ["foo LIKE '%data%'", "foo LIKE 'da%ta'" "foo LIKE '%data'"] for statement in invalid_statements: with pytest.raises(ValueError) as exc_info: From d36c82a2c87593399497874bdd999ca39d865985 Mon Sep 17 00:00:00 2001 From: Daniel Weeks Date: Sat, 21 Oct 2023 14:29:23 -0700 Subject: [PATCH 4/6] Handle NotStartsWith --- pyiceberg/expressions/parser.py | 17 ++++++++++------- tests/expressions/test_parser.py | 5 +++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index 594d406cd1..c2280f74f5 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -52,7 +52,6 @@ NotIn, NotNaN, NotNull, - NotStartsWith, Or, Reference, StartsWith, @@ -220,23 +219,27 @@ def _(result: ParseResults) -> BooleanExpression: @starts_with.set_parse_action def _(result: ParseResults) -> BooleanExpression: + return _evaluate_like_statement(result) + + +@not_starts_with.set_parse_action +def _(result: ParseResults) -> BooleanExpression: + return _evaluate_like_statement(result).__invert__() + + +def _evaluate_like_statement(result: ParseResults) -> BooleanExpression: literal_like: StringLiteral = result.raw_quoted_string match = re.search(like_regex, literal_like.value) if match and match.groupdict()['invalid_wildcard']: - raise ValueError("LIKE expression only supports wildcard, '%', at the end of a string") + raise ValueError("LIKE expressions only supports wildcard, '%', at the end of a string") elif match and match.groupdict()['valid_wildcard']: return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%'))) else: return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%'))) -@not_starts_with.set_parse_action -def _(result: ParseResults) -> BooleanExpression: - return NotStartsWith(result.column, result.raw_quoted_string) - - predicate = (comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name("predicate") diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index a41600f7f7..71157e7cfa 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -186,11 +186,12 @@ def test_invalid_likes() -> None: with pytest.raises(ValueError) as exc_info: parser.parse(statement) - assert "LIKE expression only supports wildcard, '%', at the end of a string" in str(exc_info) + assert "LIKE expressions only supports wildcard, '%', at the end of a string" in str(exc_info) def test_not_starts_with() -> None: - assert NotStartsWith("foo", "data") == parser.parse("foo NOT LIKE 'data'") + assert NotEqualTo("foo", "data") == parser.parse("foo NOT LIKE 'data'") + assert NotStartsWith("foo", "data") == parser.parse("foo NOT LIKE 'data%'") def test_with_function() -> None: From cf7087cf4fbeb731eb38254bb0dc255ffaf90b54 Mon Sep 17 00:00:00 2001 From: Daniel Weeks Date: Sat, 21 Oct 2023 15:30:23 -0700 Subject: [PATCH 5/6] Update pyiceberg/expressions/parser.py Co-authored-by: Fokko Driesprong --- pyiceberg/expressions/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index c2280f74f5..8873907813 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -224,7 +224,7 @@ def _(result: ParseResults) -> BooleanExpression: @not_starts_with.set_parse_action def _(result: ParseResults) -> BooleanExpression: - return _evaluate_like_statement(result).__invert__() + return ~_evaluate_like_statement(result) def _evaluate_like_statement(result: ParseResults) -> BooleanExpression: From 4794b606ccc1f26871c34acb7d2841e57f4a83a6 Mon Sep 17 00:00:00 2001 From: Daniel Weeks Date: Sat, 21 Oct 2023 15:30:53 -0700 Subject: [PATCH 6/6] Update tests/expressions/test_parser.py Co-authored-by: Fokko Driesprong --- tests/expressions/test_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index 71157e7cfa..8257710f66 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -180,7 +180,7 @@ def test_starts_with() -> None: def test_invalid_likes() -> None: - invalid_statements = ["foo LIKE '%data%'", "foo LIKE 'da%ta'" "foo LIKE '%data'"] + invalid_statements = ["foo LIKE '%data%'", "foo LIKE 'da%ta'", "foo LIKE '%data'"] for statement in invalid_statements: with pytest.raises(ValueError) as exc_info: