diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index 8873907813..9116b3f835 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -62,6 +62,7 @@ LongLiteral, StringLiteral, ) +from pyiceberg.transforms import UnboundTransform from pyiceberg.typedef import L ParserElement.enablePackrat() @@ -74,6 +75,7 @@ NULL = CaselessKeyword("null") NAN = CaselessKeyword("nan") LIKE = CaselessKeyword("like") +CAST = CaselessKeyword("cast") identifier = Word(alphas, alphanums + "_$").set_results_name("identifier") column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column") @@ -240,7 +242,17 @@ def _evaluate_like_statement(result: ParseResults) -> BooleanExpression: return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%'))) -predicate = (comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name("predicate") +cast_expression = (CAST + "(" + column + "as" + identifier + Suppress(")")).set_results_name("cast") + + +@cast_expression.set_parse_action +def _(result: ParseResults) -> UnboundTransform[L]: + return UnboundTransform(result.column, result[-1]) + + +predicate = (cast_expression | comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name( + "predicate" +) def handle_not(result: ParseResults) -> Not: diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index b9afae9a7e..89df93e4a4 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -20,7 +20,7 @@ from abc import ABC, abstractmethod from enum import IntEnum from functools import singledispatch -from typing import Any, Callable, Generic, Optional, TypeVar +from typing import Any, Callable, Generic, Optional, Type, TypeVar from typing import Literal as LiteralType from uuid import UUID @@ -38,6 +38,7 @@ BoundNotIn, BoundNotStartsWith, BoundPredicate, + BoundReference, BoundSetPredicate, BoundStartsWith, BoundTerm, @@ -49,6 +50,7 @@ Reference, StartsWith, UnboundPredicate, + UnboundTerm, ) from pyiceberg.expressions.literals import ( DateLiteral, @@ -58,7 +60,8 @@ TimestampLiteral, literal, ) -from pyiceberg.typedef import IcebergRootModel, L +from pyiceberg.schema import Schema +from pyiceberg.typedef import IcebergRootModel, L, StructProtocol from pyiceberg.types import ( BinaryType, DateType, @@ -821,3 +824,34 @@ class BoundTransform(BoundTerm[L]): def __init__(self, term: BoundTerm[L], transform: Transform[L, Any]): self.term: BoundTerm[L] = term self.transform = transform + + def eval(self, struct: StructProtocol) -> L: + """Return the value at the referenced field's position in an object that abides by the StructProtocol.""" + return self.term.eval(struct) + + def ref(self) -> BoundReference[L]: + """Return the bound reference.""" + return self.term.ref() + + +class UnboundTransform(UnboundTerm[L]): + """An unbound transform expression.""" + + transform: Transform[L, Any] + + def __init__(self, term: UnboundTerm[L], transform: Transform[L, Any]): + self.term: UnboundTerm[L] = term + self.transform = transform + + def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundTransform[L]: + bound_term = self.term.bind(schema, case_sensitive) + + if not self.transform.can_transform(bound_term.ref().field.field_type): + raise ValueError("some better error message") + + else: + return BoundTransform(bound_term, self.transform) + + @property + def as_bound(self) -> Type[BoundTerm[L]]: + return BoundTerm[L] diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index 3ce2f2226c..fe958ad059 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -39,6 +39,7 @@ StartsWith, parser, ) +from pyiceberg.transforms import Reference def test_true() -> None: @@ -199,3 +200,8 @@ def test_with_function() -> None: parser.parse("foo = 1 and lower(bar) = '2'") assert "Expected end of text, found 'and'" in str(exc_info) + + +def test_cast() -> None: + cast = parser.parse("CAST(created_at as date)") + assert cast.term == Reference("created_at")