diff --git a/python/pyiceberg/avro/decoder.py b/python/pyiceberg/avro/decoder.py index cef9de7c7b25..663a193ce8f1 100644 --- a/python/pyiceberg/avro/decoder.py +++ b/python/pyiceberg/avro/decoder.py @@ -16,17 +16,12 @@ # under the License. import decimal import struct -from datetime import date, datetime, time +from datetime import datetime, time from io import SEEK_CUR from uuid import UUID from pyiceberg.io import InputStream -from pyiceberg.utils.datetime import ( - days_to_date, - micros_to_time, - micros_to_timestamp, - micros_to_timestamptz, -) +from pyiceberg.utils.datetime import micros_to_time, micros_to_timestamp, micros_to_timestamptz from pyiceberg.utils.decimal import unscaled_to_decimal STRUCT_FLOAT = struct.Struct(" str: """ return self.read_bytes().decode("utf-8") - def read_date_from_int(self) -> date: - """ - int is decoded as python date object. - int stores the number of days from - the unix epoch, 1 January 1970 (ISO calendar). - """ - return days_to_date(self.read_int()) - def read_uuid_from_fixed(self) -> UUID: """Reads a UUID as a fixed[16]""" return UUID(bytes=self.read(16)) diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py index 136498ed342c..d1b114cc5b5e 100644 --- a/python/pyiceberg/avro/reader.py +++ b/python/pyiceberg/avro/reader.py @@ -28,7 +28,7 @@ from abc import abstractmethod from dataclasses import dataclass from dataclasses import field as dataclassfield -from datetime import date, datetime, time +from datetime import datetime, time from decimal import Decimal from typing import ( Any, @@ -156,8 +156,8 @@ def skip(self, decoder: BinaryDecoder) -> None: class DateReader(Reader): - def read(self, decoder: BinaryDecoder) -> date: - return decoder.read_date_from_int() + def read(self, decoder: BinaryDecoder) -> int: + return decoder.read_int() def skip(self, decoder: BinaryDecoder) -> None: decoder.skip_int() diff --git a/python/pyiceberg/typedef.py b/python/pyiceberg/typedef.py index 6ad668f006af..ccdeacad3051 100644 --- a/python/pyiceberg/typedef.py +++ b/python/pyiceberg/typedef.py @@ -98,3 +98,6 @@ def get(self, pos: int) -> Any: def __eq__(self, other: Any) -> bool: # For testing return True if isinstance(other, Record) and other._data == self._data else False + + def __repr__(self) -> str: + return "[" + ", ".join([repr(e) for e in self._data]) + "]" diff --git a/python/tests/avro/test_decoder.py b/python/tests/avro/test_decoder.py index ac38f388682a..e7232703766e 100644 --- a/python/tests/avro/test_decoder.py +++ b/python/tests/avro/test_decoder.py @@ -16,7 +16,7 @@ # under the License. from __future__ import annotations -from datetime import date, datetime, timezone +from datetime import datetime, timezone from decimal import Decimal from io import SEEK_SET from types import TracebackType @@ -171,12 +171,6 @@ def test_skip_double() -> None: assert mis.tell() == 8 -def test_read_date() -> None: - mis = MemoryInputStream(b"\xBC\x7D") - decoder = BinaryDecoder(mis) - assert decoder.read_date_from_int() == date(1991, 12, 27) - - def test_read_uuid_from_fixed() -> None: mis = MemoryInputStream(b"\x12\x34\x56\x78" * 4) decoder = BinaryDecoder(mis) diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py index 252888233342..2d54b6e887cf 100644 --- a/python/tests/avro/test_reader.py +++ b/python/tests/avro/test_reader.py @@ -94,7 +94,15 @@ def test_read_header(generated_manifest_entry_file: str, iceberg_manifest_entry_ "type": { "type": "record", "name": "r102", - "fields": [{"field-id": 1000, "default": None, "name": "VendorID", "type": ["null", "int"]}], + "fields": [ + {"field-id": 1000, "default": None, "name": "VendorID", "type": ["null", "int"]}, + { + "field-id": 1001, + "default": None, + "name": "tpep_pickup_datetime", + "type": ["null", {"type": "int", "logicalType": "date"}], + }, + ], }, }, {"field-id": 103, "doc": "Number of records in the file", "name": "record_count", "type": "long"}, @@ -268,7 +276,7 @@ def test_read_manifest_entry_file(generated_manifest_entry_file: str) -> None: Record( "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", "PARQUET", - Record(None), + Record(1, 1925), 19513, 388872, 67108864, diff --git a/python/tests/conftest.py b/python/tests/conftest.py index c656ebf4d0b3..888888da9f56 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -54,6 +54,7 @@ from pyiceberg.types import ( BinaryType, BooleanType, + DateType, DoubleType, FloatType, IntegerType, @@ -321,7 +322,7 @@ def catalog() -> InMemoryCatalog: "data_file": { "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", "file_format": "PARQUET", - "partition": {"VendorID": None}, + "partition": {"VendorID": 1, "tpep_pickup_datetime": 1925}, "record_count": 19513, "file_size_in_bytes": 388872, "block_size_in_bytes": 67108864, @@ -441,7 +442,7 @@ def catalog() -> InMemoryCatalog: "data_file": { "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", "file_format": "PARQUET", - "partition": {"VendorID": 1}, + "partition": {"VendorID": 1, "tpep_pickup_datetime": 1925}, "record_count": 95050, "file_size_in_bytes": 1265950, "block_size_in_bytes": 67108864, @@ -714,7 +715,15 @@ def avro_schema_manifest_entry() -> Dict[str, Any]: "type": { "type": "record", "name": "r102", - "fields": [{"name": "VendorID", "type": ["null", "int"], "default": None, "field-id": 1000}], + "fields": [ + {"field-id": 1000, "default": None, "name": "VendorID", "type": ["null", "int"]}, + { + "field-id": 1001, + "default": None, + "name": "tpep_pickup_datetime", + "type": ["null", {"type": "int", "logicalType": "date"}], + }, + ], }, "field-id": 102, }, @@ -987,6 +996,12 @@ def iceberg_manifest_entry_schema() -> Schema: field_type=IntegerType(), required=False, ), + NestedField( + field_id=1001, + name="tpep_pickup_datetime", + field_type=DateType(), + required=False, + ), ), required=True, ), diff --git a/python/tests/utils/test_manifest.py b/python/tests/utils/test_manifest.py index 91c111abc15c..e78d0db237ba 100644 --- a/python/tests/utils/test_manifest.py +++ b/python/tests/utils/test_manifest.py @@ -35,236 +35,485 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: input_file = PyArrowFileIO().new_input(location=generated_manifest_entry_file) - assert list(read_manifest_entry(input_file)) == [ - ManifestEntry( - status=1, - snapshot_id=8744736658442914487, - data_file=DataFile( - file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", - file_format=FileFormat.PARQUET, - partition={"VendorID": None}, - record_count=19513, - file_size_in_bytes=388872, - block_size_in_bytes=67108864, - column_sizes={ - 1: 53, - 2: 98153, - 3: 98693, - 4: 53, - 5: 53, - 6: 53, - 7: 17425, - 8: 18528, - 9: 53, - 10: 44788, - 11: 35571, - 12: 53, - 13: 1243, - 14: 2355, - 15: 12750, - 16: 4029, - 17: 110, - 18: 47194, - 19: 2948, - }, - value_counts={ - 1: 19513, - 2: 19513, - 3: 19513, - 4: 19513, - 5: 19513, - 6: 19513, - 7: 19513, - 8: 19513, - 9: 19513, - 10: 19513, - 11: 19513, - 12: 19513, - 13: 19513, - 14: 19513, - 15: 19513, - 16: 19513, - 17: 19513, - 18: 19513, - 19: 19513, - }, - null_value_counts={ - 1: 19513, - 2: 0, - 3: 0, - 4: 19513, - 5: 19513, - 6: 19513, - 7: 0, - 8: 0, - 9: 19513, - 10: 0, - 11: 0, - 12: 19513, - 13: 0, - 14: 0, - 15: 0, - 16: 0, - 17: 0, - 18: 0, - 19: 0, - }, - nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}, - lower_bounds={ - 2: b"2020-04-01 00:00", - 3: b"2020-04-01 00:12", - 7: b"\x03\x00\x00\x00", - 8: b"\x01\x00\x00\x00", - 10: b"\xf6(\\\x8f\xc2\x05S\xc0", - 11: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 13: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 14: b"\x00\x00\x00\x00\x00\x00\xe0\xbf", - 15: b")\\\x8f\xc2\xf5(\x08\xc0", - 16: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 17: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 18: b"\xf6(\\\x8f\xc2\xc5S\xc0", - 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", - }, - upper_bounds={ - 2: b"2020-04-30 23:5:", - 3: b"2020-05-01 00:41", - 7: b"\t\x01\x00\x00", - 8: b"\t\x01\x00\x00", - 10: b"\xcd\xcc\xcc\xcc\xcc,_@", - 11: b"\x1f\x85\xebQ\\\xe2\xfe@", - 13: b"\x00\x00\x00\x00\x00\x00\x12@", - 14: b"\x00\x00\x00\x00\x00\x00\xe0?", - 15: b"q=\n\xd7\xa3\xf01@", - 16: b"\x00\x00\x00\x00\x00`B@", - 17: b"333333\xd3?", - 18: b"\x00\x00\x00\x00\x00\x18b@", - 19: b"\x00\x00\x00\x00\x00\x00\x04@", - }, - key_metadata=None, - split_offsets=[4], - sort_order_id=0, + assert ( + list(read_manifest_entry(input_file)) + == [ + ManifestEntry( + status=ManifestEntryStatus.ADDED, + snapshot_id=8744736658442914487, + sequence_number=None, + data_file=DataFile( + content=DataFileContent.DATA, + file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", + file_format=FileFormat.PARQUET, + partition={"VendorID": 1, "tpep_pickup_datetime": 1925}, + record_count=19513, + file_size_in_bytes=388872, + block_size_in_bytes=67108864, + column_sizes={ + 1: 53, + 2: 98153, + 3: 98693, + 4: 53, + 5: 53, + 6: 53, + 7: 17425, + 8: 18528, + 9: 53, + 10: 44788, + 11: 35571, + 12: 53, + 13: 1243, + 14: 2355, + 15: 12750, + 16: 4029, + 17: 110, + 18: 47194, + 19: 2948, + }, + value_counts={ + 1: 19513, + 2: 19513, + 3: 19513, + 4: 19513, + 5: 19513, + 6: 19513, + 7: 19513, + 8: 19513, + 9: 19513, + 10: 19513, + 11: 19513, + 12: 19513, + 13: 19513, + 14: 19513, + 15: 19513, + 16: 19513, + 17: 19513, + 18: 19513, + 19: 19513, + }, + null_value_counts={ + 1: 19513, + 2: 0, + 3: 0, + 4: 19513, + 5: 19513, + 6: 19513, + 7: 0, + 8: 0, + 9: 19513, + 10: 0, + 11: 0, + 12: 19513, + 13: 0, + 14: 0, + 15: 0, + 16: 0, + 17: 0, + 18: 0, + 19: 0, + }, + nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}, + distinct_counts=None, + lower_bounds={ + 2: b"2020-04-01 00:00", + 3: b"2020-04-01 00:12", + 7: b"\x03\x00\x00\x00", + 8: b"\x01\x00\x00\x00", + 10: b"\xf6(\\\x8f\xc2\x05S\xc0", + 11: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 13: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 14: b"\x00\x00\x00\x00\x00\x00\xe0\xbf", + 15: b")\\\x8f\xc2\xf5(\x08\xc0", + 16: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 17: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 18: b"\xf6(\\\x8f\xc2\xc5S\xc0", + 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", + }, + upper_bounds={ + 2: b"2020-04-30 23:5:", + 3: b"2020-05-01 00:41", + 7: b"\t\x01\x00\x00", + 8: b"\t\x01\x00\x00", + 10: b"\xcd\xcc\xcc\xcc\xcc,_@", + 11: b"\x1f\x85\xebQ\\\xe2\xfe@", + 13: b"\x00\x00\x00\x00\x00\x00\x12@", + 14: b"\x00\x00\x00\x00\x00\x00\xe0?", + 15: b"q=\n\xd7\xa3\xf01@", + 16: b"\x00\x00\x00\x00\x00`B@", + 17: b"333333\xd3?", + 18: b"\x00\x00\x00\x00\x00\x18b@", + 19: b"\x00\x00\x00\x00\x00\x00\x04@", + }, + key_metadata=None, + split_offsets=[4], + equality_ids=None, + sort_order_id=0, + ), ), - ), - ManifestEntry( - status=1, - snapshot_id=8744736658442914487, - data_file=DataFile( - file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", - file_format=FileFormat.PARQUET, - partition={"VendorID": 1}, - record_count=95050, - file_size_in_bytes=1265950, - block_size_in_bytes=67108864, - column_sizes={ - 1: 318, - 2: 329806, - 3: 331632, - 4: 15343, - 5: 2351, - 6: 3389, - 7: 71269, - 8: 76429, - 9: 16383, - 10: 86992, - 11: 89608, - 12: 265, - 13: 19377, - 14: 1692, - 15: 76162, - 16: 4354, - 17: 759, - 18: 120650, - 19: 11804, - }, - value_counts={ - 1: 95050, - 2: 95050, - 3: 95050, - 4: 95050, - 5: 95050, - 6: 95050, - 7: 95050, - 8: 95050, - 9: 95050, - 10: 95050, - 11: 95050, - 12: 95050, - 13: 95050, - 14: 95050, - 15: 95050, - 16: 95050, - 17: 95050, - 18: 95050, - 19: 95050, - }, - null_value_counts={ - 1: 0, - 2: 0, - 3: 0, - 4: 0, - 5: 0, - 6: 0, - 7: 0, - 8: 0, - 9: 0, - 10: 0, - 11: 0, - 12: 95050, - 13: 0, - 14: 0, - 15: 0, - 16: 0, - 17: 0, - 18: 0, - 19: 0, - }, - nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}, - lower_bounds={ - 1: b"\x01\x00\x00\x00", - 2: b"2020-04-01 00:00", - 3: b"2020-04-01 00:03", - 4: b"\x00\x00\x00\x00", - 5: b"\x01\x00\x00\x00", - 6: b"N", - 7: b"\x01\x00\x00\x00", - 8: b"\x01\x00\x00\x00", - 9: b"\x01\x00\x00\x00", - 10: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 11: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 13: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 14: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 15: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 16: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 17: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 18: b"\x00\x00\x00\x00\x00\x00\x00\x00", - 19: b"\x00\x00\x00\x00\x00\x00\x00\x00", - }, - upper_bounds={ - 1: b"\x01\x00\x00\x00", - 2: b"2020-04-30 23:5:", - 3: b"2020-05-01 00:1:", - 4: b"\x06\x00\x00\x00", - 5: b"c\x00\x00\x00", - 6: b"Y", - 7: b"\t\x01\x00\x00", - 8: b"\t\x01\x00\x00", - 9: b"\x04\x00\x00\x00", - 10: b"\\\x8f\xc2\xf5(8\x8c@", - 11: b"\xcd\xcc\xcc\xcc\xcc,f@", - 13: b"\x00\x00\x00\x00\x00\x00\x1c@", - 14: b"\x9a\x99\x99\x99\x99\x99\xf1?", - 15: b"\x00\x00\x00\x00\x00\x00Y@", - 16: b"\x00\x00\x00\x00\x00\xb0X@", - 17: b"333333\xd3?", - 18: b"\xc3\xf5(\\\x8f:\x8c@", - 19: b"\x00\x00\x00\x00\x00\x00\x04@", - }, - key_metadata=None, - split_offsets=[4], - sort_order_id=0, + ManifestEntry( + status=ManifestEntryStatus.ADDED, + snapshot_id=8744736658442914487, + sequence_number=None, + data_file=DataFile( + content=DataFileContent.DATA, + file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", + file_format=FileFormat.PARQUET, + partition={"VendorID": 1, "tpep_pickup_datetime": 1925}, + record_count=95050, + file_size_in_bytes=1265950, + block_size_in_bytes=67108864, + column_sizes={ + 1: 318, + 2: 329806, + 3: 331632, + 4: 15343, + 5: 2351, + 6: 3389, + 7: 71269, + 8: 76429, + 9: 16383, + 10: 86992, + 11: 89608, + 12: 265, + 13: 19377, + 14: 1692, + 15: 76162, + 16: 4354, + 17: 759, + 18: 120650, + 19: 11804, + }, + value_counts={ + 1: 95050, + 2: 95050, + 3: 95050, + 4: 95050, + 5: 95050, + 6: 95050, + 7: 95050, + 8: 95050, + 9: 95050, + 10: 95050, + 11: 95050, + 12: 95050, + 13: 95050, + 14: 95050, + 15: 95050, + 16: 95050, + 17: 95050, + 18: 95050, + 19: 95050, + }, + null_value_counts={ + 1: 0, + 2: 0, + 3: 0, + 4: 0, + 5: 0, + 6: 0, + 7: 0, + 8: 0, + 9: 0, + 10: 0, + 11: 0, + 12: 95050, + 13: 0, + 14: 0, + 15: 0, + 16: 0, + 17: 0, + 18: 0, + 19: 0, + }, + nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}, + distinct_counts=None, + lower_bounds={ + 1: b"\x01\x00\x00\x00", + 2: b"2020-04-01 00:00", + 3: b"2020-04-01 00:03", + 4: b"\x00\x00\x00\x00", + 5: b"\x01\x00\x00\x00", + 6: b"N", + 7: b"\x01\x00\x00\x00", + 8: b"\x01\x00\x00\x00", + 9: b"\x01\x00\x00\x00", + 10: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 11: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 13: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 14: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 15: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 16: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 17: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 18: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 19: b"\x00\x00\x00\x00\x00\x00\x00\x00", + }, + upper_bounds={ + 1: b"\x01\x00\x00\x00", + 2: b"2020-04-30 23:5:", + 3: b"2020-05-01 00:1:", + 4: b"\x06\x00\x00\x00", + 5: b"c\x00\x00\x00", + 6: b"Y", + 7: b"\t\x01\x00\x00", + 8: b"\t\x01\x00\x00", + 9: b"\x04\x00\x00\x00", + 10: b"\\\x8f\xc2\xf5(8\x8c@", + 11: b"\xcd\xcc\xcc\xcc\xcc,f@", + 13: b"\x00\x00\x00\x00\x00\x00\x1c@", + 14: b"\x9a\x99\x99\x99\x99\x99\xf1?", + 15: b"\x00\x00\x00\x00\x00\x00Y@", + 16: b"\x00\x00\x00\x00\x00\xb0X@", + 17: b"333333\xd3?", + 18: b"\xc3\xf5(\\\x8f:\x8c@", + 19: b"\x00\x00\x00\x00\x00\x00\x04@", + }, + key_metadata=None, + split_offsets=[4], + equality_ids=None, + sort_order_id=0, + ), ), - ), - ] + ] + != [ + ManifestEntry( + status=ManifestEntryStatus.ADDED, + snapshot_id=8744736658442914487, + sequence_number=None, + data_file=DataFile( + content=DataFileContent.DATA, + file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", + file_format=FileFormat.PARQUET, + partition={"VendorID": 1, "tpep_pickup_datetime": None}, + record_count=19513, + file_size_in_bytes=388872, + block_size_in_bytes=67108864, + column_sizes={ + 1: 53, + 2: 98153, + 3: 98693, + 4: 53, + 5: 53, + 6: 53, + 7: 17425, + 8: 18528, + 9: 53, + 10: 44788, + 11: 35571, + 12: 53, + 13: 1243, + 14: 2355, + 15: 12750, + 16: 4029, + 17: 110, + 18: 47194, + 19: 2948, + }, + value_counts={ + 1: 19513, + 2: 19513, + 3: 19513, + 4: 19513, + 5: 19513, + 6: 19513, + 7: 19513, + 8: 19513, + 9: 19513, + 10: 19513, + 11: 19513, + 12: 19513, + 13: 19513, + 14: 19513, + 15: 19513, + 16: 19513, + 17: 19513, + 18: 19513, + 19: 19513, + }, + null_value_counts={ + 1: 19513, + 2: 0, + 3: 0, + 4: 19513, + 5: 19513, + 6: 19513, + 7: 0, + 8: 0, + 9: 19513, + 10: 0, + 11: 0, + 12: 19513, + 13: 0, + 14: 0, + 15: 0, + 16: 0, + 17: 0, + 18: 0, + 19: 0, + }, + nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}, + distinct_counts=None, + lower_bounds={ + 2: b"2020-04-01 00:00", + 3: b"2020-04-01 00:12", + 7: b"\x03\x00\x00\x00", + 8: b"\x01\x00\x00\x00", + 10: b"\xf6(\\\x8f\xc2\x05S\xc0", + 11: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 13: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 14: b"\x00\x00\x00\x00\x00\x00\xe0\xbf", + 15: b")\\\x8f\xc2\xf5(\x08\xc0", + 16: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 17: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 18: b"\xf6(\\\x8f\xc2\xc5S\xc0", + 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", + }, + upper_bounds={ + 2: b"2020-04-30 23:5:", + 3: b"2020-05-01 00:41", + 7: b"\t\x01\x00\x00", + 8: b"\t\x01\x00\x00", + 10: b"\xcd\xcc\xcc\xcc\xcc,_@", + 11: b"\x1f\x85\xebQ\\\xe2\xfe@", + 13: b"\x00\x00\x00\x00\x00\x00\x12@", + 14: b"\x00\x00\x00\x00\x00\x00\xe0?", + 15: b"q=\n\xd7\xa3\xf01@", + 16: b"\x00\x00\x00\x00\x00`B@", + 17: b"333333\xd3?", + 18: b"\x00\x00\x00\x00\x00\x18b@", + 19: b"\x00\x00\x00\x00\x00\x00\x04@", + }, + key_metadata=None, + split_offsets=[4], + equality_ids=None, + sort_order_id=0, + ), + ), + ManifestEntry( + status=ManifestEntryStatus.ADDED, + snapshot_id=8744736658442914487, + sequence_number=None, + data_file=DataFile( + content=DataFileContent.DATA, + file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", + file_format=FileFormat.PARQUET, + partition={"VendorID": 1, "tpep_pickup_datetime": 1925}, + record_count=95050, + file_size_in_bytes=1265950, + block_size_in_bytes=67108864, + column_sizes={ + 1: 318, + 2: 329806, + 3: 331632, + 4: 15343, + 5: 2351, + 6: 3389, + 7: 71269, + 8: 76429, + 9: 16383, + 10: 86992, + 11: 89608, + 12: 265, + 13: 19377, + 14: 1692, + 15: 76162, + 16: 4354, + 17: 759, + 18: 120650, + 19: 11804, + }, + value_counts={ + 1: 95050, + 2: 95050, + 3: 95050, + 4: 95050, + 5: 95050, + 6: 95050, + 7: 95050, + 8: 95050, + 9: 95050, + 10: 95050, + 11: 95050, + 12: 95050, + 13: 95050, + 14: 95050, + 15: 95050, + 16: 95050, + 17: 95050, + 18: 95050, + 19: 95050, + }, + null_value_counts={ + 1: 0, + 2: 0, + 3: 0, + 4: 0, + 5: 0, + 6: 0, + 7: 0, + 8: 0, + 9: 0, + 10: 0, + 11: 0, + 12: 95050, + 13: 0, + 14: 0, + 15: 0, + 16: 0, + 17: 0, + 18: 0, + 19: 0, + }, + nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}, + distinct_counts=None, + lower_bounds={ + 1: b"\x01\x00\x00\x00", + 2: b"2020-04-01 00:00", + 3: b"2020-04-01 00:03", + 4: b"\x00\x00\x00\x00", + 5: b"\x01\x00\x00\x00", + 6: b"N", + 7: b"\x01\x00\x00\x00", + 8: b"\x01\x00\x00\x00", + 9: b"\x01\x00\x00\x00", + 10: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 11: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 13: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 14: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 15: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 16: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 17: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 18: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 19: b"\x00\x00\x00\x00\x00\x00\x00\x00", + }, + upper_bounds={ + 1: b"\x01\x00\x00\x00", + 2: b"2020-04-30 23:5:", + 3: b"2020-05-01 00:1:", + 4: b"\x06\x00\x00\x00", + 5: b"c\x00\x00\x00", + 6: b"Y", + 7: b"\t\x01\x00\x00", + 8: b"\t\x01\x00\x00", + 9: b"\x04\x00\x00\x00", + 10: b"\\\x8f\xc2\xf5(8\x8c@", + 11: b"\xcd\xcc\xcc\xcc\xcc,f@", + 13: b"\x00\x00\x00\x00\x00\x00\x1c@", + 14: b"\x9a\x99\x99\x99\x99\x99\xf1?", + 15: b"\x00\x00\x00\x00\x00\x00Y@", + 16: b"\x00\x00\x00\x00\x00\xb0X@", + 17: b"333333\xd3?", + 18: b"\xc3\xf5(\\\x8f:\x8c@", + 19: b"\x00\x00\x00\x00\x00\x00\x04@", + }, + key_metadata=None, + split_offsets=[4], + equality_ids=None, + sort_order_id=0, + ), + ), + ] + ) def test_read_manifest_list(generated_manifest_file_file: str) -> None: @@ -328,7 +577,6 @@ def test_read_manifest(generated_manifest_file_file: str, generated_manifest_ent key_metadata=None, ) ] - actual = manifest_list[0].fetch_manifest_entry(io) expected = [ ManifestEntry( @@ -339,7 +587,7 @@ def test_read_manifest(generated_manifest_file_file: str, generated_manifest_ent content=DataFileContent.DATA, file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", file_format=FileFormat.PARQUET, - partition={"VendorID": None}, + partition={"VendorID": 1, "tpep_pickup_datetime": 1925}, record_count=19513, file_size_in_bytes=388872, block_size_in_bytes=67108864, @@ -452,7 +700,7 @@ def test_read_manifest(generated_manifest_file_file: str, generated_manifest_ent content=DataFileContent.DATA, file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", file_format=FileFormat.PARQUET, - partition={"VendorID": 1}, + partition={"VendorID": 1, "tpep_pickup_datetime": 1925}, record_count=95050, file_size_in_bytes=1265950, block_size_in_bytes=67108864,