diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index defe5958c5..bf5749ce9b 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -31,13 +31,15 @@ Type, ) +from pydantic_core import to_json + from pyiceberg.avro.file import AvroFile, AvroOutputFile from pyiceberg.conversions import to_bytes from pyiceberg.exceptions import ValidationError from pyiceberg.io import FileIO, InputFile, OutputFile from pyiceberg.partitioning import PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.typedef import EMPTY_DICT, Record, TableVersion +from pyiceberg.typedef import Record, TableVersion from pyiceberg.types import ( BinaryType, BooleanType, @@ -645,7 +647,6 @@ class ManifestWriter(ABC): _output_file: OutputFile _writer: AvroOutputFile[ManifestEntry] _snapshot_id: int - _meta: Dict[str, str] _added_files: int _added_rows: int _existing_files: int @@ -655,15 +656,12 @@ class ManifestWriter(ABC): _min_data_sequence_number: Optional[int] _partitions: List[Record] - def __init__( - self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int, meta: Dict[str, str] = EMPTY_DICT - ) -> None: + def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int) -> None: self.closed = False self._spec = spec self._schema = schema self._output_file = output_file self._snapshot_id = snapshot_id - self._meta = meta self._added_files = 0 self._added_rows = 0 @@ -697,6 +695,15 @@ def content(self) -> ManifestContent: ... @abstractmethod def version(self) -> TableVersion: ... + @property + def _meta(self) -> Dict[str, str]: + return { + "schema": self._schema.model_dump_json(), + "partition-spec": to_json(self._spec.fields).decode("utf-8"), + "partition-spec-id": str(self._spec.spec_id), + "format-version": str(self.version), + } + def _with_partition(self, format_version: TableVersion) -> Schema: data_file_type = data_file_with_partition( format_version=format_version, partition_type=self._spec.partition_type(self._schema) @@ -771,12 +778,6 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, schema, output_file, snapshot_id, - { - "schema": schema.model_dump_json(), - "partition-spec": spec.model_dump_json(), - "partition-spec-id": str(spec.spec_id), - "format-version": "1", - }, ) def content(self) -> ManifestContent: @@ -792,19 +793,7 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry: class ManifestWriterV2(ManifestWriter): def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int): - super().__init__( - spec, - schema, - output_file, - snapshot_id, - meta={ - "schema": schema.model_dump_json(), - "partition-spec": spec.model_dump_json(), - "partition-spec-id": str(spec.spec_id), - "format-version": "2", - "content": "data", - }, - ) + super().__init__(spec, schema, output_file, snapshot_id) def content(self) -> ManifestContent: return ManifestContent.DATA @@ -813,6 +802,13 @@ def content(self) -> ManifestContent: def version(self) -> TableVersion: return 2 + @property + def _meta(self) -> Dict[str, str]: + return { + **super()._meta, + "content": "data", + } + def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry: if entry.data_sequence_number is None: if entry.snapshot_id is not None and entry.snapshot_id != self._snapshot_id: diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 8bb03cd80e..a812b384fc 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -348,8 +348,8 @@ def test_write_manifest( expected_metadata = { "schema": test_schema.model_dump_json(), - "partition-spec": test_spec.model_dump_json(), - "partition-spec-id": str(test_spec.spec_id), + "partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""", + "partition-spec-id": str(demo_manifest_file.partition_spec_id), "format-version": str(format_version), } _verify_metadata_with_fastavro(