From bfa8143ad58b24ad239d7567b7821e81947e28e8 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 3 Oct 2023 13:29:13 +0200 Subject: [PATCH 1/3] Add logic to generate a new snapshot-id --- pyiceberg/table/__init__.py | 21 +++++++++++++++++++++ tests/table/test_init.py | 6 ++++++ 2 files changed, 27 insertions(+) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 8443315a64..ea992475df 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -17,6 +17,7 @@ from __future__ import annotations import itertools +import uuid from abc import ABC, abstractmethod from copy import copy from dataclasses import dataclass @@ -498,6 +499,14 @@ def location(self) -> str: """Return the table's base location.""" return self.metadata.location + def new_snapshot_id(self) -> int: + """Generate a new snapshot-id that's not in use.""" + snapshot_id = _generate_snapshot_id() + while self.snapshot_by_id(snapshot_id) is not None: + snapshot_id = _generate_snapshot_id() + + return snapshot_id + def current_snapshot(self) -> Optional[Snapshot]: """Get the current snapshot for this table, or None if there is no current snapshot.""" if snapshot_id := self.metadata.current_snapshot_id: @@ -1566,3 +1575,15 @@ def _add_and_move_fields( elif len(moves) > 0: return _move_fields(fields, moves) return None if len(adds) == 0 else tuple(*fields, *adds) + + +def _generate_snapshot_id() -> int: + """Generate a new Snapshot ID from a UUID. + + Right shifting the 64 bits removes the MAC address and time + leaving only the part that's based on the clock (and has the + highest entropy). + + Returns: An 64 bit long + """ + return uuid.uuid4().int & (1 << 64) - 1 diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 8fd5e2bcdb..369df4fa92 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -41,6 +41,7 @@ StaticTable, Table, UpdateSchema, + _generate_snapshot_id, _match_deletes_to_datafile, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER @@ -506,3 +507,8 @@ def test_add_nested_list_type_column(table: Table) -> None: element_required=False, ) assert new_schema.highest_field_id == 7 + + +def test_generate_snapshot_id(table: Table) -> None: + assert isinstance(_generate_snapshot_id(), int) + assert isinstance(table.new_snapshot_id(), int) From a5d56c0deccdf0a9d8f03474d578922aa8c27f74 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 3 Oct 2023 18:30:21 +0200 Subject: [PATCH 2/3] Use the xor approach --- pyiceberg/table/__init__.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index ea992475df..e117cbe96a 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -100,6 +100,8 @@ ALWAYS_TRUE = AlwaysTrue() TABLE_ROOT_ID = -1 +_JAVA_LONG_MAX = 9223372036854775807 + class Transaction: _table: Table @@ -1580,10 +1582,16 @@ def _add_and_move_fields( def _generate_snapshot_id() -> int: """Generate a new Snapshot ID from a UUID. - Right shifting the 64 bits removes the MAC address and time - leaving only the part that's based on the clock (and has the - highest entropy). - Returns: An 64 bit long """ - return uuid.uuid4().int & (1 << 64) - 1 + rnd_uuid = uuid.uuid4() + snapshot_id = int.from_bytes( + bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16])), byteorder='little', signed=True + ) + + if snapshot_id < 0: + raise ValueError(f"Snapshot ID should not be negative: {snapshot_id}") + if snapshot_id > _JAVA_LONG_MAX: + raise ValueError(f"Snapshot ID should not be larger than signed 63 bit ({_JAVA_LONG_MAX}): {snapshot_id}") + + return snapshot_id From 11b3c5d603177328a28083fe0fcbf352efb6cb51 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 5 Oct 2023 10:11:49 +0200 Subject: [PATCH 3/3] Closer to the Java way of doing it --- pyiceberg/table/__init__.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index e117cbe96a..6e71a40c2d 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -100,8 +100,6 @@ ALWAYS_TRUE = AlwaysTrue() TABLE_ROOT_ID = -1 -_JAVA_LONG_MAX = 9223372036854775807 - class Transaction: _table: Table @@ -1588,10 +1586,6 @@ def _generate_snapshot_id() -> int: snapshot_id = int.from_bytes( bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16])), byteorder='little', signed=True ) - - if snapshot_id < 0: - raise ValueError(f"Snapshot ID should not be negative: {snapshot_id}") - if snapshot_id > _JAVA_LONG_MAX: - raise ValueError(f"Snapshot ID should not be larger than signed 63 bit ({_JAVA_LONG_MAX}): {snapshot_id}") + snapshot_id = snapshot_id if snapshot_id >= 0 else snapshot_id * -1 return snapshot_id